I am calculating weighted average post outlier removal in pandas.When I am going to like 1M rows, I am seeing significant time taken in filter_and_compute method. Why is that? What can be done better? I am only showing a dummy example below so the size is alot smaller and compute may not be noticeable. Could there be better approach to this?
import pandas as pdimport numpy as npdata_raw = {'date': pd.date_range(start='2024-01-01', end='2024-01-10')}df_raw = pd.DataFrame(data_raw)data_past = {'date': pd.date_range(start='2023-12-15', end='2023-12-31'),'bid': [10, 20, 30, 40, 50, 60, 270, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170],'costprice': [20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 430, 140, 150, 160, 170, 180],'sellprice': [30, 450, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190]}df_past = pd.DataFrame(data_past)lags = [10, 11, 12, 13, 14, 15]for lag in lags: df_raw['date_lag_%s'%(lag)] = df_raw['date'] - pd.Timedelta(days=lag) df_raw = df_raw.merge(df_past, left_on=['date_lag_%s'%(lag)], right_on=['date'], how='left') df_raw.drop('date_y', axis=1, inplace=True) df_raw.rename(columns={'bid': 'bid_lag_%s'%(lag), 'costprice': 'costprice_lag_%s'%(lag), 'date_x':'date', 'sellprice': 'sellprice_lag_%s'%(lag)}, inplace=True)def filter_and_compute(row, pre_lag, post_lag, prefactor=0.6, postfactor=0.4): values_pre_bid, values_pre_cost, values_pre_sell = [], [], [] median_pre_bid = np.median([row[f'bid_lag_{lag}'] for lag in pre_lag]) median_pre_cost = np.median([row[f'costprice_lag_{lag}'] for lag in pre_lag]) median_pre_sell = np.median([row[f'sellprice_lag_{lag}'] for lag in pre_lag]) for lag in pre_lag: bid_col = f'bid_lag_{lag}' cost_col = f'costprice_lag_{lag}' sell_col = f'sellprice_lag_{lag}' if (row[bid_col] > 2*median_pre_bid) or (row[cost_col] > 2*median_pre_cost) or (row[sell_col] > 2*median_pre_sell): continue values_pre_bid.append(row[bid_col]) values_pre_cost.append(row[cost_col]) values_pre_sell.append(row[sell_col]) if values_pre_bid: agg_pre_bid, agg_pre_cost, agg_pre_sell = np.mean(values_pre_bid), np.mean(values_pre_cost), np.mean(values_pre_sell) else: agg_pre_bid, agg_pre_cost, agg_pre_sell = median_pre_bid, median_pre_cost, median_pre_sell values_post_bid, values_post_cost, values_post_sell = [], [], [] median_post_bid = np.median([row[f'bid_lag_{lag}'] for lag in post_lag]) median_post_cost = np.median([row[f'costprice_lag_{lag}'] for lag in post_lag]) median_post_sell = np.median([row[f'sellprice_lag_{lag}'] for lag in post_lag]) for lag in post_lag: bid_col = f'bid_lag_{lag}' cost_col = f'costprice_lag_{lag}' sell_col = f'sellprice_lag_{lag}' if (row[bid_col] > 2*median_post_bid) or (row[cost_col] > 2*median_post_cost) or (row[sell_col] > 2*median_post_sell): continue values_post_bid.append(row[bid_col]) values_post_cost.append(row[cost_col]) values_post_sell.append(row[sell_col]) if values_post_bid: agg_post_bid, agg_post_cost, agg_post_sell = np.mean(values_post_bid), np.mean(values_post_cost), np.mean(values_post_sell) else: agg_post_bid, agg_post_cost, agg_post_sell = median_pre_bid, median_pre_cost, median_pre_sell return prefactor*agg_pre_bid+postfactor*agg_post_bid, prefactor*agg_pre_cost+postfactor*agg_post_costdf_raw['agg_bid'], df_raw['cost'] = df_raw.apply(lambda row: filter_and_compute(row, pre_lag=[10, 11, 12], post_lag = [13, 14, 15]), axis=1).apply(pd.Series)