Quantcast
Channel: Active questions tagged python - Stack Overflow
Viewing all articles
Browse latest Browse all 18819

Pandas Apply running slow

$
0
0

I am calculating weighted average post outlier removal in pandas.When I am going to like 1M rows, I am seeing significant time taken in filter_and_compute method. Why is that? What can be done better? I am only showing a dummy example below so the size is alot smaller and compute may not be noticeable. Could there be better approach to this?

import pandas as pdimport numpy as npdata_raw = {'date': pd.date_range(start='2024-01-01', end='2024-01-10')}df_raw = pd.DataFrame(data_raw)data_past = {'date': pd.date_range(start='2023-12-15', end='2023-12-31'),'bid': [10, 20, 30, 40, 50, 60, 270, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170],'costprice': [20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 430, 140, 150, 160, 170, 180],'sellprice': [30, 450, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190]}df_past = pd.DataFrame(data_past)lags = [10, 11, 12, 13, 14, 15]for lag in lags:    df_raw['date_lag_%s'%(lag)] = df_raw['date'] - pd.Timedelta(days=lag)    df_raw = df_raw.merge(df_past, left_on=['date_lag_%s'%(lag)], right_on=['date'], how='left')    df_raw.drop('date_y', axis=1, inplace=True)    df_raw.rename(columns={'bid': 'bid_lag_%s'%(lag), 'costprice': 'costprice_lag_%s'%(lag), 'date_x':'date', 'sellprice': 'sellprice_lag_%s'%(lag)}, inplace=True)def filter_and_compute(row, pre_lag, post_lag, prefactor=0.6, postfactor=0.4):    values_pre_bid, values_pre_cost, values_pre_sell = [], [], []    median_pre_bid = np.median([row[f'bid_lag_{lag}'] for lag in pre_lag])    median_pre_cost = np.median([row[f'costprice_lag_{lag}'] for lag in pre_lag])    median_pre_sell = np.median([row[f'sellprice_lag_{lag}'] for lag in pre_lag])    for lag in pre_lag:        bid_col = f'bid_lag_{lag}'        cost_col = f'costprice_lag_{lag}'        sell_col = f'sellprice_lag_{lag}'        if (row[bid_col] > 2*median_pre_bid) or (row[cost_col] > 2*median_pre_cost) or (row[sell_col] > 2*median_pre_sell):            continue        values_pre_bid.append(row[bid_col])        values_pre_cost.append(row[cost_col])        values_pre_sell.append(row[sell_col])    if values_pre_bid:        agg_pre_bid, agg_pre_cost, agg_pre_sell = np.mean(values_pre_bid), np.mean(values_pre_cost), np.mean(values_pre_sell)            else:        agg_pre_bid, agg_pre_cost, agg_pre_sell = median_pre_bid, median_pre_cost, median_pre_sell          values_post_bid, values_post_cost, values_post_sell = [], [], []    median_post_bid = np.median([row[f'bid_lag_{lag}'] for lag in post_lag])    median_post_cost = np.median([row[f'costprice_lag_{lag}'] for lag in post_lag])    median_post_sell = np.median([row[f'sellprice_lag_{lag}'] for lag in post_lag])    for lag in post_lag:        bid_col = f'bid_lag_{lag}'        cost_col = f'costprice_lag_{lag}'        sell_col = f'sellprice_lag_{lag}'        if (row[bid_col] > 2*median_post_bid) or (row[cost_col] > 2*median_post_cost) or (row[sell_col] > 2*median_post_sell):            continue        values_post_bid.append(row[bid_col])        values_post_cost.append(row[cost_col])        values_post_sell.append(row[sell_col])    if values_post_bid:        agg_post_bid, agg_post_cost, agg_post_sell = np.mean(values_post_bid), np.mean(values_post_cost), np.mean(values_post_sell)            else:        agg_post_bid, agg_post_cost, agg_post_sell = median_pre_bid, median_pre_cost, median_pre_sell          return prefactor*agg_pre_bid+postfactor*agg_post_bid, prefactor*agg_pre_cost+postfactor*agg_post_costdf_raw['agg_bid'], df_raw['cost'] = df_raw.apply(lambda row: filter_and_compute(row, pre_lag=[10, 11, 12], post_lag = [13, 14, 15]), axis=1).apply(pd.Series)

Viewing all articles
Browse latest Browse all 18819

Trending Articles



<script src="https://jsc.adskeeper.com/r/s/rssing.com.1596347.js" async> </script>