Trying to build model where user will provide input of factors(presented as #hash code for now) based on input predict product price with lags.
Example: Heavy rain during summer will impact price of Mango. Rain accurse today and price increase/decrease after 1-2 week. in my case monthly.
Suppose this is the data set.
ORG_data = {"Month": range(1, 37),"Year": [2021]*12 + [2022]*12 + [2023]*12,"Product": ["Product_A"]*36,"Price": [5.259233, 6.053424, 4.605903, 11.394058, 14, 10, 9, 12, 8, 7, 5, 4, 6, 4, 18, 6, 5, 5, 19, 8, 5, 4, 3, 7, 4, 5, 6, 7, 7, 7, 6, 5, 6, 6, 7, 6],"d21d-4e17-bbee": [58, 20, 58, 54, 8, 57, 13, 19, 29, 25, 39, 1, 33, 36, 26, 39, 11, 44, 2, 57, 50, 39, 42, 20, 29, 12, 3, 48, 30, 45, 38, 55, 27, 6, 13, 8],"b61e-4d06-b404": [79, 123, 59, 50, 94, 148, 84, 99, 78, 120, 74, 79, 104, 122, 119, 61, 98, 118, 147, 68, 105, 98, 132, 119, 140, 103, 142, 147, 122, 115, 77, 66, 129, 128, 97, 107],"f756-4415-821a": [108, 105, 127, 127, 112, 107, 107, 104, 127, 100, 126, 124, 100, 121, 122, 124, 124, 127, 119, 100, 103, 111, 100, 104, 125, 116, 121, 120, 130, 119, 118, 114, 118, 102, 124, 116],"df5b-4f06-a9b0": [81, 84, 85, 82, 77, 85, 85, 71, 80, 85, 73, 83, 82, 85, 84, 71, 85, 82, 79, 84, 76, 80, 70, 74, 79,76, 81, 74, 84, 77, 74, 73, 72, 83, 74, 78]"1367-4eaf-b6d0": [84, 99, 78, 120, 74, 85, 82, 77, 85, 85, 71, 80, 61, 98, 118, 147, 68, 105, 98, 80, 70, 74, 79, 76, 81, 74, 84, 77, 74, 73, 72, 83, 74, 78, 88, 77]}Here is the code of for generate Lag for now consider Lag=2.
ORG_data = ORG_data.drop(['Product','Year','Month'], axis=1)def generate_lagged_features(df, lag=2): lagged_data = Final_DF_W_P.copy() for i in range(1, lag+1): for col in df.iloc[:,1:]: lagged_data[f'{col}_lag_{i}'] = lagged_data[col].shift(i) return lagged_data.dropna()And here is the code for predictive model
def predict_price_with_lags(data, factors_input, lag=2): if not isinstance(data, pd.DataFrame): # try converting the input to a Dataframe try: data = pd.DataFrame(data) except: raise ValueError('Input Data must be a DataFrame or convertible to a DataFrame') if isinstance(factors_input, dict): factors_input = pd.Series(factors_input) factors = data.columns[1:] factors_to_use = factors_input[factors_input.index.isin(factors)] print(factors_to_use) data_filtered = data[['Price'] + list(factors_to_use)] # Create lagged features for price data_with_lags = generate_lagged_features(data_filtered, lag=lag) # Separate the data X = data_with_lags.drop(columns='Price') y = data_with_lags['Price'] # Initialize and fit the SARIMAX model model = SARIMAX(y, exog=X, order=(1, 1, 1)) results = model.fit() # Prepare exogenous variables for prediction df1 = pd.DataFrame({"Cols": factors_to_use}) exog_pred = df1.T exog_pred.to_csv('abcd.csv') #exog_pred = factors_to_use.to_frame().T print(exog_pred) exog_pred.columns = [f'{col}_lag_{i+1}' for i in range(lag) for col in exog_pred.columns] # Predictions prediction_data = pd.DataFrame(columns=X.columns) prediction_data = prediction_data.append(exog_pred, ignore_index=True) predicted_price = results.predict(start=0, end=0, exog=prediction_data) return predicted_price.values[0]Input Data
user_input_factors = {'b61e-4d06-b404': 30,'d21d-4e17-bbee': 110,'f756-4415-821a': 87,'df5b-4f06-a9b0': 80,'1367-4eaf-b6d0': 138}Call the model function
predicted_price = predict_price_with_lags(ORG_data, user_input_factors, lag=2)Error Getting:KeyError: '[30,110,87,80,138] not in index'