When running the code below, the product demand is being predicted. I am using a dataset from Kaggle:
https://www.kaggle.com/datasets/vikramxd/amazon-business-research-analyst-dataset/data
from HeterogeneousGraph import heterogeneous_graphimport numpy as npimport randomimport pandas as pdimport matplotlib.pyplot as pltfrom tensorflow.keras.models import Sequentialfrom tensorflow.keras.layers import LSTM, Densefrom tensorflow.keras.layers import Dropoutfrom sklearn.model_selection import train_test_splitfrom keras.utils import to_categoricalhetero = heterogeneous_graph()hetero.create_mdigraph_from_df()class lstm:def __init__(self, graph=hetero): self.graph = graph self.df = pd.read_csv('mydataset.csv') self.orig_df = pd.read_csv('mydataset.csv') self.lags = [1, 7, 30] self.sales = [] self.datetimes = [] self.festivals = [] self.weathers = [] self.lstm_model, self.lstm_history = None,None self.x_train, self.x_valid, self.y_train, self.y_valid = None,None,None,None self.preprocess_data() # self.show_graph() self.create_model() self.train_model() self.evaluate_model() self.make_predictions()def preprocess_data(self): times = [] for time_str in self.df["Time_Orderd"]: if type(time_str)==float or '.' in time_str: hours, mins, secs = random.randint(0, 12), random.randint(0, 59), random.randint(0, 59) tmp = f"{hours}:{mins}:{secs}" elif time_str.count(':')==3: pass elif type(time_str) == str and time_str.count(':')==1: tmp = time_str + f":{random.randint(0, 59)}" else: hours, mins, secs = random.randint(0, 12), random.randint(0, 59), random.randint(0, 59) tmp = f"{hours}:{mins}:{secs}" times.append(tmp) self.df.loc[self.df["Time_Orderd"] == time_str, "Time_Orderd"] = tmp for i, dt in enumerate(self.df["Order_Date"]): tmp = dt + f" {times[i]}" tmp = pd.to_datetime(tmp, format='%d/%m/%y %H:%M:%S', dayfirst=True) self.datetimes.append(tmp) # self.df.loc[self.df["Order_Date"] == dt, "Order_Date"] = tmp self.df["Order_Date"] = self.datetimes self.sales = [i for i in self.df["Type_of_order"]] for i, sales_str in enumerate(self.sales): if sales_str == "Drinks": self.df.loc[self.df["Type_of_order"] == sales_str, "Type_of_order"] = 0 self.sales[i] = 0 elif sales_str == "Snack": self.df.loc[self.df["Type_of_order"] == sales_str, "Type_of_order"] = 1 self.sales[i] = 1 elif sales_str == "Meal": self.df.loc[self.df["Type_of_order"] == sales_str, "Type_of_order"] = 2 self.sales[i] = 2 elif sales_str == "Buffet": self.df.loc[self.df["Type_of_order"] == sales_str, "Type_of_order"] = 3 self.sales[i] = 3 else: self.df.loc[self.df["Type_of_order"] == sales_str, "Type_of_order"] = 4 self.sales[i] = 4 self.weathers = [i for i in self.df["Weather"]] for i, weather_str in enumerate(self.weathers): if weather_str == "Stormy": self.df.loc[self.df["Weather"] == weather_str, "Weather"] = 0 self.weathers[i] = 0 # worst weather is lowest elif weather_str == "Fog": self.df.loc[self.df["Weather"] == weather_str, "Weather"] = 1 self.weathers[i] = 1 elif weather_str == "Windy": self.df.loc[self.df["Weather"] == weather_str, "Weather"] = 2 self.weathers[i] = 2 elif weather_str == "Cloudy": self.df.loc[self.df["Weather"] == weather_str, "Weather"] = 3 self.weathers[i] = 3 else: # sunny self.df.loc[self.df["Weather"] == weather_str, "Weather"] = 4 self.weathers[i] = 4 # best weather is highest # everytime the weather is sunny, there will be a festival. for i, festivaL_str in enumerate(self.df["Festival"]): if self.weathers[i] == 4: # if sunny self.df.loc[self.df["Festival"] == festivaL_str, "Festival"] = 1 self.festivals.append(1) # there is festival else: self.df.loc[self.df["Festival"] == festivaL_str, "Festival"] = 0 self.festivals.append(0) self.df = pd.DataFrame({"datetimes": self.datetimes,"sales": self.sales,"festivals": self.festivals,"weathers": self.weathers }) self.df = self.df.set_index("datetimes").sort_index(ascending=True) for col in self.df.columns: self.df[col].fillna(method='ffill', inplace=True) # print(self.df["datetimes"].dtype) # exit() # fig,ax = plt.subplots() # ax.plot(self.df["sales"]) # ax.set_xlabel("datetimes") # ax.set_ylabel("sales") # plt.xticks(np.arange(7,400,24),['Friday', 'Saturday', 'Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']) # plt.xlim(0,400) # fig.autofmt_xdate() # plt.tight_layout() # plt.show()def series_to_supervised(self, data, window, lag, dropnan=True): cols, names = list(), list() # Input sequence (t-n, ... t-1) for i in range(window, -1, -1): cols.append(data.shift(i)) names += [('%s(t-%d)' % (col, i)) for col in data.columns] # Current timestep (t=0) cols.append(data) names += [('%s(t)' % (col)) for col in data.columns] # Target timestep (t=lag) cols.append(data.shift(-lag)) names += [('%s(t+%d)' % (col, lag)) for col in data.columns] agg = pd.concat(cols, axis=1) agg.columns = names # Drop rows with NaN values if dropnan: agg.dropna(inplace=True) return agg# method that builds the lstm modeldef create_model(self): window = 29 lag = 90 print(self.df.head()) self.df = self.series_to_supervised(self.df,window=window,lag=lag) print(self.df.head()) labels_columns = 'sales(t+%d)' % lag labels = self.df[labels_columns] # split into features x and target y. x = self.df y = labels.values self.x_train, self.x_valid, self.y_train, self.y_valid = train_test_split(x, y, test_size=0.2, random_state=42) print('X Train shape: ', self.x_train.shape) print('X Test shape: ', self.x_valid.shape) print(self.x_train.head()) self.x_train = self.x_train.values.reshape((self.x_train.shape[0], self.x_train.shape[1],1)) self.x_valid = self.x_valid.values.reshape((self.x_valid.shape[0], self.x_valid.shape[1],1)) print('X Train shape: ', self.x_train.shape) print('X Test shape: ', self.x_valid.shape) print('Y Train shape: ', self.y_train.shape) print('Y Test shape: ', self.y_valid.shape) self.lstm_model = Sequential() # Use return_sequences=True for all layers except the last to enable information flow across layers. self.lstm_model.add(LSTM(50, return_sequences=False, input_shape=(self.x_train.shape[1],self.x_train.shape[2]))) # self.lstm_model.add(Dropout(0.2)) # self.lstm_model.add(LSTM(32, return_sequences=True)) # self.lstm_model.add(Dropout(0.2)) # self.lstm_model.add(LSTM(16, return_sequences=True)) # self.lstm_model.add(Dropout(0.2)) # self.lstm_model.add(LSTM(8)) # self.lstm_model.add(Dropout(0.2)) self.lstm_model.add(Dense(1)) self.lstm_model.compile(optimizer='adam', loss='mse') print(self.lstm_model.summary())# model is trained using x_train and y_train. During training, the model learns the relationship between the features and the target values.def train_model(self, epochs=10, batch_size=32): self.lstm_history = self.lstm_model.fit(self.x_train, self.y_train, validation_data=(self.x_valid, self.y_valid), epochs=epochs, batch_size=batch_size, verbose=2)# the trained model is evaluated using x_test and y_test to assess how well it performs on unseen data.def evaluate_model(self): # Evaluate model performance using appropriate metrics plt.plot(self.lstm_history.history["loss"]) plt.plot(self.lstm_history.history["val_loss"]) plt.title("Model Loss") plt.ylabel("Loss") plt.xlabel("Epoch") plt.legend(["Train Loss", "Validation Loss"], fancybox=True, shadow=True) plt.show()def make_predictions(self): # Preprocess new data with lagging # Make predictions predicted_sales = self.lstm_model.predict(self.x_valid) # We flatten the 2D array so we can plot it with matplotlib self.y_valid = self.y_valid.flatten() predicted_sales = predicted_sales.flatten() plt.plot(self.y_valid, color='black', label=f'Actual Sales') plt.plot(predicted_sales, color='green', label='Predicted Sales') plt.title("Actual Sales vs Predicted Sales") plt.xlabel("Days in test period") plt.ylabel("Sales") plt.legend(fancybox=True, shadow=True) plt.show()m = lstm()The model ends up with the validation loss being much lower than the training loss, which doesn't seem right I think.
I ran the code and I got the below output:
Adding nodes and edges to the graph.sales festivals weathersdatetimes
2022-02-11 00:01:17 3 0 22022-02-11 00:02:12 3 0 02022-02-11 00:11:26 3 0 12022-02-11 00:17:10 0 1 42022-02-11 00:18:27 2 0 0sales(t-29) ... weathers(t+90)datetimes ...
2022-02-11 01:30:20 3.0 ... 3.02022-02-11 01:31:06 3.0 ... 1.02022-02-11 01:35:03 3.0 ... 1.02022-02-11 01:36:51 0.0 ... 3.02022-02-11 01:40:00 2.0 ... 3.0
[5 rows x 96 columns]X Train shape: (9024, 96)X Test shape: (2257, 96)sales(t-29) ... weathers(t+90)datetimes ...
2022-03-09 00:00:39 1.0 ... 2.02022-04-06 03:57:43 2.0 ... 4.02022-03-01 04:59:09 3.0 ... 3.02022-04-04 10:28:57 2.0 ... 1.02022-03-20 05:58:29 0.0 ... 3.0
[5 rows x 96 columns]X Train shape: (9024, 96, 1)X Test shape: (2257, 96, 1)Y Train shape: (9024,)Y Test shape: (2257,)
Model: "sequential"
Layer (type) Output Shape Param #
lstm (LSTM) (None, 50) 10400
dense (Dense) (None, 1) 51
=================================================================Total params: 10451 (40.82 KB)Trainable params: 10451 (40.82 KB)Non-trainable params: 0 (0.00 Byte)
NoneEpoch 1/10282/282 - 8s - loss: 0.7972 - val_loss: 0.0966 - 8s/epoch - 30ms/stepEpoch 2/10282/282 - 6s - loss: 0.0472 - val_loss: 0.0241 - 6s/epoch - 23ms/stepEpoch 3/10282/282 - 7s - loss: 0.0142 - val_loss: 0.0107 - 7s/epoch - 23ms/stepEpoch 4/10282/282 - 6s - loss: 0.0067 - val_loss: 0.0051 - 6s/epoch - 22ms/stepEpoch 5/10282/282 - 6s - loss: 0.0033 - val_loss: 0.0022 - 6s/epoch - 23ms/stepEpoch 6/10282/282 - 6s - loss: 0.0019 - val_loss: 0.0013 - 6s/epoch - 22ms/stepEpoch 7/10282/282 - 7s - loss: 0.0012 - val_loss: 8.9631e-04 - 7s/epoch - 23ms/stepEpoch 8/10282/282 - 6s - loss: 9.1112e-04 - val_loss: 0.0015 - 6s/epoch - 23ms/stepEpoch 9/10282/282 - 7s - loss: 6.8309e-04 - val_loss: 5.2418e-04 - 7s/epoch - 24ms/stepEpoch 10/10282/282 - 6s - loss: 5.3130e-04 - val_loss: 3.7712e-04 - 6s/epoch - 23ms/step
I was expecting the training loss to be lower than the validation loss but this is not the case. Please can someone help me understand why this is or if this is how it's supposed to be?
