Quantcast
Channel: Active questions tagged python - Stack Overflow
Viewing all articles
Browse latest Browse all 23131

Why is my training loss so much higher than the validation loss? Is this normal or how do I fix it?

$
0
0

When running the code below, the product demand is being predicted. I am using a dataset from Kaggle:

https://www.kaggle.com/datasets/vikramxd/amazon-business-research-analyst-dataset/data

from HeterogeneousGraph import heterogeneous_graphimport numpy as npimport randomimport pandas as pdimport matplotlib.pyplot as pltfrom tensorflow.keras.models import Sequentialfrom tensorflow.keras.layers import LSTM, Densefrom tensorflow.keras.layers import Dropoutfrom sklearn.model_selection import train_test_splitfrom keras.utils import to_categoricalhetero = heterogeneous_graph()hetero.create_mdigraph_from_df()class lstm:def __init__(self, graph=hetero):    self.graph = graph    self.df = pd.read_csv('mydataset.csv')    self.orig_df = pd.read_csv('mydataset.csv')    self.lags = [1, 7, 30]    self.sales = []    self.datetimes = []    self.festivals = []    self.weathers = []    self.lstm_model, self.lstm_history = None,None    self.x_train, self.x_valid, self.y_train, self.y_valid = None,None,None,None    self.preprocess_data()    # self.show_graph()    self.create_model()    self.train_model()    self.evaluate_model()    self.make_predictions()def preprocess_data(self):    times = []    for time_str in self.df["Time_Orderd"]:        if type(time_str)==float or '.' in time_str:            hours, mins, secs = random.randint(0, 12), random.randint(0, 59), random.randint(0, 59)            tmp = f"{hours}:{mins}:{secs}"        elif time_str.count(':')==3:            pass        elif type(time_str) == str and time_str.count(':')==1:            tmp = time_str + f":{random.randint(0, 59)}"        else:            hours, mins, secs = random.randint(0, 12), random.randint(0, 59), random.randint(0, 59)            tmp = f"{hours}:{mins}:{secs}"        times.append(tmp)        self.df.loc[self.df["Time_Orderd"] == time_str, "Time_Orderd"] = tmp    for i, dt in enumerate(self.df["Order_Date"]):        tmp = dt + f" {times[i]}"        tmp = pd.to_datetime(tmp, format='%d/%m/%y %H:%M:%S', dayfirst=True)        self.datetimes.append(tmp)        # self.df.loc[self.df["Order_Date"] == dt, "Order_Date"] = tmp    self.df["Order_Date"] = self.datetimes    self.sales = [i for i in self.df["Type_of_order"]]    for i, sales_str in enumerate(self.sales):        if sales_str == "Drinks":            self.df.loc[self.df["Type_of_order"] == sales_str, "Type_of_order"] = 0            self.sales[i] = 0        elif sales_str == "Snack":            self.df.loc[self.df["Type_of_order"] == sales_str, "Type_of_order"] = 1            self.sales[i] = 1        elif sales_str == "Meal":            self.df.loc[self.df["Type_of_order"] == sales_str, "Type_of_order"] = 2            self.sales[i] = 2        elif sales_str == "Buffet":            self.df.loc[self.df["Type_of_order"] == sales_str, "Type_of_order"] = 3            self.sales[i] = 3        else:            self.df.loc[self.df["Type_of_order"] == sales_str, "Type_of_order"] = 4            self.sales[i] = 4    self.weathers = [i for i in self.df["Weather"]]    for i, weather_str in enumerate(self.weathers):        if weather_str == "Stormy":            self.df.loc[self.df["Weather"] == weather_str, "Weather"] = 0            self.weathers[i] = 0  # worst weather is lowest        elif weather_str == "Fog":            self.df.loc[self.df["Weather"] == weather_str, "Weather"] = 1            self.weathers[i] = 1        elif weather_str == "Windy":            self.df.loc[self.df["Weather"] == weather_str, "Weather"] = 2            self.weathers[i] = 2        elif weather_str == "Cloudy":            self.df.loc[self.df["Weather"] == weather_str, "Weather"] = 3            self.weathers[i] = 3        else:  # sunny            self.df.loc[self.df["Weather"] == weather_str, "Weather"] = 4            self.weathers[i] = 4  # best weather is highest    # everytime the weather is sunny, there will be a festival.    for i, festivaL_str in enumerate(self.df["Festival"]):        if self.weathers[i] == 4:  # if sunny            self.df.loc[self.df["Festival"] == festivaL_str, "Festival"] = 1            self.festivals.append(1)  # there is festival        else:            self.df.loc[self.df["Festival"] == festivaL_str, "Festival"] = 0            self.festivals.append(0)    self.df = pd.DataFrame({"datetimes": self.datetimes,"sales": self.sales,"festivals": self.festivals,"weathers": self.weathers    })    self.df = self.df.set_index("datetimes").sort_index(ascending=True)    for col in self.df.columns:        self.df[col].fillna(method='ffill', inplace=True)    # print(self.df["datetimes"].dtype)    # exit()    # fig,ax = plt.subplots()    # ax.plot(self.df["sales"])    # ax.set_xlabel("datetimes")    # ax.set_ylabel("sales")    # plt.xticks(np.arange(7,400,24),['Friday', 'Saturday', 'Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])    # plt.xlim(0,400)    # fig.autofmt_xdate()    # plt.tight_layout()    # plt.show()def series_to_supervised(self, data, window, lag, dropnan=True):    cols, names = list(), list()    # Input sequence (t-n, ... t-1)    for i in range(window, -1, -1):        cols.append(data.shift(i))        names += [('%s(t-%d)' % (col, i)) for col in data.columns]    # Current timestep (t=0)    cols.append(data)    names += [('%s(t)' % (col)) for col in data.columns]    # Target timestep (t=lag)    cols.append(data.shift(-lag))    names += [('%s(t+%d)' % (col, lag)) for col in data.columns]    agg = pd.concat(cols, axis=1)    agg.columns = names    # Drop rows with NaN values    if dropnan:        agg.dropna(inplace=True)    return agg# method that builds the lstm modeldef create_model(self):    window = 29    lag = 90    print(self.df.head())    self.df = self.series_to_supervised(self.df,window=window,lag=lag)    print(self.df.head())    labels_columns = 'sales(t+%d)' % lag    labels = self.df[labels_columns]    # split into features x and target y.    x = self.df    y = labels.values    self.x_train, self.x_valid, self.y_train, self.y_valid = train_test_split(x, y, test_size=0.2, random_state=42)    print('X Train shape: ', self.x_train.shape)    print('X Test shape: ', self.x_valid.shape)    print(self.x_train.head())    self.x_train = self.x_train.values.reshape((self.x_train.shape[0], self.x_train.shape[1],1))    self.x_valid = self.x_valid.values.reshape((self.x_valid.shape[0], self.x_valid.shape[1],1))    print('X Train shape: ', self.x_train.shape)    print('X Test shape: ', self.x_valid.shape)    print('Y Train shape: ', self.y_train.shape)    print('Y Test shape: ', self.y_valid.shape)    self.lstm_model = Sequential()    # Use return_sequences=True for all layers except the last to enable information flow across layers.    self.lstm_model.add(LSTM(50, return_sequences=False, input_shape=(self.x_train.shape[1],self.x_train.shape[2])))    # self.lstm_model.add(Dropout(0.2))    # self.lstm_model.add(LSTM(32, return_sequences=True))    # self.lstm_model.add(Dropout(0.2))    # self.lstm_model.add(LSTM(16, return_sequences=True))    # self.lstm_model.add(Dropout(0.2))    # self.lstm_model.add(LSTM(8))    # self.lstm_model.add(Dropout(0.2))    self.lstm_model.add(Dense(1))    self.lstm_model.compile(optimizer='adam', loss='mse')    print(self.lstm_model.summary())# model is trained using x_train and y_train. During training, the model learns the relationship between the features and the target values.def train_model(self, epochs=10, batch_size=32):    self.lstm_history = self.lstm_model.fit(self.x_train, self.y_train, validation_data=(self.x_valid, self.y_valid), epochs=epochs, batch_size=batch_size, verbose=2)# the trained model is evaluated using x_test and y_test to assess how well it performs on unseen data.def evaluate_model(self):    # Evaluate model performance using appropriate metrics    plt.plot(self.lstm_history.history["loss"])    plt.plot(self.lstm_history.history["val_loss"])    plt.title("Model Loss")    plt.ylabel("Loss")    plt.xlabel("Epoch")    plt.legend(["Train Loss", "Validation Loss"], fancybox=True, shadow=True)    plt.show()def make_predictions(self):    # Preprocess new data with lagging    # Make predictions    predicted_sales = self.lstm_model.predict(self.x_valid)    # We flatten the 2D array so we can plot it with matplotlib    self.y_valid = self.y_valid.flatten()    predicted_sales = predicted_sales.flatten()    plt.plot(self.y_valid, color='black', label=f'Actual Sales')    plt.plot(predicted_sales, color='green', label='Predicted Sales')    plt.title("Actual Sales vs Predicted Sales")    plt.xlabel("Days in test period")    plt.ylabel("Sales")    plt.legend(fancybox=True, shadow=True)    plt.show()m = lstm()

The model ends up with the validation loss being much lower than the training loss, which doesn't seem right I think.

I ran the code and I got the below output:

Adding nodes and edges to the graph.sales festivals weathersdatetimes
2022-02-11 00:01:17 3 0 22022-02-11 00:02:12 3 0 02022-02-11 00:11:26 3 0 12022-02-11 00:17:10 0 1 42022-02-11 00:18:27 2 0 0sales(t-29) ... weathers(t+90)datetimes ...
2022-02-11 01:30:20 3.0 ... 3.02022-02-11 01:31:06 3.0 ... 1.02022-02-11 01:35:03 3.0 ... 1.02022-02-11 01:36:51 0.0 ... 3.02022-02-11 01:40:00 2.0 ... 3.0

[5 rows x 96 columns]X Train shape: (9024, 96)X Test shape: (2257, 96)sales(t-29) ... weathers(t+90)datetimes ...
2022-03-09 00:00:39 1.0 ... 2.02022-04-06 03:57:43 2.0 ... 4.02022-03-01 04:59:09 3.0 ... 3.02022-04-04 10:28:57 2.0 ... 1.02022-03-20 05:58:29 0.0 ... 3.0

[5 rows x 96 columns]X Train shape: (9024, 96, 1)X Test shape: (2257, 96, 1)Y Train shape: (9024,)Y Test shape: (2257,)

Model: "sequential"

Layer (type) Output Shape Param #
lstm (LSTM) (None, 50) 10400
dense (Dense) (None, 1) 51

=================================================================Total params: 10451 (40.82 KB)Trainable params: 10451 (40.82 KB)Non-trainable params: 0 (0.00 Byte)


NoneEpoch 1/10282/282 - 8s - loss: 0.7972 - val_loss: 0.0966 - 8s/epoch - 30ms/stepEpoch 2/10282/282 - 6s - loss: 0.0472 - val_loss: 0.0241 - 6s/epoch - 23ms/stepEpoch 3/10282/282 - 7s - loss: 0.0142 - val_loss: 0.0107 - 7s/epoch - 23ms/stepEpoch 4/10282/282 - 6s - loss: 0.0067 - val_loss: 0.0051 - 6s/epoch - 22ms/stepEpoch 5/10282/282 - 6s - loss: 0.0033 - val_loss: 0.0022 - 6s/epoch - 23ms/stepEpoch 6/10282/282 - 6s - loss: 0.0019 - val_loss: 0.0013 - 6s/epoch - 22ms/stepEpoch 7/10282/282 - 7s - loss: 0.0012 - val_loss: 8.9631e-04 - 7s/epoch - 23ms/stepEpoch 8/10282/282 - 6s - loss: 9.1112e-04 - val_loss: 0.0015 - 6s/epoch - 23ms/stepEpoch 9/10282/282 - 7s - loss: 6.8309e-04 - val_loss: 5.2418e-04 - 7s/epoch - 24ms/stepEpoch 10/10282/282 - 6s - loss: 5.3130e-04 - val_loss: 3.7712e-04 - 6s/epoch - 23ms/step

I was expecting the training loss to be lower than the validation loss but this is not the case. Please can someone help me understand why this is or if this is how it's supposed to be?

click here to see the graph


Viewing all articles
Browse latest Browse all 23131

Trending Articles



<script src="https://jsc.adskeeper.com/r/s/rssing.com.1596347.js" async> </script>