I am doing StandardScaler normalization to my dataframe but it is inserting NaN values into some of the rows
Main Function
games = prepare_data()train_size = int(0.7 * len(games))test_size = len(games) - train_sizetrain_set = games.iloc[:train_size]test_set = games.iloc[-test_size:]X_train = train_set.iloc[:,:-1] y_train = train_set.iloc[:,-1]X_test = test_set.iloc[:,:-1] y_test = test_set.iloc[:,-1]train_set_normalized = normalize_data(train_set,'HOME_TEAM_WINS')prepare_data
def prepare_data(): #Import data from pkl files games = pd.read_pickle('cleaned_data/merged.pkl') teams = pd.read_pickle('cleaned_data/teams.pkl') #Add foundation year of home team games = games.merge(teams[['TEAM_ID', 'YEARFOUNDED']], left_on='HOME_TEAM_ID', right_on='TEAM_ID', how='left') games['FOUNDATION_HOME'] = games['YEARFOUNDED'] games.drop('TEAM_ID', axis=1, inplace=True) games.drop('YEARFOUNDED', axis=1, inplace=True) #Add foundation year of away team games = games.merge(teams[['TEAM_ID', 'YEARFOUNDED']], left_on='VISITOR_TEAM_ID', right_on='TEAM_ID', how='left') games['FOUNDATION_VISITOR'] = games['YEARFOUNDED'] games.drop('TEAM_ID', axis=1, inplace=True) games.drop('YEARFOUNDED', axis=1, inplace=True) games = games[games['SEASON'] != 2003] #Remove IDs and redundant columns # Lista das colunas que você deseja remover remove = ['GAME_DATE_EST','GAME_ID','HOME_TEAM_ID','VISITOR_TEAM_ID','SEASON','HOME_TEAM_REGULAR_SEASON_GAMES','HOME_TEAM_REGULAR_SEASON_TOTAL_WINS','HOME_TEAM_REGULAR_SEASON_TOTAL_LOSSES','HOME_TEAM_REGULAR_SEASON_WINNING_PERCENTAGE','HOME_TEAM_REGULAR_SEASON_WINS_AT_HOME','HOME_TEAM_REGULAR_SEASON_LOSSES_AT_HOME','HOME_TEAM_REGULAR_SEASON_WINS_AWAY','HOME_TEAM_REGULAR_SEASON_AWAY_LOSSES','STANDINGSDATE','AWAY_TEAM_REGULAR_SEASON_GAMES','AWAY_TEAM_REGULAR_SEASON_TOTAL_WINS','AWAY_TEAM_REGULAR_SEASON_TOTAL_LOSSES','AWAY_TEAM_REGULAR_SEASON_WINNING_PERCENTAGE','AWAY_TEAM_REGULAR_SEASON_WINS_AT_HOME','AWAY_TEAM_REGULAR_SEASON_LOSSES_AT_HOME','AWAY_TEAM_REGULAR_SEASON_WINS_AWAY','AWAY_TEAM_REGULAR_SEASON_AWAY_LOSSES' ] games = games.drop(columns=remove) #HOME_TEAM_WINS TO LAST COLUMN target = games.pop('HOME_TEAM_WINS') games.insert(len(games.columns), 'HOME_TEAM_WINS', target) print(games.head(50)) print("HOME TEAM WINS VALUE COUNTS:") print(games['HOME_TEAM_WINS'].value_counts()) games.to_csv('2.csv', index=False) return gamesnormalize_data
def normalize_data(data, target_column): result = data.isna().sum() result.to_csv('nan_counts.csv', header=['NaN_count']) data_normalized = data.copy() subset = data_normalized[['HOME_TEAM_TOTAL_GAMES','AWAY_TEAM_TOTAL_GAMES','HOME_TEAM_TOTAL_WINS','AWAY_TEAM_TOTAL_WINS','HOME_TEAM_TOTAL_LOSSES','AWAY_TEAM_TOTAL_LOSSES','HOME_TEAM_WIN_PERCENTAGE','AWAY_TEAM_WIN_PERCENTAGE','HOME_TEAM_TOTAL_GAMES_AT_HOME','AWAY_TEAM_TOTAL_GAMES_AWAY','HOME_TEAM_TOTAL_WINS_AT_HOME','AWAY_TEAM_TOTAL_WINS_AWAY','HOME_TEAM_TOTAL_LOSSES_AT_HOME','AWAY_TEAM_TOTAL_LOSSES_AWAY','HOME_TEAM_WIN_PERCENTAGE_AT_HOME','AWAY_TEAM_WIN_PERCENTAGE_AWAY','HOME_TEAM_WIN/LOSS_STREAK','VISITOR_TEAM_WIN/LOSS_STREAK','HOME_TEAM_WIN/LOSS_STREAK_AT_HOME','VISITOR_TEAM_WIN/LOSS_STREAK_AWAY','AVG_POINTS_LAST_4_GAMES_HOME_TEAM','AVG_POINTS_LAST_4_GAMES_VISITOR_TEAM','AVG_POINTS_CONCEDED_LAST_4_GAMES_HOME_TEAM','AVG_POINTS_CONCEDED_LAST_4_GAMES_VISITOR_TEAM','AVG_ASSISTS_LAST_4_GAMES_HOME_TEAM','AVG_ASSISTS_LAST_4_GAMES_VISITOR_TEAM','AVG_FGPCT_LAST_4_GAMES_HOME_TEAM','AVG_FGPCT_LAST_4_GAMES_VISITOR_TEAM','AVG_FTPCT_LAST_4_GAMES_HOME_TEAM','AVG_FTPCT_LAST_4_GAMES_VISITOR_TEAM','AVG_FG3PCT_LAST_4_GAMES_HOME_TEAM','AVG_FG3PCT_LAST_4_GAMES_VISITOR_TEAM','AVG_REB_LAST_4_GAMES_HOME_TEAM','AVG_REB_LAST_4_GAMES_VISITOR_TEAM','AVG_POINTS_LAST_4_GAMES_AT_HOME_HOME_TEAM','AVG_POINTS_CONCEDED_LAST_4_GAMES_AT_HOME_HOME_TEAM','AVG_ASSISTS_LAST_4_GAMES_AT_HOME_HOME_TEAM','AVG_FGPCT_LAST_4_GAMES_AT_HOME_HOME_TEAM','AVG_FTPCT_LAST_4_GAMES_AT_HOME_HOME_TEAM','AVG_FG3PCT_LAST_4_GAMES_AT_HOME_HOME_TEAM','AVG_REB_LAST_4_GAMES_AT_HOME_HOME_TEAM','AVG_POINTS_LAST_4_GAMES_AWAY_VISITOR_TEAM','AVG_POINTS_CONCEDED_LAST_4_GAMES_AWAY_VISITOR_TEAM','AVG_ASSISTS_LAST_4_GAMES_AWAY_VISITOR_TEAM','AVG_FGPCT_LAST_4_GAMES_AWAY_VISITOR_TEAM','AVG_FTPCT_LAST_4_GAMES_AWAY_VISITOR_TEAM','AVG_FG3PCT_LAST_4_GAMES_AWAY_VISITOR_TEAM','AVG_REB_LAST_4_GAMES_AWAY_VISITOR_TEAM','FOUNDATION_HOME','FOUNDATION_VISITOR' ]] scaler = StandardScaler() scaler.fit(subset) scaled_data = scaler.transform(subset) scaled_df = pd.DataFrame(scaled_data, columns=['HOME_TEAM_TOTAL_GAMES','AWAY_TEAM_TOTAL_GAMES','HOME_TEAM_TOTAL_WINS','AWAY_TEAM_TOTAL_WINS','HOME_TEAM_TOTAL_LOSSES','AWAY_TEAM_TOTAL_LOSSES','HOME_TEAM_WIN_PERCENTAGE','AWAY_TEAM_WIN_PERCENTAGE','HOME_TEAM_TOTAL_GAMES_AT_HOME','AWAY_TEAM_TOTAL_GAMES_AWAY','HOME_TEAM_TOTAL_WINS_AT_HOME','AWAY_TEAM_TOTAL_WINS_AWAY','HOME_TEAM_TOTAL_LOSSES_AT_HOME','AWAY_TEAM_TOTAL_LOSSES_AWAY','HOME_TEAM_WIN_PERCENTAGE_AT_HOME','AWAY_TEAM_WIN_PERCENTAGE_AWAY','HOME_TEAM_WIN/LOSS_STREAK','VISITOR_TEAM_WIN/LOSS_STREAK','HOME_TEAM_WIN/LOSS_STREAK_AT_HOME','VISITOR_TEAM_WIN/LOSS_STREAK_AWAY','AVG_POINTS_LAST_4_GAMES_HOME_TEAM','AVG_POINTS_LAST_4_GAMES_VISITOR_TEAM','AVG_POINTS_CONCEDED_LAST_4_GAMES_HOME_TEAM','AVG_POINTS_CONCEDED_LAST_4_GAMES_VISITOR_TEAM','AVG_ASSISTS_LAST_4_GAMES_HOME_TEAM','AVG_ASSISTS_LAST_4_GAMES_VISITOR_TEAM','AVG_FGPCT_LAST_4_GAMES_HOME_TEAM','AVG_FGPCT_LAST_4_GAMES_VISITOR_TEAM','AVG_FTPCT_LAST_4_GAMES_HOME_TEAM','AVG_FTPCT_LAST_4_GAMES_VISITOR_TEAM','AVG_FG3PCT_LAST_4_GAMES_HOME_TEAM','AVG_FG3PCT_LAST_4_GAMES_VISITOR_TEAM','AVG_REB_LAST_4_GAMES_HOME_TEAM','AVG_REB_LAST_4_GAMES_VISITOR_TEAM','AVG_POINTS_LAST_4_GAMES_AT_HOME_HOME_TEAM','AVG_POINTS_CONCEDED_LAST_4_GAMES_AT_HOME_HOME_TEAM','AVG_ASSISTS_LAST_4_GAMES_AT_HOME_HOME_TEAM','AVG_FGPCT_LAST_4_GAMES_AT_HOME_HOME_TEAM','AVG_FTPCT_LAST_4_GAMES_AT_HOME_HOME_TEAM','AVG_FG3PCT_LAST_4_GAMES_AT_HOME_HOME_TEAM','AVG_REB_LAST_4_GAMES_AT_HOME_HOME_TEAM','AVG_POINTS_LAST_4_GAMES_AWAY_VISITOR_TEAM','AVG_POINTS_CONCEDED_LAST_4_GAMES_AWAY_VISITOR_TEAM','AVG_ASSISTS_LAST_4_GAMES_AWAY_VISITOR_TEAM','AVG_FGPCT_LAST_4_GAMES_AWAY_VISITOR_TEAM','AVG_FTPCT_LAST_4_GAMES_AWAY_VISITOR_TEAM','AVG_FG3PCT_LAST_4_GAMES_AWAY_VISITOR_TEAM','AVG_REB_LAST_4_GAMES_AWAY_VISITOR_TEAM','FOUNDATION_HOME','FOUNDATION_VISITOR' ]) data_normalized[['HOME_TEAM_TOTAL_GAMES','AWAY_TEAM_TOTAL_GAMES','HOME_TEAM_TOTAL_WINS','AWAY_TEAM_TOTAL_WINS','HOME_TEAM_TOTAL_LOSSES','AWAY_TEAM_TOTAL_LOSSES','HOME_TEAM_WIN_PERCENTAGE','AWAY_TEAM_WIN_PERCENTAGE','HOME_TEAM_TOTAL_GAMES_AT_HOME','AWAY_TEAM_TOTAL_GAMES_AWAY','HOME_TEAM_TOTAL_WINS_AT_HOME','AWAY_TEAM_TOTAL_WINS_AWAY','HOME_TEAM_TOTAL_LOSSES_AT_HOME','AWAY_TEAM_TOTAL_LOSSES_AWAY','HOME_TEAM_WIN_PERCENTAGE_AT_HOME','AWAY_TEAM_WIN_PERCENTAGE_AWAY','HOME_TEAM_WIN/LOSS_STREAK','VISITOR_TEAM_WIN/LOSS_STREAK','HOME_TEAM_WIN/LOSS_STREAK_AT_HOME','VISITOR_TEAM_WIN/LOSS_STREAK_AWAY','AVG_POINTS_LAST_4_GAMES_HOME_TEAM','AVG_POINTS_LAST_4_GAMES_VISITOR_TEAM','AVG_POINTS_CONCEDED_LAST_4_GAMES_HOME_TEAM','AVG_POINTS_CONCEDED_LAST_4_GAMES_VISITOR_TEAM','AVG_ASSISTS_LAST_4_GAMES_HOME_TEAM','AVG_ASSISTS_LAST_4_GAMES_VISITOR_TEAM','AVG_FGPCT_LAST_4_GAMES_HOME_TEAM','AVG_FGPCT_LAST_4_GAMES_VISITOR_TEAM','AVG_FTPCT_LAST_4_GAMES_HOME_TEAM','AVG_FTPCT_LAST_4_GAMES_VISITOR_TEAM','AVG_FG3PCT_LAST_4_GAMES_HOME_TEAM','AVG_FG3PCT_LAST_4_GAMES_VISITOR_TEAM','AVG_REB_LAST_4_GAMES_HOME_TEAM','AVG_REB_LAST_4_GAMES_VISITOR_TEAM','AVG_POINTS_LAST_4_GAMES_AT_HOME_HOME_TEAM','AVG_POINTS_CONCEDED_LAST_4_GAMES_AT_HOME_HOME_TEAM','AVG_ASSISTS_LAST_4_GAMES_AT_HOME_HOME_TEAM','AVG_FGPCT_LAST_4_GAMES_AT_HOME_HOME_TEAM','AVG_FTPCT_LAST_4_GAMES_AT_HOME_HOME_TEAM','AVG_FG3PCT_LAST_4_GAMES_AT_HOME_HOME_TEAM','AVG_REB_LAST_4_GAMES_AT_HOME_HOME_TEAM','AVG_POINTS_LAST_4_GAMES_AWAY_VISITOR_TEAM','AVG_POINTS_CONCEDED_LAST_4_GAMES_AWAY_VISITOR_TEAM','AVG_ASSISTS_LAST_4_GAMES_AWAY_VISITOR_TEAM','AVG_FGPCT_LAST_4_GAMES_AWAY_VISITOR_TEAM','AVG_FTPCT_LAST_4_GAMES_AWAY_VISITOR_TEAM','AVG_FG3PCT_LAST_4_GAMES_AWAY_VISITOR_TEAM','AVG_REB_LAST_4_GAMES_AWAY_VISITOR_TEAM','FOUNDATION_HOME','FOUNDATION_VISITOR' ]] = scaled_df result = data_normalized.isna().sum() result.to_csv('nan_counts2.csv', header=['NaN_count']) return data_normalizedI checked the csv file that is generate in the beginning of the normalize_data function and there are no NaN, but in the csv file at the end of the function there are around 7000 lines with NaN values.
Am I doing some mistake when I am normalizing my data?
Also, I tried deleting the rows where Season is 2003 before merging the two dataframes and the problem doesn't happen.
#Import data from pkl files games = pd.read_pickle('cleaned_data/merged.pkl') teams = pd.read_pickle('cleaned_data/teams.pkl') games = games[games['SEASON'] != 2003] #Add foundation year of home team games = games.merge(teams[['TEAM_ID', 'YEARFOUNDED']], left_on='HOME_TEAM_ID', right_on='TEAM_ID', how='left') games['FOUNDATION_HOME'] = games['YEARFOUNDED'] games.drop('TEAM_ID', axis=1, inplace=True) games.drop('YEARFOUNDED', axis=1, inplace=True)I runned both versions while saving the resultant games dataframe to a csv file, and when comparing them I reached the conclusion that they are equal.
What can be causing this problem?