Quantcast
Channel: Active questions tagged python - Stack Overflow
Viewing all articles
Browse latest Browse all 23305

Normalization inserting NaN values into my dataframe

$
0
0

I am doing StandardScaler normalization to my dataframe but it is inserting NaN values into some of the rows

Main Function

games = prepare_data()train_size = int(0.7 * len(games))test_size = len(games) - train_sizetrain_set = games.iloc[:train_size]test_set = games.iloc[-test_size:]X_train = train_set.iloc[:,:-1] y_train = train_set.iloc[:,-1]X_test = test_set.iloc[:,:-1] y_test = test_set.iloc[:,-1]train_set_normalized = normalize_data(train_set,'HOME_TEAM_WINS')

prepare_data

def prepare_data():    #Import data from pkl files    games = pd.read_pickle('cleaned_data/merged.pkl')    teams = pd.read_pickle('cleaned_data/teams.pkl')    #Add foundation year of home team    games = games.merge(teams[['TEAM_ID', 'YEARFOUNDED']], left_on='HOME_TEAM_ID', right_on='TEAM_ID', how='left')    games['FOUNDATION_HOME'] = games['YEARFOUNDED']    games.drop('TEAM_ID', axis=1, inplace=True)    games.drop('YEARFOUNDED', axis=1, inplace=True)    #Add foundation year of away team    games = games.merge(teams[['TEAM_ID', 'YEARFOUNDED']], left_on='VISITOR_TEAM_ID', right_on='TEAM_ID', how='left')    games['FOUNDATION_VISITOR'] = games['YEARFOUNDED']    games.drop('TEAM_ID', axis=1, inplace=True)    games.drop('YEARFOUNDED', axis=1, inplace=True)    games = games[games['SEASON'] != 2003]    #Remove IDs and redundant columns    # Lista das colunas que você deseja remover    remove = ['GAME_DATE_EST','GAME_ID','HOME_TEAM_ID','VISITOR_TEAM_ID','SEASON','HOME_TEAM_REGULAR_SEASON_GAMES','HOME_TEAM_REGULAR_SEASON_TOTAL_WINS','HOME_TEAM_REGULAR_SEASON_TOTAL_LOSSES','HOME_TEAM_REGULAR_SEASON_WINNING_PERCENTAGE','HOME_TEAM_REGULAR_SEASON_WINS_AT_HOME','HOME_TEAM_REGULAR_SEASON_LOSSES_AT_HOME','HOME_TEAM_REGULAR_SEASON_WINS_AWAY','HOME_TEAM_REGULAR_SEASON_AWAY_LOSSES','STANDINGSDATE','AWAY_TEAM_REGULAR_SEASON_GAMES','AWAY_TEAM_REGULAR_SEASON_TOTAL_WINS','AWAY_TEAM_REGULAR_SEASON_TOTAL_LOSSES','AWAY_TEAM_REGULAR_SEASON_WINNING_PERCENTAGE','AWAY_TEAM_REGULAR_SEASON_WINS_AT_HOME','AWAY_TEAM_REGULAR_SEASON_LOSSES_AT_HOME','AWAY_TEAM_REGULAR_SEASON_WINS_AWAY','AWAY_TEAM_REGULAR_SEASON_AWAY_LOSSES'    ]    games = games.drop(columns=remove)    #HOME_TEAM_WINS TO LAST COLUMN    target = games.pop('HOME_TEAM_WINS')    games.insert(len(games.columns), 'HOME_TEAM_WINS', target)    print(games.head(50))    print("HOME TEAM WINS VALUE COUNTS:")    print(games['HOME_TEAM_WINS'].value_counts())    games.to_csv('2.csv', index=False)    return games

normalize_data

def normalize_data(data, target_column):    result = data.isna().sum()    result.to_csv('nan_counts.csv', header=['NaN_count'])    data_normalized = data.copy()    subset = data_normalized[['HOME_TEAM_TOTAL_GAMES','AWAY_TEAM_TOTAL_GAMES','HOME_TEAM_TOTAL_WINS','AWAY_TEAM_TOTAL_WINS','HOME_TEAM_TOTAL_LOSSES','AWAY_TEAM_TOTAL_LOSSES','HOME_TEAM_WIN_PERCENTAGE','AWAY_TEAM_WIN_PERCENTAGE','HOME_TEAM_TOTAL_GAMES_AT_HOME','AWAY_TEAM_TOTAL_GAMES_AWAY','HOME_TEAM_TOTAL_WINS_AT_HOME','AWAY_TEAM_TOTAL_WINS_AWAY','HOME_TEAM_TOTAL_LOSSES_AT_HOME','AWAY_TEAM_TOTAL_LOSSES_AWAY','HOME_TEAM_WIN_PERCENTAGE_AT_HOME','AWAY_TEAM_WIN_PERCENTAGE_AWAY','HOME_TEAM_WIN/LOSS_STREAK','VISITOR_TEAM_WIN/LOSS_STREAK','HOME_TEAM_WIN/LOSS_STREAK_AT_HOME','VISITOR_TEAM_WIN/LOSS_STREAK_AWAY','AVG_POINTS_LAST_4_GAMES_HOME_TEAM','AVG_POINTS_LAST_4_GAMES_VISITOR_TEAM','AVG_POINTS_CONCEDED_LAST_4_GAMES_HOME_TEAM','AVG_POINTS_CONCEDED_LAST_4_GAMES_VISITOR_TEAM','AVG_ASSISTS_LAST_4_GAMES_HOME_TEAM','AVG_ASSISTS_LAST_4_GAMES_VISITOR_TEAM','AVG_FGPCT_LAST_4_GAMES_HOME_TEAM','AVG_FGPCT_LAST_4_GAMES_VISITOR_TEAM','AVG_FTPCT_LAST_4_GAMES_HOME_TEAM','AVG_FTPCT_LAST_4_GAMES_VISITOR_TEAM','AVG_FG3PCT_LAST_4_GAMES_HOME_TEAM','AVG_FG3PCT_LAST_4_GAMES_VISITOR_TEAM','AVG_REB_LAST_4_GAMES_HOME_TEAM','AVG_REB_LAST_4_GAMES_VISITOR_TEAM','AVG_POINTS_LAST_4_GAMES_AT_HOME_HOME_TEAM','AVG_POINTS_CONCEDED_LAST_4_GAMES_AT_HOME_HOME_TEAM','AVG_ASSISTS_LAST_4_GAMES_AT_HOME_HOME_TEAM','AVG_FGPCT_LAST_4_GAMES_AT_HOME_HOME_TEAM','AVG_FTPCT_LAST_4_GAMES_AT_HOME_HOME_TEAM','AVG_FG3PCT_LAST_4_GAMES_AT_HOME_HOME_TEAM','AVG_REB_LAST_4_GAMES_AT_HOME_HOME_TEAM','AVG_POINTS_LAST_4_GAMES_AWAY_VISITOR_TEAM','AVG_POINTS_CONCEDED_LAST_4_GAMES_AWAY_VISITOR_TEAM','AVG_ASSISTS_LAST_4_GAMES_AWAY_VISITOR_TEAM','AVG_FGPCT_LAST_4_GAMES_AWAY_VISITOR_TEAM','AVG_FTPCT_LAST_4_GAMES_AWAY_VISITOR_TEAM','AVG_FG3PCT_LAST_4_GAMES_AWAY_VISITOR_TEAM','AVG_REB_LAST_4_GAMES_AWAY_VISITOR_TEAM','FOUNDATION_HOME','FOUNDATION_VISITOR'    ]]    scaler = StandardScaler()    scaler.fit(subset)    scaled_data = scaler.transform(subset)    scaled_df = pd.DataFrame(scaled_data, columns=['HOME_TEAM_TOTAL_GAMES','AWAY_TEAM_TOTAL_GAMES','HOME_TEAM_TOTAL_WINS','AWAY_TEAM_TOTAL_WINS','HOME_TEAM_TOTAL_LOSSES','AWAY_TEAM_TOTAL_LOSSES','HOME_TEAM_WIN_PERCENTAGE','AWAY_TEAM_WIN_PERCENTAGE','HOME_TEAM_TOTAL_GAMES_AT_HOME','AWAY_TEAM_TOTAL_GAMES_AWAY','HOME_TEAM_TOTAL_WINS_AT_HOME','AWAY_TEAM_TOTAL_WINS_AWAY','HOME_TEAM_TOTAL_LOSSES_AT_HOME','AWAY_TEAM_TOTAL_LOSSES_AWAY','HOME_TEAM_WIN_PERCENTAGE_AT_HOME','AWAY_TEAM_WIN_PERCENTAGE_AWAY','HOME_TEAM_WIN/LOSS_STREAK','VISITOR_TEAM_WIN/LOSS_STREAK','HOME_TEAM_WIN/LOSS_STREAK_AT_HOME','VISITOR_TEAM_WIN/LOSS_STREAK_AWAY','AVG_POINTS_LAST_4_GAMES_HOME_TEAM','AVG_POINTS_LAST_4_GAMES_VISITOR_TEAM','AVG_POINTS_CONCEDED_LAST_4_GAMES_HOME_TEAM','AVG_POINTS_CONCEDED_LAST_4_GAMES_VISITOR_TEAM','AVG_ASSISTS_LAST_4_GAMES_HOME_TEAM','AVG_ASSISTS_LAST_4_GAMES_VISITOR_TEAM','AVG_FGPCT_LAST_4_GAMES_HOME_TEAM','AVG_FGPCT_LAST_4_GAMES_VISITOR_TEAM','AVG_FTPCT_LAST_4_GAMES_HOME_TEAM','AVG_FTPCT_LAST_4_GAMES_VISITOR_TEAM','AVG_FG3PCT_LAST_4_GAMES_HOME_TEAM','AVG_FG3PCT_LAST_4_GAMES_VISITOR_TEAM','AVG_REB_LAST_4_GAMES_HOME_TEAM','AVG_REB_LAST_4_GAMES_VISITOR_TEAM','AVG_POINTS_LAST_4_GAMES_AT_HOME_HOME_TEAM','AVG_POINTS_CONCEDED_LAST_4_GAMES_AT_HOME_HOME_TEAM','AVG_ASSISTS_LAST_4_GAMES_AT_HOME_HOME_TEAM','AVG_FGPCT_LAST_4_GAMES_AT_HOME_HOME_TEAM','AVG_FTPCT_LAST_4_GAMES_AT_HOME_HOME_TEAM','AVG_FG3PCT_LAST_4_GAMES_AT_HOME_HOME_TEAM','AVG_REB_LAST_4_GAMES_AT_HOME_HOME_TEAM','AVG_POINTS_LAST_4_GAMES_AWAY_VISITOR_TEAM','AVG_POINTS_CONCEDED_LAST_4_GAMES_AWAY_VISITOR_TEAM','AVG_ASSISTS_LAST_4_GAMES_AWAY_VISITOR_TEAM','AVG_FGPCT_LAST_4_GAMES_AWAY_VISITOR_TEAM','AVG_FTPCT_LAST_4_GAMES_AWAY_VISITOR_TEAM','AVG_FG3PCT_LAST_4_GAMES_AWAY_VISITOR_TEAM','AVG_REB_LAST_4_GAMES_AWAY_VISITOR_TEAM','FOUNDATION_HOME','FOUNDATION_VISITOR'    ])    data_normalized[['HOME_TEAM_TOTAL_GAMES','AWAY_TEAM_TOTAL_GAMES','HOME_TEAM_TOTAL_WINS','AWAY_TEAM_TOTAL_WINS','HOME_TEAM_TOTAL_LOSSES','AWAY_TEAM_TOTAL_LOSSES','HOME_TEAM_WIN_PERCENTAGE','AWAY_TEAM_WIN_PERCENTAGE','HOME_TEAM_TOTAL_GAMES_AT_HOME','AWAY_TEAM_TOTAL_GAMES_AWAY','HOME_TEAM_TOTAL_WINS_AT_HOME','AWAY_TEAM_TOTAL_WINS_AWAY','HOME_TEAM_TOTAL_LOSSES_AT_HOME','AWAY_TEAM_TOTAL_LOSSES_AWAY','HOME_TEAM_WIN_PERCENTAGE_AT_HOME','AWAY_TEAM_WIN_PERCENTAGE_AWAY','HOME_TEAM_WIN/LOSS_STREAK','VISITOR_TEAM_WIN/LOSS_STREAK','HOME_TEAM_WIN/LOSS_STREAK_AT_HOME','VISITOR_TEAM_WIN/LOSS_STREAK_AWAY','AVG_POINTS_LAST_4_GAMES_HOME_TEAM','AVG_POINTS_LAST_4_GAMES_VISITOR_TEAM','AVG_POINTS_CONCEDED_LAST_4_GAMES_HOME_TEAM','AVG_POINTS_CONCEDED_LAST_4_GAMES_VISITOR_TEAM','AVG_ASSISTS_LAST_4_GAMES_HOME_TEAM','AVG_ASSISTS_LAST_4_GAMES_VISITOR_TEAM','AVG_FGPCT_LAST_4_GAMES_HOME_TEAM','AVG_FGPCT_LAST_4_GAMES_VISITOR_TEAM','AVG_FTPCT_LAST_4_GAMES_HOME_TEAM','AVG_FTPCT_LAST_4_GAMES_VISITOR_TEAM','AVG_FG3PCT_LAST_4_GAMES_HOME_TEAM','AVG_FG3PCT_LAST_4_GAMES_VISITOR_TEAM','AVG_REB_LAST_4_GAMES_HOME_TEAM','AVG_REB_LAST_4_GAMES_VISITOR_TEAM','AVG_POINTS_LAST_4_GAMES_AT_HOME_HOME_TEAM','AVG_POINTS_CONCEDED_LAST_4_GAMES_AT_HOME_HOME_TEAM','AVG_ASSISTS_LAST_4_GAMES_AT_HOME_HOME_TEAM','AVG_FGPCT_LAST_4_GAMES_AT_HOME_HOME_TEAM','AVG_FTPCT_LAST_4_GAMES_AT_HOME_HOME_TEAM','AVG_FG3PCT_LAST_4_GAMES_AT_HOME_HOME_TEAM','AVG_REB_LAST_4_GAMES_AT_HOME_HOME_TEAM','AVG_POINTS_LAST_4_GAMES_AWAY_VISITOR_TEAM','AVG_POINTS_CONCEDED_LAST_4_GAMES_AWAY_VISITOR_TEAM','AVG_ASSISTS_LAST_4_GAMES_AWAY_VISITOR_TEAM','AVG_FGPCT_LAST_4_GAMES_AWAY_VISITOR_TEAM','AVG_FTPCT_LAST_4_GAMES_AWAY_VISITOR_TEAM','AVG_FG3PCT_LAST_4_GAMES_AWAY_VISITOR_TEAM','AVG_REB_LAST_4_GAMES_AWAY_VISITOR_TEAM','FOUNDATION_HOME','FOUNDATION_VISITOR'    ]] = scaled_df    result = data_normalized.isna().sum()    result.to_csv('nan_counts2.csv', header=['NaN_count'])    return data_normalized

I checked the csv file that is generate in the beginning of the normalize_data function and there are no NaN, but in the csv file at the end of the function there are around 7000 lines with NaN values.

Am I doing some mistake when I am normalizing my data?

Also, I tried deleting the rows where Season is 2003 before merging the two dataframes and the problem doesn't happen.

#Import data from pkl files    games = pd.read_pickle('cleaned_data/merged.pkl')    teams = pd.read_pickle('cleaned_data/teams.pkl')    games = games[games['SEASON'] != 2003]    #Add foundation year of home team    games = games.merge(teams[['TEAM_ID', 'YEARFOUNDED']], left_on='HOME_TEAM_ID', right_on='TEAM_ID', how='left')    games['FOUNDATION_HOME'] = games['YEARFOUNDED']    games.drop('TEAM_ID', axis=1, inplace=True)    games.drop('YEARFOUNDED', axis=1, inplace=True)

I runned both versions while saving the resultant games dataframe to a csv file, and when comparing them I reached the conclusion that they are equal.

What can be causing this problem?


Viewing all articles
Browse latest Browse all 23305

Trending Articles



<script src="https://jsc.adskeeper.com/r/s/rssing.com.1596347.js" async> </script>