Quantcast
Channel: Active questions tagged python - Stack Overflow
Viewing all articles
Browse latest Browse all 16595

Python: I need to match rows and attributes while creating a clean_data function

$
0
0

`I don't know how to match rows and columns and seem more lost.

def clean_data(df):"""Perform feature trimming, re-encoding, and engineering for demographicsdata

INPUT: Demographics DataFrameOUTPUT: Trimmed and cleaned demographics DataFrame"""dataframe_clean_copy = df.copy()# Main cleaning steps:# Convert blank value codes into NaNsfor indx in range(len(features_info_clean)):    missing_or_unknown = features_info_clean.iloc[indx]['missing_or_unknown']    missing_or_unknown = missing_or_unknown.strip('[')    missing_or_unknown = missing_or_unknown.strip(']')    missing_or_unknown = missing_or_unknown.split(sep=',')    missing_or_unknown = [int(value) if (value!='X' and value!='XX' and value!='') else value for value in missing_or_unknown]    if missing_or_unknown != ['']:        dataframe_clean_copy = dataframe_clean_copy.replace({features_info_clean.iloc[indx]['attribute']: missing_or_unknown}, np.nan)for col in df.columns:    dataframe_clean_copy = dataframe_clean_copy.replace({col: ['XX', 'X']}, np.nan)# Drop rows with 12%+ missing datacolumns_12plus = ['KK_KUNDENTYP', 'W_KEIT_KIND_HH','KBA05_ANTG1','KBA05_ANTG2','KBA05_ANTG3','KBA05_ANTG4','KBA05_BAUMAX','KBA05_GBZ','KKK','MOBI_REGIO','REGIOTYP','PLZ8_ANTG1','PLZ8_ANTG2','PLZ8_ANTG3','PLZ8_ANTG4','PLZ8_BAUMAX','PLZ8_HHZ','PLZ8_GBZ']dataframe_clean_copy = dataframe_clean_copy.drop(columns_12plus, axis = 1)# Convert letter to numthreshold = 0.20threshold_count = int(threshold * len(azdias.columns))rows_to_drop = azdias[azdias.isnull().sum(axis=1) > threshold_count].copy() dataframe_clean_copy = dataframe_clean_copy.drop(index=rows_to_drop, axis=0)replacements = {'W': 0, 'O': 1}dataframe_clean_copy = dataframe_clean_copy.replace({'OST_WEST_KZ': replacements})# Drop multiple level hard_code_multi = ['AGER_TYP', 'CJT_GESAMTTYP', 'FINANZTYP', 'GFK_URLAUBERTYP', 'LP_FAMILIE_FEIN', 'LP_FAMILIE_GROB', 'LP_STATUS_FEIN', 'LP_STATUS_GROB', 'NATIONALITAET_KZ', 'SHOPPER_TYP', 'TITEL_KZ', 'VERS_TYP', 'ZABEOTYP', 'GEBAEUDETYP', 'CAMEO_DEUG_2015', 'CAMEO_DEU_2015']dataframe_clean_copy = dataframe_clean_copy.drop(hard_code_multi, axis=1)# Format mixeddataframe_clean_copy['DECADE'] = dataframe_clean_copy['PRAEGENDE_JUGENDJAHRE']dataframe_clean_copy['MOVEMENT'] = dataframe_clean_copy['PRAEGENDE_JUGENDJAHRE']decade_dictionary = {1: 1, 2: 1, 3: 2, 4: 2, 5: 3, 6: 3, 7: 3, 8: 4, 9: 4, 10: 5, 11: 5, 12: 5, 13: 5, 14: 6, 15: 6}movement_dictionary = {1: 1, 2: 0, 3: 1, 4: 0, 5: 1, 6: 0, 7: 0, 8: 1, 9: 0, 10: 1, 11: 0, 12: 1, 13: 0, 14: 1, 15: 0}dataframe_clean_copy['DECADE'].replace(decade_dictionary, inplace=True)dataframe_clean_copy['MOVEMENT'].replace(movement_dictionary, inplace=True)dataframe_clean_copy['WEALTH'] = dataframe_clean_copy['CAMEO_INTL_2015']dataframe_clean_copy['LIFE_STAGE'] = dataframe_clean_copy['CAMEO_INTL_2015']wealth_dictionary = {'11':1, '12':1, '13':1, '14':1, '15':1, '21':2, '22':2, '23':2, '24':2, '25':2,'31':3, '32':3, '33':3, '34':3, '35':3, '41':4, '42':4, '43':4, '44':4, '45':4,'51':5, '52':5, '53':5, '54':5, '55':5}stages_dictionary = {'11':1, '12':2, '13':3, '14':4, '15':5, '21':1, '22':2, '23':3, '24':4, '25':5,'31':1, '32':2, '33':3, '34':4, '35':5, '41':1, '42':2, '43':3, '44':4, '45':5,'51':1, '52':2, '53':3, '54':4, '55':5}dataframe_clean_copy['WEALTH'].replace(wealth_dictionary, inplace=True)dataframe_clean_copy['LIFE_STAGE'].replace(stages_dictionary, inplace=True)multi_features = features_info_clean[features_info_clean["type"]=="mixed"]["attribute"]for feature in multi_features:    dataframe_clean_copy.drop(feature, axis=1, inplace=True)# Print shape after final cleaningprint("After final cleaning:", dataframe_clean_copy.shape)# Return the clean versionreturn dataframe_clean_copy

`

assert azdias.shape[0] == features_info_clean.shape[0], "clean_data function is not working properly, rows mismatch"assert azdias.shape[1] == features_info_clean.shape[1], "clean_data function is not working properly, columns mismatch"print("If this is all you see, you passed the tests")


AssertionError Traceback (most recent call last) in ()----> 1 assert azdias.shape[0] == features_info_clean.shape[0], "clean_data function is not working properly, rows mismatch"2 assert azdias.shape[1] == features_info_clean.shape[1], "clean_data function is not working properly, columns mismatch"3 print("If this is all you see, you passed the tests")

AssertionError: clean_data function is not working properly, rows mismatch

azdias.shape(768921, 55)

features_info_clean.shape(67, 4)

I tried different data sets. What I am expecting is my main customer demographic file (azdias) to be shape 768921, 55 and somehow the features info data needs to match. I am lost on this part.


Viewing all articles
Browse latest Browse all 16595

Trending Articles



<script src="https://jsc.adskeeper.com/r/s/rssing.com.1596347.js" async> </script>