import pandas as pdimport matplotlib.pyplot as pltimport seaborn as snsfrom sklearn.model_selection import train_test_splitfrom sklearn.ensemble import RandomForestClassifierfrom sklearn import metricsairline_data = pd.read_csv('airlines-corgis.csv')airline_data['CarrierNames'] = airline_data['CarrierNames'].str.split(',')airline_data['TotalCarriers'] = airline_data['CarrierNames'].apply(len)airline_data = airline_data.explode('CarrierNames')airline_data['Time'] = pd.to_numeric(airline_data['Time'], errors='coerce')airline_data_encoded = pd.get_dummies(airline_data, columns= ['CarrierNames','Code','Name','MonthName'])X = airline_data_encoded.drop(['Delays-Carrier', 'Delays-LateAircraft', 'Delays- NAS', 'Delays-Security', 'Delays-Weather'], axis=1)y = airline_data_encoded['Delayed']X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=16)my_forest = RandomForestClassifier()my_forest.fit(X_train, y_train)y_pred = my_forest.predict(X_test)cnf_matrix = metrics.confusion_matrix(y_test, y_pred, normalize='true')plt.figure(figsize=(8, 6))sns.heatmap(cnf_matrix, annot=True, fmt='.2f', xticklabels=my_forest.classes_, yticklabels=my_forest.classes_)plt.title('Normalized Confusion Matrix')plt.xlabel('Predicted Value')plt.ylabel('True Value')plt.show()print('Length of X_train:', len(X_train))print('Feature Importances:', my_forest.feature_importances_)
I have been trying to create a random forest classifier, heat map, and determine the most important features impacting delays (which delay type), but when I run the code, nothing appears.