So I've written this code after consulting chatgpt and it works for the most part:
import numpy as npfrom sklearn.datasets import make_classificationfrom sklearn.model_selection import train_test_splitfrom sklearn.ensemble import RandomForestClassifierfrom sklearn.metrics import accuracy_scorefrom multiprocessing import Pool, cpu_countdef evaluate_subset(model,scoring, X_in, y_in, subset = None): #model = RandomForestClassifier(n_estimators=100, random_state=42) list_scores = [] for train_index, test_index in skf.split(X_in, y_in): X_train, y_train = X_in.values[train_index], y_in.values[train_index] X_test, y_test = X_in.values[test_index], y_in.values[test_index] model.fit(X_train[:, subset], y_train) y_pred = model.predict(X_test[:, subset]) list_scores.append(scoring(y_test, y_pred)) return np.mean(list_scores)def stepwise_add_selection(model, scoring, X_in, y_in, n_processes=None): if n_processes is None: n_processes = cpu_count() pool = Pool(processes=n_processes) remaining_features = set(range(X_in.shape[1])) selected_features = [] best_accuracy = 0 while remaining_features: results = [] for feature in remaining_features: subset = selected_features + [feature] results.append(pool.apply_async(evaluate_subset, args=(model, scoring, X_in, y_in, subset))) accuracies = [res.get() for res in results] best_index = np.argmax(accuracies) print("Current Best") print(max(accuracies)) print("Previous Best") print(best_accuracy) print(selected_features) if best_accuracy < max(accuracies): selected_features.append(list(remaining_features)[(best_index)]) best_accuracy = accuracies[best_index] else: break pool.close() pool.join() return selected_features, best_accuracyHowever, I am trying to create another greedy search which removes features:
def stepwise_feature_removal(model, scoring, X_in, y_in, n_processes=None): remaining_features = set(range(X_train.shape[1])) selected_features = list(remaining_features) best_accuracy = evaluate_subset(model, scoring, X_in, y_in, selected_features) print("Initial accuracy score:", best_accuracy) while remaining_features: results = [] worst_feature = None pool = Pool(processes=n_processes) for feature in remaining_features: temp_features = selected_features[:] temp_features.remove(feature) results.append(pool.apply_async(evaluate_subset, args=(model, scoring, X_in, y_in, temp_features))) pool.close() pool.join() accuracies = [res.get() for res in results] best_index = np.argmax(accuracies) if accuracies[best_index] > best_accuracy: best_accuracy = accuracies[best_index] worst_feature = temp_features[best_index] print("Current Best") print(accuracy) print("Previous Best") print(best_accuracy) print("Feature removed:") print(worst_feature) if worst_feature is not None: selected_features.remove(worst_feature) remaining_features.remove(worst_feature) else: break return selected_features, best_accuracyIn the feature removal approach the issue I am running into is that the program just stops running. It does not give an indication that there is an error or anything. I added both pool.close() and pool.join() but its not fixing the issue.
Thanks ahead of time.
I am trying to write a greedy feature reduction function that works similar to the greedy feature addition function. Not sure why it is freezing so that would be helpful as well.
Edit: I should've clarified the issue occurs when I run this code with imblearn packages. Without imblearn the multiprocessing can occur and the program runs.
def use_pipeline(clf, resample = False): if resample == False: pipe = make_pipeline(MinMaxScaler(), clf) else: pipe = make_pipeline(resample, MinMaxScaler(), clf) return pipesm = SMOTE (random_state=38)pipe_clf = use_pipeline(clf1, sm)stepwise_feature_removal(pipe_clf, matthews_corrcoef, X_train, y_train, 15)