I am running a nested CV with stratification at both inner and loops on a binary classification task using a small sample. I am able to run these models without stratification on a dataset with balanced class distribution for Task A, but on the version of this dataset with unbalanced class distribution (Task B), by introducing stratification, it seems to pass an empty 'y' to split.
Here is the error:
File "/xxxxxxxx/main.py", line 68, in all models = run model selection (model name, X scaled df, y encoded,feature_sets, model_config, training_strategy, scoring)xxxxxxxx/model_selection.py" , line 40, in run model selection for train_index, test_index in outer_cv.split(X selected)File "/share/pkg.&/python3/3.10.12/install/lib/python3.10/site-packages/sklearn/model_selection/_split.py", line 1507, in split for train_index, test_index in cv.split(X, y, groups) :File"share/pko.8/python3/3.10.12/install/lib/python3.io/site-packages/sklearn/model_selection/_split.py",line 796, in splity = check_array (y, input_name="y", ensure_2d=False, dtype=None)File "/share/pkg.8/python373.10.12/install/Tib/python3.10/site-packages/sklearn/utils/validation.py",line 967, in check_arrayn samples = num samples (array)File "/share/pkg .87python3/3.10.12/install/lib/python3.10/site-packages/sklearn/utils/validation.py",line347,in_num_samples raise TypeError (TypeError: Singleton array array (None, type=object) cannot be considered a valid collection.
Relevant code snippets:
from itertools import chain, combinationsimport numpy as npimport pandas as pdfrom sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, matthews_corrcoeffrom sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFoldfrom sklearn.preprocessing import MinMaxScalerfrom config import specificity_scorerdef all_combinations(feature_sets): return chain(*map(lambda x: combinations(feature_sets.values(), x), range(1, len(feature_sets) + 1)))def run_model_selection(model_name, X, y_encoded, feature_sets, config, training_strategy, scoring): top_models = [] all_models = {} scaler = MinMaxScaler() outer_cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=42) for feature_set_combination in all_combinations(feature_sets): selected_features = [item for sublist in feature_set_combination for item in sublist] if not set(selected_features).issubset(X.columns): continue X_selected = X[selected_features] if not isinstance(X_selected, pd.DataFrame): X_selected = pd.DataFrame(X_selected, columns=selected_features) metrics_summary = {'accuracy': [],'f1': [],'precision': [],'recall': [],'roc_auc': [],'mcc': [],'specificity': [] } for train_index, test_index in outer_cv.split(X_selected): X_train, X_test = X_selected.iloc[train_index], X_selected.iloc[test_index] y_train, y_test = y_encoded[train_index], y_encoded[test_index] X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) grid_search = GridSearchCV( estimator=config['model'], param_grid=config['param_grid'], scoring=scoring, cv=training_strategy, n_jobs=-1, refit='F1', return_train_score=True ) grid_search.fit(X_train_scaled, y_train) best_model = grid_search.best_estimator_ predictions = best_model.predict(X_test_scaled) if 'roc_auc' in scoring and hasattr(best_model, "predict_proba"): probas = best_model.predict_proba(X_test_scaled) roc_auc_val = roc_auc_score(y_test, probas[:, 1]) metrics_summary['roc_auc'].append(roc_auc_val metrics_summary['accuracy'].append(accuracy_score(y_test, predictions)) metrics_summary['f1'].append(f1_score(y_test, predictions, average='macro')) metrics_summary['precision'].append(precision_score(y_test, predictions, average='macro')) metrics_summary['recall'].append(recall_score(y_test, predictions, average='macro')) metrics_summary['mcc'].append(matthews_corrcoef(y_test, predictions)) metrics_summary['specificity'].append(specificity_scorer(best_model, X_test_scaled, y_test)) # After completing the outer fold loop and having the final metrics summary model_key = f"{model_name}_{','.join(selected_features)}" all_models[model_key] = {'model_params': grid_search.best_estimator_.get_params(), # Adjusted to 'model_params''selected_features': ', '.join(selected_features), # Adjusted to 'selected_features''Metrics': { # Keeping 'Metrics' for detailed scoring information metric: {'Mean': np.mean(values),'Std': np.std(values) } for metric, values in metrics_summary.items() } } return all_models
My config.py is applying StratifiedKFold(n_splits=4, shuffle=True
for the inner loop.
What might be going wrong?