diff --git a/model_selection/cv_metric_gen.py b/model_selection/cv_metric_gen.py index 378551883cab4961ed66c5e5028a60086a188cd5..2042928a9e286f77500e901a1175c1209796fe29 100644 --- a/model_selection/cv_metric_gen.py +++ b/model_selection/cv_metric_gen.py @@ -191,23 +191,40 @@ if __name__ == "__main__": # Metric generation for each model for model_idx, (model_name, model) in enumerate(models.items()): print(f"{group}-{method_names[j]}-{model_name}") - # Retrieve cv scores for our metrics of interest - scores = cross_validate(model, X_train, y_train, scoring=scorings, cv=cv, return_train_score=True, n_jobs=10) - # Save results of each fold - for metric_name in scorings.keys(): - scores_df.loc[model_name + f'_{metric_name}']=list(np.around(np.array(scores[f"test_{metric_name}"]),4)) - # ---------------------------------------- Generate curves ---------------------------------------- + # # Retrieve cv scores for our metrics of interest + # scores = cross_validate(model, X_train, y_train, scoring=scorings, cv=cv, return_train_score=True, n_jobs=10) + # # Save results of each fold + # for metric_name in scorings.keys(): + # scores_df.loc[model_name + f'_{metric_name}']=list(np.around(np.array(scores[f"test_{metric_name}"]),4)) mean_fpr = np.linspace(0, 1, 100) tprs, aucs = [], [] mean_recall = np.linspace(0, 1, 100) precisions, pr_aucs = [], [] cmap = plt.get_cmap('tab10') # Colormap + # Initialize storage for scores for each fold + fold_scores = {metric_name: [] for metric_name in scorings.keys()} # Loop through each fold in the cross-validation - for fold_idx, (train, test) in enumerate(cv.split(X_train, y_train)): + for fold_idx, (train_idx, test_idx) in enumerate(cv.split(X_train, y_train)): + X_train_fold, X_test_fold = X_train[train_idx], X_train[test_idx] + y_train_fold, y_test_fold = y_train[train_idx], y_train[test_idx] # Fit the model on the training data - model.fit(X_train[train], y_train[train]) + model.fit(X_train_fold, y_train_fold) + # Predict on the test data + if hasattr(model, "decision_function"): + y_score = model.decision_function(X_test_fold) + else: + y_score = model.predict_proba(X_test_fold)[:, 1] # Use probability of positive class + y_pred = model.predict(X_test_fold) + # Calculate and store the scores for each metric + for metric_name, scorer in scorings.items(): + if metric_name in ['AUROC', 'AUPRC']: + score = scorer._score_func(y_test_fold, y_score) + else: + score = scorer._score_func(y_test_fold, y_pred) + fold_scores[metric_name].append(score) + # --------------------- CURVES --------------------------- # Generate ROC curve for the fold - roc_display = RocCurveDisplay.from_estimator(model, X_train[test], y_train[test], + roc_display = RocCurveDisplay.from_estimator(model, X_test_fold, y_test_fold, name=f"ROC fold {fold_idx}", alpha=0.6, lw=2, ax=axes[model_idx][0], color=cmap(fold_idx % 10)) interp_tpr = np.interp(mean_fpr, roc_display.fpr, roc_display.tpr) @@ -215,12 +232,15 @@ if __name__ == "__main__": tprs.append(interp_tpr) aucs.append(roc_display.roc_auc) # Generate Precision-Recall curve for the fold - pr_display = PrecisionRecallDisplay.from_estimator(model, X_train[test], y_train[test], + pr_display = PrecisionRecallDisplay.from_estimator(model, X_test_fold, y_test_fold, name=f"PR fold {fold_idx}", alpha=0.6, lw=2, ax=axes[model_idx][1], color=cmap(fold_idx % 10)) interp_precision = np.interp(mean_recall, pr_display.recall[::-1], pr_display.precision[::-1]) precisions.append(interp_precision) pr_aucs.append(pr_display.average_precision) + # Store the fold scores in the dataframe + for metric_name, scores in fold_scores.items(): + scores_df.loc[f"{model_name}_{metric_name}"] = np.around(scores, 4) # Plot diagonal line for random guessing in ROC curve axes[model_idx][0].plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', alpha=.8, label='Random guessing') # Compute mean ROC curve @@ -241,7 +261,6 @@ if __name__ == "__main__": # Set Precision-Recall plot limits and title axes[model_idx][1].set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05], title=f"Precision-Recall Curve - {model_name} ({group}-{method_names[j]})") axes[model_idx][1].legend(loc="lower right") - # ---------------------------------------- End Generate Curves ---------------------------------------- # Store the DataFrame in the dictionary with a unique key for each sheet sheet_name = f"{group}_{method_names[j]}" scores_sheets[sheet_name] = scores_df