Testing PR curve

f128a515 · Joaquin Torres · 9a51e5c3 · f128a515 · f128a515 · f128a515
Commit f128a515 authored May 23, 2024 by Joaquin Torres
3 changed files
--- a/model_selection/cv_metric_gen.py
+++ b/model_selection/cv_metric_gen.py
@@ -16,7 +16,7 @@ from sklearn.svm import SVC
 from sklearn.linear_model import  LogisticRegression
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.model_selection import StratifiedKFold, cross_validate
-from sklearn.metrics import RocCurveDisplay, roc_curve, auc
+from sklearn.metrics import RocCurveDisplay, auc
 from sklearn.metrics import PrecisionRecallDisplay, precision_recall_curve
 import matplotlib.pyplot as plt
 import ast # String to dictionary
@@ -185,49 +185,64 @@ if __name__ == "__main__":
            # Scores df -> one column per cv split, one row for each model-metric
            scores_df = pd.DataFrame(columns=range(1,11), index=[f"{model_name}_{metric_name}" for model_name in models.keys() for metric_name in scorings.keys()])
            # Create a figure for all models in this group-method
-            fig, axes = plt.subplots(len(models), 1, figsize=(10, 8 * len(models)))
+            fig, axes = plt.subplots(len(models), 2, figsize=(10, 8 * len(models)))
            if len(models) == 1:  # Adjustment if there's only one model (axes indexing issue)
                axes = [axes]
            # Metric generation for each model
            for model_idx, (model_name, model) in enumerate(models.items()):
-                print(f"{group}-{method_names[j]}-{model_name}")
+                if model_name == 'XGB':
-                # Retrieve cv scores for our metrics of interest
+                    print(f"{group}-{method_names[j]}-{model_name}")
-                scores = cross_validate(model, X_train, y_train, scoring=scorings, cv=cv, return_train_score=True, n_jobs=10)
+                    # Retrieve cv scores for our metrics of interest
-                # Save results of each fold
+                    scores = cross_validate(model, X_train, y_train, scoring=scorings, cv=cv, return_train_score=True, n_jobs=10)
-                for metric_name in scorings.keys():
+                    # Save results of each fold
-                    scores_df.loc[model_name + f'_{metric_name}']=list(np.around(np.array(scores[f"test_{metric_name}"]),4)) 
+                    for metric_name in scorings.keys():
-                # ---------- Generate ROC curves ----------
+                        scores_df.loc[model_name + f'_{metric_name}']=list(np.around(np.array(scores[f"test_{metric_name}"]),4)) 
-                mean_fpr = np.linspace(0, 1, 100) 
+                    # ---------------------------------------- Generate curves ----------------------------------------
-                tprs, aucs = [], []
+                    mean_fpr = np.linspace(0, 1, 100)
-                cmap = plt.get_cmap('tab10')  # Colormap
+                    tprs, aucs = [], []
-                # Loop through each fold in the cross-validation (redoing cv for simplicity)
+                    mean_recall = np.linspace(0, 1, 100)
-                for fold_idx, (train, test) in enumerate(cv.split(X_train, y_train)):
+                    precisions, pr_aucs = [], []
-                    # Fit the model on the training data
+                    cmap = plt.get_cmap('tab10')  # Colormap
-                    model.fit(X_train[train], y_train[train])
+                    # Loop through each fold in the cross-validation
-                    # Use RocCurveDisplay to generate the ROC curve
+                    for fold_idx, (train, test) in enumerate(cv.split(X_train, y_train)):
-                    roc_display = RocCurveDisplay.from_estimator(model, X_train[test], y_train[test],
+                        # Fit the model on the training data
-                                                                name=f"ROC fold {fold_idx}", alpha=0.6, lw=2, 
+                        model.fit(X_train[train], y_train[train])
-                                                                ax=axes[model_idx], color=cmap(fold_idx % 10))
+                        # Generate ROC curve for the fold
-                    # Interpolate the true positive rates to get a smooth curve
+                        roc_display = RocCurveDisplay.from_estimator(model, X_train[test], y_train[test],
-                    interp_tpr = np.interp(mean_fpr, roc_display.fpr, roc_display.tpr)
+                                                                    name=f"ROC fold {fold_idx}", alpha=0.6, lw=2,
-                    interp_tpr[0] = 0.0
+                                                                    ax=axes[model_idx][0], color=cmap(fold_idx % 10))
-                    # Append the interpolated TPR and AUC for this fold
+                        interp_tpr = np.interp(mean_fpr, roc_display.fpr, roc_display.tpr)
-                    tprs.append(interp_tpr)
+                        interp_tpr[0] = 0.0
-                    aucs.append(roc_display.roc_auc)
+                        tprs.append(interp_tpr)
-                # Plot the diagonal line representing random guessing
+                        aucs.append(roc_display.roc_auc)
-                axes[model_idx].plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', alpha=.8, label='Random guessing')
+                        # Generate Precision-Recall curve for the fold
-                # Compute the mean of the TPRs
+                        pr_display = PrecisionRecallDisplay.from_estimator(model, X_train[test], y_train[test],
-                mean_tpr = np.mean(tprs, axis=0)
+                                                                        name=f"PR fold {fold_idx}", alpha=0.6, lw=2,
-                mean_tpr[-1] = 1.0
+                                                                        ax=axes[model_idx][1], color=cmap(fold_idx % 10))
-                mean_auc = auc(mean_fpr, mean_tpr)  # Calculate the mean AUC
+                        interp_precision = np.interp(mean_recall, pr_display.recall[::-1], pr_display.precision[::-1])
-                # Plot the mean ROC curve with a thicker line and distinct color
+                        precisions.append(interp_precision)
-                axes[model_idx].plot(mean_fpr, mean_tpr, color='b', lw=4,
+                        pr_aucs.append(pr_display.average_precision)
-                                        label=r'Mean ROC (AUC = %0.2f)' % mean_auc, alpha=.8)
+                    # Plot diagonal line for random guessing in ROC curve
-                # Set plot limits and title
+                    axes[model_idx][0].plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', alpha=.8, label='Random guessing')
-                axes[model_idx].set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05],
+                    # Compute mean ROC curve
-                                    title=f"ROC Curve - {model_name} ({group}-{method_names[j]})")
+                    mean_tpr = np.mean(tprs, axis=0)
-                axes[model_idx].legend(loc="lower right")
+                    mean_tpr[-1] = 1.0
-                # ---------- END ROC curves Generation ----------
+                    mean_auc = auc(mean_fpr, mean_tpr)
+                    axes[model_idx][0].plot(mean_fpr, mean_tpr, color='b', lw=4, label=r'Mean ROC (AUC = %0.2f)' % mean_auc, alpha=.8)
+                    # Set ROC plot limits and title
+                    axes[model_idx][0].set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05], title=f"ROC Curve - {model_name} ({group}-{method_names[j]})")
+                    axes[model_idx][0].legend(loc="lower right")
+                    # Compute mean Precision-Recall curve
+                    mean_precision = np.mean(precisions, axis=0)
+                    mean_pr_auc = np.mean(pr_aucs)
+                    axes[model_idx][1].plot(mean_recall, mean_precision, color='b', lw=4, label=r'Mean PR (AUC = %0.2f)' % mean_pr_auc, alpha=.8)
+                    # # Plot baseline precision (proportion of positive samples)
+                    # baseline = np.sum(y_train) / len(y_train)
+                    # axes[model_idx][1].plot([0, 1], [baseline, baseline], linestyle='--', lw=2, color='r', alpha=.8, label='Baseline')
+                    # Set Precision-Recall plot limits and title
+                    axes[model_idx][1].set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05], title=f"Precision-Recall Curve - {model_name} ({group}-{method_names[j]})")
+                    axes[model_idx][1].legend(loc="lower right")
+                    # ---------------------------------------- End Generate Curves  ----------------------------------------
            # Store the DataFrame in the dictionary with a unique key for each sheet
            sheet_name = f"{group}_{method_names[j]}"
            scores_sheets[sheet_name] = scores_df
@@ -239,7 +254,4 @@ if __name__ == "__main__":
    with pd.ExcelWriter('./output_cv_metrics/metrics.xlsx') as writer:
        for sheet_name, data in scores_sheets.items():
            data.to_excel(writer, sheet_name=sheet_name)
    print("Successful cv metric generation for tuned models")
\ No newline at end of file
\ No newline at end of file
--- a/model_selection/output_cv_metrics/curves/pre_ORIG.svg
+++ b/model_selection/output_cv_metrics/curves/pre_ORIG.svg
--- a/model_selection/output_cv_metrics/metrics.xlsx
+++ b/model_selection/output_cv_metrics/metrics.xlsx