ready to implement the PR curves

9a51e5c3 · Joaquin Torres · 9fa990e0 · 9a51e5c3
Commit 9a51e5c3 authored May 23, 2024 by Joaquin Torres
Hide whitespace changes
Inline Side-by-side

Showing with 38 additions and 39 deletions

model_selection/cv_metric_gen.py model_selection/cv_metric_gen.py +38 -39

No files found.
--- a/model_selection/cv_metric_gen.py
+++ b/model_selection/cv_metric_gen.py
@@ -177,7 +177,6 @@ if __name__ == "__main__":
    scores_sheets = {} # To store score dfs as sheets in the same excel file
    for i, group in enumerate(['pre']): # 'post'
        for j, method in enumerate(['']): # '', 'over_', 'under_'
-            # print(f"{group}-{method_names[j]}")
            # Get train dataset based on group and method
            X_train = data_dic['X_train_' + method + group]
            y_train = data_dic['y_train_' + method + group]
@@ -191,44 +190,44 @@ if __name__ == "__main__":
                axes = [axes]
            # Metric generation for each model
            for model_idx, (model_name, model) in enumerate(models.items()):
-                if model_name == 'DT':
+                print(f"{group}-{method_names[j]}-{model_name}")
-                    print(f"{group}-{method_names[j]}-{model_name}")
+                # Retrieve cv scores for our metrics of interest
-                    # Retrieve cv scores for our metrics of interest
+                scores = cross_validate(model, X_train, y_train, scoring=scorings, cv=cv, return_train_score=True, n_jobs=10)
-                    scores = cross_validate(model, X_train, y_train, scoring=scorings, cv=cv, return_train_score=True, n_jobs=10)
+                # Save results of each fold
-                    # Save results of each fold
+                for metric_name in scorings.keys():
-                    for metric_name in scorings.keys():
+                    scores_df.loc[model_name + f'_{metric_name}']=list(np.around(np.array(scores[f"test_{metric_name}"]),4)) 
-                        scores_df.loc[model_name + f'_{metric_name}']=list(np.around(np.array(scores[f"test_{metric_name}"]),4)) 
+                # ---------- Generate ROC curves ----------
-                    # Generate ROC curves
+                mean_fpr = np.linspace(0, 1, 100) 
-                    mean_fpr = np.linspace(0, 1, 100)
+                tprs, aucs = [], []
-                    tprs, aucs = [], []
+                cmap = plt.get_cmap('tab10')  # Colormap
-                    cmap = plt.get_cmap('tab10')  # Colormap for stronger colors
+                # Loop through each fold in the cross-validation (redoing cv for simplicity)
-                    # Loop through each fold in the cross-validation
+                for fold_idx, (train, test) in enumerate(cv.split(X_train, y_train)):
-                    for fold_idx, (train, test) in enumerate(cv.split(X_train, y_train)):
+                    # Fit the model on the training data
-                        # Fit the model on the training data
+                    model.fit(X_train[train], y_train[train])
-                        model.fit(X_train[train], y_train[train])
+                    # Use RocCurveDisplay to generate the ROC curve
-                        # Use RocCurveDisplay to generate the ROC curve
+                    roc_display = RocCurveDisplay.from_estimator(model, X_train[test], y_train[test],
-                        roc_display = RocCurveDisplay.from_estimator(model, X_train[test], y_train[test],
+                                                                name=f"ROC fold {fold_idx}", alpha=0.6, lw=2, 
-                                                                    name=f"ROC fold {fold_idx}", alpha=0.6, lw=2, 
+                                                                ax=axes[model_idx], color=cmap(fold_idx % 10))
-                                                                    ax=axes[model_idx], color=cmap(fold_idx % 10))
+                    # Interpolate the true positive rates to get a smooth curve
-                        # Interpolate the true positive rates to get a smooth curve
+                    interp_tpr = np.interp(mean_fpr, roc_display.fpr, roc_display.tpr)
-                        interp_tpr = np.interp(mean_fpr, roc_display.fpr, roc_display.tpr)
+                    interp_tpr[0] = 0.0
-                        interp_tpr[0] = 0.0
+                    # Append the interpolated TPR and AUC for this fold
-                        # Append the interpolated TPR and AUC for this fold
+                    tprs.append(interp_tpr)
-                        tprs.append(interp_tpr)
+                    aucs.append(roc_display.roc_auc)
-                        aucs.append(roc_display.roc_auc)
+                # Plot the diagonal line representing random guessing
-                    # Plot the diagonal line representing random guessing
+                axes[model_idx].plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', alpha=.8, label='Random guessing')
-                    axes[model_idx].plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', alpha=.8, label='Random guessing')
+                # Compute the mean of the TPRs
-                    # Compute the mean of the TPRs
+                mean_tpr = np.mean(tprs, axis=0)
-                    mean_tpr = np.mean(tprs, axis=0)
+                mean_tpr[-1] = 1.0
-                    mean_tpr[-1] = 1.0
+                mean_auc = auc(mean_fpr, mean_tpr)  # Calculate the mean AUC
-                    mean_auc = auc(mean_fpr, mean_tpr)  # Calculate the mean AUC
+                # Plot the mean ROC curve with a thicker line and distinct color
-                    # Plot the mean ROC curve with a thicker line and distinct color
+                axes[model_idx].plot(mean_fpr, mean_tpr, color='b', lw=4,
-                    axes[model_idx].plot(mean_fpr, mean_tpr, color='b', lw=4,
+                                        label=r'Mean ROC (AUC = %0.2f)' % mean_auc, alpha=.8)
-                                         label=r'Mean ROC (AUC = %0.2f)' % mean_auc, alpha=.8)
+                # Set plot limits and title
-                    # Set plot limits and title
+                axes[model_idx].set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05],
-                    axes[model_idx].set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05],
+                                    title=f"ROC Curve - {model_name} ({group}-{method_names[j]})")
-                                        title=f"ROC Curve - {model_name} ({group}-{method_names[j]})")
+                axes[model_idx].legend(loc="lower right")
-                    axes[model_idx].legend(loc="lower right")
+                # ---------- END ROC curves Generation ----------
            # Store the DataFrame in the dictionary with a unique key for each sheet
            sheet_name = f"{group}_{method_names[j]}"
            scores_sheets[sheet_name] = scores_df