Thinking about merging scroings and plots

20a35749 · Joaquin Torres · 18034aa8 · 20a35749 · 20a35749
Commit 20a35749 authored May 23, 2024 by Joaquin Torres
Showing with 81726 additions and 15353 deletions

model_selection/cv_metric_gen.py model_selection/cv_metric_gen.py +54 -55

model_selection/output_cv_metrics/curves/pre_ORIG.svg model_selection/output_cv_metrics/curves/pre_ORIG.svg +81672 -15298

No files found.
--- a/model_selection/cv_metric_gen.py
+++ b/model_selection/cv_metric_gen.py
@@ -175,8 +175,8 @@ if __name__ == "__main__":
    # Metric generation through cv for tuned models3
    # --------------------------------------------------------------------------------------------------------
    scores_sheets = {} # To store score dfs as sheets in the same excel file
-    for i, group in enumerate(['pre']): # 'post'
+    for i, group in enumerate(['pre', 'post']): # 'post'
-        for j, method in enumerate(['']): # '', 'over_', 'under_'
+        for j, method in enumerate(['', 'over_', 'under_']): 
            # Get train dataset based on group and method
            X_train = data_dic['X_train_' + method + group]
            y_train = data_dic['y_train_' + method + group]
@@ -190,59 +190,58 @@ if __name__ == "__main__":
                axes = [axes]
            # Metric generation for each model
            for model_idx, (model_name, model) in enumerate(models.items()):
-                if model_name == 'XGB':
+                print(f"{group}-{method_names[j]}-{model_name}")
-                    print(f"{group}-{method_names[j]}-{model_name}")
+                # Retrieve cv scores for our metrics of interest
-                    # Retrieve cv scores for our metrics of interest
+                scores = cross_validate(model, X_train, y_train, scoring=scorings, cv=cv, return_train_score=True, n_jobs=10)
-                    scores = cross_validate(model, X_train, y_train, scoring=scorings, cv=cv, return_train_score=True, n_jobs=10)
+                # Save results of each fold
-                    # Save results of each fold
+                for metric_name in scorings.keys():
-                    for metric_name in scorings.keys():
+                    scores_df.loc[model_name + f'_{metric_name}']=list(np.around(np.array(scores[f"test_{metric_name}"]),4)) 
-                        scores_df.loc[model_name + f'_{metric_name}']=list(np.around(np.array(scores[f"test_{metric_name}"]),4)) 
+                # ---------------------------------------- Generate curves ----------------------------------------
-                    # ---------------------------------------- Generate curves ----------------------------------------
+                mean_fpr = np.linspace(0, 1, 100)
-                    mean_fpr = np.linspace(0, 1, 100)
+                tprs, aucs = [], []
-                    tprs, aucs = [], []
+                mean_recall = np.linspace(0, 1, 100)
-                    mean_recall = np.linspace(0, 1, 100)
+                precisions, pr_aucs = [], []
-                    precisions, pr_aucs = [], []
+                cmap = plt.get_cmap('tab10')  # Colormap
-                    cmap = plt.get_cmap('tab10')  # Colormap
+                # Loop through each fold in the cross-validation
-                    # Loop through each fold in the cross-validation
+                for fold_idx, (train, test) in enumerate(cv.split(X_train, y_train)):
-                    for fold_idx, (train, test) in enumerate(cv.split(X_train, y_train)):
+                    # Fit the model on the training data
-                        # Fit the model on the training data
+                    model.fit(X_train[train], y_train[train])
-                        model.fit(X_train[train], y_train[train])
+                    # Generate ROC curve for the fold
-                        # Generate ROC curve for the fold
+                    roc_display = RocCurveDisplay.from_estimator(model, X_train[test], y_train[test],
-                        roc_display = RocCurveDisplay.from_estimator(model, X_train[test], y_train[test],
+                                                                name=f"ROC fold {fold_idx}", alpha=0.6, lw=2,
-                                                                    name=f"ROC fold {fold_idx}", alpha=0.6, lw=2,
+                                                                ax=axes[model_idx][0], color=cmap(fold_idx % 10))
-                                                                    ax=axes[model_idx][0], color=cmap(fold_idx % 10))
+                    interp_tpr = np.interp(mean_fpr, roc_display.fpr, roc_display.tpr)
-                        interp_tpr = np.interp(mean_fpr, roc_display.fpr, roc_display.tpr)
+                    interp_tpr[0] = 0.0
-                        interp_tpr[0] = 0.0
+                    tprs.append(interp_tpr)
-                        tprs.append(interp_tpr)
+                    aucs.append(roc_display.roc_auc)
-                        aucs.append(roc_display.roc_auc)
+                    # Generate Precision-Recall curve for the fold
-                        # Generate Precision-Recall curve for the fold
+                    pr_display = PrecisionRecallDisplay.from_estimator(model, X_train[test], y_train[test],
-                        pr_display = PrecisionRecallDisplay.from_estimator(model, X_train[test], y_train[test],
+                                                                    name=f"PR fold {fold_idx}", alpha=0.6, lw=2,
-                                                                        name=f"PR fold {fold_idx}", alpha=0.6, lw=2,
+                                                                    ax=axes[model_idx][1], color=cmap(fold_idx % 10))
-                                                                        ax=axes[model_idx][1], color=cmap(fold_idx % 10))
+                    interp_precision = np.interp(mean_recall, pr_display.recall[::-1], pr_display.precision[::-1])
-                        interp_precision = np.interp(mean_recall, pr_display.recall[::-1], pr_display.precision[::-1])
+                    precisions.append(interp_precision)
-                        precisions.append(interp_precision)
+                    pr_aucs.append(pr_display.average_precision)
-                        pr_aucs.append(pr_display.average_precision)
+                # Plot diagonal line for random guessing in ROC curve
-                    # Plot diagonal line for random guessing in ROC curve
+                axes[model_idx][0].plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', alpha=.8, label='Random guessing')
-                    axes[model_idx][0].plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', alpha=.8, label='Random guessing')
+                # Compute mean ROC curve
-                    # Compute mean ROC curve
+                mean_tpr = np.mean(tprs, axis=0)
-                    mean_tpr = np.mean(tprs, axis=0)
+                mean_tpr[-1] = 1.0
-                    mean_tpr[-1] = 1.0
+                mean_auc = auc(mean_fpr, mean_tpr)
-                    mean_auc = auc(mean_fpr, mean_tpr)
+                axes[model_idx][0].plot(mean_fpr, mean_tpr, color='b', lw=4, label=r'Mean ROC (AUC = %0.2f)' % mean_auc, alpha=.8)
-                    axes[model_idx][0].plot(mean_fpr, mean_tpr, color='b', lw=4, label=r'Mean ROC (AUC = %0.2f)' % mean_auc, alpha=.8)
+                # Set ROC plot limits and title
-                    # Set ROC plot limits and title
+                axes[model_idx][0].set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05], title=f"ROC Curve - {model_name} ({group}-{method_names[j]})")
-                    axes[model_idx][0].set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05], title=f"ROC Curve - {model_name} ({group}-{method_names[j]})")
+                axes[model_idx][0].legend(loc="lower right")
-                    axes[model_idx][0].legend(loc="lower right")
+                # Compute mean Precision-Recall curve
-                    # Compute mean Precision-Recall curve
+                mean_precision = np.mean(precisions, axis=0)
-                    mean_precision = np.mean(precisions, axis=0)
+                mean_pr_auc = np.mean(pr_aucs)
-                    mean_pr_auc = np.mean(pr_aucs)
+                axes[model_idx][1].plot(mean_recall, mean_precision, color='b', lw=4, label=r'Mean PR (AUC = %0.2f)' % mean_pr_auc, alpha=.8)
-                    axes[model_idx][1].plot(mean_recall, mean_precision, color='b', lw=4, label=r'Mean PR (AUC = %0.2f)' % mean_pr_auc, alpha=.8)
+                # Plot baseline precision (proportion of positive samples)
-                    # Plot baseline precision (proportion of positive samples)
+                baseline = np.sum(y_train) / len(y_train)
-                    baseline = np.sum(y_train) / len(y_train)
+                axes[model_idx][1].plot([0, 1], [baseline, baseline], linestyle='--', lw=2, color='r', alpha=.8, label='Baseline')
-                    axes[model_idx][1].plot([0, 1], [baseline, baseline], linestyle='--', lw=2, color='r', alpha=.8, label='Baseline')
+                # Set Precision-Recall plot limits and title
-                    # Set Precision-Recall plot limits and title
+                axes[model_idx][1].set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05], title=f"Precision-Recall Curve - {model_name} ({group}-{method_names[j]})")
-                    axes[model_idx][1].set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05], title=f"Precision-Recall Curve - {model_name} ({group}-{method_names[j]})")
+                axes[model_idx][1].legend(loc="lower right")
-                    axes[model_idx][1].legend(loc="lower right")
+                # ---------------------------------------- End Generate Curves  ----------------------------------------
-                    # ---------------------------------------- End Generate Curves  ----------------------------------------
            # Store the DataFrame in the dictionary with a unique key for each sheet
            sheet_name = f"{group}_{method_names[j]}"
            scores_sheets[sheet_name] = scores_df

--- a/model_selection/output_cv_metrics/curves/pre_ORIG.svg
+++ b/model_selection/output_cv_metrics/curves/pre_ORIG.svg