Tested tuned models and metrics-plots generated

9aee207f · Joaquin Torres · c703ac3d · 9aee207f · 9aee207f · 9aee207f
Commit 9aee207f authored May 24, 2024 by Joaquin Torres
10 changed files
--- a/model_selection/output_test/plots/post_ORIG.svg
+++ b/model_selection/output_test/plots/post_ORIG.svg
--- a/model_selection/output_test/plots/post_ORIG_CW.svg
+++ b/model_selection/output_test/plots/post_ORIG_CW.svg
--- a/model_selection/output_test/plots/post_OVER.svg
+++ b/model_selection/output_test/plots/post_OVER.svg
--- a/model_selection/output_test/plots/post_UNDER.svg
+++ b/model_selection/output_test/plots/post_UNDER.svg
--- a/model_selection/output_test/plots/pre_ORIG.svg
+++ b/model_selection/output_test/plots/pre_ORIG.svg
--- a/model_selection/output_test/plots/pre_ORIG_CW.svg
+++ b/model_selection/output_test/plots/pre_ORIG_CW.svg
--- a/model_selection/output_test/plots/pre_OVER.svg
+++ b/model_selection/output_test/plots/pre_OVER.svg
--- a/model_selection/output_test/plots/pre_UNDER.svg
+++ b/model_selection/output_test/plots/pre_UNDER.svg
--- a/model_selection/output_test/testing_tuned_models.xlsx
+++ b/model_selection/output_test/testing_tuned_models.xlsx
--- a/model_selection/test_models.py
+++ b/model_selection/test_models.py
@@ -89,10 +89,10 @@ def get_tuned_models(group_str, method_str):
    }
    tuned_models = {}
    # Iterate through each row of the DataFrame
-    for index, row in tuned_models_df.iterrows():
+    for _, row in tuned_models_df.iterrows():
-        model_name = row[0]
+        model_name = row.iloc[0]
        # Read dictionary
-        parameters = ast.literal_eval(row['Parameters'])
+        parameters = ast.literal_eval(row['Best Parameters'])
        # Add extra parameters 
        if model_name == 'AB':
            parameters['algorithm'] = 'SAMME'
@@ -202,7 +202,6 @@ if __name__ == "__main__":
        X_test = data_dic['X_test_' + group]
        y_test = data_dic['y_test_' + group]
        for j, method in enumerate(['', '', 'over_', 'under_']): 
-            print(f"{group}-{method_names[j]}")
            # Get train dataset based on group and method
            X_train = data_dic['X_train_' + method + group]
            y_train = data_dic['y_train_' + method + group]
@@ -214,44 +213,66 @@ if __name__ == "__main__":
            fig, axes = plt.subplots(len(models), 3, figsize=(10, 8 * len(models)))
            if len(models) == 1:  # Adjustment if there's only one model (axes indexing issue)
                axes = [axes]
-            # Evaluate each model
+            # Evaluate each model with test dataset
            for model_idx, (model_name, model) in enumerate(models.items()):
-                # ----------- TEMPORAL -------------
+                print(f"{group}-{method_names[j]}-{model_name}")
-                # Train the model (it was just initialized above)
+                # Fit the model on the training data
                model.fit(X_train, y_train)
+                # --------------------- SCORINGS ---------------------------
+                # Calculate and store the scores for each metric
+                for metric_name, scorer in scorings.items():
+                    score = scorer(model, X_test, y_test)
+                    scores_df.at[model_name, metric_name] = round(score, 4)
+                # -----------------------------------------------------------
+                # --------------------- PLOTS ---------------------------
+                # Check if the model has a decision_function method
                if hasattr(model, "decision_function"):
+                    # Use the decision function to get scores
                    y_score = model.decision_function(X_test)
                else:
-                    y_score = model.predict_proba(X_test)[:, 1]  # Use probability of positive class
+                    # Otherwise, use the probability estimates and take the probability of the positive class
+                    y_score = model.predict_proba(X_test)[:, 1]
                # Calculate ROC curve and ROC area for each class
                fpr, tpr, _ = roc_curve(y_test, y_score, pos_label=model.classes_[1])
-                roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr).plot(ax=axes[model_idx][0]) 
+                # Plot the ROC curve with thicker line
+                roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr)
+                roc_display.plot(ax=axes[model_idx][0], lw=2)
+                # Plot the diagonal line for the ROC curve
+                axes[model_idx][0].plot([0, 1], [0, 1], 'k--', lw=2, label='Random Classifier')
+                axes[model_idx][0].set_title(f'ROC Curve for {group}-{method}-{model_name}')
+                axes[model_idx][0].set_xlabel('False Positive Rate')
+                axes[model_idx][0].set_ylabel('True Positive Rate')
+                axes[model_idx][0].legend(loc='lower right')
                # Calculate precision-recall curve
                precision, recall, _ = precision_recall_curve(y_test, y_score, pos_label=model.classes_[1])
-                pr_display = PrecisionRecallDisplay(precision=precision, recall=recall).plot(ax=axes[model_idx][1])
+                # Plot the precision-recall curve with thicker line
-                # Get confusion matrix plot
+                pr_display = PrecisionRecallDisplay(precision=precision, recall=recall)
+                pr_display.plot(ax=axes[model_idx][1], lw=2)
+                # Plot the baseline for the PR curve
+                no_skill = len(y_test[y_test == 1]) / len(y_test)
+                axes[model_idx][1].plot([0, 1], [no_skill, no_skill], 'k--', lw=2, label='No Skill')
+                axes[model_idx][1].set_title(f'PR Curve for {group}-{method}-{model_name}')
+                axes[model_idx][1].set_xlabel('Recall')
+                axes[model_idx][1].set_ylabel('Precision')
+                axes[model_idx][1].legend(loc='lower left')
+                # Predict the test data to get confusion matrix
                y_pred = model.predict(X_test)
+                # Compute confusion matrix
                cm = confusion_matrix(y_test, y_pred)
+                # Plot the confusion matrix
                ConfusionMatrixDisplay(cm).plot(ax=axes[model_idx][2])
-                # Give name to plots
+                axes[model_idx][2].set_title(f'CM for {group}-{method}-{model_name}')
-                axes[model_idx][0].set_title(f'ROC Curve for {model_name}')
+                # ----------------------------------------------------------
-                axes[model_idx][1].set_title(f'PR Curve for {model_name}')
-                axes[model_idx][2].set_title(f'CM for {model_name}')
-                # Evaluate at each of the scores of interest
-                for score_name, scorer in scorings.items():
-                    score_value = scorer(model, X_test, y_test)
-                    scores_df.at[model_name, score_name] = score_value
            # Adjust layout and save/show figure
            plt.tight_layout()
-            plt.savefig(f'./test_results/aux_plots/{group}_{method_names[j]}.svg', format='svg', dpi=500)
+            plt.savefig(f'./output_test/plots/{group}_{method_names[j]}.svg', format='svg', dpi=500)
            plt.close(fig)
            # Store the DataFrame in the dictionary with a unique key for each sheet
            sheet_name = f"{group}_{method_names[j]}"
            scores_sheets[sheet_name] = scores_df
    # Write results to Excel file
-    with pd.ExcelWriter('./test_results/testing_tuned_models.xlsx') as writer:
+    with pd.ExcelWriter('./output_test/testing_tuned_models.xlsx') as writer:
        for sheet_name, data in scores_sheets.items():
            data.to_excel(writer, sheet_name=sheet_name)
+    print("Successful evaluation with test dataset")
 # --------------------------------------------------------------------------------------------------------
\ No newline at end of file