Completed comments

6d590283 · Joaquin Torres · 762a245e · 6d590283
Commit 6d590283 authored Jul 08, 2024 by Joaquin Torres
Hide whitespace changes
Inline Side-by-side

Showing with 23 additions and 9 deletions

model_selection/cv_metric_gen.py model_selection/cv_metric_gen.py +23 -9

No files found.
--- a/model_selection/cv_metric_gen.py
+++ b/model_selection/cv_metric_gen.py
+# CV Metric Generation
+# Author: Joaquín Torres Bravo
 """
    Metric generation for each tuned model.
    Done in a different script for perfomance and clarity purposes.
@@ -5,20 +7,25 @@

 # Libraries
 # --------------------------------------------------------------------------------------------------------
+# Basics
 import pandas as pd
 import numpy as np
+import matplotlib.pyplot as plt
+# Models
 from xgboost import XGBClassifier
-from sklearn.metrics import confusion_matrix
-from sklearn.metrics import f1_score, make_scorer, precision_score, recall_score, accuracy_score, roc_auc_score, average_precision_score
 from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier
 from sklearn.neural_network import MLPClassifier
 from sklearn.svm import SVC
 from sklearn.linear_model import  LogisticRegression
 from sklearn.tree import DecisionTreeClassifier
-from sklearn.model_selection import StratifiedKFold
+# Metrics
+from sklearn.metrics import confusion_matrix
+from sklearn.metrics import f1_score, make_scorer, precision_score, recall_score, accuracy_score, roc_auc_score, average_precision_score
 from sklearn.metrics import RocCurveDisplay, auc
 from sklearn.metrics import PrecisionRecallDisplay, precision_recall_curve
-import matplotlib.pyplot as plt
+# CV
+from sklearn.model_selection import StratifiedKFold
+# Misc
 import ast # String to dictionary
 # --------------------------------------------------------------------------------------------------------

@@ -82,9 +89,9 @@ def get_tuned_models(group_str, method_str):
    # Iterate through each row of the DataFrame
    for _, row in tuned_models_df.iterrows():
        model_name = row.iloc[0]
-        # Read dictionary
+        # Read dictionary with parameters
        parameters = ast.literal_eval(row['Best Parameters'])
-        # Add extra parameters 
+        # Add extra parameters if needed
        if model_name == 'AB':
            parameters['algorithm'] = 'SAMME'
        elif model_name == 'LR':
@@ -140,24 +147,31 @@ def negative_recall_scorer(clf, X, y):
    TN_prop = cm[0,0]/(cm[0,1]+cm[0,0])
    return TN_prop

-# Custom scorers for AUROC and AUPRC
+# Custom scorers for AUROC (Area Under the Receiver Operating Characteristic Curve) and AUPRC (Area Under the Precision-Recall Curve)
 def AUROC_scorer(clf, X, y):
+    # Check if the classifier has a decision_function method
    if hasattr(clf, "decision_function"):
+        # If so, use the decision function to get the scores for X
        y_score = clf.decision_function(X)
    else:
+        # Otherwise, use predict_proba to get the probabilities, and take the probabilities for the positive class (index 1)
        y_score = clf.predict_proba(X)[:, 1]
+    # Compute and return the ROC AUC score using the true labels and the predicted scores
    return roc_auc_score(y, y_score)

 def AUPRC_scorer(clf, X, y):
+    # Check if the classifier has a decision_function method
    if hasattr(clf, "decision_function"):
+        # If so, use the decision function to get the scores for X
        y_score = clf.decision_function(X)
    else:
+        # Otherwise, use predict_proba to get the probabilities, and take the probabilities for the positive class (index 1)
        y_score = clf.predict_proba(X)[:, 1]
+    # Compute and return the average precision score using the true labels and the predicted scores
    return average_precision_score(y, y_score)
 # --------------------------------------------------------------------------------------------------------

 if __name__ == "__main__":
-
    # Setup
    # --------------------------------------------------------------------------------------------------------
    # Reading training data
@@ -188,7 +202,7 @@ if __name__ == "__main__":
    cmap = plt.get_cmap('tab10')  
    # --------------------------------------------------------------------------------------------------------

-    # Metric generation through cv for tuned models3
+    # Metric generation through cv for tuned models
    # --------------------------------------------------------------------------------------------------------
    scores_sheets = {} # To store score dfs as sheets in the same excel file
    for i, group in enumerate(['pre', 'post']):