diff --git a/model_selection/cv_metric_gen.py b/model_selection/cv_metric_gen.py index e48b8a9fc7992fa39035960725f70b04f69520a4..2820cd335998e357ef847d6bfb1e40694b29f979 100644 --- a/model_selection/cv_metric_gen.py +++ b/model_selection/cv_metric_gen.py @@ -1,3 +1,5 @@ +# CV Metric Generation +# Author: JoaquĆ­n Torres Bravo """ Metric generation for each tuned model. Done in a different script for perfomance and clarity purposes. @@ -5,20 +7,25 @@ # Libraries # -------------------------------------------------------------------------------------------------------- +# Basics import pandas as pd import numpy as np +import matplotlib.pyplot as plt +# Models from xgboost import XGBClassifier -from sklearn.metrics import confusion_matrix -from sklearn.metrics import f1_score, make_scorer, precision_score, recall_score, accuracy_score, roc_auc_score, average_precision_score from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier from sklearn.neural_network import MLPClassifier from sklearn.svm import SVC from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier -from sklearn.model_selection import StratifiedKFold +# Metrics +from sklearn.metrics import confusion_matrix +from sklearn.metrics import f1_score, make_scorer, precision_score, recall_score, accuracy_score, roc_auc_score, average_precision_score from sklearn.metrics import RocCurveDisplay, auc from sklearn.metrics import PrecisionRecallDisplay, precision_recall_curve -import matplotlib.pyplot as plt +# CV +from sklearn.model_selection import StratifiedKFold +# Misc import ast # String to dictionary # -------------------------------------------------------------------------------------------------------- @@ -82,9 +89,9 @@ def get_tuned_models(group_str, method_str): # Iterate through each row of the DataFrame for _, row in tuned_models_df.iterrows(): model_name = row.iloc[0] - # Read dictionary + # Read dictionary with parameters parameters = ast.literal_eval(row['Best Parameters']) - # Add extra parameters + # Add extra parameters if needed if model_name == 'AB': parameters['algorithm'] = 'SAMME' elif model_name == 'LR': @@ -140,24 +147,31 @@ def negative_recall_scorer(clf, X, y): TN_prop = cm[0,0]/(cm[0,1]+cm[0,0]) return TN_prop -# Custom scorers for AUROC and AUPRC +# Custom scorers for AUROC (Area Under the Receiver Operating Characteristic Curve) and AUPRC (Area Under the Precision-Recall Curve) def AUROC_scorer(clf, X, y): + # Check if the classifier has a decision_function method if hasattr(clf, "decision_function"): + # If so, use the decision function to get the scores for X y_score = clf.decision_function(X) else: + # Otherwise, use predict_proba to get the probabilities, and take the probabilities for the positive class (index 1) y_score = clf.predict_proba(X)[:, 1] + # Compute and return the ROC AUC score using the true labels and the predicted scores return roc_auc_score(y, y_score) def AUPRC_scorer(clf, X, y): + # Check if the classifier has a decision_function method if hasattr(clf, "decision_function"): + # If so, use the decision function to get the scores for X y_score = clf.decision_function(X) else: + # Otherwise, use predict_proba to get the probabilities, and take the probabilities for the positive class (index 1) y_score = clf.predict_proba(X)[:, 1] + # Compute and return the average precision score using the true labels and the predicted scores return average_precision_score(y, y_score) # -------------------------------------------------------------------------------------------------------- if __name__ == "__main__": - # Setup # -------------------------------------------------------------------------------------------------------- # Reading training data @@ -188,7 +202,7 @@ if __name__ == "__main__": cmap = plt.get_cmap('tab10') # -------------------------------------------------------------------------------------------------------- - # Metric generation through cv for tuned models3 + # Metric generation through cv for tuned models # -------------------------------------------------------------------------------------------------------- scores_sheets = {} # To store score dfs as sheets in the same excel file for i, group in enumerate(['pre', 'post']):