From f26ac8cd803e7745825766ba92dbbfe46d14b285 Mon Sep 17 00:00:00 2001
From: joaquintb <joaquintobrw@gmail.com>
Date: Sun, 5 May 2024 14:33:01 +0200
Subject: [PATCH] working on loop for model evaluation

---
 training_models/train_models.py | 117 ++++++++++++++++++++++++++------
 1 file changed, 97 insertions(+), 20 deletions(-)

diff --git a/training_models/train_models.py b/training_models/train_models.py
index a16f1bd..7b8873a 100644
--- a/training_models/train_models.py
+++ b/training_models/train_models.py
@@ -23,10 +23,41 @@ from sklearn.tree import DecisionTreeClassifier
 # --------------------------------------------------------------------------------------------------------
 
 
-if __name__ == "__main__":
+def negative_recall_scorer(clf, X, y):
+    """Gives the negative recall defined as the (number of true_negative_samples)/(total number of negative samples)"""
+    y_pred = clf.predict(X)
+    cm = confusion_matrix(y, y_pred)
+    TN_prop = cm[0,0]/(cm[0,1]+cm[0,0])
+    return TN_prop
+
+def TN_scorer(clf, X, y):
+    """Gives the number of samples predicted as true negatives"""
+    y_pred = clf.predict(X)
+    cm = confusion_matrix(y, y_pred)
+    TN = cm[0,0]
+    return TN
+def FN_scorer(clf, X, y):
+    """Gives the number of samples predicted as false negatives"""
+    y_pred = clf.predict(X)
+    cm = confusion_matrix(y, y_pred)
+    FN = cm[0,1]
+    return FN
+def FP_scorer(clf, X, y):
+    """Gives the number of samples predicted as false positive"""
+    y_pred = clf.predict(X)
+    cm = confusion_matrix(y, y_pred)
+    FP = cm[1,0]
+    return FP
+def TP_scorer(clf, X, y):
+    """Gives the number of samples predicted as true positive"""
+    y_pred = clf.predict(X)
+    cm = confusion_matrix(y, y_pred)
+    TP = cm[1,1]
+    return TP
+
+def read_data():
+    import numpy as np
 
-    # Reading training data
-    # --------------------------------------------------------------------------------------------------------
     # Load test data
     X_test_pre = np.load('gen_train_data/data/output/pre/X_test_pre.npy', allow_pickle=True)
     y_test_pre = np.load('gen_train_data/data/output/pre/y_test_pre.npy', allow_pickle=True)
@@ -50,31 +81,77 @@ if __name__ == "__main__":
     y_train_under_pre = np.load('gen_train_data/data/output/pre/y_train_under_pre.npy', allow_pickle=True)
     X_train_under_post = np.load('gen_train_data/data/output/post/X_train_under_post.npy', allow_pickle=True)
     y_train_under_post = np.load('gen_train_data/data/output/post/y_train_under_post.npy', allow_pickle=True)
-    # --------------------------------------------------------------------------------------------------------
+
+    data_dic = {
+        "X_test_pre": X_test_pre,
+        "y_test_pre": y_test_pre,
+        "X_test_post": X_test_post,
+        "y_test_post": y_test_post,
+        "X_train_pre": X_train_pre,
+        "y_train_pre": y_train_pre,
+        "X_train_post": X_train_post,
+        "y_train_post": y_train_post,
+        "X_train_over_pre": X_train_over_pre,
+        "y_train_over_pre": y_train_over_pre,
+        "X_train_over_post": X_train_over_post,
+        "y_train_over_post": y_train_over_post,
+        "X_train_under_pre": X_train_under_pre,
+        "y_train_under_pre": y_train_under_pre,
+        "X_train_under_post": X_train_under_post,
+        "y_train_under_post": y_train_under_post,
+    }
+
+    return data_dic
+
+if __name__ == "__main__":
+
+    # Reading training data
+    data_dic = read_data()
 
     # Defining the models to train
     # --------------------------------------------------------------------------------------------------------
     # 1. No class weight
     models_1 = {"DT" : DecisionTreeClassifier(), 
-            "RF" : RandomForestClassifier(), 
-            "Bagging" : BaggingClassifier(),
-            "AB" : AdaBoostClassifier(), 
-            "XGB": XGBClassifier(),
-            "LR" : LogisticRegression(), 
-            "ElNet" : LogisticRegression(penalty='elasticnet'), 
-            "SVM" : SVC(), 
-            "MLP" : MLPClassifier(),
+            # "RF" : RandomForestClassifier(), 
+            # "Bagging" : BaggingClassifier(),
+            # "AB" : AdaBoostClassifier(), 
+            # "XGB": XGBClassifier(),
+            # "LR" : LogisticRegression(), 
+            # "ElNet" : LogisticRegression(penalty='elasticnet'), 
+            # "SVM" : SVC(), 
+            # "MLP" : MLPClassifier(),
             }
     
     # 2. Class weight 
     models_2 = {"DT" : DecisionTreeClassifier(class_weight='balanced'), 
-            "RF" : RandomForestClassifier(class_weight='balanced'), 
-            "Bagging" : BaggingClassifier(), # <-
-            "AB" : AdaBoostClassifier(),  # <-
-            "XGB": XGBClassifier(), # <-
-            "LR" : LogisticRegression(class_weight='balanced'), 
-            "ElNet" : LogisticRegression(penalty='elasticnet', class_weight='balanced'), 
-            "SVM" : SVC(class_weight='balanced'), 
-            "MLP" : MLPClassifier(), # <-
+            # "RF" : RandomForestClassifier(class_weight='balanced'), 
+            # "Bagging" : BaggingClassifier(), # <-
+            # "AB" : AdaBoostClassifier(),  # <-
+            # "XGB": XGBClassifier(), # <-
+            # "LR" : LogisticRegression(class_weight='balanced'), 
+            # "ElNet" : LogisticRegression(penalty='elasticnet', class_weight='balanced'), 
+            # "SVM" : SVC(class_weight='balanced'), 
+            # "MLP" : MLPClassifier(), # <-
             }
     # --------------------------------------------------------------------------------------------------------
+
+    # Setup
+    # --------------------------------------------------------------------------------------------------------
+    # Scorings to use for model evaluation
+    scorings = {'f1':make_scorer(f1_score), 'negative_recall': negative_recall_scorer, 'recall':make_scorer(recall_score), 'precision':make_scorer(precision_score), 'TN':TN_scorer, 'FN':FN_scorer, 'FP':FP_scorer, 'TP':TP_scorer}
+    # Defining cross-validation protocol
+    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
+    # --------------------------------------------------------------------------------------------------------
+
+    for i, group in enumerate(['pre', 'post']):
+        for j, method in enumerate(['', '', 'over_', 'under_']):
+            # Get dataset based on group and method
+            X = data_dic['X_train_' + method + group]
+            y = data_dic['y_train_' + method + group]
+            # Use group of models with class weight if needed
+            models = models_2 if j == 2 else models_1 
+            # Create df to keep track of each group-method for all its models
+            results = pd.DataFrame()
+            for model_name, model in models.items():
+                cv_results = cross_validate(model, X, y, scoring=scorings, cv=cv, return_train_score=True, n_jobs=1)
+                
-- 
2.24.1