working on loop for model evaluation

f26ac8cd · Joaquin Torres · d72df2cb · f26ac8cd
Commit f26ac8cd authored May 05, 2024 by Joaquin Torres
Hide whitespace changes
Inline Side-by-side

Showing with 97 additions and 20 deletions

training_models/train_models.py training_models/train_models.py +97 -20

No files found.
--- a/training_models/train_models.py
+++ b/training_models/train_models.py
@@ -23,10 +23,41 @@ from sklearn.tree import DecisionTreeClassifier
 # --------------------------------------------------------------------------------------------------------
-if __name__ == "__main__":
+def negative_recall_scorer(clf, X, y):
+    """Gives the negative recall defined as the (number of true_negative_samples)/(total number of negative samples)"""
+    y_pred = clf.predict(X)
+    cm = confusion_matrix(y, y_pred)
+    TN_prop = cm[0,0]/(cm[0,1]+cm[0,0])
+    return TN_prop
+def TN_scorer(clf, X, y):
+    """Gives the number of samples predicted as true negatives"""
+    y_pred = clf.predict(X)
+    cm = confusion_matrix(y, y_pred)
+    TN = cm[0,0]
+    return TN
+def FN_scorer(clf, X, y):
+    """Gives the number of samples predicted as false negatives"""
+    y_pred = clf.predict(X)
+    cm = confusion_matrix(y, y_pred)
+    FN = cm[0,1]
+    return FN
+def FP_scorer(clf, X, y):
+    """Gives the number of samples predicted as false positive"""
+    y_pred = clf.predict(X)
+    cm = confusion_matrix(y, y_pred)
+    FP = cm[1,0]
+    return FP
+def TP_scorer(clf, X, y):
+    """Gives the number of samples predicted as true positive"""
+    y_pred = clf.predict(X)
+    cm = confusion_matrix(y, y_pred)
+    TP = cm[1,1]
+    return TP
+def read_data():
+    import numpy as np
-    # Reading training data
-    # --------------------------------------------------------------------------------------------------------
    # Load test data
    X_test_pre = np.load('gen_train_data/data/output/pre/X_test_pre.npy', allow_pickle=True)
    y_test_pre = np.load('gen_train_data/data/output/pre/y_test_pre.npy', allow_pickle=True)
@@ -50,31 +81,77 @@ if __name__ == "__main__":
    y_train_under_pre = np.load('gen_train_data/data/output/pre/y_train_under_pre.npy', allow_pickle=True)
    X_train_under_post = np.load('gen_train_data/data/output/post/X_train_under_post.npy', allow_pickle=True)
    y_train_under_post = np.load('gen_train_data/data/output/post/y_train_under_post.npy', allow_pickle=True)
-    # --------------------------------------------------------------------------------------------------------
+    data_dic = {
+        "X_test_pre": X_test_pre,
+        "y_test_pre": y_test_pre,
+        "X_test_post": X_test_post,
+        "y_test_post": y_test_post,
+        "X_train_pre": X_train_pre,
+        "y_train_pre": y_train_pre,
+        "X_train_post": X_train_post,
+        "y_train_post": y_train_post,
+        "X_train_over_pre": X_train_over_pre,
+        "y_train_over_pre": y_train_over_pre,
+        "X_train_over_post": X_train_over_post,
+        "y_train_over_post": y_train_over_post,
+        "X_train_under_pre": X_train_under_pre,
+        "y_train_under_pre": y_train_under_pre,
+        "X_train_under_post": X_train_under_post,
+        "y_train_under_post": y_train_under_post,
+    }
+    return data_dic
+if __name__ == "__main__":
+    # Reading training data
+    data_dic = read_data()
    # Defining the models to train
    # --------------------------------------------------------------------------------------------------------
    # 1. No class weight
    models_1 = {"DT" : DecisionTreeClassifier(), 
-            "RF" : RandomForestClassifier(), 
+            # "RF" : RandomForestClassifier(), 
-            "Bagging" : BaggingClassifier(),
+            # "Bagging" : BaggingClassifier(),
-            "AB" : AdaBoostClassifier(), 
+            # "AB" : AdaBoostClassifier(), 
-            "XGB": XGBClassifier(),
+            # "XGB": XGBClassifier(),
-            "LR" : LogisticRegression(), 
+            # "LR" : LogisticRegression(), 
-            "ElNet" : LogisticRegression(penalty='elasticnet'), 
+            # "ElNet" : LogisticRegression(penalty='elasticnet'), 
-            "SVM" : SVC(), 
+            # "SVM" : SVC(), 
-            "MLP" : MLPClassifier(),
+            # "MLP" : MLPClassifier(),
            }
    # 2. Class weight 
    models_2 = {"DT" : DecisionTreeClassifier(class_weight='balanced'), 
-            "RF" : RandomForestClassifier(class_weight='balanced'), 
+            # "RF" : RandomForestClassifier(class_weight='balanced'), 
-            "Bagging" : BaggingClassifier(), # <-
+            # "Bagging" : BaggingClassifier(), # <-
-            "AB" : AdaBoostClassifier(),  # <-
+            # "AB" : AdaBoostClassifier(),  # <-
-            "XGB": XGBClassifier(), # <-
+            # "XGB": XGBClassifier(), # <-
-            "LR" : LogisticRegression(class_weight='balanced'), 
+            # "LR" : LogisticRegression(class_weight='balanced'), 
-            "ElNet" : LogisticRegression(penalty='elasticnet', class_weight='balanced'), 
+            # "ElNet" : LogisticRegression(penalty='elasticnet', class_weight='balanced'), 
-            "SVM" : SVC(class_weight='balanced'), 
+            # "SVM" : SVC(class_weight='balanced'), 
-            "MLP" : MLPClassifier(), # <-
+            # "MLP" : MLPClassifier(), # <-
            }
    # --------------------------------------------------------------------------------------------------------
+    # Setup
+    # --------------------------------------------------------------------------------------------------------
+    # Scorings to use for model evaluation
+    scorings = {'f1':make_scorer(f1_score), 'negative_recall': negative_recall_scorer, 'recall':make_scorer(recall_score), 'precision':make_scorer(precision_score), 'TN':TN_scorer, 'FN':FN_scorer, 'FP':FP_scorer, 'TP':TP_scorer}
+    # Defining cross-validation protocol
+    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
+    # --------------------------------------------------------------------------------------------------------
+    for i, group in enumerate(['pre', 'post']):
+        for j, method in enumerate(['', '', 'over_', 'under_']):
+            # Get dataset based on group and method
+            X = data_dic['X_train_' + method + group]
+            y = data_dic['y_train_' + method + group]
+            # Use group of models with class weight if needed
+            models = models_2 if j == 2 else models_1 
+            # Create df to keep track of each group-method for all its models
+            results = pd.DataFrame()
+            for model_name, model in models.items():
+                cv_results = cross_validate(model, X, y, scoring=scorings, cv=cv, return_train_score=True, n_jobs=1)