From f26ac8cd803e7745825766ba92dbbfe46d14b285 Mon Sep 17 00:00:00 2001 From: joaquintb Date: Sun, 5 May 2024 14:33:01 +0200 Subject: [PATCH] working on loop for model evaluation --- training_models/train_models.py | 117 ++++++++++++++++++++++++++------ 1 file changed, 97 insertions(+), 20 deletions(-) diff --git a/training_models/train_models.py b/training_models/train_models.py index a16f1bd..7b8873a 100644 --- a/training_models/train_models.py +++ b/training_models/train_models.py @@ -23,10 +23,41 @@ from sklearn.tree import DecisionTreeClassifier # -------------------------------------------------------------------------------------------------------- -if __name__ == "__main__": +def negative_recall_scorer(clf, X, y): + """Gives the negative recall defined as the (number of true_negative_samples)/(total number of negative samples)""" + y_pred = clf.predict(X) + cm = confusion_matrix(y, y_pred) + TN_prop = cm[0,0]/(cm[0,1]+cm[0,0]) + return TN_prop + +def TN_scorer(clf, X, y): + """Gives the number of samples predicted as true negatives""" + y_pred = clf.predict(X) + cm = confusion_matrix(y, y_pred) + TN = cm[0,0] + return TN +def FN_scorer(clf, X, y): + """Gives the number of samples predicted as false negatives""" + y_pred = clf.predict(X) + cm = confusion_matrix(y, y_pred) + FN = cm[0,1] + return FN +def FP_scorer(clf, X, y): + """Gives the number of samples predicted as false positive""" + y_pred = clf.predict(X) + cm = confusion_matrix(y, y_pred) + FP = cm[1,0] + return FP +def TP_scorer(clf, X, y): + """Gives the number of samples predicted as true positive""" + y_pred = clf.predict(X) + cm = confusion_matrix(y, y_pred) + TP = cm[1,1] + return TP + +def read_data(): + import numpy as np - # Reading training data - # -------------------------------------------------------------------------------------------------------- # Load test data X_test_pre = np.load('gen_train_data/data/output/pre/X_test_pre.npy', allow_pickle=True) y_test_pre = np.load('gen_train_data/data/output/pre/y_test_pre.npy', allow_pickle=True) @@ -50,31 +81,77 @@ if __name__ == "__main__": y_train_under_pre = np.load('gen_train_data/data/output/pre/y_train_under_pre.npy', allow_pickle=True) X_train_under_post = np.load('gen_train_data/data/output/post/X_train_under_post.npy', allow_pickle=True) y_train_under_post = np.load('gen_train_data/data/output/post/y_train_under_post.npy', allow_pickle=True) - # -------------------------------------------------------------------------------------------------------- + + data_dic = { + "X_test_pre": X_test_pre, + "y_test_pre": y_test_pre, + "X_test_post": X_test_post, + "y_test_post": y_test_post, + "X_train_pre": X_train_pre, + "y_train_pre": y_train_pre, + "X_train_post": X_train_post, + "y_train_post": y_train_post, + "X_train_over_pre": X_train_over_pre, + "y_train_over_pre": y_train_over_pre, + "X_train_over_post": X_train_over_post, + "y_train_over_post": y_train_over_post, + "X_train_under_pre": X_train_under_pre, + "y_train_under_pre": y_train_under_pre, + "X_train_under_post": X_train_under_post, + "y_train_under_post": y_train_under_post, + } + + return data_dic + +if __name__ == "__main__": + + # Reading training data + data_dic = read_data() # Defining the models to train # -------------------------------------------------------------------------------------------------------- # 1. No class weight models_1 = {"DT" : DecisionTreeClassifier(), - "RF" : RandomForestClassifier(), - "Bagging" : BaggingClassifier(), - "AB" : AdaBoostClassifier(), - "XGB": XGBClassifier(), - "LR" : LogisticRegression(), - "ElNet" : LogisticRegression(penalty='elasticnet'), - "SVM" : SVC(), - "MLP" : MLPClassifier(), + # "RF" : RandomForestClassifier(), + # "Bagging" : BaggingClassifier(), + # "AB" : AdaBoostClassifier(), + # "XGB": XGBClassifier(), + # "LR" : LogisticRegression(), + # "ElNet" : LogisticRegression(penalty='elasticnet'), + # "SVM" : SVC(), + # "MLP" : MLPClassifier(), } # 2. Class weight models_2 = {"DT" : DecisionTreeClassifier(class_weight='balanced'), - "RF" : RandomForestClassifier(class_weight='balanced'), - "Bagging" : BaggingClassifier(), # <- - "AB" : AdaBoostClassifier(), # <- - "XGB": XGBClassifier(), # <- - "LR" : LogisticRegression(class_weight='balanced'), - "ElNet" : LogisticRegression(penalty='elasticnet', class_weight='balanced'), - "SVM" : SVC(class_weight='balanced'), - "MLP" : MLPClassifier(), # <- + # "RF" : RandomForestClassifier(class_weight='balanced'), + # "Bagging" : BaggingClassifier(), # <- + # "AB" : AdaBoostClassifier(), # <- + # "XGB": XGBClassifier(), # <- + # "LR" : LogisticRegression(class_weight='balanced'), + # "ElNet" : LogisticRegression(penalty='elasticnet', class_weight='balanced'), + # "SVM" : SVC(class_weight='balanced'), + # "MLP" : MLPClassifier(), # <- } # -------------------------------------------------------------------------------------------------------- + + # Setup + # -------------------------------------------------------------------------------------------------------- + # Scorings to use for model evaluation + scorings = {'f1':make_scorer(f1_score), 'negative_recall': negative_recall_scorer, 'recall':make_scorer(recall_score), 'precision':make_scorer(precision_score), 'TN':TN_scorer, 'FN':FN_scorer, 'FP':FP_scorer, 'TP':TP_scorer} + # Defining cross-validation protocol + cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1) + # -------------------------------------------------------------------------------------------------------- + + for i, group in enumerate(['pre', 'post']): + for j, method in enumerate(['', '', 'over_', 'under_']): + # Get dataset based on group and method + X = data_dic['X_train_' + method + group] + y = data_dic['y_train_' + method + group] + # Use group of models with class weight if needed + models = models_2 if j == 2 else models_1 + # Create df to keep track of each group-method for all its models + results = pd.DataFrame() + for model_name, model in models.items(): + cv_results = cross_validate(model, X, y, scoring=scorings, cv=cv, return_train_score=True, n_jobs=1) + -- 2.24.1