diff --git a/training_models/train_models.py b/training_models/eval_models.py similarity index 90% rename from training_models/train_models.py rename to training_models/eval_models.py index 04cf6085927535256481cbd7297bbb4d4e96c587..ad3a781233f8ccddae6779cc928cbc476b8c40af 100644 --- a/training_models/train_models.py +++ b/training_models/eval_models.py @@ -79,26 +79,26 @@ if __name__ == "__main__": # -------------------------------------------------------------------------------------------------------- # 1. No class weight models_1 = {#"DT" : DecisionTreeClassifier(), - "RF" : RandomForestClassifier(), + "RF" : RandomForestClassifier(n_estimators=50), # "Bagging" : BaggingClassifier(), # "AB" : AdaBoostClassifier(), # "XGB": XGBClassifier(), - # "LR" : LogisticRegression(), - # "ElNet" : LogisticRegression(penalty='elasticnet'), - # "SVM" : SVC(), - # "MLP" : MLPClassifier(), + # "LR" : LogisticRegression(max_iter=1000), + # "ElNet" : LogisticRegression(max_iter=1000, penalty='elasticnet'), + # "SVM" : SVC(probability=True), + # "MLP" : MLPClassifier(max_iter=500), } # 2. Class weight models_2 = {#"DT" : DecisionTreeClassifier(class_weight='balanced'), - "RF" : RandomForestClassifier(class_weight='balanced'), + "RF" : RandomForestClassifier(n_estimators=50, class_weight='balanced'), # "Bagging" : BaggingClassifier(), # <- # "AB" : AdaBoostClassifier(), # <- # "XGB": XGBClassifier(), # <- - # "LR" : LogisticRegression(class_weight='balanced'), - # "ElNet" : LogisticRegression(penalty='elasticnet', class_weight='balanced'), - # "SVM" : SVC(class_weight='balanced'), - # "MLP" : MLPClassifier(), # <- + # "LR" : LogisticRegression(max_iter=1000, class_weight='balanced'), + # "ElNet" : LogisticRegression(max_iter=1000, penalty='elasticnet', class_weight='balanced'), + # "SVM" : SVC(probability=True, class_weight='balanced'), + # "MLP" : MLPClassifier(max_iter=500), # <- } # -------------------------------------------------------------------------------------------------------- diff --git a/training_models/hyperparam_tuning.py b/training_models/hyperparam_tuning.py new file mode 100644 index 0000000000000000000000000000000000000000..05e0bcd5c29310673cfa0f3734049776f5bf0cec --- /dev/null +++ b/training_models/hyperparam_tuning.py @@ -0,0 +1,182 @@ +""" + Selecting best models through cross validation and hyperparameter tunning + for each method: + 1. Original training dataset + 2. Original training dataset - Cost sensitive + 3. Oversampling + 4. Undersampling +""" + +# Libraries +# -------------------------------------------------------------------------------------------------------- +import pandas as pd +import numpy as np +from xgboost import XGBClassifier +from sklearn.metrics import confusion_matrix +from sklearn.metrics import f1_score, make_scorer, precision_score, recall_score +from sklearn.model_selection import StratifiedKFold, cross_validate +from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier +from sklearn.neural_network import MLPClassifier +from sklearn.svm import SVC +from sklearn.linear_model import LogisticRegression +from sklearn.tree import DecisionTreeClassifier +from scipy.stats import randint, uniform +from sklearn.model_selection import RandomizedSearchCV +# -------------------------------------------------------------------------------------------------------- + +# Function to read datasets +# -------------------------------------------------------------------------------------------------------- +def read_data(): + import numpy as np + + # Load test data + X_test_pre = np.load('./gen_train_data/data/output/pre/X_test_pre.npy', allow_pickle=True) + y_test_pre = np.load('./gen_train_data/data/output/pre/y_test_pre.npy', allow_pickle=True) + X_test_post = np.load('./gen_train_data/data/output/post/X_test_post.npy', allow_pickle=True) + y_test_post = np.load('./gen_train_data/data/output/post/y_test_post.npy', allow_pickle=True) + + # Load ORIGINAL training data + X_train_pre = np.load('./gen_train_data/data/output/pre/X_train_pre.npy', allow_pickle=True) + y_train_pre = np.load('./gen_train_data/data/output/pre/y_train_pre.npy', allow_pickle=True) + X_train_post = np.load('./gen_train_data/data/output/post/X_train_post.npy', allow_pickle=True) + y_train_post = np.load('./gen_train_data/data/output/post/y_train_post.npy', allow_pickle=True) + + # Load oversampled training data + X_train_over_pre = np.load('./gen_train_data/data/output/pre/X_train_over_pre.npy', allow_pickle=True) + y_train_over_pre = np.load('./gen_train_data/data/output/pre/y_train_over_pre.npy', allow_pickle=True) + X_train_over_post = np.load('./gen_train_data/data/output/post/X_train_over_post.npy', allow_pickle=True) + y_train_over_post = np.load('./gen_train_data/data/output/post/y_train_over_post.npy', allow_pickle=True) + + # Load undersampled training data + X_train_under_pre = np.load('./gen_train_data/data/output/pre/X_train_under_pre.npy', allow_pickle=True) + y_train_under_pre = np.load('./gen_train_data/data/output/pre/y_train_under_pre.npy', allow_pickle=True) + X_train_under_post = np.load('./gen_train_data/data/output/post/X_train_under_post.npy', allow_pickle=True) + y_train_under_post = np.load('./gen_train_data/data/output/post/y_train_under_post.npy', allow_pickle=True) + + data_dic = { + "X_test_pre": X_test_pre, + "y_test_pre": y_test_pre, + "X_test_post": X_test_post, + "y_test_post": y_test_post, + "X_train_pre": X_train_pre, + "y_train_pre": y_train_pre, + "X_train_post": X_train_post, + "y_train_post": y_train_post, + "X_train_over_pre": X_train_over_pre, + "y_train_over_pre": y_train_over_pre, + "X_train_over_post": X_train_over_post, + "y_train_over_post": y_train_over_post, + "X_train_under_pre": X_train_under_pre, + "y_train_under_pre": y_train_under_pre, + "X_train_under_post": X_train_under_post, + "y_train_under_post": y_train_under_post, + } + + return data_dic +# -------------------------------------------------------------------------------------------------------- + +if __name__ == "__main__": + + # Reading training data + data_dic = read_data() + + # Defining the models to train + # -------------------------------------------------------------------------------------------------------- + # 1. No class weight + models_1 = {"DT" : DecisionTreeClassifier(), + # "RF" : RandomForestClassifier(n_estimators=50), + # "Bagging" : BaggingClassifier(), + # "AB" : AdaBoostClassifier(), + # "XGB": XGBClassifier(), + # "LR" : LogisticRegression(max_iter=1000), + # "ElNet" : LogisticRegression(max_iter=1000, penalty='elasticnet'), + # "SVM" : SVC(probability=True), + # "MLP" : MLPClassifier(max_iter=500), + } + + # 2. Class weight + models_2 = {"DT" : DecisionTreeClassifier(class_weight='balanced'), + # "RF" : RandomForestClassifier(n_estimators=50, class_weight='balanced'), + # "Bagging" : BaggingClassifier(), # <- + # "AB" : AdaBoostClassifier(), # <- + # "XGB": XGBClassifier(), # <- + # "LR" : LogisticRegression(max_iter=1000, class_weight='balanced'), + # "ElNet" : LogisticRegression(max_iter=1000, penalty='elasticnet', class_weight='balanced'), + # "SVM" : SVC(probability=True, class_weight='balanced'), + # "MLP" : MLPClassifier(max_iter=500), # <- + } + + # Hyperparameter tuning setup + # -------------------------------------------------------------------------------------------------------- + hyperparameters = { + "DT": {'splitter': ['best', 'random'], + 'max_features': ['sqrt', 'log2'], + 'criterion': ['gini', 'entropy', 'log_loss']}, + "RF": {'n_estimators': randint(100, 250), + 'max_features': ['sqrt', 'log2'], + 'criterion': ['gini', 'entropy']}, + "Bagging": {'n_estimators': randint(10, 100), + 'max_samples': [0.8, 1.0], + 'max_features': [0.8, 1.0], + 'warm_start': [True, False]}, + "AB": {'n_estimators': randint(50, 150), + 'learning_rate': uniform(0.8, 1.2)}, + "XGB": {'n_estimators': randint(100, 1000), + 'max_depth': randint(3, 10), + 'learning_rate': uniform(0.01, 0.3)}, + "LR": {'penalty': ['l1', 'l2', None], + 'solver': ['lbfgs', 'sag', 'saga']}, + "EL": {'solver': ['lbfgs', 'sag', 'saga']}, + "SVM": {'C': uniform(0.8, 1.2), + 'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}, + "MLP": {'activation': ['identity', 'logistic', 'tanh', 'relu'], + 'hidden_layer_sizes': randint(50, 150), + 'learning_rate': ['constant', 'invscaling', 'adaptive']} + } + # -------------------------------------------------------------------------------------------------------- + + # Cross-validation setup + # -------------------------------------------------------------------------------------------------------- + # Defining cross-validation protocol + cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1) + method_names = { + 0: "ORIG", + 1: "ORIG_CW", + 2: "OVER", + 3: "UNDER" + } + # -------------------------------------------------------------------------------------------------------- + + # Hyperparameter tuning loop and exporting results + # -------------------------------------------------------------------------------------------------------- + # Store each df as a sheet in an excel file + sheets_dict = {} + for i, group in enumerate(['pre', 'post']): + for j, method in enumerate(['', '', 'over_', 'under_']): + print(f"ITERATION {i+j}") + # Get dataset based on group and method + X = data_dic['X_train_' + method + group] + y = data_dic['y_train_' + method + group] + # Use group of models with class weight if needed + models = models_2 if j == 2 else models_1 + # Save results: params and best score for each of the mdodels of this method and group + hyperparam_df = pd.DataFrame(index=list(models.keys()), columns=['Parameters','Score']) + for model_name, model in models.items(): + # Find optimal hyperparams for curr model + params = hyperparameters[model_name] + search = RandomizedSearchCV(model, param_distributions=params, cv=cv, n_jobs=1, scoring='precision') + search.fit(X,y) + hyperparam_df.at[model_name,'Parameters']=search.best_params_ + hyperparam_df.at[model_name,'Score']=round(search.best_score_,4) + + # Store the DataFrame in the dictionary with a unique key for each sheet + sheet_name = f"{group}_{method_names[j]}" + sheets_dict[sheet_name] = hyperparam_df + + # Write results to Excel file + with pd.ExcelWriter('./training_models/output/hyperparam.xlsx') as writer: + for sheet_name, data in sheets_dict.items(): + data.to_excel(writer, sheet_name=sheet_name) + # -------------------------------------------------------------------------------------------------------- + + diff --git a/training_models/output/cross_val_res.xlsx b/training_models/output/cross_val_res.xlsx index 5c87162d1058c8cbef378cb3d15f167ed6c97631..f1c0c3b472619e5938ec3f287c8972f748dd36ac 100644 Binary files a/training_models/output/cross_val_res.xlsx and b/training_models/output/cross_val_res.xlsx differ diff --git a/training_models/output/hyperparam.xlsx b/training_models/output/hyperparam.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..cf23ae22eda7fecc10b15ae0a5a978074eb30f38 Binary files /dev/null and b/training_models/output/hyperparam.xlsx differ