Commit 203b75e8 authored by Joaquin Torres's avatar Joaquin Torres

Found optimal hyperparamers and exported, more analysis to be done in different files

parent 620b8a59
""" """
Selecting best models through cross validation and hyperparameter tunning Finding optimal hyperparameters through RandomSearchCV for each group (1. pre - 2. post)
for each method: and method:
1. Original training dataset 1. Original training dataset
2. Original training dataset - Cost sensitive 2. Original training dataset - Cost sensitive
3. Oversampling 3. Oversampling
...@@ -21,7 +21,6 @@ from sklearn.linear_model import LogisticRegression ...@@ -21,7 +21,6 @@ from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier from sklearn.tree import DecisionTreeClassifier
from scipy.stats import randint, uniform from scipy.stats import randint, uniform
from sklearn.model_selection import RandomizedSearchCV from sklearn.model_selection import RandomizedSearchCV
import os
# -------------------------------------------------------------------------------------------------------- # --------------------------------------------------------------------------------------------------------
# Function to read training datasets # Function to read training datasets
...@@ -73,26 +72,22 @@ if __name__ == "__main__": ...@@ -73,26 +72,22 @@ if __name__ == "__main__":
# -------------------------------------------------------------------------------------------------------- # --------------------------------------------------------------------------------------------------------
# 1. No class weight # 1. No class weight
models_simple = {"DT" : DecisionTreeClassifier(), models_simple = {"DT" : DecisionTreeClassifier(),
"RF" : RandomForestClassifier(), # "RF" : RandomForestClassifier(),
"Bagging" : BaggingClassifier(), # "Bagging" : BaggingClassifier(),
"AB" : AdaBoostClassifier(algorithm='SAMME'), # "AB" : AdaBoostClassifier(algorithm='SAMME'),
"XGB": XGBClassifier(), # "XGB": XGBClassifier(),
"LR" : LogisticRegression(max_iter=1000), # "LR" : LogisticRegression(max_iter=1000),
"SVM" : SVC(probability=True, max_iter=1000), # "SVM" : SVC(probability=True, max_iter=1000),
"MLP" : MLPClassifier(max_iter=500) # "MLP" : MLPClassifier(max_iter=500)
# "ElNet" : LogisticRegression(max_iter=1000, penalty='elasticnet')
} }
# 2. Class weight: cost-sensitive learning # 2. Class weight: cost-sensitive learning
models_CS = {"DT" : DecisionTreeClassifier(class_weight='balanced'), models_CS = {"DT" : DecisionTreeClassifier(class_weight='balanced'),
"RF" : RandomForestClassifier(class_weight='balanced'), # "RF" : RandomForestClassifier(class_weight='balanced'),
"Bagging" : BaggingClassifier(estimator= DecisionTreeClassifier(class_weight='balanced')), # "Bagging" : BaggingClassifier(estimator= DecisionTreeClassifier(class_weight='balanced')),
"AB" : AdaBoostClassifier(estimator= DecisionTreeClassifier(class_weight='balanced'), algorithm='SAMME'), # "AB" : AdaBoostClassifier(estimator= DecisionTreeClassifier(class_weight='balanced'), algorithm='SAMME'),
"LR" : LogisticRegression(max_iter=1000, class_weight='balanced'), # "LR" : LogisticRegression(max_iter=1000, class_weight='balanced'),
"SVM" : SVC(probability=True, max_iter = 1000, class_weight='balanced'), # "SVM" : SVC(probability=True, max_iter = 1000, class_weight='balanced'),
# "ElNet" : LogisticRegression(max_iter=1000, penalty='elasticnet', class_weight='balanced'),
# "XGB": XGBClassifier(), # <-
# "MLP" : MLPClassifier(max_iter=500) # <-
} }
# -------------------------------------------------------------------------------------------------------- # --------------------------------------------------------------------------------------------------------
...@@ -121,7 +116,6 @@ if __name__ == "__main__": ...@@ -121,7 +116,6 @@ if __name__ == "__main__":
"MLP": {'activation': ['identity', 'logistic', 'tanh', 'relu'], "MLP": {'activation': ['identity', 'logistic', 'tanh', 'relu'],
'hidden_layer_sizes': randint(50, 150), 'hidden_layer_sizes': randint(50, 150),
'learning_rate': ['constant', 'invscaling', 'adaptive']} 'learning_rate': ['constant', 'invscaling', 'adaptive']}
# "ElNet": {'solver': ['lbfgs', 'sag', 'saga']},
} }
# -------------------------------------------------------------------------------------------------------- # --------------------------------------------------------------------------------------------------------
...@@ -148,25 +142,17 @@ if __name__ == "__main__": ...@@ -148,25 +142,17 @@ if __name__ == "__main__":
y = data_dic['y_train_' + method + group] y = data_dic['y_train_' + method + group]
# Use group of models with class weight if needed # Use group of models with class weight if needed
models = models_CS if j == 1 else models_simple models = models_CS if j == 1 else models_simple
# Save results: set of optimal hyperpameters -> mean precision and sd for those parameters across folds # Save optimal hyperparameters for each of the models -> metrics will be computed in a different file
hyperparam_df = pd.DataFrame(index=list(models.keys()), columns=['Best Parameters','Mean Precision', 'SD']) hyperparam_df = pd.DataFrame(index=list(models.keys()), columns=['Model Name', 'Best Parameters'])
for model_name, model in models.items(): for model_name, model in models.items():
print(f"{group}-{method_names[j]}-{model_name}") print(f"{group}-{method_names[j]}-{model_name}")
# Find optimal hyperparams for curr model # Find optimal hyperparams for curr model
params = hyperparameters[model_name] params = hyperparameters[model_name]
search = RandomizedSearchCV(model, param_distributions=params, cv=cv, n_jobs=10, scoring='precision') search = RandomizedSearchCV(model, param_distributions=params, cv=cv, n_jobs=10, scoring='precision')
search.fit(X,y) search.fit(X,y)
# Access the results # Keep optimal parameters
results = search.cv_results_
best_index = search.best_index_
# Get sd and mean across folds for best set of hyperpameters
best_params = search.best_params_ best_params = search.best_params_
mean_precision_best = results['mean_test_score'][best_index]
std_precision_best = results['std_test_score'][best_index]
# Storing these values
hyperparam_df.at[model_name, 'Best Parameters'] = best_params hyperparam_df.at[model_name, 'Best Parameters'] = best_params
hyperparam_df.at[model_name, 'Mean Precision'] = round(mean_precision_best, 4)
hyperparam_df.at[model_name, 'SD'] = round(std_precision_best, 4)
# Store the DataFrame in the dictionary with a unique key for each sheet # Store the DataFrame in the dictionary with a unique key for each sheet
sheet_name = f"{group}_{method_names[j]}" sheet_name = f"{group}_{method_names[j]}"
sheets_dict[sheet_name] = hyperparam_df sheets_dict[sheet_name] = hyperparam_df
...@@ -178,5 +164,3 @@ if __name__ == "__main__": ...@@ -178,5 +164,3 @@ if __name__ == "__main__":
print("Successful tuning") print("Successful tuning")
# -------------------------------------------------------------------------------------------------------- # --------------------------------------------------------------------------------------------------------
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment