Commit 203b75e8 authored by Joaquin Torres's avatar Joaquin Torres

Found optimal hyperparamers and exported, more analysis to be done in different files

parent 620b8a59
"""
Selecting best models through cross validation and hyperparameter tunning
for each method:
Finding optimal hyperparameters through RandomSearchCV for each group (1. pre - 2. post)
and method:
1. Original training dataset
2. Original training dataset - Cost sensitive
3. Oversampling
......@@ -21,7 +21,6 @@ from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from scipy.stats import randint, uniform
from sklearn.model_selection import RandomizedSearchCV
import os
# --------------------------------------------------------------------------------------------------------
# Function to read training datasets
......@@ -73,26 +72,22 @@ if __name__ == "__main__":
# --------------------------------------------------------------------------------------------------------
# 1. No class weight
models_simple = {"DT" : DecisionTreeClassifier(),
"RF" : RandomForestClassifier(),
"Bagging" : BaggingClassifier(),
"AB" : AdaBoostClassifier(algorithm='SAMME'),
"XGB": XGBClassifier(),
"LR" : LogisticRegression(max_iter=1000),
"SVM" : SVC(probability=True, max_iter=1000),
"MLP" : MLPClassifier(max_iter=500)
# "ElNet" : LogisticRegression(max_iter=1000, penalty='elasticnet')
# "RF" : RandomForestClassifier(),
# "Bagging" : BaggingClassifier(),
# "AB" : AdaBoostClassifier(algorithm='SAMME'),
# "XGB": XGBClassifier(),
# "LR" : LogisticRegression(max_iter=1000),
# "SVM" : SVC(probability=True, max_iter=1000),
# "MLP" : MLPClassifier(max_iter=500)
}
# 2. Class weight: cost-sensitive learning
models_CS = {"DT" : DecisionTreeClassifier(class_weight='balanced'),
"RF" : RandomForestClassifier(class_weight='balanced'),
"Bagging" : BaggingClassifier(estimator= DecisionTreeClassifier(class_weight='balanced')),
"AB" : AdaBoostClassifier(estimator= DecisionTreeClassifier(class_weight='balanced'), algorithm='SAMME'),
"LR" : LogisticRegression(max_iter=1000, class_weight='balanced'),
"SVM" : SVC(probability=True, max_iter = 1000, class_weight='balanced'),
# "ElNet" : LogisticRegression(max_iter=1000, penalty='elasticnet', class_weight='balanced'),
# "XGB": XGBClassifier(), # <-
# "MLP" : MLPClassifier(max_iter=500) # <-
# "RF" : RandomForestClassifier(class_weight='balanced'),
# "Bagging" : BaggingClassifier(estimator= DecisionTreeClassifier(class_weight='balanced')),
# "AB" : AdaBoostClassifier(estimator= DecisionTreeClassifier(class_weight='balanced'), algorithm='SAMME'),
# "LR" : LogisticRegression(max_iter=1000, class_weight='balanced'),
# "SVM" : SVC(probability=True, max_iter = 1000, class_weight='balanced'),
}
# --------------------------------------------------------------------------------------------------------
......@@ -121,7 +116,6 @@ if __name__ == "__main__":
"MLP": {'activation': ['identity', 'logistic', 'tanh', 'relu'],
'hidden_layer_sizes': randint(50, 150),
'learning_rate': ['constant', 'invscaling', 'adaptive']}
# "ElNet": {'solver': ['lbfgs', 'sag', 'saga']},
}
# --------------------------------------------------------------------------------------------------------
......@@ -148,25 +142,17 @@ if __name__ == "__main__":
y = data_dic['y_train_' + method + group]
# Use group of models with class weight if needed
models = models_CS if j == 1 else models_simple
# Save results: set of optimal hyperpameters -> mean precision and sd for those parameters across folds
hyperparam_df = pd.DataFrame(index=list(models.keys()), columns=['Best Parameters','Mean Precision', 'SD'])
# Save optimal hyperparameters for each of the models -> metrics will be computed in a different file
hyperparam_df = pd.DataFrame(index=list(models.keys()), columns=['Model Name', 'Best Parameters'])
for model_name, model in models.items():
print(f"{group}-{method_names[j]}-{model_name}")
# Find optimal hyperparams for curr model
params = hyperparameters[model_name]
search = RandomizedSearchCV(model, param_distributions=params, cv=cv, n_jobs=10, scoring='precision')
search.fit(X,y)
# Access the results
results = search.cv_results_
best_index = search.best_index_
# Get sd and mean across folds for best set of hyperpameters
# Keep optimal parameters
best_params = search.best_params_
mean_precision_best = results['mean_test_score'][best_index]
std_precision_best = results['std_test_score'][best_index]
# Storing these values
hyperparam_df.at[model_name, 'Best Parameters'] = best_params
hyperparam_df.at[model_name, 'Mean Precision'] = round(mean_precision_best, 4)
hyperparam_df.at[model_name, 'SD'] = round(std_precision_best, 4)
# Store the DataFrame in the dictionary with a unique key for each sheet
sheet_name = f"{group}_{method_names[j]}"
sheets_dict[sheet_name] = hyperparam_df
......@@ -178,5 +164,3 @@ if __name__ == "__main__":
print("Successful tuning")
# --------------------------------------------------------------------------------------------------------
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment