diff --git a/model_selection/hyperparam_tuning.py b/model_selection/hyperparam_tuning.py index 6d8af49771572dbf17ad8a2c86d8c3a4897d5115..4f7734de5bdb3d955d3695293852b06423615934 100644 --- a/model_selection/hyperparam_tuning.py +++ b/model_selection/hyperparam_tuning.py @@ -1,6 +1,6 @@ """ - Selecting best models through cross validation and hyperparameter tunning - for each method: + Finding optimal hyperparameters through RandomSearchCV for each group (1. pre - 2. post) + and method: 1. Original training dataset 2. Original training dataset - Cost sensitive 3. Oversampling @@ -21,7 +21,6 @@ from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from scipy.stats import randint, uniform from sklearn.model_selection import RandomizedSearchCV -import os # -------------------------------------------------------------------------------------------------------- # Function to read training datasets @@ -73,26 +72,22 @@ if __name__ == "__main__": # -------------------------------------------------------------------------------------------------------- # 1. No class weight models_simple = {"DT" : DecisionTreeClassifier(), - "RF" : RandomForestClassifier(), - "Bagging" : BaggingClassifier(), - "AB" : AdaBoostClassifier(algorithm='SAMME'), - "XGB": XGBClassifier(), - "LR" : LogisticRegression(max_iter=1000), - "SVM" : SVC(probability=True, max_iter=1000), - "MLP" : MLPClassifier(max_iter=500) - # "ElNet" : LogisticRegression(max_iter=1000, penalty='elasticnet') + # "RF" : RandomForestClassifier(), + # "Bagging" : BaggingClassifier(), + # "AB" : AdaBoostClassifier(algorithm='SAMME'), + # "XGB": XGBClassifier(), + # "LR" : LogisticRegression(max_iter=1000), + # "SVM" : SVC(probability=True, max_iter=1000), + # "MLP" : MLPClassifier(max_iter=500) } # 2. Class weight: cost-sensitive learning models_CS = {"DT" : DecisionTreeClassifier(class_weight='balanced'), - "RF" : RandomForestClassifier(class_weight='balanced'), - "Bagging" : BaggingClassifier(estimator= DecisionTreeClassifier(class_weight='balanced')), - "AB" : AdaBoostClassifier(estimator= DecisionTreeClassifier(class_weight='balanced'), algorithm='SAMME'), - "LR" : LogisticRegression(max_iter=1000, class_weight='balanced'), - "SVM" : SVC(probability=True, max_iter = 1000, class_weight='balanced'), - # "ElNet" : LogisticRegression(max_iter=1000, penalty='elasticnet', class_weight='balanced'), - # "XGB": XGBClassifier(), # <- - # "MLP" : MLPClassifier(max_iter=500) # <- + # "RF" : RandomForestClassifier(class_weight='balanced'), + # "Bagging" : BaggingClassifier(estimator= DecisionTreeClassifier(class_weight='balanced')), + # "AB" : AdaBoostClassifier(estimator= DecisionTreeClassifier(class_weight='balanced'), algorithm='SAMME'), + # "LR" : LogisticRegression(max_iter=1000, class_weight='balanced'), + # "SVM" : SVC(probability=True, max_iter = 1000, class_weight='balanced'), } # -------------------------------------------------------------------------------------------------------- @@ -121,7 +116,6 @@ if __name__ == "__main__": "MLP": {'activation': ['identity', 'logistic', 'tanh', 'relu'], 'hidden_layer_sizes': randint(50, 150), 'learning_rate': ['constant', 'invscaling', 'adaptive']} - # "ElNet": {'solver': ['lbfgs', 'sag', 'saga']}, } # -------------------------------------------------------------------------------------------------------- @@ -148,25 +142,17 @@ if __name__ == "__main__": y = data_dic['y_train_' + method + group] # Use group of models with class weight if needed models = models_CS if j == 1 else models_simple - # Save results: set of optimal hyperpameters -> mean precision and sd for those parameters across folds - hyperparam_df = pd.DataFrame(index=list(models.keys()), columns=['Best Parameters','Mean Precision', 'SD']) + # Save optimal hyperparameters for each of the models -> metrics will be computed in a different file + hyperparam_df = pd.DataFrame(index=list(models.keys()), columns=['Model Name', 'Best Parameters']) for model_name, model in models.items(): print(f"{group}-{method_names[j]}-{model_name}") # Find optimal hyperparams for curr model params = hyperparameters[model_name] search = RandomizedSearchCV(model, param_distributions=params, cv=cv, n_jobs=10, scoring='precision') search.fit(X,y) - # Access the results - results = search.cv_results_ - best_index = search.best_index_ - # Get sd and mean across folds for best set of hyperpameters + # Keep optimal parameters best_params = search.best_params_ - mean_precision_best = results['mean_test_score'][best_index] - std_precision_best = results['std_test_score'][best_index] - # Storing these values hyperparam_df.at[model_name, 'Best Parameters'] = best_params - hyperparam_df.at[model_name, 'Mean Precision'] = round(mean_precision_best, 4) - hyperparam_df.at[model_name, 'SD'] = round(std_precision_best, 4) # Store the DataFrame in the dictionary with a unique key for each sheet sheet_name = f"{group}_{method_names[j]}" sheets_dict[sheet_name] = hyperparam_df @@ -177,6 +163,4 @@ if __name__ == "__main__": data.to_excel(writer, sheet_name=sheet_name) print("Successful tuning") - # -------------------------------------------------------------------------------------------------------- - - + # -------------------------------------------------------------------------------------------------------- \ No newline at end of file diff --git a/model_selection/output_hyperparam/hyperparamers.xlsx b/model_selection/output_hyperparam/hyperparamers.xlsx index 0f4ba7ea4cbbfa8585dcc2d47f8faffc61ec8ab8..96eff997c434ccdd40d61a42fe25cf01d2f6fdd3 100644 Binary files a/model_selection/output_hyperparam/hyperparamers.xlsx and b/model_selection/output_hyperparam/hyperparamers.xlsx differ