Found optimal hyperparamers and exported, more analysis to be done in different files

203b75e8 · Joaquin Torres · 620b8a59 · 203b75e8 · 203b75e8
Commit 203b75e8 authored May 22, 2024 by Joaquin Torres
Showing with 18 additions and 34 deletions

model_selection/hyperparam_tuning.py model_selection/hyperparam_tuning.py +18 -34

model_selection/output_hyperparam/hyperparamers.xlsx model_selection/output_hyperparam/hyperparamers.xlsx +0 -0

No files found.
--- a/model_selection/hyperparam_tuning.py
+++ b/model_selection/hyperparam_tuning.py
 """
-    Selecting best models through cross validation and hyperparameter tunning 
+    Finding optimal hyperparameters through RandomSearchCV for each group (1. pre - 2. post) 
-    for each method: 
+    and method: 
        1. Original training dataset
        2. Original training dataset - Cost sensitive
        3. Oversampling
@@ -21,7 +21,6 @@ from sklearn.linear_model import  LogisticRegression
 from sklearn.tree import DecisionTreeClassifier
 from scipy.stats import randint, uniform
 from sklearn.model_selection import RandomizedSearchCV
-import os 
 # --------------------------------------------------------------------------------------------------------
 # Function to read training datasets
@@ -73,26 +72,22 @@ if __name__ == "__main__":
    # --------------------------------------------------------------------------------------------------------
    # 1. No class weight
    models_simple = {"DT" : DecisionTreeClassifier(), 
-            "RF" : RandomForestClassifier(), 
+            # "RF" : RandomForestClassifier(), 
-            "Bagging" : BaggingClassifier(),
+            # "Bagging" : BaggingClassifier(),
-            "AB" : AdaBoostClassifier(algorithm='SAMME'), 
+            # "AB" : AdaBoostClassifier(algorithm='SAMME'), 
-            "XGB": XGBClassifier(),
+            # "XGB": XGBClassifier(),
-            "LR" : LogisticRegression(max_iter=1000), 
+            # "LR" : LogisticRegression(max_iter=1000), 
-            "SVM" : SVC(probability=True, max_iter=1000), 
+            # "SVM" : SVC(probability=True, max_iter=1000), 
-            "MLP" : MLPClassifier(max_iter=500)
+            # "MLP" : MLPClassifier(max_iter=500)
-            # "ElNet" : LogisticRegression(max_iter=1000, penalty='elasticnet')
            }
    # 2. Class weight: cost-sensitive learning
    models_CS = {"DT" : DecisionTreeClassifier(class_weight='balanced'), 
-            "RF" : RandomForestClassifier(class_weight='balanced'), 
+            # "RF" : RandomForestClassifier(class_weight='balanced'), 
-            "Bagging" : BaggingClassifier(estimator= DecisionTreeClassifier(class_weight='balanced')),
+            # "Bagging" : BaggingClassifier(estimator= DecisionTreeClassifier(class_weight='balanced')),
-            "AB" : AdaBoostClassifier(estimator= DecisionTreeClassifier(class_weight='balanced'), algorithm='SAMME'),  
+            # "AB" : AdaBoostClassifier(estimator= DecisionTreeClassifier(class_weight='balanced'), algorithm='SAMME'),  
-            "LR" : LogisticRegression(max_iter=1000, class_weight='balanced'), 
+            # "LR" : LogisticRegression(max_iter=1000, class_weight='balanced'), 
-            "SVM" : SVC(probability=True, max_iter = 1000, class_weight='balanced'), 
+            # "SVM" : SVC(probability=True, max_iter = 1000, class_weight='balanced'), 
-            # "ElNet" : LogisticRegression(max_iter=1000, penalty='elasticnet', class_weight='balanced'), 
-            # "XGB": XGBClassifier(), # <-
-            # "MLP" : MLPClassifier(max_iter=500) # <-
            }
    # --------------------------------------------------------------------------------------------------------
@@ -121,7 +116,6 @@ if __name__ == "__main__":
        "MLP": {'activation': ['identity', 'logistic', 'tanh', 'relu'], 
                'hidden_layer_sizes': randint(50, 150), 
                'learning_rate': ['constant', 'invscaling', 'adaptive']}
-        # "ElNet": {'solver': ['lbfgs', 'sag', 'saga']},
    }
    # --------------------------------------------------------------------------------------------------------
@@ -148,25 +142,17 @@ if __name__ == "__main__":
            y = data_dic['y_train_' + method + group]
            # Use group of models with class weight if needed
            models = models_CS if j == 1 else models_simple 
-            # Save results: set of optimal hyperpameters -> mean precision and sd for those parameters across folds
+            # Save optimal hyperparameters for each of the models -> metrics will be computed in a different file
-            hyperparam_df = pd.DataFrame(index=list(models.keys()), columns=['Best Parameters','Mean Precision', 'SD'])
+            hyperparam_df = pd.DataFrame(index=list(models.keys()), columns=['Model Name', 'Best Parameters'])
            for model_name, model in models.items():
                print(f"{group}-{method_names[j]}-{model_name}")
                # Find optimal hyperparams for curr model
                params = hyperparameters[model_name]
                search = RandomizedSearchCV(model, param_distributions=params, cv=cv, n_jobs=10, scoring='precision')
                search.fit(X,y)
-                # Access the results
+                # Keep optimal parameters
-                results = search.cv_results_
-                best_index = search.best_index_
-                # Get sd and mean across folds for best set of hyperpameters
                best_params = search.best_params_
-                mean_precision_best = results['mean_test_score'][best_index]
-                std_precision_best = results['std_test_score'][best_index]
-                # Storing these values
                hyperparam_df.at[model_name, 'Best Parameters'] = best_params
-                hyperparam_df.at[model_name, 'Mean Precision'] = round(mean_precision_best, 4)
-                hyperparam_df.at[model_name, 'SD'] = round(std_precision_best, 4)
            # Store the DataFrame in the dictionary with a unique key for each sheet
            sheet_name = f"{group}_{method_names[j]}"
            sheets_dict[sheet_name] = hyperparam_df
@@ -177,6 +163,4 @@ if __name__ == "__main__":
            data.to_excel(writer, sheet_name=sheet_name)
    print("Successful tuning")
    # --------------------------------------------------------------------------------------------------------
\ No newline at end of file
--- a/model_selection/output_hyperparam/hyperparamers.xlsx
+++ b/model_selection/output_hyperparam/hyperparamers.xlsx