automated reading the parameters for tuned models

2d351899 · Joaquin Torres · 6674b724 · 2d351899
Commit 2d351899 authored May 14, 2024 by Joaquin Torres
Hide whitespace changes
Inline Side-by-side

Showing with 45 additions and 100 deletions

model_selection/test_models.py model_selection/test_models.py +45 -100

No files found.
--- a/model_selection/test_models.py
+++ b/model_selection/test_models.py
@@ -18,6 +18,7 @@ from sklearn.metrics import RocCurveDisplay, roc_curve
 from sklearn.metrics import PrecisionRecallDisplay, precision_recall_curve
 import matplotlib.pyplot as plt
 from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
+import ast # String to dictionary
 # --------------------------------------------------------------------------------------------------------

 # Reading test data
@@ -71,103 +72,47 @@ def read_test_data():

 # Returning tuned models for each situation
 # --------------------------------------------------------------------------------------------------------
-def get_tuned_models(group_id, method_id):
-    # 1. PRE
-    if group_id == 0:
-        # 1.1) Trained with original dataset
-        if method_id == 0:
-            tuned_models = {
-            "DT" : DecisionTreeClassifier(**{'splitter': 'best', 'max_features': 'sqrt', 'criterion': 'entropy'}), 
-            "RF" : RandomForestClassifier(**{'criterion': 'entropy', 'max_features': 'sqrt', 'n_estimators': 123}), 
-            "Bagging" : BaggingClassifier(**{'max_features': 1.0, 'max_samples': 0.8, 'n_estimators': 13, 'warm_start': False}),
-            "AB" : AdaBoostClassifier(**{'learning_rate': 1.8473150336970519, 'n_estimators': 96, 'algorithm': 'SAMME'}), 
-            "XGB": XGBClassifier(**{'learning_rate': 0.21528982071549305, 'max_depth': 6, 'n_estimators': 804}),
-            "LR" : LogisticRegression(**{'solver': 'lbfgs', 'penalty': 'l2','max_iter': 1000}), 
-            "SVM" : SVC(**{'C': 1.051871311397777, 'kernel': 'linear', 'max_iter':1000, 'probability': True}), 
-            "MLP" : MLPClassifier(**{'activation': 'identity', 'hidden_layer_sizes': 78, 'learning_rate': 'constant','max_iter':500})
-            }
-        # 1.2) Trained with original dataset and cost-sensitive learning
-        elif method_id == 1:
-            tuned_models = {
-            "DT": DecisionTreeClassifier(**{'splitter': 'best', 'max_features': 'log2', 'criterion': 'entropy', 'class_weight': 'balanced'}),
-            "RF": RandomForestClassifier(**{'criterion': 'entropy', 'max_features': 'sqrt', 'n_estimators': 238, 'class_weight': 'balanced'}),
-            "Bagging": BaggingClassifier(**{'max_features': 1.0, 'max_samples': 0.8, 'n_estimators': 22, 'warm_start': False, 'estimator': DecisionTreeClassifier(class_weight='balanced')}),
-            "AB": AdaBoostClassifier(**{'learning_rate': 1.7136783954287846, 'n_estimators': 99, 'algorithm': 'SAMME', 'estimator': DecisionTreeClassifier(class_weight='balanced')}),
-            "LR": LogisticRegression(**{'solver': 'lbfgs', 'penalty': 'l2', 'max_iter': 1000, 'class_weight': 'balanced'}),
-            "SVM": SVC(**{'C': 1.480857958217729, 'kernel': 'linear', 'max_iter': 1000, 'class_weight': 'balanced', 'probability': True}),
-            }
-        # 1.3) Trained with oversampled training dataset
-        elif method_id == 2:
-            tuned_models = {
-            "DT" : DecisionTreeClassifier(**{'splitter': 'best', 'max_features': 'sqrt', 'criterion': 'log_loss'}), 
-            "RF" : RandomForestClassifier(**{'criterion': 'gini', 'max_features': 'sqrt', 'n_estimators': 121}), 
-            "Bagging" : BaggingClassifier(**{'max_features': 1.0, 'max_samples': 1.0, 'n_estimators': 22, 'warm_start': True}),
-            "AB" : AdaBoostClassifier(**{'learning_rate': 1.4640913091426446, 'n_estimators': 145, 'algorithm': 'SAMME'}), 
-            "XGB": XGBClassifier(**{'learning_rate': 0.19621698151985992, 'max_depth': 7, 'n_estimators': 840}),
-            "LR" : LogisticRegression(**{'solver': 'lbfgs', 'penalty': 'l2', 'max_iter': 1000}), 
-            "SVM" : SVC(**{'C': 1.590799972846728, 'kernel': 'poly', 'max_iter':1000, 'probability': True}), 
-            "MLP" : MLPClassifier(**{'activation': 'relu', 'hidden_layer_sizes': 112, 'learning_rate': 'constant', 'max_iter':500})
-            }
-        # 1.4) Trained with undersampled training dataset
-        elif method_id == 3:
-            tuned_models = {
-            "DT" : DecisionTreeClassifier(**{'splitter': 'best', 'max_features': 'sqrt', 'criterion': 'log_loss'}), 
-            "RF" : RandomForestClassifier(**{'criterion': 'gini', 'max_features': 'sqrt', 'n_estimators': 148}), 
-            "Bagging" : BaggingClassifier(**{'max_features': 1.0, 'max_samples': 0.8, 'n_estimators': 24, 'warm_start': True}),
-            "AB" : AdaBoostClassifier(**{'learning_rate': 1.7970533619575801, 'n_estimators': 122, 'algorithm': 'SAMME'}), 
-            "XGB": XGBClassifier(**{'learning_rate': 0.13148624656904934, 'max_depth': 9, 'n_estimators': 723}),
-            "LR" : LogisticRegression(**{'solver': 'sag', 'penalty': 'l2', 'max_iter': 1000}), 
-            "SVM" : SVC(**{'C': 1.383651513577477, 'kernel': 'poly', 'max_iter':1000, 'probability': True}), 
-            "MLP" : MLPClassifier(**{'activation': 'relu', 'hidden_layer_sizes': 89, 'learning_rate': 'invscaling', 'max_iter':500})
-            }
-    # 2. POST
-    else:
-        # 2.1) Trained with original dataset
-        if method_id == 0:
-            tuned_models = {
-            "DT" : DecisionTreeClassifier(**{'splitter': 'best', 'max_features': 'sqrt', 'criterion': 'log_loss'}), 
-            "RF" : RandomForestClassifier(**{'criterion': 'entropy', 'max_features': 'sqrt', 'n_estimators': 120}), 
-            "Bagging" : BaggingClassifier(**{'max_features': 1.0, 'max_samples': 0.8, 'n_estimators': 38, 'warm_start': True}),
-            "AB" : AdaBoostClassifier(**{'learning_rate': 1.9069394544838472, 'n_estimators': 121, 'algorithm': 'SAMME'}), 
-            "XGB": XGBClassifier(**{'learning_rate': 0.24787889985627387, 'max_depth': 4, 'n_estimators': 956}),
-            "LR" : LogisticRegression(**{'solver': 'lbfgs', 'penalty': 'l2'}), 
-            "SVM" : SVC(**{'C': 1.7965537393241109, 'kernel': 'linear', 'max_iter':1000, 'probability': True}), 
-            "MLP" : MLPClassifier(**{'activation': 'relu', 'hidden_layer_sizes': 147, 'learning_rate': 'invscaling', 'max_iter':500})
-            }
-        # 2.2) Trained with original dataset and cost-sensitive learning
-        elif method_id == 1:
-            tuned_models = {
-            "DT": DecisionTreeClassifier(**{'splitter': 'best', 'max_features': 'sqrt', 'criterion': 'gini', 'class_weight': 'balanced'}),
-            "RF": RandomForestClassifier(**{'criterion': 'entropy', 'max_features': 'sqrt', 'n_estimators': 138, 'class_weight': 'balanced'}),
-            "Bagging": BaggingClassifier(**{'max_features': 1.0, 'max_samples': 1.0, 'n_estimators': 66, 'warm_start': True, 'estimator': DecisionTreeClassifier(class_weight='balanced')}),
-            "AB": AdaBoostClassifier(**{'learning_rate': 1.92541653518023, 'n_estimators': 114, 'algorithm': 'SAMME', 'estimator': DecisionTreeClassifier(class_weight='balanced')}),
-            "LR": LogisticRegression(**{'solver': 'lbfgs', 'penalty': 'l2', 'max_iter': 1000, 'class_weight': 'balanced'}),
-            "SVM": SVC(**{'C': 0.8395104850983046, 'kernel': 'linear', 'max_iter': 1000, 'class_weight': 'balanced', 'probability': True})
-            }
-        # 2.3) Trained with oversampled training dataset
-        elif method_id == 2:
-            tuned_models = {
-            "DT" : DecisionTreeClassifier(**{'splitter': 'best', 'max_features': 'log2', 'criterion': 'entropy'}), 
-            "RF" : RandomForestClassifier(**{'criterion': 'gini', 'max_features': 'sqrt', 'n_estimators': 118}), 
-            "Bagging" : BaggingClassifier(**{'max_features': 1.0, 'max_samples': 1.0, 'n_estimators': 56, 'warm_start': False}),
-            "AB" : AdaBoostClassifier(**{'learning_rate': 1.5933610622176648, 'n_estimators': 114, 'algorithm': 'SAMME'}), 
-            "XGB": XGBClassifier(**{'learning_rate': 0.059934879882855396, 'max_depth': 9, 'n_estimators': 660}),
-            "LR" : LogisticRegression(**{'solver': 'lbfgs', 'penalty': 'l2', 'max_iter': 1000}), 
-            "SVM" : SVC(**{'C': 1.2237930722499044, 'kernel': 'poly', 'max_iter':1000, 'probability': True}), 
-            "MLP" : MLPClassifier(**{'activation': 'identity', 'hidden_layer_sizes': 134, 'learning_rate': 'invscaling', 'max_iter':500})
-            }
-        # 2.4) Trained with undersampled training dataset
-        elif method_id == 3:
-            tuned_models = {
-            "DT" : DecisionTreeClassifier(**{'splitter': 'best', 'max_features': 'log2', 'criterion': 'log_loss'}), 
-            "RF" : RandomForestClassifier(**{'criterion': 'gini', 'max_features': 'sqrt', 'n_estimators': 151}), 
-            "Bagging" : BaggingClassifier(**{'max_features': 1.0, 'max_samples': 1.0, 'n_estimators': 20, 'warm_start': False}),
-            "AB" : AdaBoostClassifier(**{'learning_rate': 1.6523810056317618, 'n_estimators': 89, 'algorithm': 'SAMME'}), 
-            "XGB": XGBClassifier(**{'learning_rate': 0.18430397856234193, 'max_depth': 4, 'n_estimators': 956}),
-            "LR" : LogisticRegression(**{'solver': 'lbfgs', 'penalty': 'l2', 'max_iter': 1000}), 
-            "SVM" : SVC(**{'C': 1.1807459108651588, 'kernel': 'linear', 'max_iter':1000, 'probability': True}), 
-            "MLP" : MLPClassifier(**{'activation': 'identity', 'hidden_layer_sizes': 55, 'learning_rate': 'constant', 'max_iter':500})
-            }
+def get_tuned_models(group_str, method_str):
+
+    # Read sheet corresponding to group and method with tuned models and their hyperparam
+    tuned_models_df = pd.read_excel("./output_hyperparam/hyperparamers.xlsx",sheet_name=f"{group_str}_{method_str}")
+    # Mapping from model abbreviations to sklearn model classes
+    model_mapping = {
+        'DT': DecisionTreeClassifier,
+        'RF': RandomForestClassifier,
+        'Bagging': BaggingClassifier,
+        'AB': AdaBoostClassifier,
+        'XGB': XGBClassifier,
+        'LR': LogisticRegression,
+        'SVM': SVC,
+        'MLP': MLPClassifier
+    }
+    tuned_models = {}
+    # Iterate through each row of the DataFrame
+    for index, row in tuned_models_df.iterrows():
+        model_name = row[0]
+        # Read dictionary
+        parameters = ast.literal_eval(row['Parameters'])
+        # Add extra parameters 
+        if model_name == 'AB':
+            parameters['algorithm'] = 'SAMME'
+        elif model_name == 'LR':
+            parameters['max_iter'] = 1000
+        elif model_name == 'SVM':
+            parameters['max_iter'] = 1000
+            parameters['probability'] = True
+        elif model_name == "MLP":
+            parameters['max_iter'] = 500
+        # Add class_weight argument for cost-sensitive learning method
+        if 'CW' in method_str:
+            if model_name == 'Bagging' or model_name == 'AB':
+                parameters['estimator'] = DecisionTreeClassifier(class_weight='balanced')
+            else:
+                parameters['class_weight'] = 'balanced'
+        # Fetch class
+        model_class = model_mapping[model_name]
+        # Initialize model
+        tuned_models[model_name] = model_class(**parameters)
    return tuned_models
 # --------------------------------------------------------------------------------------------------------

@@ -242,12 +187,12 @@ if __name__ == "__main__":
        X_test = data_dic['X_test_' + group]
        y_test = data_dic['y_test_' + group]
        for j, method in enumerate(['', '', 'over_', 'under_']):
-            print(f"{group}-{method}")
+            print(f"{group}-{method_names[j]}")
            # Get train dataset based on group and method
            X_train = data_dic['X_train_' + method + group]
            y_train = data_dic['y_train_' + method + group]
            # Get tuned models for this group and method
-            models = get_tuned_models(group_id=i, method_id=j)
+            models = get_tuned_models(group, method_names[j])
            # Scores df
            scores_df = pd.DataFrame(index=models.keys(), columns=scorings.keys())
            # Create a figure for all models in this group-method
@@ -292,6 +237,6 @@ if __name__ == "__main__":
    with pd.ExcelWriter('./test_results/testing_tuned_models.xlsx') as writer:
        for sheet_name, data in scores_sheets.items():
            data.to_excel(writer, sheet_name=sheet_name)
-    # --------------------------------------------------------------------------------------------------------
+# --------------------------------------------------------------------------------------------------------