Able to retrieve chosen tuned models based on model name easily

aa9797c1 · Joaquin Torres · 050d8a00 · aa9797c1
Commit aa9797c1 authored Jun 06, 2024 by Joaquin Torres
Hide whitespace changes
Inline Side-by-side

Showing with 80 additions and 26 deletions

explicability/shap_vals.py explicability/shap_vals.py +80 -26

No files found.
--- a/explicability/shap_vals.py
+++ b/explicability/shap_vals.py
@@ -3,6 +3,7 @@
 import pandas as pd
 import numpy as np
 import shap
+import ast
 from xgboost import XGBClassifier
 from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier
@@ -61,6 +62,58 @@ def read_data():
    return data_dic
 # --------------------------------------------------------------------------------------------------------
+# Retrieving parameters for chosen models
+# --------------------------------------------------------------------------------------------------------
+def get_chosen_model(group_str, method_str, model_name):
+    # Read sheet corresponding to group and method with tuned models and their hyperparameters
+    tuned_models_df = pd.read_excel("../model_selection/output_hyperparam/hyperparamers.xlsx", sheet_name=f"{group_str}_{method_str}")
+    tuned_models_df.columns = ['Model', 'Best Parameters']
+    # Define the mapping from model abbreviations to sklearn model classes
+    model_mapping = {
+        'DT': DecisionTreeClassifier,
+        'RF': RandomForestClassifier,
+        'Bagging': BaggingClassifier,
+        'AB': AdaBoostClassifier,
+        'XGB': XGBClassifier,
+        'LR': LogisticRegression,
+        'SVM': SVC,
+        'MLP': MLPClassifier
+    }
+    # Access the row for the given model name by checking the first column (index 0)
+    row = tuned_models_df[tuned_models_df['Model'] == model_name].iloc[0]
+    # Parse the dictionary of parameters from the 'Best Parameters' column
+    parameters = ast.literal_eval(row['Best Parameters'])
+    # Modify parameters based on model specifics or methods if necessary
+    if model_name == 'AB':
+        parameters['algorithm'] = 'SAMME'
+    elif model_name == 'LR':
+        parameters['max_iter'] = 1000
+    elif model_name == 'SVM':
+        parameters['max_iter'] = 1000
+        parameters['probability'] = True
+    elif model_name == "MLP":
+        parameters['max_iter'] = 500
+    # Add class_weight argument for cost-sensitive learning method
+    if 'CW' in method_str:
+        if model_name in ['Bagging', 'AB']:
+            parameters['estimator'] = DecisionTreeClassifier(class_weight='balanced')
+        else:
+            parameters['class_weight'] = 'balanced'
+    # Fetch the class of the model
+    model_class = model_mapping[model_name]
+    # Initialize the model with the parameters
+    chosen_model = model_class(**parameters)
+    return chosen_model
+# --------------------------------------------------------------------------------------------------------
 if __name__ == "__main__":
    # Setup
@@ -73,18 +126,12 @@ if __name__ == "__main__":
        2: "OVER",
        3: "UNDER"
    }
-    # Best model initialization (to be completed - manually)
-    # Mapping group-method -> (isTreeModel:bool, model)
+    model_choices = {
-    models = {
+        "ORIG": "XGB",
-        "pre_ORIG": (None,None),
+        "ORIG_CW": "RF",
-        "pre_ORIG_CW": (None,None), 
+        "OVER": "XGB",
-        "pre_OVER": (None,None),
+        "UNDER": "XGB"
-        "pre_UNDER": (None,None),
-        "post_ORIG": (None,None),
-        "post_ORIG": (None,None),
-        "post_ORIG_CW": (None,None), 
-        "post_OVER": (None,None),
-        "post_UNDER": (None,None),
    }
    # # Retrieve attribute names in order
    # df = pd.read_csv("..\gen_train_data\data\input\pre_dataset.csv")
@@ -102,19 +149,26 @@ if __name__ == "__main__":
            # Get train dataset based on group and method
            X_train = data_dic['X_train_' + method + group]
            y_train = data_dic['y_train_' + method + group]
-            # Retrieve best model for this group-method context
+            method_name = method_names[j]
-            model_info = models[group + '_' + method_names[j]]
+            # Get chosen tuned model for this group and method context
-            is_tree = model_info[0]
+            model = get_chosen_model(group_str=group, method_str=method_name, model_name=model_choices[method_name])
-            model = model_info[1]
+            print(f'Name: {model_choices[method_name]}')
-            # Fit model with training data
+            print(model.get_params())
-            fitted_model = model.fit(X_train[:500], y_train[:500])
+            # # --------------------------------------------------------------------------------------------------------
-            # Check if we are dealing with a tree vs nn model
+            # # Retrieve best model for this group-method context
-            if is_tree:
+            # model_info = models[group + '_' + method_names[j]]
-                explainer = shap.TreeExplainer(fitted_model, X_test[:500])
+            # is_tree = model_info[0]
-            else:
+            # model = model_info[1]
-                explainer = shap.KernelExplainer(fitted_model.predict, X_test[:500])
+            # # Fit model with training data
-            # Compute shap values
+            # fitted_model = model.fit(X_train[:500], y_train[:500])
-            shap_vals = explainer.shap_values(X_test[:500], check_additivity=False) # Change to true for final results
+            # # Check if we are dealing with a tree vs nn model
+            # if is_tree:
+            #     explainer = shap.TreeExplainer(fitted_model, X_test[:500])
+            # else:
+            #     explainer = shap.KernelExplainer(fitted_model.predict, X_test[:500])
+            # # Compute shap values
+            # shap_vals = explainer.shap_values(X_test[:500], check_additivity=False) # Change to true for final results
+            # # ---------------------------------------------------------------------------------------------------------
            # Save results
-            np.save(f"shap_values/{group}_{method_names[j]}", shap_vals)
+            # np.save(f"shap_values/{group}_{method_names[j]}", shap_vals)
    # --------------------------------------------------------------------------------------------------------
\ No newline at end of file