Merge branch 'model_sel_and_shap' of...

Merge branch 'model_sel_and_shap' of https://medal.ctb.upm.es/internal/gitlab/compara/covid_analysis into model_sel_and_shap

Merge branch 'model_sel_and_shap' of...
Merge branch 'model_sel_and_shap' of https://medal.ctb.upm.es/internal/gitlab/compara/covid_analysis into model_sel_and_shap
9d601f43 · Joaquin Torres · 7cc7f28b · f919e066 · 9d601f43
Commit 9d601f43 authored Jun 07, 2024 by Joaquin Torres
Show whitespace changes
Inline Side-by-side

Showing with 78 additions and 27 deletions

explicability/shap_vals.py explicability/shap_vals.py +78 -27

No files found.
--- a/explicability/shap_vals.py
+++ b/explicability/shap_vals.py
@@ -3,6 +3,7 @@
 import pandas as pd
 import numpy as np
 import shap
+import ast

 from xgboost import XGBClassifier
 from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier
@@ -61,6 +62,60 @@ def read_data():
    return data_dic
 # --------------------------------------------------------------------------------------------------------

+# Retrieving parameters for chosen models
+# --------------------------------------------------------------------------------------------------------
+def get_chosen_model(group_str, method_str, model_name):
+    # Read sheet corresponding to group and method with tuned models and their hyperparameters
+    tuned_models_df = pd.read_excel("../model_selection/output_hyperparam/hyperparamers.xlsx", sheet_name=f"{group_str}_{method_str}")
+    tuned_models_df.columns = ['Model', 'Best Parameters']
+    
+    # Define the mapping from model abbreviations to sklearn model classes
+    model_mapping = {
+        'DT': DecisionTreeClassifier,
+        'RF': RandomForestClassifier,
+        'Bagging': BaggingClassifier,
+        'AB': AdaBoostClassifier,
+        'XGB': XGBClassifier,
+        'LR': LogisticRegression,
+        'SVM': SVC,
+        'MLP': MLPClassifier
+    }
+    
+    # Access the row for the given model name by checking the first column (index 0)
+    row = tuned_models_df[tuned_models_df['Model'] == model_name].iloc[0]
+
+    # Parse the dictionary of parameters from the 'Best Parameters' column
+    parameters = ast.literal_eval(row['Best Parameters'])
+    
+    # Modify parameters based on model specifics or methods if necessary
+    if model_name == 'AB':
+        parameters['algorithm'] = 'SAMME'
+    elif model_name == 'LR':
+        parameters['max_iter'] = 1000
+    elif model_name == 'SVM':
+        parameters['max_iter'] = 1000
+        parameters['probability'] = True
+    elif model_name == "MLP":
+        parameters['max_iter'] = 500
+    
+    # Add class_weight argument for cost-sensitive learning method
+    if 'CW' in method_str:
+        if model_name in ['Bagging', 'AB']:
+            parameters['estimator'] = DecisionTreeClassifier(class_weight='balanced')
+        else:
+            parameters['class_weight'] = 'balanced'
+
+    # Fetch the class of the model
+    model_class = model_mapping[model_name]
+
+    # Initialize the model with the parameters
+    chosen_model = model_class(**parameters)
+    # Return if it is a tree model, for SHAP
+    is_tree = model_name not in ['LR', 'SVM', 'MLP']
+    
+    return chosen_model, is_tree
+# --------------------------------------------------------------------------------------------------------
+
 if __name__ == "__main__":

    # Setup
@@ -73,48 +128,44 @@ if __name__ == "__main__":
        2: "OVER",
        3: "UNDER"
    }
-    # Best model initialization (to be completed - manually)
-    # Mapping group-method -> (isTreeModel:bool, model)
-    models = {
-        "pre_ORIG": (None,None),
-        "pre_ORIG_CW": (None,None), 
-        "pre_OVER": (None,None),
-        "pre_UNDER": (None,None),
-        "post_ORIG": (None,None),
-        "post_ORIG": (None,None),
-        "post_ORIG_CW": (None,None), 
-        "post_OVER": (None,None),
-        "post_UNDER": (None,None),
+    
+    model_choices = {
+        "ORIG": "XGB",
+        "ORIG_CW": "RF",
+        "OVER": "XGB",
+        "UNDER": "XGB"
    }
-    # # Retrieve attribute names in order
-    # df = pd.read_csv("..\gen_train_data\data\input\pre_dataset.csv")
-    # attribute_names = list(df.columns.values)
+    # Retrieve attribute names in order
+    df = pd.read_csv("../gen_train_data/data/input/pre_dataset.csv")
+    attribute_names = list(df.columns.values)
    # --------------------------------------------------------------------------------------------------------

    # Shap value generation
    # --------------------------------------------------------------------------------------------------------
    for i, group in enumerate(['pre', 'post']):
-        # Get test dataset based on group
-        X_test = data_dic['X_test_' + group]
+        # Get test dataset based on group, add column names
+        X_test = pd.DataFrame(data_dic['X_test_' + group], columns=attribute_names)
        y_test = data_dic['y_test_' + group]
        for j, method in enumerate(['', '', 'over_', 'under_']):
            print(f"{group}-{method_names[j]}")
            # Get train dataset based on group and method
-            X_train = data_dic['X_train_' + method + group]
+            X_train = pd.DataFrame(data_dic['X_train_' + method + group], columns=attribute_names)
            y_train = data_dic['y_train_' + method + group]
-            # Retrieve best model for this group-method context
-            model_info = models[group + '_' + method_names[j]]
-            is_tree = model_info[0]
-            model = model_info[1]
+            method_name = method_names[j]
+            # Get chosen tuned model for this group and method context
+            model, is_tree = get_chosen_model(group_str=group, method_str=method_name, model_name=model_choices[method_name])
+            # --------------------------------------------------------------------------------------------------------
            # Fit model with training data
            fitted_model = model.fit(X_train[:500], y_train[:500])
-            # Check if we are dealing with a tree vs nn model
+            # # Check if we are dealing with a tree vs nn model
            if is_tree:
-                explainer = shap.TreeExplainer(fitted_model, X_test[:500])
-            else:
-                explainer = shap.KernelExplainer(fitted_model.predict, X_test[:500])
+                 explainer = shap.TreeExplainer(fitted_model)
+            # else:
+            #     explainer = shap.KernelExplainer(fitted_model.predict_proba, X_test[:500])
            # Compute shap values
            shap_vals = explainer.shap_values(X_test[:500], check_additivity=False) # Change to true for final results
+            # ---------------------------------------------------------------------------------------------------------
            # Save results
-            np.save(f"shap_values/{group}_{method_names[j]}", shap_vals)
+            np.save(f"./output/shap_values/{group}_{method_names[j]}", shap_vals)
+            print(f'Shape of numpy array: {shap_vals.shape}')
    # --------------------------------------------------------------------------------------------------------
\ No newline at end of file