From 36a3534ea7d776890c1b149287e5ebadb59b60f2 Mon Sep 17 00:00:00 2001 From: joaquintb Date: Tue, 21 May 2024 15:55:50 +0200 Subject: [PATCH] script prepared to compute shap values --- gen_train_data/.gitignore => .gitignore | 0 explicability/shap_vals.py | 105 +++++++++++++++++++++++- model_selection/test_models.py | 8 +- 3 files changed, 106 insertions(+), 7 deletions(-) rename gen_train_data/.gitignore => .gitignore (100%) diff --git a/gen_train_data/.gitignore b/.gitignore similarity index 100% rename from gen_train_data/.gitignore rename to .gitignore diff --git a/explicability/shap_vals.py b/explicability/shap_vals.py index 09a7cda..8511020 100644 --- a/explicability/shap_vals.py +++ b/explicability/shap_vals.py @@ -2,12 +2,111 @@ # -------------------------------------------------------------------------------------------------------- import pandas as pd import numpy as np +import shap + from xgboost import XGBClassifier -from sklearn.metrics import confusion_matrix -from sklearn.metrics import f1_score, make_scorer, precision_score, recall_score, accuracy_score from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier from sklearn.neural_network import MLPClassifier from sklearn.svm import SVC from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier -# -------------------------------------------------------------------------------------------------------- \ No newline at end of file +# -------------------------------------------------------------------------------------------------------- + +# Reading test and training data +# -------------------------------------------------------------------------------------------------------- +def read_data(): + # Load test data + X_test_pre = np.load('../gen_train_data/data/output/pre/X_test_pre.npy', allow_pickle=True) + y_test_pre = np.load('../gen_train_data/data/output/pre/y_test_pre.npy', allow_pickle=True) + X_test_post = np.load('../gen_train_data/data/output/post/X_test_post.npy', allow_pickle=True) + y_test_post = np.load('../gen_train_data/data/output/post/y_test_post.npy', allow_pickle=True) + + # Load ORIGINAL training data + X_train_pre = np.load('../gen_train_data/data/output/pre/X_train_pre.npy', allow_pickle=True) + y_train_pre = np.load('../gen_train_data/data/output/pre/y_train_pre.npy', allow_pickle=True) + X_train_post = np.load('../gen_train_data/data/output/post/X_train_post.npy', allow_pickle=True) + y_train_post = np.load('../gen_train_data/data/output/post/y_train_post.npy', allow_pickle=True) + + # Load oversampled training data + X_train_over_pre = np.load('../gen_train_data/data/output/pre/X_train_over_pre.npy', allow_pickle=True) + y_train_over_pre = np.load('../gen_train_data/data/output/pre/y_train_over_pre.npy', allow_pickle=True) + X_train_over_post = np.load('../gen_train_data/data/output/post/X_train_over_post.npy', allow_pickle=True) + y_train_over_post = np.load('../gen_train_data/data/output/post/y_train_over_post.npy', allow_pickle=True) + + # Load undersampled training data + X_train_under_pre = np.load('../gen_train_data/data/output/pre/X_train_under_pre.npy', allow_pickle=True) + y_train_under_pre = np.load('../gen_train_data/data/output/pre/y_train_under_pre.npy', allow_pickle=True) + X_train_under_post = np.load('../gen_train_data/data/output/post/X_train_under_post.npy', allow_pickle=True) + y_train_under_post = np.load('../gen_train_data/data/output/post/y_train_under_post.npy', allow_pickle=True) + + data_dic = { + "X_test_pre": X_test_pre, + "y_test_pre": y_test_pre, + "X_test_post": X_test_post, + "y_test_post": y_test_post, + "X_train_pre": X_train_pre, + "y_train_pre": y_train_pre, + "X_train_post": X_train_post, + "y_train_post": y_train_post, + "X_train_over_pre": X_train_over_pre, + "y_train_over_pre": y_train_over_pre, + "X_train_over_post": X_train_over_post, + "y_train_over_post": y_train_over_post, + "X_train_under_pre": X_train_under_pre, + "y_train_under_pre": y_train_under_pre, + "X_train_under_post": X_train_under_post, + "y_train_under_post": y_train_under_post, + } + + return data_dic +# -------------------------------------------------------------------------------------------------------- + +if __name__ == "__main__": + + # Setup + # -------------------------------------------------------------------------------------------------------- + # Reading data + data_dic = read_data() + method_names = { + 0: "ORIG", + 1: "ORIG_CW", + 2: "OVER", + 3: "UNDER" + } + # Best model initialization (to be completed - manually) + # Mapping group-method -> (isTreeModel:bool, model) + models = { + "pre_ORIG": (None,None), + "pre_ORIG_CW": (None,None), + "pre_OVER": (None,None), + "pre_UNDER": (None,None), + "post_ORIG": (None,None), + "post_ORIG": (None,None), + "post_ORIG_CW": (None,None), + "post_OVER": (None,None), + "post_UNDER": (None,None), + } + # -------------------------------------------------------------------------------------------------------- + + # Shap value generation + # -------------------------------------------------------------------------------------------------------- + shap_values = {} # Mapping group-method -> shap values + for i, group in enumerate(['pre', 'post']): + # Get test dataset based on group + X_test = data_dic['X_test_' + group] + y_test = data_dic['y_test_' + group] + for j, method in enumerate(['', '', 'over_', 'under_']): + print(f"{group}-{method_names[j]}") + # Get train dataset based on group and method + X_train = data_dic['X_train_' + method + group] + y_train = data_dic['y_train_' + method + group] + # Retrieve best model for this group-method context + model_info = models[group + '_' + method_names[j]] + is_tree = model_info[0] + model = model_info[1] + # Fit model with training data + fitted_model = model.fit(X_train, y_train) # [:500]? + # Check if we are dealing with a tree vs nn model + if is_tree: + explainer = shap.TreeExplainer(fitted_model, X_test) # [:500]? + # -------------------------------------------------------------------------------------------------------- \ No newline at end of file diff --git a/model_selection/test_models.py b/model_selection/test_models.py index cb79d9c..e26fcfb 100644 --- a/model_selection/test_models.py +++ b/model_selection/test_models.py @@ -21,9 +21,9 @@ from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay import ast # String to dictionary # -------------------------------------------------------------------------------------------------------- -# Reading test data +# Reading data # -------------------------------------------------------------------------------------------------------- -def read_test_data(): +def read_data(): # Load test data X_test_pre = np.load('../gen_train_data/data/output/pre/X_test_pre.npy', allow_pickle=True) y_test_pre = np.load('../gen_train_data/data/output/pre/y_test_pre.npy', allow_pickle=True) @@ -152,8 +152,8 @@ def negative_recall_scorer(clf, X, y): # -------------------------------------------------------------------------------------------------------- if __name__ == "__main__": - # Reading testing data - data_dic = read_test_data() + # Reading data + data_dic = read_data() # Setup # -------------------------------------------------------------------------------------------------------- -- 2.24.1