Upload plotSHAP.py

d126fbfa · Lucia Prieto · 4d3411be · d126fbfa
Commit d126fbfa authored Oct 03, 2023 by Lucia Prieto
Show whitespace changes
Inline Side-by-side

Showing with 96 additions and 0 deletions

code/shap/plotSHAP.py code/shap/plotSHAP.py +96 -0

No files found.
--- a/code/shap/plotSHAP.py
+++ b/code/shap/plotSHAP.py
+import pandas as pd
+import numpy as np
+import shap
+
+import matplotlib.pyplot as plt
+
+from os import listdir
+
+from sklearn.model_selection import train_test_split
+
+
+def getDatasets(dataset, f):
+    # Import of database
+    label = pd.read_csv("labels.csv")
+    if dataset == "Dropout_1":
+        if f == "_Filtered":
+            db_cluster = pd.read_csv("featsGR_cluster.csv", sep=",")
+            db = pd.read_csv("featsGR.csv", sep=",")
+        else:
+            db_cluster = pd.read_csv("featsCluster.csv", sep=",").drop(columns="Unnamed: 0")
+            db = pd.read_csv("feats.csv", sep=",").drop(columns="Unnamed: 0")
+
+
+    # Creation of train and test sets for the set without cluster
+    columns_to_be_changed = db.select_dtypes(exclude='number').columns.values
+    sin_cluster_data_features = pd.get_dummies(db, columns=columns_to_be_changed)
+
+    # Creation of train and test sets for the set with cluster
+    columns_to_be_changed = db_cluster.select_dtypes(exclude='number').columns.values
+    cluster_data_features = pd.get_dummies(db_cluster, columns=columns_to_be_changed)
+
+    for col1 in sin_cluster_data_features:
+        sin_cluster_data_features[col1] = sin_cluster_data_features[col1].astype(float)
+    for col2 in cluster_data_features:
+        cluster_data_features[col2] = cluster_data_features[col2].astype(float)
+
+
+    return sin_cluster_data_features, cluster_data_features, label
+
+def plots(shap_values, tFeatures, name):
+
+    print(shap_values.shape)
+    print(tFeatures.shape)
+    print(name)
+
+    shap.summary_plot(shap_values, tFeatures, plot_type="bar", show=False, max_display=10, plot_size=(20, 8))
+    plt.savefig('figures/'+name+'_bar.svg', format='svg', dpi=1200)
+    plt.clf()
+    plt.xscale('log')
+    shap.summary_plot(shap_values, tFeatures, plot_type="dot", show=False,  max_display=10, plot_size=(20, 8))
+    plt.savefig('figures/'+name+'_dot.svg', format='svg', dpi=1200)
+    plt.clf()
+    print(":::::::::::::::::::::::::::::::::::::::::::::::::::")
+
+if __name__ == "__main__":
+
+    datasets = ["Dropout_1"]
+    filtered = ["","_Filtered"]
+    for f in filtered:
+        for d in datasets:
+            sin_cluster_data_features, cluster_data_features, label = getDatasets(d, f)
+
+            shap.initjs()
+
+            train_data_features, test_data_features, train_data_label, test_data_label = train_test_split(sin_cluster_data_features,
+                                                                                                          label,
+                                                                                                          test_size=0.2,
+                                                                                                          random_state=25)
+            train_data_features_cluster, test_data_features_cluster, train_data_label_cluster, test_data_label_cluster = train_test_split(
+                cluster_data_features, label, test_size=0.2, random_state=25)
+
+            features = list(train_data_features.columns.values)  # beware that this will change in case of FSS
+            featuresC = list(train_data_features_cluster.columns.values)
+
+            for file in listdir("shapValues/" + d + f):
+                shapValue = np.load("shapValues/" + d + f + '/' + file)
+                nameO = file.split(".")[0]
+
+                cluster = nameO[-1]
+                tree = nameO[10] == 'T' or nameO[10] == 'R'
+                if cluster == 'C':
+                    test_data_feats = test_data_features_cluster[:500]
+                    feats = featuresC
+                else:
+                    test_data_feats = test_data_features[:500]
+                    feats = features
+
+                if tree:
+                    shape = [2, 500, len(feats)]
+                else:
+                    shape = [500, len(feats)]
+
+                shapValue = np.reshape(shapValue, shape)
+                if tree:
+                    shapValue = shapValue[0]
+                plots(shapValue, test_data_feats, nameO)