import pandas as pd import numpy as np import shap import matplotlib.pyplot as plt from os import listdir from sklearn.model_selection import train_test_split def getDatasets(dataset, f): # Import of database label = pd.read_csv("labels.csv") if dataset == "Dropout_1": if f == "_Filtered": db_cluster = pd.read_csv("featsGR_cluster.csv", sep=",") db = pd.read_csv("featsGR.csv", sep=",") else: db_cluster = pd.read_csv("featsCluster.csv", sep=",").drop(columns="Unnamed: 0") db = pd.read_csv("feats.csv", sep=",").drop(columns="Unnamed: 0") # Creation of train and test sets for the set without cluster columns_to_be_changed = db.select_dtypes(exclude='number').columns.values sin_cluster_data_features = pd.get_dummies(db, columns=columns_to_be_changed) # Creation of train and test sets for the set with cluster columns_to_be_changed = db_cluster.select_dtypes(exclude='number').columns.values cluster_data_features = pd.get_dummies(db_cluster, columns=columns_to_be_changed) for col1 in sin_cluster_data_features: sin_cluster_data_features[col1] = sin_cluster_data_features[col1].astype(float) for col2 in cluster_data_features: cluster_data_features[col2] = cluster_data_features[col2].astype(float) return sin_cluster_data_features, cluster_data_features, label def plots(shap_values, tFeatures, name): print(shap_values.shape) print(tFeatures.shape) print(name) shap.summary_plot(shap_values, tFeatures, plot_type="bar", show=False, max_display=10, plot_size=(20, 8)) plt.savefig('figures/'+name+'_bar.svg', format='svg', dpi=1200) plt.clf() plt.xscale('log') shap.summary_plot(shap_values, tFeatures, plot_type="dot", show=False, max_display=10, plot_size=(20, 8)) plt.savefig('figures/'+name+'_dot.svg', format='svg', dpi=1200) plt.clf() print(":::::::::::::::::::::::::::::::::::::::::::::::::::") if __name__ == "__main__": datasets = ["Dropout_1"] filtered = ["","_Filtered"] for f in filtered: for d in datasets: sin_cluster_data_features, cluster_data_features, label = getDatasets(d, f) shap.initjs() train_data_features, test_data_features, train_data_label, test_data_label = train_test_split(sin_cluster_data_features, label, test_size=0.2, random_state=25) train_data_features_cluster, test_data_features_cluster, train_data_label_cluster, test_data_label_cluster = train_test_split( cluster_data_features, label, test_size=0.2, random_state=25) features = list(train_data_features.columns.values) # beware that this will change in case of FSS featuresC = list(train_data_features_cluster.columns.values) for file in listdir("shapValues/" + d + f): shapValue = np.load("shapValues/" + d + f + '/' + file) nameO = file.split(".")[0] cluster = nameO[-1] tree = nameO[10] == 'T' or nameO[10] == 'R' if cluster == 'C': test_data_feats = test_data_features_cluster[:500] feats = featuresC else: test_data_feats = test_data_features[:500] feats = features if tree: shape = [2, 500, len(feats)] else: shape = [500, len(feats)] shapValue = np.reshape(shapValue, shape) if tree: shapValue = shapValue[0] plots(shapValue, test_data_feats, nameO)