From 4d3411be76edcc8a6e12d21f6c7e4ee2d0abe708 Mon Sep 17 00:00:00 2001 From: Lucia Prieto Date: Tue, 3 Oct 2023 04:37:56 +0000 Subject: [PATCH] Upload explicability1.py --- code/shap/explicability1.py | 179 ++++++++++++++++++++++++++++++++++++ 1 file changed, 179 insertions(+) create mode 100644 code/shap/explicability1.py diff --git a/code/shap/explicability1.py b/code/shap/explicability1.py new file mode 100644 index 0000000..22d51ca --- /dev/null +++ b/code/shap/explicability1.py @@ -0,0 +1,179 @@ +import pandas as pd +import numpy as np +import shap + +from sklearn.model_selection import train_test_split +from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier +from sklearn.neural_network import MLPClassifier +from sklearn.svm import SVC +from sklearn.linear_model import LogisticRegression +from sklearn.tree import DecisionTreeClassifier +from imblearn.combine import SMOTETomek +from imblearn.pipeline import Pipeline + +def getDatasets(dataset, f): + # Import of database + label = pd.read_csv("labels.csv") + if dataset == "Dropout_1": + if f == "_Filtered": + db_cluster = pd.read_csv("featsGR_cluster.csv", sep=",") + db = pd.read_csv("featsGR.csv", sep=",") + else: + db_cluster = pd.read_csv("featsCluster.csv", sep=",").drop(columns="Unnamed: 0") + db = pd.read_csv("feats.csv", sep=",").drop(columns="Unnamed: 0") + + + # Creation of train and test sets for the set without cluster + columns_to_be_changed = db.select_dtypes(exclude='number').columns.values + sin_cluster_data_features = pd.get_dummies(db, columns=columns_to_be_changed) + + # Creation of train and test sets for the set with cluster + columns_to_be_changed = db_cluster.select_dtypes(exclude='number').columns.values + cluster_data_features = pd.get_dummies(db_cluster, columns=columns_to_be_changed) + + for col1 in sin_cluster_data_features: + sin_cluster_data_features[col1] = sin_cluster_data_features[col1].astype(float) + for col2 in cluster_data_features: + cluster_data_features[col2] = cluster_data_features[col2].astype(float) + + + return sin_cluster_data_features, cluster_data_features, label + + +def getModels(dataset, f): + if dataset == "Dropout_1": + if f == "_Filtered": + modelT = DecisionTreeClassifier(splitter='best', max_features='sqrt', criterion='gini') + modelTC = DecisionTreeClassifier(splitter='random', max_features='log2', criterion='log_loss') + + modelRF = RandomForestClassifier(criterion='entropy', max_features='sqrt', n_estimators=111) + modelRFC = RandomForestClassifier(criterion='gini', max_features='sqrt', n_estimators=194) + + modelBG = BaggingClassifier(max_features=0.8, max_samples=1.0, n_estimators=13, warm_start=True) + modelBGC = BaggingClassifier(max_features=0.8, max_samples=0.8, n_estimators=67, warm_start=True) + + modelBS = GradientBoostingClassifier(learning_rate=1.906133726818843, n_estimators=62) + modelBSC = GradientBoostingClassifier(learning_rate=1.9184233056461408, n_estimators=83) + + modelLR = LogisticRegression(solver='lbfgs', penalty='l2') + modelLRC = LogisticRegression(solver='newton-cholesky', penalty='l2') + + modelSVM = SVC(C=1.666308029510168, kernel='linear') + modelSVMC = SVC(C=0.9893908052093191, kernel='linear') + + modelMLP = MLPClassifier(activation='logistic', hidden_layer_sizes=116, learning_rate='invscaling') + modelMLPC = MLPClassifier(activation='identity', hidden_layer_sizes=94, learning_rate='adaptive') + + else: + modelT = DecisionTreeClassifier(splitter='random', max_features='log2', criterion='gini') + modelTC = DecisionTreeClassifier(splitter='random', max_features='sqrt', criterion='entropy') + + modelRF = RandomForestClassifier(criterion='entropy', max_features='log2', n_estimators=134) + modelRFC = RandomForestClassifier(criterion='entropy', max_features='log2', n_estimators=237) + + modelBG = BaggingClassifier(max_features=0.8, max_samples=0.8, n_estimators=11, warm_start=True) + modelBGC = BaggingClassifier(max_features=0.8, max_samples=0.8, n_estimators=16, warm_start=False) + + modelBS = GradientBoostingClassifier(learning_rate=0.9249002333174023, n_estimators=134) + modelBSC = GradientBoostingClassifier(learning_rate=0.998432567508207, n_estimators=91) + + modelLR = LogisticRegression(solver='sag', penalty='l2') + modelLRC = LogisticRegression(solver='lbfgs', penalty='l2') + + modelSVM = SVC(C=0.9151969366500319, kernel='linear') + modelSVMC = SVC(C=1.3078813689652904, kernel='linear') + + modelMLP = MLPClassifier(activation='identity', hidden_layer_sizes=114, learning_rate='constant') + modelMLPC = MLPClassifier(activation='identity', hidden_layer_sizes=71, learning_rate='constant') + + + t = Pipeline(steps=[('r', SMOTETomek(sampling_strategy=1)), ('m', modelT)]) + tC = Pipeline(steps=[('r', SMOTETomek(sampling_strategy=1)), ('m', modelTC)]) + + rf = Pipeline(steps=[('r', SMOTETomek(sampling_strategy=1)), ('m', modelRF)]) + rfC = Pipeline(steps=[('r', SMOTETomek(sampling_strategy=1)), ('m', modelRFC)]) + + bag = Pipeline(steps=[('r', SMOTETomek(sampling_strategy=1)), ('m', modelBG)]) + bagC = Pipeline(steps=[('r', SMOTETomek(sampling_strategy=1)), ('m', modelBGC)]) + + boos = Pipeline(steps=[('r', SMOTETomek(sampling_strategy=1)), ('m', modelBS)]) + boosC = Pipeline(steps=[('r', SMOTETomek(sampling_strategy=1)), ('m', modelBSC)]) + + lr = Pipeline(steps=[('r', SMOTETomek(sampling_strategy=1)), ('m', modelLR)]) + lrC = Pipeline(steps=[('r', SMOTETomek(sampling_strategy=1)), ('m', modelLRC)]) + + svm = Pipeline(steps=[('r', SMOTETomek(sampling_strategy=1)), ('m', modelSVM)]) + svmC = Pipeline(steps=[('r', SMOTETomek(sampling_strategy=1)), ('m', modelSVMC)]) + + mlp = Pipeline(steps=[('r', SMOTETomek(sampling_strategy=1)), ('m', modelMLP)]) + mlpC = Pipeline(steps=[('r', SMOTETomek(sampling_strategy=1)), ('m', modelMLPC)]) + + return t, tC, rf, rfC, bag, bagC, boos, boosC, lr, lrC, svm, svmC, mlp, mlpC + + +if __name__ == '__main__': + datasets = ["Dropout_1"] + filtered = ["","_Filtered"] + for f in filtered: + for d in datasets: + sin_cluster_data_features, cluster_data_features, label = getDatasets(d, f) + + shap.initjs() + + train_data_features, test_data_features, train_data_label, test_data_label = train_test_split(sin_cluster_data_features, + label, + test_size=0.2, + random_state=25) + train_data_features_cluster, test_data_features_cluster, train_data_label_cluster, test_data_label_cluster = train_test_split( + cluster_data_features, label, test_size=0.2, random_state=25) + + features = list(train_data_features.columns.values) # beware that this will change in case of FSS + shap_set = test_data_features + shap_setC = test_data_features_cluster + + t, tC, rf, rfC, bag, bagC, boos, boosC, lr, lrC, svm, svmC, mlp, mlpC = getModels(d, f) + + tree_models = [t, tC, rf, rfC] + tree_names = ["T", "TC", "RF", "RFC"] + + nn_models = [bag, bagC, boos, boosC, lr, lrC, svm, svmC, mlp, mlpC] + nn_names = ["BG", "BGC", "BOOS", "BOOSC", "LR", "LRC", "SVM", "SVMC", "MLP", "MLPC"] + + shap_values = {} + + # ##DRAFT OF THE LOOPING + for i, (m, mN) in enumerate(zip(tree_models, tree_names)): + if i % 2 == 0: + fitted_model = m.fit(train_data_features.values[:500], train_data_label.values[:500]) + print('\n' + mN, ":", m.score(test_data_features.values, test_data_label.values)) + explainer = shap.TreeExplainer(fitted_model['m'], shap_set[:500]) + shap_values[mN] = explainer.shap_values(shap_set[:500], check_additivity=False) # check_additivity to be changed for final computation + + else: + fitted_model = m.fit(train_data_features_cluster.values[:500], train_data_label_cluster.values[:500]) + print('\n' + mN, ":", m.score(train_data_features_cluster.values, train_data_label_cluster.values)) + explainer = shap.TreeExplainer(fitted_model['m'], shap_setC[:500]) + shap_values[mN] = explainer.shap_values(shap_setC[:500], check_additivity=False) # check_additivity to be changed for final computation + + print(np.array(shap_values[mN]).shape) + print(shap_values[mN][0][0].shape) + print(shap_values[mN][1].shape) + np.save("./shapValues/"+d+"_"+mN + f, shap_values[mN]) + + + for i,(m, mN) in enumerate(zip(nn_models, nn_names)): + if i % 2 == 0: + fitted_model = m.fit(train_data_features.values[:500], train_data_label.values[:500]) + print('\n' + mN, ":", m.score(test_data_features.values, test_data_label.values)) + explainer = shap.KernelExplainer(fitted_model['m'].predict, shap_set[:500]) + shap_values[mN] = explainer.shap_values(shap_set[:500], check_additivity=False) + else: + fitted_model = m.fit(train_data_features_cluster.values[:500], train_data_label_cluster.values[:500]) + print('\n' + mN, ":", m.score(train_data_features_cluster.values, train_data_label_cluster.values)) + explainer = shap.KernelExplainer(fitted_model['m'].predict, shap_setC[:500]) + shap_values[mN] = explainer.shap_values(shap_setC[:500], check_additivity=False) # check_additivity to be changed for final computation + + np.save("./shapValues/" + d + "_" + mN + f, shap_values[mN]) + + + -- 2.24.1