import pandas as pd import numpy as np import shap from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier from sklearn.neural_network import MLPClassifier from sklearn.svm import SVC from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from imblearn.combine import SMOTETomek from imblearn.pipeline import Pipeline def getDatasets(dataset, f): # Import of database label = pd.read_csv("labels.csv") if dataset == "Dropout_1": if f == "_Filtered": db_cluster = pd.read_csv("featsGR_cluster.csv", sep=",") db = pd.read_csv("featsGR.csv", sep=",") else: db_cluster = pd.read_csv("featsCluster.csv", sep=",").drop(columns="Unnamed: 0") db = pd.read_csv("feats.csv", sep=",").drop(columns="Unnamed: 0") # Creation of train and test sets for the set without cluster columns_to_be_changed = db.select_dtypes(exclude='number').columns.values sin_cluster_data_features = pd.get_dummies(db, columns=columns_to_be_changed) # Creation of train and test sets for the set with cluster columns_to_be_changed = db_cluster.select_dtypes(exclude='number').columns.values cluster_data_features = pd.get_dummies(db_cluster, columns=columns_to_be_changed) for col1 in sin_cluster_data_features: sin_cluster_data_features[col1] = sin_cluster_data_features[col1].astype(float) for col2 in cluster_data_features: cluster_data_features[col2] = cluster_data_features[col2].astype(float) return sin_cluster_data_features, cluster_data_features, label def getModels(dataset, f): if dataset == "Dropout_1": if f == "_Filtered": modelT = DecisionTreeClassifier(splitter='best', max_features='sqrt', criterion='gini') modelTC = DecisionTreeClassifier(splitter='random', max_features='log2', criterion='log_loss') modelRF = RandomForestClassifier(criterion='entropy', max_features='sqrt', n_estimators=111) modelRFC = RandomForestClassifier(criterion='gini', max_features='sqrt', n_estimators=194) modelBG = BaggingClassifier(max_features=0.8, max_samples=1.0, n_estimators=13, warm_start=True) modelBGC = BaggingClassifier(max_features=0.8, max_samples=0.8, n_estimators=67, warm_start=True) modelBS = GradientBoostingClassifier(learning_rate=1.906133726818843, n_estimators=62) modelBSC = GradientBoostingClassifier(learning_rate=1.9184233056461408, n_estimators=83) modelLR = LogisticRegression(solver='lbfgs', penalty='l2') modelLRC = LogisticRegression(solver='newton-cholesky', penalty='l2') modelSVM = SVC(C=1.666308029510168, kernel='linear') modelSVMC = SVC(C=0.9893908052093191, kernel='linear') modelMLP = MLPClassifier(activation='logistic', hidden_layer_sizes=116, learning_rate='invscaling') modelMLPC = MLPClassifier(activation='identity', hidden_layer_sizes=94, learning_rate='adaptive') else: modelT = DecisionTreeClassifier(splitter='random', max_features='log2', criterion='gini') modelTC = DecisionTreeClassifier(splitter='random', max_features='sqrt', criterion='entropy') modelRF = RandomForestClassifier(criterion='entropy', max_features='log2', n_estimators=134) modelRFC = RandomForestClassifier(criterion='entropy', max_features='log2', n_estimators=237) modelBG = BaggingClassifier(max_features=0.8, max_samples=0.8, n_estimators=11, warm_start=True) modelBGC = BaggingClassifier(max_features=0.8, max_samples=0.8, n_estimators=16, warm_start=False) modelBS = GradientBoostingClassifier(learning_rate=0.9249002333174023, n_estimators=134) modelBSC = GradientBoostingClassifier(learning_rate=0.998432567508207, n_estimators=91) modelLR = LogisticRegression(solver='sag', penalty='l2') modelLRC = LogisticRegression(solver='lbfgs', penalty='l2') modelSVM = SVC(C=0.9151969366500319, kernel='linear') modelSVMC = SVC(C=1.3078813689652904, kernel='linear') modelMLP = MLPClassifier(activation='identity', hidden_layer_sizes=114, learning_rate='constant') modelMLPC = MLPClassifier(activation='identity', hidden_layer_sizes=71, learning_rate='constant') t = Pipeline(steps=[('r', SMOTETomek(sampling_strategy=1)), ('m', modelT)]) tC = Pipeline(steps=[('r', SMOTETomek(sampling_strategy=1)), ('m', modelTC)]) rf = Pipeline(steps=[('r', SMOTETomek(sampling_strategy=1)), ('m', modelRF)]) rfC = Pipeline(steps=[('r', SMOTETomek(sampling_strategy=1)), ('m', modelRFC)]) bag = Pipeline(steps=[('r', SMOTETomek(sampling_strategy=1)), ('m', modelBG)]) bagC = Pipeline(steps=[('r', SMOTETomek(sampling_strategy=1)), ('m', modelBGC)]) boos = Pipeline(steps=[('r', SMOTETomek(sampling_strategy=1)), ('m', modelBS)]) boosC = Pipeline(steps=[('r', SMOTETomek(sampling_strategy=1)), ('m', modelBSC)]) lr = Pipeline(steps=[('r', SMOTETomek(sampling_strategy=1)), ('m', modelLR)]) lrC = Pipeline(steps=[('r', SMOTETomek(sampling_strategy=1)), ('m', modelLRC)]) svm = Pipeline(steps=[('r', SMOTETomek(sampling_strategy=1)), ('m', modelSVM)]) svmC = Pipeline(steps=[('r', SMOTETomek(sampling_strategy=1)), ('m', modelSVMC)]) mlp = Pipeline(steps=[('r', SMOTETomek(sampling_strategy=1)), ('m', modelMLP)]) mlpC = Pipeline(steps=[('r', SMOTETomek(sampling_strategy=1)), ('m', modelMLPC)]) return t, tC, rf, rfC, bag, bagC, boos, boosC, lr, lrC, svm, svmC, mlp, mlpC if __name__ == '__main__': datasets = ["Dropout_1"] filtered = ["","_Filtered"] for f in filtered: for d in datasets: sin_cluster_data_features, cluster_data_features, label = getDatasets(d, f) shap.initjs() train_data_features, test_data_features, train_data_label, test_data_label = train_test_split(sin_cluster_data_features, label, test_size=0.2, random_state=25) train_data_features_cluster, test_data_features_cluster, train_data_label_cluster, test_data_label_cluster = train_test_split( cluster_data_features, label, test_size=0.2, random_state=25) features = list(train_data_features.columns.values) # beware that this will change in case of FSS shap_set = test_data_features shap_setC = test_data_features_cluster t, tC, rf, rfC, bag, bagC, boos, boosC, lr, lrC, svm, svmC, mlp, mlpC = getModels(d, f) tree_models = [t, tC, rf, rfC] tree_names = ["T", "TC", "RF", "RFC"] nn_models = [bag, bagC, boos, boosC, lr, lrC, svm, svmC, mlp, mlpC] nn_names = ["BG", "BGC", "BOOS", "BOOSC", "LR", "LRC", "SVM", "SVMC", "MLP", "MLPC"] shap_values = {} # ##DRAFT OF THE LOOPING for i, (m, mN) in enumerate(zip(tree_models, tree_names)): if i % 2 == 0: fitted_model = m.fit(train_data_features.values[:500], train_data_label.values[:500]) print('\n' + mN, ":", m.score(test_data_features.values, test_data_label.values)) explainer = shap.TreeExplainer(fitted_model['m'], shap_set[:500]) shap_values[mN] = explainer.shap_values(shap_set[:500], check_additivity=False) # check_additivity to be changed for final computation else: fitted_model = m.fit(train_data_features_cluster.values[:500], train_data_label_cluster.values[:500]) print('\n' + mN, ":", m.score(train_data_features_cluster.values, train_data_label_cluster.values)) explainer = shap.TreeExplainer(fitted_model['m'], shap_setC[:500]) shap_values[mN] = explainer.shap_values(shap_setC[:500], check_additivity=False) # check_additivity to be changed for final computation print(np.array(shap_values[mN]).shape) print(shap_values[mN][0][0].shape) print(shap_values[mN][1].shape) np.save("./shapValues/"+d+"_"+mN + f, shap_values[mN]) for i,(m, mN) in enumerate(zip(nn_models, nn_names)): if i % 2 == 0: fitted_model = m.fit(train_data_features.values[:500], train_data_label.values[:500]) print('\n' + mN, ":", m.score(test_data_features.values, test_data_label.values)) explainer = shap.KernelExplainer(fitted_model['m'].predict, shap_set[:500]) shap_values[mN] = explainer.shap_values(shap_set[:500], check_additivity=False) else: fitted_model = m.fit(train_data_features_cluster.values[:500], train_data_label_cluster.values[:500]) print('\n' + mN, ":", m.score(train_data_features_cluster.values, train_data_label_cluster.values)) explainer = shap.KernelExplainer(fitted_model['m'].predict, shap_setC[:500]) shap_values[mN] = explainer.shap_values(shap_setC[:500], check_additivity=False) # check_additivity to be changed for final computation np.save("./shapValues/" + d + "_" + mN + f, shap_values[mN])