Commit 4d3411be authored by Lucia Prieto's avatar Lucia Prieto

Upload explicability1.py

parent 1ac7e915
import pandas as pd
import numpy as np
import shap
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTETomek
from imblearn.pipeline import Pipeline
def getDatasets(dataset, f):
# Import of database
label = pd.read_csv("labels.csv")
if dataset == "Dropout_1":
if f == "_Filtered":
db_cluster = pd.read_csv("featsGR_cluster.csv", sep=",")
db = pd.read_csv("featsGR.csv", sep=",")
else:
db_cluster = pd.read_csv("featsCluster.csv", sep=",").drop(columns="Unnamed: 0")
db = pd.read_csv("feats.csv", sep=",").drop(columns="Unnamed: 0")
# Creation of train and test sets for the set without cluster
columns_to_be_changed = db.select_dtypes(exclude='number').columns.values
sin_cluster_data_features = pd.get_dummies(db, columns=columns_to_be_changed)
# Creation of train and test sets for the set with cluster
columns_to_be_changed = db_cluster.select_dtypes(exclude='number').columns.values
cluster_data_features = pd.get_dummies(db_cluster, columns=columns_to_be_changed)
for col1 in sin_cluster_data_features:
sin_cluster_data_features[col1] = sin_cluster_data_features[col1].astype(float)
for col2 in cluster_data_features:
cluster_data_features[col2] = cluster_data_features[col2].astype(float)
return sin_cluster_data_features, cluster_data_features, label
def getModels(dataset, f):
if dataset == "Dropout_1":
if f == "_Filtered":
modelT = DecisionTreeClassifier(splitter='best', max_features='sqrt', criterion='gini')
modelTC = DecisionTreeClassifier(splitter='random', max_features='log2', criterion='log_loss')
modelRF = RandomForestClassifier(criterion='entropy', max_features='sqrt', n_estimators=111)
modelRFC = RandomForestClassifier(criterion='gini', max_features='sqrt', n_estimators=194)
modelBG = BaggingClassifier(max_features=0.8, max_samples=1.0, n_estimators=13, warm_start=True)
modelBGC = BaggingClassifier(max_features=0.8, max_samples=0.8, n_estimators=67, warm_start=True)
modelBS = GradientBoostingClassifier(learning_rate=1.906133726818843, n_estimators=62)
modelBSC = GradientBoostingClassifier(learning_rate=1.9184233056461408, n_estimators=83)
modelLR = LogisticRegression(solver='lbfgs', penalty='l2')
modelLRC = LogisticRegression(solver='newton-cholesky', penalty='l2')
modelSVM = SVC(C=1.666308029510168, kernel='linear')
modelSVMC = SVC(C=0.9893908052093191, kernel='linear')
modelMLP = MLPClassifier(activation='logistic', hidden_layer_sizes=116, learning_rate='invscaling')
modelMLPC = MLPClassifier(activation='identity', hidden_layer_sizes=94, learning_rate='adaptive')
else:
modelT = DecisionTreeClassifier(splitter='random', max_features='log2', criterion='gini')
modelTC = DecisionTreeClassifier(splitter='random', max_features='sqrt', criterion='entropy')
modelRF = RandomForestClassifier(criterion='entropy', max_features='log2', n_estimators=134)
modelRFC = RandomForestClassifier(criterion='entropy', max_features='log2', n_estimators=237)
modelBG = BaggingClassifier(max_features=0.8, max_samples=0.8, n_estimators=11, warm_start=True)
modelBGC = BaggingClassifier(max_features=0.8, max_samples=0.8, n_estimators=16, warm_start=False)
modelBS = GradientBoostingClassifier(learning_rate=0.9249002333174023, n_estimators=134)
modelBSC = GradientBoostingClassifier(learning_rate=0.998432567508207, n_estimators=91)
modelLR = LogisticRegression(solver='sag', penalty='l2')
modelLRC = LogisticRegression(solver='lbfgs', penalty='l2')
modelSVM = SVC(C=0.9151969366500319, kernel='linear')
modelSVMC = SVC(C=1.3078813689652904, kernel='linear')
modelMLP = MLPClassifier(activation='identity', hidden_layer_sizes=114, learning_rate='constant')
modelMLPC = MLPClassifier(activation='identity', hidden_layer_sizes=71, learning_rate='constant')
t = Pipeline(steps=[('r', SMOTETomek(sampling_strategy=1)), ('m', modelT)])
tC = Pipeline(steps=[('r', SMOTETomek(sampling_strategy=1)), ('m', modelTC)])
rf = Pipeline(steps=[('r', SMOTETomek(sampling_strategy=1)), ('m', modelRF)])
rfC = Pipeline(steps=[('r', SMOTETomek(sampling_strategy=1)), ('m', modelRFC)])
bag = Pipeline(steps=[('r', SMOTETomek(sampling_strategy=1)), ('m', modelBG)])
bagC = Pipeline(steps=[('r', SMOTETomek(sampling_strategy=1)), ('m', modelBGC)])
boos = Pipeline(steps=[('r', SMOTETomek(sampling_strategy=1)), ('m', modelBS)])
boosC = Pipeline(steps=[('r', SMOTETomek(sampling_strategy=1)), ('m', modelBSC)])
lr = Pipeline(steps=[('r', SMOTETomek(sampling_strategy=1)), ('m', modelLR)])
lrC = Pipeline(steps=[('r', SMOTETomek(sampling_strategy=1)), ('m', modelLRC)])
svm = Pipeline(steps=[('r', SMOTETomek(sampling_strategy=1)), ('m', modelSVM)])
svmC = Pipeline(steps=[('r', SMOTETomek(sampling_strategy=1)), ('m', modelSVMC)])
mlp = Pipeline(steps=[('r', SMOTETomek(sampling_strategy=1)), ('m', modelMLP)])
mlpC = Pipeline(steps=[('r', SMOTETomek(sampling_strategy=1)), ('m', modelMLPC)])
return t, tC, rf, rfC, bag, bagC, boos, boosC, lr, lrC, svm, svmC, mlp, mlpC
if __name__ == '__main__':
datasets = ["Dropout_1"]
filtered = ["","_Filtered"]
for f in filtered:
for d in datasets:
sin_cluster_data_features, cluster_data_features, label = getDatasets(d, f)
shap.initjs()
train_data_features, test_data_features, train_data_label, test_data_label = train_test_split(sin_cluster_data_features,
label,
test_size=0.2,
random_state=25)
train_data_features_cluster, test_data_features_cluster, train_data_label_cluster, test_data_label_cluster = train_test_split(
cluster_data_features, label, test_size=0.2, random_state=25)
features = list(train_data_features.columns.values) # beware that this will change in case of FSS
shap_set = test_data_features
shap_setC = test_data_features_cluster
t, tC, rf, rfC, bag, bagC, boos, boosC, lr, lrC, svm, svmC, mlp, mlpC = getModels(d, f)
tree_models = [t, tC, rf, rfC]
tree_names = ["T", "TC", "RF", "RFC"]
nn_models = [bag, bagC, boos, boosC, lr, lrC, svm, svmC, mlp, mlpC]
nn_names = ["BG", "BGC", "BOOS", "BOOSC", "LR", "LRC", "SVM", "SVMC", "MLP", "MLPC"]
shap_values = {}
# ##DRAFT OF THE LOOPING
for i, (m, mN) in enumerate(zip(tree_models, tree_names)):
if i % 2 == 0:
fitted_model = m.fit(train_data_features.values[:500], train_data_label.values[:500])
print('\n' + mN, ":", m.score(test_data_features.values, test_data_label.values))
explainer = shap.TreeExplainer(fitted_model['m'], shap_set[:500])
shap_values[mN] = explainer.shap_values(shap_set[:500], check_additivity=False) # check_additivity to be changed for final computation
else:
fitted_model = m.fit(train_data_features_cluster.values[:500], train_data_label_cluster.values[:500])
print('\n' + mN, ":", m.score(train_data_features_cluster.values, train_data_label_cluster.values))
explainer = shap.TreeExplainer(fitted_model['m'], shap_setC[:500])
shap_values[mN] = explainer.shap_values(shap_setC[:500], check_additivity=False) # check_additivity to be changed for final computation
print(np.array(shap_values[mN]).shape)
print(shap_values[mN][0][0].shape)
print(shap_values[mN][1].shape)
np.save("./shapValues/"+d+"_"+mN + f, shap_values[mN])
for i,(m, mN) in enumerate(zip(nn_models, nn_names)):
if i % 2 == 0:
fitted_model = m.fit(train_data_features.values[:500], train_data_label.values[:500])
print('\n' + mN, ":", m.score(test_data_features.values, test_data_label.values))
explainer = shap.KernelExplainer(fitted_model['m'].predict, shap_set[:500])
shap_values[mN] = explainer.shap_values(shap_set[:500], check_additivity=False)
else:
fitted_model = m.fit(train_data_features_cluster.values[:500], train_data_label_cluster.values[:500])
print('\n' + mN, ":", m.score(train_data_features_cluster.values, train_data_label_cluster.values))
explainer = shap.KernelExplainer(fitted_model['m'].predict, shap_setC[:500])
shap_values[mN] = explainer.shap_values(shap_setC[:500], check_additivity=False) # check_additivity to be changed for final computation
np.save("./shapValues/" + d + "_" + mN + f, shap_values[mN])
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment