# The aim of this code is to get an ovrview of the performances of the selected models on the filtered data set whetehr thery are clustered of not and for both outputs # We train with a 10-folds scenario and get as an output the following metrics for each fold : # -AUROC # -F1 # -LogLoss # Import of databases for training import matplotlib.pyplot as plt import numpy as np import pandas as pd from imblearn.pipeline import Pipeline from imblearn.combine import SMOTETomek from sklearn.metrics import confusion_matrix from sklearn.metrics import f1_score, make_scorer, precision_score, recall_score from sklearn.model_selection import StratifiedKFold, cross_validate from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier from sklearn.neural_network import MLPClassifier from sklearn.svm import SVC from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier def output_datasets(dataset,filtering): """Gives the features and labels to train the model on depending on the dataset considered""" if dataset == "Dropout_1": # Import of csv database from the feature engineering R code db_cluster = pd.read_csv("dropout_cluster.csv", sep=",") db = pd.read_csv("dropout_sin_cluster.csv", sep=",") # Features to be selected from the feature filtering step features_cluster = (pd.read_csv("dropout_cluster_FSS.csv",sep=",")).columns.values features = (pd.read_csv("dropout_sin_cluster_FSS.csv",sep=",")).columns.values else: # Import of csv database from the feature engineering R code db_cluster = pd.read_csv("relapse_cluster.csv", sep=",") db = pd.read_csv("relapse_sin_cluster.csv", sep=",") # Features to be selected from the feature filtering step features_cluster = (pd.read_csv("relapse_cluster_FSS.csv",sep=",")).columns.values features = (pd.read_csv("relapse_sin_cluster_FSS.csv",sep=",")).columns.values # Creation of train and test sets for the dataset without cluster sin_cluster_data_label = db[dataset] sin_cluster_data_features = db.drop(dataset, axis=1) #elimination of the output from the training set columns_to_be_changed = sin_cluster_data_features.select_dtypes(exclude='number').columns.values sin_cluster_data_features = pd.get_dummies(sin_cluster_data_features, columns=columns_to_be_changed) #use of one hot encoding for categorical variables if filtering == "FSS" : #selection of features in case the filtering is activated for the dataset sin_cluster_data_features = sin_cluster_data_features.filter(features, axis=1) sin_cluster_data_features.replace({False: 0, True: 1}, inplace=True) #convertion of boolean by integers for the features set (necessary for newest numpy versions) sin_cluster_data_label.replace({False: 0, True: 1}, inplace=True) #convertion of boolean by integers for the features set (necessary for newest numpy versions) # Creation of train and test sets for the dataset with cluster (same steps) cluster_data_label = db_cluster[dataset] cluster_data_features = db_cluster.drop(dataset, axis=1) columns_to_be_changed = cluster_data_features.select_dtypes(exclude='number').columns.values cluster_data_features = pd.get_dummies(cluster_data_features, columns=columns_to_be_changed) if filtering == "FSS" : cluster_data_features = cluster_data_features.filter(features_cluster, axis=1) cluster_data_features.replace({False: 0, True: 1}, inplace=True) cluster_data_label.replace({False: 0, True: 1}, inplace=True) return sin_cluster_data_features, sin_cluster_data_label, cluster_data_features, cluster_data_label def models_input(dataset, filtering) : """Gives a dictionnary of models to train with as a tuple model_name:(model optimized without cluster, model optimized with clusters)""" if filtering == "FSS" : if dataset == "Dropout_1" : models = {"Tree" : (DecisionTreeClassifier(splitter='best', max_features='sqrt', criterion='gini'), DecisionTreeClassifier(splitter='random', max_features='log2', criterion='log_loss')), "RF" : (RandomForestClassifier(criterion='entropy', max_features='sqrt', n_estimators=111), RandomForestClassifier(criterion='gini', max_features='sqrt', n_estimators=194)), "Boost" : (AdaBoostClassifier(learning_rate= 1.9061, n_estimators= 62),AdaBoostClassifier(learning_rate= 1.9184, n_estimators= 83)), "Bag" :(BaggingClassifier(max_features= 0.8, max_samples= 1.0, n_estimators= 13, warm_start= True), BaggingClassifier(max_features= 0.8, max_samples= 0.8, n_estimators= 67, warm_start= True)), "LR" : (LogisticRegression(solver='lbfgs', penalty='l2'), LogisticRegression(solver='newton-cholesky', penalty='l2')), "SVM" : (SVC(C=1.6663, kernel='linear'), SVC(C=0.9894, kernel='linear')), "NN" : (MLPClassifier(activation='logistic', hidden_layer_sizes=116, learning_rate='invscaling'), MLPClassifier(activation='identity', hidden_layer_sizes=94, learning_rate='adaptive'))} if dataset == "Relapse_1" : models = {"Tree" : (DecisionTreeClassifier(splitter='random', max_features='log2', criterion='log_loss'), DecisionTreeClassifier(splitter='best', max_features='sqrt', criterion='gini')), "RF" : (RandomForestClassifier(criterion='gini', max_features='sqrt', n_estimators=158), RandomForestClassifier(criterion='gini', max_features='sqrt', n_estimators=242)), "Boost" : (AdaBoostClassifier(learning_rate= 0.994, n_estimators= 117),AdaBoostClassifier(learning_rate= 1.672, n_estimators= 144)), "Bag" :(BaggingClassifier(max_features= 0.8, max_samples= 1, n_estimators= 18, warm_start= False), BaggingClassifier(max_features= 0.8, max_samples= 0.8, n_estimators= 67, warm_start= False)), "LR" : (LogisticRegression(solver='sag', penalty='l2'), LogisticRegression(solver='saga', penalty='l2')), "SVM" : (SVC(C=1.511, kernel='rbf'), SVC(C=1.033, kernel='rbf')), "NN" : (MLPClassifier(activation='relu', hidden_layer_sizes=125, learning_rate='invscaling'), MLPClassifier(activation='tanh', hidden_layer_sizes=67, learning_rate='constant'))} if filtering == "noFSS" : if dataset == "Dropout_1" : models = {"Tree" : (DecisionTreeClassifier(splitter='random', max_features='log2', criterion='gini'), DecisionTreeClassifier(splitter='random', max_features='sqrt', criterion='entropy')), "RF" : (RandomForestClassifier(criterion='entropy', max_features='log2', n_estimators=134), RandomForestClassifier(criterion='entropy', max_features='log2', n_estimators=237)), "Boost" : (AdaBoostClassifier(learning_rate= 0.9249, n_estimators= 54),AdaBoostClassifier(learning_rate= 0.9984, n_estimators= 91)), "Bag" :(BaggingClassifier(max_features= 0.8, max_samples= 0.8, n_estimators= 11, warm_start= True), BaggingClassifier(max_features= 0.8, max_samples= 0.8, n_estimators= 16, warm_start= False)), "LR" : (LogisticRegression(solver='sag', penalty='l2'), LogisticRegression(solver='lbfgs', penalty='l2')), "SVM" : (SVC(C=0.9152, kernel='linear'), SVC(C=1.3079, kernel='linear')), "NN" : (MLPClassifier(activation='identity', hidden_layer_sizes=114, learning_rate='constant'), MLPClassifier(activation='identity', hidden_layer_sizes=71, learning_rate='constant'))} if dataset == "Relapse_1" : models = {"Tree" : (DecisionTreeClassifier(splitter='best', max_features='sqrt', criterion='entropy'), DecisionTreeClassifier(splitter='random', max_features='sqrt', criterion='entropy')), "RF" : (RandomForestClassifier(criterion='entropy', max_features='sqrt', n_estimators=128), RandomForestClassifier(criterion='entropy', max_features='sqrt', n_estimators=131)), "Boost" : (AdaBoostClassifier(learning_rate= 1.259, n_estimators= 127), AdaBoostClassifier(learning_rate= 1.393, n_estimators= 135)), "Bag" :(BaggingClassifier(max_features= 0.8, max_samples= 0.8, n_estimators= 80, warm_start= False), BaggingClassifier(max_features= 0.8, max_samples= 0.8, n_estimators= 51, warm_start= False)), "LR" : (LogisticRegression(solver='saga', penalty='l1'), LogisticRegression(solver='saga', penalty='l1')), "SVM" : (SVC(C=1.974, kernel='rbf'), SVC(C=1.503, kernel='rbf')), "NN" : (MLPClassifier(activation='relu', hidden_layer_sizes=125, learning_rate='invscaling'), MLPClassifier(activation='tanh', hidden_layer_sizes=100, learning_rate='constant'))} return models def negative_recall_scorer(clf, X, y): """Gives the negative recall defined as the (number of true_negative_samples)/(total number of negative samples)""" y_pred = clf.predict(X) cm = confusion_matrix(y, y_pred) TN_prop = cm[0,0]/(cm[0,1]+cm[0,0]) return TN_prop def TN_scorer(clf, X, y): """Gives the number of samples predicted as true negatives""" y_pred = clf.predict(X) cm = confusion_matrix(y, y_pred) TN = cm[0,0] return TN def FN_scorer(clf, X, y): """Gives the number of samples predicted as false negatives""" y_pred = clf.predict(X) cm = confusion_matrix(y, y_pred) FN = cm[0,1] return FN def FP_scorer(clf, X, y): """Gives the number of samples predicted as false positive""" y_pred = clf.predict(X) cm = confusion_matrix(y, y_pred) FP = cm[1,0] return FP def TP_scorer(clf, X, y): """Gives the number of samples predicted as true positive""" y_pred = clf.predict(X) cm = confusion_matrix(y, y_pred) TP = cm[1,1] return TP if __name__ == '__main__': datasets = ["Dropout_1"] #select the dataset to train on filtering = ["FSS","noFSS"] #select whether the dataset has been through the filtering step or not scorings = {'f1':make_scorer(f1_score), 'negative_recall': negative_recall_scorer, 'recall':make_scorer(recall_score), 'precision':make_scorer(precision_score), 'TN':TN_scorer, 'FN':FN_scorer, 'FP':FP_scorer, 'TP':TP_scorer} #scorings to be used for model evaluation resample = SMOTETomek() #the method used to balance the output classes cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1) #the cross-validation protocole used for f in filtering : for d in datasets: sin_cluster_data_features, sin_cluster_data_label, cluster_data_features, cluster_data_label = output_datasets(d,f) models = models_input(d,f) #models selected for training cluster_df = pd.DataFrame(columns=range(1,11), index=['SVM_F1','SVM_Precision','SVM_Recall','SVM_TN-prop','NN_F1','NN_Precision','NN_Recall','NN_TN-prop', 'LR_F1','LR_Precision','LR_Recall','LR_TN-prop','Bag_F1','Bag_Precision','Bag_Recall','Bag_TN-prop', 'RF_F1','RF_Precision','RF_Recall','RF_TN-prop','Boost_F1','Boost_Precision','Boost_Recall','Boost_TN-prop', 'Tree_F1','Tree_Precision','Tree_Recall','Tree_TN-prop']) #dataframe to save the results in for the cluster dataset sin_cluster_df = cluster_df.copy(deep=True) #dataframe to save the results in for the cluster dataset cluster_cm = pd.DataFrame(columns=range(1,11), index=['SVM_TN','SVM_FN','SVM_FP','SVM_TP','NN_TN','NN_FN','NN_FP','NN_TP', 'LR_TN','LR_FN','LR_FP','LR_TP','Bag_TN','Bag_FN','Bag_FP','Bag_TP', 'RF_TN','RF_FN','RF_FP','RF_TP','Boost_TN','Boost_FN','Boost_FP','Boost_TP', 'Tree_TN','Tree_FN','Tree_FP','Tree_TP']) #dataframe to save the results in for the cluster dataset sin_cluster_cm = cluster_cm.copy(deep=True) #dataframe to save the results in for the cluster dataset for k in models : model = models[k][0] #selection of the first model of the tuple which is the one without clusters pipeline = Pipeline(steps=[('r', resample), ('m', model)]) #training of the model for the dataset without clusters sin_cluster_scores = cross_validate(pipeline, sin_cluster_data_features.values, sin_cluster_data_label.values, scoring=scorings, cv=cv, return_train_score=True, n_jobs=1) sin_cluster_df.loc[k+'_F1']=list(np.around(np.array(sin_cluster_scores["test_f1"]),4)) #the F1 score for the database without cluster is stored in a dataframe sin_cluster_df.loc[k+'_Precision']=list(np.around(np.array(sin_cluster_scores["test_precision"]),4)) #same for precision sin_cluster_df.loc[k+'_Recall']=list(np.around(np.array(sin_cluster_scores["test_recall"]),4)) #same for recall sin_cluster_df.loc[k+'_TN-prop']=list(np.around(np.array(sin_cluster_scores["test_negative_recall"]),4)) #same for negative_recall sin_cluster_cm.loc[k+'_TN']=sin_cluster_scores["test_TN"] #the number of true negative samples for the database without cluster is stored in a dataframe sin_cluster_cm.loc[k+'_FN']=sin_cluster_scores["test_FN"] #same for false negative sin_cluster_cm.loc[k+'_FP']=sin_cluster_scores["test_FP"] #same for false positive sin_cluster_cm.loc[k+'_TP']=sin_cluster_scores["test_TP"] #same for true positive model = models[k][1] #selection of the second model of the tuple which is the one with clusters pipeline = Pipeline(steps=[('r', resample), ('m', model)]) #training of the model for the dataset with clusters cluster_scores = cross_validate(pipeline, cluster_data_features.values, cluster_data_label.values, scoring=scorings, cv=cv, return_train_score=True, n_jobs=1) cluster_df.loc[k+'_F1']=list(np.around(np.array(cluster_scores["test_f1"]),4)) #the F1 score for the database without cluster is stored in a dataframe cluster_df.loc[k+'_Precision']=list(np.around(np.array(cluster_scores["test_precision"]),4)) #same for precision cluster_df.loc[k+'_Recall']=list(np.around(np.array(cluster_scores["test_recall"]),4)) #same for recall cluster_df.loc[k+'_TN-prop']=list(np.around(np.array(cluster_scores["test_negative_recall"]),4)) #same for negative_recall cluster_cm.loc[k+'_TN']=cluster_scores["test_TN"] #the number of true negative samples for the database without cluster is stored in a dataframe cluster_cm.loc[k+'_FN']=cluster_scores["test_FN"] #same for false negative cluster_cm.loc[k+'_FP']=cluster_scores["test_FP"] #same for false positive cluster_cm.loc[k+'_TP']=cluster_scores["test_TP"] #same for true positive #Download of results as csv files cluster_df.to_csv("Results_3_"+d+"_Cluster_"+f+".csv") sin_cluster_df.to_csv("Results_3_"+d+"_sin_Cluster_"+f+".csv") cluster_cm.to_csv("Results_3_confusion_matrix_"+d+"_Cluster_"+f+".csv") sin_cluster_cm.to_csv("Results_3_confusion_matrix"+d+"_sin_Cluster_"+f+".csv")