diff --git a/code/models/final-training.py b/code/models/final-training.py new file mode 100644 index 0000000000000000000000000000000000000000..cd76b7c255ce337eb0f3af669d413a2e97a46b74 --- /dev/null +++ b/code/models/final-training.py @@ -0,0 +1,184 @@ +# The aim of this code is to get an ovrview of the performances of the selected models on the filtered data set whetehr thery are clustered of not and for both outputs +# We train with a 10-folds scenario and get as an output the following metrics for each fold : +# -AUROC +# -F1 +# -LogLoss + +# Import of databases for training +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd + +from imblearn.pipeline import Pipeline +from imblearn.combine import SMOTETomek +from sklearn.metrics import confusion_matrix +from sklearn.metrics import f1_score, make_scorer, precision_score, recall_score +from sklearn.model_selection import StratifiedKFold, cross_validate +from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier +from sklearn.neural_network import MLPClassifier +from sklearn.svm import SVC +from sklearn.linear_model import LogisticRegression +from sklearn.tree import DecisionTreeClassifier + +def output_datasets(dataset,filtering): + """Gives the features and labels to train the model on depending on the dataset considered""" + if dataset == "Dropout_1": + # Import of csv database from the feature engineering R code + db_cluster = pd.read_csv("dropout_cluster.csv", sep=",") + db = pd.read_csv("dropout_sin_cluster.csv", sep=",") + # Features to be selected from the feature filtering step + features_cluster = (pd.read_csv("dropout_cluster_FSS.csv",sep=",")).columns.values + features = (pd.read_csv("dropout_sin_cluster_FSS.csv",sep=",")).columns.values + + else: + # Import of csv database from the feature engineering R code + db_cluster = pd.read_csv("relapse_cluster.csv", sep=",") + db = pd.read_csv("relapse_sin_cluster.csv", sep=",") + # Features to be selected from the feature filtering step + features_cluster = (pd.read_csv("relapse_cluster_FSS.csv",sep=",")).columns.values + features = (pd.read_csv("relapse_sin_cluster_FSS.csv",sep=",")).columns.values + + # Creation of train and test sets for the dataset without cluster + sin_cluster_data_label = db[dataset] + sin_cluster_data_features = db.drop(dataset, axis=1) #elimination of the output from the training set + columns_to_be_changed = sin_cluster_data_features.select_dtypes(exclude='number').columns.values + sin_cluster_data_features = pd.get_dummies(sin_cluster_data_features, columns=columns_to_be_changed) #use of one hot encoding for categorical variables + if filtering == "FSS" : #selection of features in case the filtering is activated for the dataset + sin_cluster_data_features = sin_cluster_data_features.filter(features, axis=1) + sin_cluster_data_features.replace({False: 0, True: 1}, inplace=True) #convertion of boolean by integers for the features set (necessary for newest numpy versions) + sin_cluster_data_label.replace({False: 0, True: 1}, inplace=True) #convertion of boolean by integers for the features set (necessary for newest numpy versions) + + # Creation of train and test sets for the dataset with cluster (same steps) + cluster_data_label = db_cluster[dataset] + cluster_data_features = db_cluster.drop(dataset, axis=1) + columns_to_be_changed = cluster_data_features.select_dtypes(exclude='number').columns.values + cluster_data_features = pd.get_dummies(cluster_data_features, columns=columns_to_be_changed) + if filtering == "FSS" : + cluster_data_features = cluster_data_features.filter(features_cluster, axis=1) + cluster_data_features.replace({False: 0, True: 1}, inplace=True) + cluster_data_label.replace({False: 0, True: 1}, inplace=True) + + return sin_cluster_data_features, sin_cluster_data_label, cluster_data_features, cluster_data_label + +def models_input(dataset, filtering) : + """Gives a dictionnary of models to train with as a tuple model_name:(model optimized without cluster, model optimized with clusters)""" + if filtering == "FSS" : + if dataset == "Dropout_1" : + models = {"Tree" : (DecisionTreeClassifier(splitter='best', max_features='sqrt', criterion='gini'), DecisionTreeClassifier(splitter='random', max_features='log2', criterion='log_loss')), + "RF" : (RandomForestClassifier(criterion='entropy', max_features='sqrt', n_estimators=111), RandomForestClassifier(criterion='gini', max_features='sqrt', n_estimators=194)), + "Boost" : (AdaBoostClassifier(learning_rate= 1.9061, n_estimators= 62),AdaBoostClassifier(learning_rate= 1.9184, n_estimators= 83)), + "Bag" :(BaggingClassifier(max_features= 0.8, max_samples= 1.0, n_estimators= 13, warm_start= True), BaggingClassifier(max_features= 0.8, max_samples= 0.8, n_estimators= 67, warm_start= True)), + "LR" : (LogisticRegression(solver='lbfgs', penalty='l2'), LogisticRegression(solver='newton-cholesky', penalty='l2')), + "SVM" : (SVC(C=1.6663, kernel='linear'), SVC(C=0.9894, kernel='linear')), + "NN" : (MLPClassifier(activation='logistic', hidden_layer_sizes=116, learning_rate='invscaling'), MLPClassifier(activation='identity', hidden_layer_sizes=94, learning_rate='adaptive'))} + if dataset == "Relapse_1" : + models = {"Tree" : (DecisionTreeClassifier(splitter='random', max_features='log2', criterion='log_loss'), DecisionTreeClassifier(splitter='best', max_features='sqrt', criterion='gini')), + "RF" : (RandomForestClassifier(criterion='gini', max_features='sqrt', n_estimators=158), RandomForestClassifier(criterion='gini', max_features='sqrt', n_estimators=242)), + "Boost" : (AdaBoostClassifier(learning_rate= 0.994, n_estimators= 117),AdaBoostClassifier(learning_rate= 1.672, n_estimators= 144)), + "Bag" :(BaggingClassifier(max_features= 0.8, max_samples= 1, n_estimators= 18, warm_start= False), BaggingClassifier(max_features= 0.8, max_samples= 0.8, n_estimators= 67, warm_start= False)), + "LR" : (LogisticRegression(solver='sag', penalty='l2'), LogisticRegression(solver='saga', penalty='l2')), + "SVM" : (SVC(C=1.511, kernel='rbf'), SVC(C=1.033, kernel='rbf')), + "NN" : (MLPClassifier(activation='relu', hidden_layer_sizes=125, learning_rate='invscaling'), MLPClassifier(activation='tanh', hidden_layer_sizes=67, learning_rate='constant'))} + if filtering == "noFSS" : + if dataset == "Dropout_1" : + models = {"Tree" : (DecisionTreeClassifier(splitter='random', max_features='log2', criterion='gini'), DecisionTreeClassifier(splitter='random', max_features='sqrt', criterion='entropy')), + "RF" : (RandomForestClassifier(criterion='entropy', max_features='log2', n_estimators=134), RandomForestClassifier(criterion='entropy', max_features='log2', n_estimators=237)), + "Boost" : (AdaBoostClassifier(learning_rate= 0.9249, n_estimators= 54),AdaBoostClassifier(learning_rate= 0.9984, n_estimators= 91)), + "Bag" :(BaggingClassifier(max_features= 0.8, max_samples= 0.8, n_estimators= 11, warm_start= True), BaggingClassifier(max_features= 0.8, max_samples= 0.8, n_estimators= 16, warm_start= False)), + "LR" : (LogisticRegression(solver='sag', penalty='l2'), LogisticRegression(solver='lbfgs', penalty='l2')), + "SVM" : (SVC(C=0.9152, kernel='linear'), SVC(C=1.3079, kernel='linear')), + "NN" : (MLPClassifier(activation='identity', hidden_layer_sizes=114, learning_rate='constant'), MLPClassifier(activation='identity', hidden_layer_sizes=71, learning_rate='constant'))} + if dataset == "Relapse_1" : + models = {"Tree" : (DecisionTreeClassifier(splitter='best', max_features='sqrt', criterion='entropy'), DecisionTreeClassifier(splitter='random', max_features='sqrt', criterion='entropy')), + "RF" : (RandomForestClassifier(criterion='entropy', max_features='sqrt', n_estimators=128), RandomForestClassifier(criterion='entropy', max_features='sqrt', n_estimators=131)), + "Boost" : (AdaBoostClassifier(learning_rate= 1.259, n_estimators= 127), AdaBoostClassifier(learning_rate= 1.393, n_estimators= 135)), + "Bag" :(BaggingClassifier(max_features= 0.8, max_samples= 0.8, n_estimators= 80, warm_start= False), BaggingClassifier(max_features= 0.8, max_samples= 0.8, n_estimators= 51, warm_start= False)), + "LR" : (LogisticRegression(solver='saga', penalty='l1'), LogisticRegression(solver='saga', penalty='l1')), + "SVM" : (SVC(C=1.974, kernel='rbf'), SVC(C=1.503, kernel='rbf')), + "NN" : (MLPClassifier(activation='relu', hidden_layer_sizes=125, learning_rate='invscaling'), MLPClassifier(activation='tanh', hidden_layer_sizes=100, learning_rate='constant'))} + return models + +def negative_recall_scorer(clf, X, y): + """Gives the negative recall defined as the (number of true_negative_samples)/(total number of negative samples)""" + y_pred = clf.predict(X) + cm = confusion_matrix(y, y_pred) + TN_prop = cm[0,0]/(cm[0,1]+cm[0,0]) + return TN_prop + +def TN_scorer(clf, X, y): + """Gives the number of samples predicted as true negatives""" + y_pred = clf.predict(X) + cm = confusion_matrix(y, y_pred) + TN = cm[0,0] + return TN +def FN_scorer(clf, X, y): + """Gives the number of samples predicted as false negatives""" + y_pred = clf.predict(X) + cm = confusion_matrix(y, y_pred) + FN = cm[0,1] + return FN +def FP_scorer(clf, X, y): + """Gives the number of samples predicted as false positive""" + y_pred = clf.predict(X) + cm = confusion_matrix(y, y_pred) + FP = cm[1,0] + return FP +def TP_scorer(clf, X, y): + """Gives the number of samples predicted as true positive""" + y_pred = clf.predict(X) + cm = confusion_matrix(y, y_pred) + TP = cm[1,1] + return TP + +if __name__ == '__main__': + datasets = ["Dropout_1"] #select the dataset to train on + filtering = ["FSS","noFSS"] #select whether the dataset has been through the filtering step or not + scorings = {'f1':make_scorer(f1_score), 'negative_recall': negative_recall_scorer, 'recall':make_scorer(recall_score), 'precision':make_scorer(precision_score), 'TN':TN_scorer, 'FN':FN_scorer, 'FP':FP_scorer, 'TP':TP_scorer} #scorings to be used for model evaluation + resample = SMOTETomek() #the method used to balance the output classes + cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1) #the cross-validation protocole used + for f in filtering : + for d in datasets: + sin_cluster_data_features, sin_cluster_data_label, cluster_data_features, cluster_data_label = output_datasets(d,f) + models = models_input(d,f) #models selected for training + cluster_df = pd.DataFrame(columns=range(1,11), index=['SVM_F1','SVM_Precision','SVM_Recall','SVM_TN-prop','NN_F1','NN_Precision','NN_Recall','NN_TN-prop', + 'LR_F1','LR_Precision','LR_Recall','LR_TN-prop','Bag_F1','Bag_Precision','Bag_Recall','Bag_TN-prop', + 'RF_F1','RF_Precision','RF_Recall','RF_TN-prop','Boost_F1','Boost_Precision','Boost_Recall','Boost_TN-prop', + 'Tree_F1','Tree_Precision','Tree_Recall','Tree_TN-prop']) #dataframe to save the results in for the cluster dataset + sin_cluster_df = cluster_df.copy(deep=True) #dataframe to save the results in for the cluster dataset + cluster_cm = pd.DataFrame(columns=range(1,11), index=['SVM_TN','SVM_FN','SVM_FP','SVM_TP','NN_TN','NN_FN','NN_FP','NN_TP', + 'LR_TN','LR_FN','LR_FP','LR_TP','Bag_TN','Bag_FN','Bag_FP','Bag_TP', + 'RF_TN','RF_FN','RF_FP','RF_TP','Boost_TN','Boost_FN','Boost_FP','Boost_TP', + 'Tree_TN','Tree_FN','Tree_FP','Tree_TP']) #dataframe to save the results in for the cluster dataset + sin_cluster_cm = cluster_cm.copy(deep=True) #dataframe to save the results in for the cluster dataset + for k in models : + model = models[k][0] #selection of the first model of the tuple which is the one without clusters + pipeline = Pipeline(steps=[('r', resample), ('m', model)]) + #training of the model for the dataset without clusters + sin_cluster_scores = cross_validate(pipeline, sin_cluster_data_features.values, sin_cluster_data_label.values, scoring=scorings, cv=cv, return_train_score=True, n_jobs=1) + sin_cluster_df.loc[k+'_F1']=list(np.around(np.array(sin_cluster_scores["test_f1"]),4)) #the F1 score for the database without cluster is stored in a dataframe + sin_cluster_df.loc[k+'_Precision']=list(np.around(np.array(sin_cluster_scores["test_precision"]),4)) #same for precision + sin_cluster_df.loc[k+'_Recall']=list(np.around(np.array(sin_cluster_scores["test_recall"]),4)) #same for recall + sin_cluster_df.loc[k+'_TN-prop']=list(np.around(np.array(sin_cluster_scores["test_negative_recall"]),4)) #same for negative_recall + sin_cluster_cm.loc[k+'_TN']=sin_cluster_scores["test_TN"] #the number of true negative samples for the database without cluster is stored in a dataframe + sin_cluster_cm.loc[k+'_FN']=sin_cluster_scores["test_FN"] #same for false negative + sin_cluster_cm.loc[k+'_FP']=sin_cluster_scores["test_FP"] #same for false positive + sin_cluster_cm.loc[k+'_TP']=sin_cluster_scores["test_TP"] #same for true positive + + model = models[k][1] #selection of the second model of the tuple which is the one with clusters + pipeline = Pipeline(steps=[('r', resample), ('m', model)]) + #training of the model for the dataset with clusters + cluster_scores = cross_validate(pipeline, cluster_data_features.values, cluster_data_label.values, scoring=scorings, cv=cv, return_train_score=True, n_jobs=1) + cluster_df.loc[k+'_F1']=list(np.around(np.array(cluster_scores["test_f1"]),4)) #the F1 score for the database without cluster is stored in a dataframe + cluster_df.loc[k+'_Precision']=list(np.around(np.array(cluster_scores["test_precision"]),4)) #same for precision + cluster_df.loc[k+'_Recall']=list(np.around(np.array(cluster_scores["test_recall"]),4)) #same for recall + cluster_df.loc[k+'_TN-prop']=list(np.around(np.array(cluster_scores["test_negative_recall"]),4)) #same for negative_recall + cluster_cm.loc[k+'_TN']=cluster_scores["test_TN"] #the number of true negative samples for the database without cluster is stored in a dataframe + cluster_cm.loc[k+'_FN']=cluster_scores["test_FN"] #same for false negative + cluster_cm.loc[k+'_FP']=cluster_scores["test_FP"] #same for false positive + cluster_cm.loc[k+'_TP']=cluster_scores["test_TP"] #same for true positive + + #Download of results as csv files + cluster_df.to_csv("Results_3_"+d+"_Cluster_"+f+".csv") + sin_cluster_df.to_csv("Results_3_"+d+"_sin_Cluster_"+f+".csv") + cluster_cm.to_csv("Results_3_confusion_matrix_"+d+"_Cluster_"+f+".csv") + sin_cluster_cm.to_csv("Results_3_confusion_matrix"+d+"_sin_Cluster_"+f+".csv")