Commit d8cb406e authored by Lucia Prieto's avatar Lucia Prieto

Upload final-training.py

parent a4bf40a4
# The aim of this code is to get an ovrview of the performances of the selected models on the filtered data set whetehr thery are clustered of not and for both outputs
# We train with a 10-folds scenario and get as an output the following metrics for each fold :
# -AUROC
# -F1
# -LogLoss
# Import of databases for training
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from imblearn.pipeline import Pipeline
from imblearn.combine import SMOTETomek
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score, make_scorer, precision_score, recall_score
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
def output_datasets(dataset,filtering):
"""Gives the features and labels to train the model on depending on the dataset considered"""
if dataset == "Dropout_1":
# Import of csv database from the feature engineering R code
db_cluster = pd.read_csv("dropout_cluster.csv", sep=",")
db = pd.read_csv("dropout_sin_cluster.csv", sep=",")
# Features to be selected from the feature filtering step
features_cluster = (pd.read_csv("dropout_cluster_FSS.csv",sep=",")).columns.values
features = (pd.read_csv("dropout_sin_cluster_FSS.csv",sep=",")).columns.values
else:
# Import of csv database from the feature engineering R code
db_cluster = pd.read_csv("relapse_cluster.csv", sep=",")
db = pd.read_csv("relapse_sin_cluster.csv", sep=",")
# Features to be selected from the feature filtering step
features_cluster = (pd.read_csv("relapse_cluster_FSS.csv",sep=",")).columns.values
features = (pd.read_csv("relapse_sin_cluster_FSS.csv",sep=",")).columns.values
# Creation of train and test sets for the dataset without cluster
sin_cluster_data_label = db[dataset]
sin_cluster_data_features = db.drop(dataset, axis=1) #elimination of the output from the training set
columns_to_be_changed = sin_cluster_data_features.select_dtypes(exclude='number').columns.values
sin_cluster_data_features = pd.get_dummies(sin_cluster_data_features, columns=columns_to_be_changed) #use of one hot encoding for categorical variables
if filtering == "FSS" : #selection of features in case the filtering is activated for the dataset
sin_cluster_data_features = sin_cluster_data_features.filter(features, axis=1)
sin_cluster_data_features.replace({False: 0, True: 1}, inplace=True) #convertion of boolean by integers for the features set (necessary for newest numpy versions)
sin_cluster_data_label.replace({False: 0, True: 1}, inplace=True) #convertion of boolean by integers for the features set (necessary for newest numpy versions)
# Creation of train and test sets for the dataset with cluster (same steps)
cluster_data_label = db_cluster[dataset]
cluster_data_features = db_cluster.drop(dataset, axis=1)
columns_to_be_changed = cluster_data_features.select_dtypes(exclude='number').columns.values
cluster_data_features = pd.get_dummies(cluster_data_features, columns=columns_to_be_changed)
if filtering == "FSS" :
cluster_data_features = cluster_data_features.filter(features_cluster, axis=1)
cluster_data_features.replace({False: 0, True: 1}, inplace=True)
cluster_data_label.replace({False: 0, True: 1}, inplace=True)
return sin_cluster_data_features, sin_cluster_data_label, cluster_data_features, cluster_data_label
def models_input(dataset, filtering) :
"""Gives a dictionnary of models to train with as a tuple model_name:(model optimized without cluster, model optimized with clusters)"""
if filtering == "FSS" :
if dataset == "Dropout_1" :
models = {"Tree" : (DecisionTreeClassifier(splitter='best', max_features='sqrt', criterion='gini'), DecisionTreeClassifier(splitter='random', max_features='log2', criterion='log_loss')),
"RF" : (RandomForestClassifier(criterion='entropy', max_features='sqrt', n_estimators=111), RandomForestClassifier(criterion='gini', max_features='sqrt', n_estimators=194)),
"Boost" : (AdaBoostClassifier(learning_rate= 1.9061, n_estimators= 62),AdaBoostClassifier(learning_rate= 1.9184, n_estimators= 83)),
"Bag" :(BaggingClassifier(max_features= 0.8, max_samples= 1.0, n_estimators= 13, warm_start= True), BaggingClassifier(max_features= 0.8, max_samples= 0.8, n_estimators= 67, warm_start= True)),
"LR" : (LogisticRegression(solver='lbfgs', penalty='l2'), LogisticRegression(solver='newton-cholesky', penalty='l2')),
"SVM" : (SVC(C=1.6663, kernel='linear'), SVC(C=0.9894, kernel='linear')),
"NN" : (MLPClassifier(activation='logistic', hidden_layer_sizes=116, learning_rate='invscaling'), MLPClassifier(activation='identity', hidden_layer_sizes=94, learning_rate='adaptive'))}
if dataset == "Relapse_1" :
models = {"Tree" : (DecisionTreeClassifier(splitter='random', max_features='log2', criterion='log_loss'), DecisionTreeClassifier(splitter='best', max_features='sqrt', criterion='gini')),
"RF" : (RandomForestClassifier(criterion='gini', max_features='sqrt', n_estimators=158), RandomForestClassifier(criterion='gini', max_features='sqrt', n_estimators=242)),
"Boost" : (AdaBoostClassifier(learning_rate= 0.994, n_estimators= 117),AdaBoostClassifier(learning_rate= 1.672, n_estimators= 144)),
"Bag" :(BaggingClassifier(max_features= 0.8, max_samples= 1, n_estimators= 18, warm_start= False), BaggingClassifier(max_features= 0.8, max_samples= 0.8, n_estimators= 67, warm_start= False)),
"LR" : (LogisticRegression(solver='sag', penalty='l2'), LogisticRegression(solver='saga', penalty='l2')),
"SVM" : (SVC(C=1.511, kernel='rbf'), SVC(C=1.033, kernel='rbf')),
"NN" : (MLPClassifier(activation='relu', hidden_layer_sizes=125, learning_rate='invscaling'), MLPClassifier(activation='tanh', hidden_layer_sizes=67, learning_rate='constant'))}
if filtering == "noFSS" :
if dataset == "Dropout_1" :
models = {"Tree" : (DecisionTreeClassifier(splitter='random', max_features='log2', criterion='gini'), DecisionTreeClassifier(splitter='random', max_features='sqrt', criterion='entropy')),
"RF" : (RandomForestClassifier(criterion='entropy', max_features='log2', n_estimators=134), RandomForestClassifier(criterion='entropy', max_features='log2', n_estimators=237)),
"Boost" : (AdaBoostClassifier(learning_rate= 0.9249, n_estimators= 54),AdaBoostClassifier(learning_rate= 0.9984, n_estimators= 91)),
"Bag" :(BaggingClassifier(max_features= 0.8, max_samples= 0.8, n_estimators= 11, warm_start= True), BaggingClassifier(max_features= 0.8, max_samples= 0.8, n_estimators= 16, warm_start= False)),
"LR" : (LogisticRegression(solver='sag', penalty='l2'), LogisticRegression(solver='lbfgs', penalty='l2')),
"SVM" : (SVC(C=0.9152, kernel='linear'), SVC(C=1.3079, kernel='linear')),
"NN" : (MLPClassifier(activation='identity', hidden_layer_sizes=114, learning_rate='constant'), MLPClassifier(activation='identity', hidden_layer_sizes=71, learning_rate='constant'))}
if dataset == "Relapse_1" :
models = {"Tree" : (DecisionTreeClassifier(splitter='best', max_features='sqrt', criterion='entropy'), DecisionTreeClassifier(splitter='random', max_features='sqrt', criterion='entropy')),
"RF" : (RandomForestClassifier(criterion='entropy', max_features='sqrt', n_estimators=128), RandomForestClassifier(criterion='entropy', max_features='sqrt', n_estimators=131)),
"Boost" : (AdaBoostClassifier(learning_rate= 1.259, n_estimators= 127), AdaBoostClassifier(learning_rate= 1.393, n_estimators= 135)),
"Bag" :(BaggingClassifier(max_features= 0.8, max_samples= 0.8, n_estimators= 80, warm_start= False), BaggingClassifier(max_features= 0.8, max_samples= 0.8, n_estimators= 51, warm_start= False)),
"LR" : (LogisticRegression(solver='saga', penalty='l1'), LogisticRegression(solver='saga', penalty='l1')),
"SVM" : (SVC(C=1.974, kernel='rbf'), SVC(C=1.503, kernel='rbf')),
"NN" : (MLPClassifier(activation='relu', hidden_layer_sizes=125, learning_rate='invscaling'), MLPClassifier(activation='tanh', hidden_layer_sizes=100, learning_rate='constant'))}
return models
def negative_recall_scorer(clf, X, y):
"""Gives the negative recall defined as the (number of true_negative_samples)/(total number of negative samples)"""
y_pred = clf.predict(X)
cm = confusion_matrix(y, y_pred)
TN_prop = cm[0,0]/(cm[0,1]+cm[0,0])
return TN_prop
def TN_scorer(clf, X, y):
"""Gives the number of samples predicted as true negatives"""
y_pred = clf.predict(X)
cm = confusion_matrix(y, y_pred)
TN = cm[0,0]
return TN
def FN_scorer(clf, X, y):
"""Gives the number of samples predicted as false negatives"""
y_pred = clf.predict(X)
cm = confusion_matrix(y, y_pred)
FN = cm[0,1]
return FN
def FP_scorer(clf, X, y):
"""Gives the number of samples predicted as false positive"""
y_pred = clf.predict(X)
cm = confusion_matrix(y, y_pred)
FP = cm[1,0]
return FP
def TP_scorer(clf, X, y):
"""Gives the number of samples predicted as true positive"""
y_pred = clf.predict(X)
cm = confusion_matrix(y, y_pred)
TP = cm[1,1]
return TP
if __name__ == '__main__':
datasets = ["Dropout_1"] #select the dataset to train on
filtering = ["FSS","noFSS"] #select whether the dataset has been through the filtering step or not
scorings = {'f1':make_scorer(f1_score), 'negative_recall': negative_recall_scorer, 'recall':make_scorer(recall_score), 'precision':make_scorer(precision_score), 'TN':TN_scorer, 'FN':FN_scorer, 'FP':FP_scorer, 'TP':TP_scorer} #scorings to be used for model evaluation
resample = SMOTETomek() #the method used to balance the output classes
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1) #the cross-validation protocole used
for f in filtering :
for d in datasets:
sin_cluster_data_features, sin_cluster_data_label, cluster_data_features, cluster_data_label = output_datasets(d,f)
models = models_input(d,f) #models selected for training
cluster_df = pd.DataFrame(columns=range(1,11), index=['SVM_F1','SVM_Precision','SVM_Recall','SVM_TN-prop','NN_F1','NN_Precision','NN_Recall','NN_TN-prop',
'LR_F1','LR_Precision','LR_Recall','LR_TN-prop','Bag_F1','Bag_Precision','Bag_Recall','Bag_TN-prop',
'RF_F1','RF_Precision','RF_Recall','RF_TN-prop','Boost_F1','Boost_Precision','Boost_Recall','Boost_TN-prop',
'Tree_F1','Tree_Precision','Tree_Recall','Tree_TN-prop']) #dataframe to save the results in for the cluster dataset
sin_cluster_df = cluster_df.copy(deep=True) #dataframe to save the results in for the cluster dataset
cluster_cm = pd.DataFrame(columns=range(1,11), index=['SVM_TN','SVM_FN','SVM_FP','SVM_TP','NN_TN','NN_FN','NN_FP','NN_TP',
'LR_TN','LR_FN','LR_FP','LR_TP','Bag_TN','Bag_FN','Bag_FP','Bag_TP',
'RF_TN','RF_FN','RF_FP','RF_TP','Boost_TN','Boost_FN','Boost_FP','Boost_TP',
'Tree_TN','Tree_FN','Tree_FP','Tree_TP']) #dataframe to save the results in for the cluster dataset
sin_cluster_cm = cluster_cm.copy(deep=True) #dataframe to save the results in for the cluster dataset
for k in models :
model = models[k][0] #selection of the first model of the tuple which is the one without clusters
pipeline = Pipeline(steps=[('r', resample), ('m', model)])
#training of the model for the dataset without clusters
sin_cluster_scores = cross_validate(pipeline, sin_cluster_data_features.values, sin_cluster_data_label.values, scoring=scorings, cv=cv, return_train_score=True, n_jobs=1)
sin_cluster_df.loc[k+'_F1']=list(np.around(np.array(sin_cluster_scores["test_f1"]),4)) #the F1 score for the database without cluster is stored in a dataframe
sin_cluster_df.loc[k+'_Precision']=list(np.around(np.array(sin_cluster_scores["test_precision"]),4)) #same for precision
sin_cluster_df.loc[k+'_Recall']=list(np.around(np.array(sin_cluster_scores["test_recall"]),4)) #same for recall
sin_cluster_df.loc[k+'_TN-prop']=list(np.around(np.array(sin_cluster_scores["test_negative_recall"]),4)) #same for negative_recall
sin_cluster_cm.loc[k+'_TN']=sin_cluster_scores["test_TN"] #the number of true negative samples for the database without cluster is stored in a dataframe
sin_cluster_cm.loc[k+'_FN']=sin_cluster_scores["test_FN"] #same for false negative
sin_cluster_cm.loc[k+'_FP']=sin_cluster_scores["test_FP"] #same for false positive
sin_cluster_cm.loc[k+'_TP']=sin_cluster_scores["test_TP"] #same for true positive
model = models[k][1] #selection of the second model of the tuple which is the one with clusters
pipeline = Pipeline(steps=[('r', resample), ('m', model)])
#training of the model for the dataset with clusters
cluster_scores = cross_validate(pipeline, cluster_data_features.values, cluster_data_label.values, scoring=scorings, cv=cv, return_train_score=True, n_jobs=1)
cluster_df.loc[k+'_F1']=list(np.around(np.array(cluster_scores["test_f1"]),4)) #the F1 score for the database without cluster is stored in a dataframe
cluster_df.loc[k+'_Precision']=list(np.around(np.array(cluster_scores["test_precision"]),4)) #same for precision
cluster_df.loc[k+'_Recall']=list(np.around(np.array(cluster_scores["test_recall"]),4)) #same for recall
cluster_df.loc[k+'_TN-prop']=list(np.around(np.array(cluster_scores["test_negative_recall"]),4)) #same for negative_recall
cluster_cm.loc[k+'_TN']=cluster_scores["test_TN"] #the number of true negative samples for the database without cluster is stored in a dataframe
cluster_cm.loc[k+'_FN']=cluster_scores["test_FN"] #same for false negative
cluster_cm.loc[k+'_FP']=cluster_scores["test_FP"] #same for false positive
cluster_cm.loc[k+'_TP']=cluster_scores["test_TP"] #same for true positive
#Download of results as csv files
cluster_df.to_csv("Results_3_"+d+"_Cluster_"+f+".csv")
sin_cluster_df.to_csv("Results_3_"+d+"_sin_Cluster_"+f+".csv")
cluster_cm.to_csv("Results_3_confusion_matrix_"+d+"_Cluster_"+f+".csv")
sin_cluster_cm.to_csv("Results_3_confusion_matrix"+d+"_sin_Cluster_"+f+".csv")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment