Commit 208dd931 authored by Lucia Prieto's avatar Lucia Prieto

Update final-training.py

parent 5ae52883
......@@ -20,27 +20,17 @@ from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
def output_datasets(dataset,filtering):
"""Gives the features and labels to train the model on depending on the dataset considered"""
if dataset == "Dropout_1":
# Import of csv database from the feature engineering R code
db_cluster = pd.read_csv("dropout_cluster.csv", sep=",")
db = pd.read_csv("dropout_sin_cluster.csv", sep=",")
# Features to be selected from the feature filtering step
features_cluster = (pd.read_csv("dropout_cluster_FSS.csv",sep=",")).columns.values
features = (pd.read_csv("dropout_sin_cluster_FSS.csv",sep=",")).columns.values
else:
# Import of csv database from the feature engineering R code
db_cluster = pd.read_csv("relapse_cluster.csv", sep=",")
db = pd.read_csv("relapse_sin_cluster.csv", sep=",")
# Features to be selected from the feature filtering step
features_cluster = (pd.read_csv("relapse_cluster_FSS.csv",sep=",")).columns.values
features = (pd.read_csv("relapse_sin_cluster_FSS.csv",sep=",")).columns.values
def output_datasets(filtering):
# Import of csv database from the feature engineering R code
db_cluster = pd.read_csv("data/dropout_cluster.csv", sep=",")
db = pd.read_csv("data/dropout.csv", sep=",")
# Features to be selected from the feature filtering step
features_cluster = (pd.read_csv("data/FSS/featsGR_cluster.csv",sep=",")).columns.values
features = (pd.read_csv("data/FSS/featsGR.csv",sep=",")).columns.values
# Creation of train and test sets for the dataset without cluster
sin_cluster_data_label = db[dataset]
sin_cluster_data_features = db.drop(dataset, axis=1) #elimination of the output from the training set
sin_cluster_data_label = db["Dropout_1"]
sin_cluster_data_features = db.drop("Dropout_1", axis=1) #elimination of the output from the training set
columns_to_be_changed = sin_cluster_data_features.select_dtypes(exclude='number').columns.values
sin_cluster_data_features = pd.get_dummies(sin_cluster_data_features, columns=columns_to_be_changed) #use of one hot encoding for categorical variables
if filtering == "FSS" : #selection of features in case the filtering is activated for the dataset
......@@ -49,8 +39,8 @@ def output_datasets(dataset,filtering):
sin_cluster_data_label.replace({False: 0, True: 1}, inplace=True) #convertion of boolean by integers for the features set (necessary for newest numpy versions)
# Creation of train and test sets for the dataset with cluster (same steps)
cluster_data_label = db_cluster[dataset]
cluster_data_features = db_cluster.drop(dataset, axis=1)
cluster_data_label = db_cluster["Dropout_1"]
cluster_data_features = db_cluster.drop("Dropout_1", axis=1)
columns_to_be_changed = cluster_data_features.select_dtypes(exclude='number').columns.values
cluster_data_features = pd.get_dummies(cluster_data_features, columns=columns_to_be_changed)
if filtering == "FSS" :
......@@ -60,42 +50,24 @@ def output_datasets(dataset,filtering):
return sin_cluster_data_features, sin_cluster_data_label, cluster_data_features, cluster_data_label
def models_input(dataset, filtering) :
def models_input(filtering) :
"""Gives a dictionnary of models to train with as a tuple model_name:(model optimized without cluster, model optimized with clusters)"""
if filtering == "FSS" :
if dataset == "Dropout_1" :
models = {"Tree" : (DecisionTreeClassifier(splitter='best', max_features='sqrt', criterion='gini'), DecisionTreeClassifier(splitter='random', max_features='log2', criterion='log_loss')),
"RF" : (RandomForestClassifier(criterion='entropy', max_features='sqrt', n_estimators=111), RandomForestClassifier(criterion='gini', max_features='sqrt', n_estimators=194)),
"Boost" : (AdaBoostClassifier(learning_rate= 1.9061, n_estimators= 62),AdaBoostClassifier(learning_rate= 1.9184, n_estimators= 83)),
"Bag" :(BaggingClassifier(max_features= 0.8, max_samples= 1.0, n_estimators= 13, warm_start= True), BaggingClassifier(max_features= 0.8, max_samples= 0.8, n_estimators= 67, warm_start= True)),
"LR" : (LogisticRegression(solver='lbfgs', penalty='l2'), LogisticRegression(solver='newton-cholesky', penalty='l2')),
"SVM" : (SVC(C=1.6663, kernel='linear'), SVC(C=0.9894, kernel='linear')),
"NN" : (MLPClassifier(activation='logistic', hidden_layer_sizes=116, learning_rate='invscaling'), MLPClassifier(activation='identity', hidden_layer_sizes=94, learning_rate='adaptive'))}
if dataset == "Relapse_1" :
models = {"Tree" : (DecisionTreeClassifier(splitter='random', max_features='log2', criterion='log_loss'), DecisionTreeClassifier(splitter='best', max_features='sqrt', criterion='gini')),
"RF" : (RandomForestClassifier(criterion='gini', max_features='sqrt', n_estimators=158), RandomForestClassifier(criterion='gini', max_features='sqrt', n_estimators=242)),
"Boost" : (AdaBoostClassifier(learning_rate= 0.994, n_estimators= 117),AdaBoostClassifier(learning_rate= 1.672, n_estimators= 144)),
"Bag" :(BaggingClassifier(max_features= 0.8, max_samples= 1, n_estimators= 18, warm_start= False), BaggingClassifier(max_features= 0.8, max_samples= 0.8, n_estimators= 67, warm_start= False)),
"LR" : (LogisticRegression(solver='sag', penalty='l2'), LogisticRegression(solver='saga', penalty='l2')),
"SVM" : (SVC(C=1.511, kernel='rbf'), SVC(C=1.033, kernel='rbf')),
"NN" : (MLPClassifier(activation='relu', hidden_layer_sizes=125, learning_rate='invscaling'), MLPClassifier(activation='tanh', hidden_layer_sizes=67, learning_rate='constant'))}
if filtering == "FSS":
models = {"Tree" : (DecisionTreeClassifier(splitter='best', max_features='sqrt', criterion='gini'), DecisionTreeClassifier(splitter='random', max_features='log2', criterion='log_loss')),
"RF" : (RandomForestClassifier(criterion='entropy', max_features='sqrt', n_estimators=111), RandomForestClassifier(criterion='gini', max_features='sqrt', n_estimators=194)),
"Boost" : (AdaBoostClassifier(learning_rate= 1.9061, n_estimators= 62),AdaBoostClassifier(learning_rate= 1.9184, n_estimators= 83)),
"Bag" :(BaggingClassifier(max_features= 0.8, max_samples= 1.0, n_estimators= 13, warm_start= True), BaggingClassifier(max_features= 0.8, max_samples= 0.8, n_estimators= 67, warm_start= True)),
"LR" : (LogisticRegression(solver='lbfgs', penalty='l2'), LogisticRegression(solver='newton-cholesky', penalty='l2')),
"SVM" : (SVC(C=1.6663, kernel='linear'), SVC(C=0.9894, kernel='linear')),
"NN" : (MLPClassifier(activation='logistic', hidden_layer_sizes=116, learning_rate='invscaling'), MLPClassifier(activation='identity', hidden_layer_sizes=94, learning_rate='adaptive'))}
if filtering == "noFSS" :
if dataset == "Dropout_1" :
models = {"Tree" : (DecisionTreeClassifier(splitter='random', max_features='log2', criterion='gini'), DecisionTreeClassifier(splitter='random', max_features='sqrt', criterion='entropy')),
"RF" : (RandomForestClassifier(criterion='entropy', max_features='log2', n_estimators=134), RandomForestClassifier(criterion='entropy', max_features='log2', n_estimators=237)),
"Boost" : (AdaBoostClassifier(learning_rate= 0.9249, n_estimators= 54),AdaBoostClassifier(learning_rate= 0.9984, n_estimators= 91)),
"Bag" :(BaggingClassifier(max_features= 0.8, max_samples= 0.8, n_estimators= 11, warm_start= True), BaggingClassifier(max_features= 0.8, max_samples= 0.8, n_estimators= 16, warm_start= False)),
"LR" : (LogisticRegression(solver='sag', penalty='l2'), LogisticRegression(solver='lbfgs', penalty='l2')),
"SVM" : (SVC(C=0.9152, kernel='linear'), SVC(C=1.3079, kernel='linear')),
"NN" : (MLPClassifier(activation='identity', hidden_layer_sizes=114, learning_rate='constant'), MLPClassifier(activation='identity', hidden_layer_sizes=71, learning_rate='constant'))}
if dataset == "Relapse_1" :
models = {"Tree" : (DecisionTreeClassifier(splitter='best', max_features='sqrt', criterion='entropy'), DecisionTreeClassifier(splitter='random', max_features='sqrt', criterion='entropy')),
"RF" : (RandomForestClassifier(criterion='entropy', max_features='sqrt', n_estimators=128), RandomForestClassifier(criterion='entropy', max_features='sqrt', n_estimators=131)),
"Boost" : (AdaBoostClassifier(learning_rate= 1.259, n_estimators= 127), AdaBoostClassifier(learning_rate= 1.393, n_estimators= 135)),
"Bag" :(BaggingClassifier(max_features= 0.8, max_samples= 0.8, n_estimators= 80, warm_start= False), BaggingClassifier(max_features= 0.8, max_samples= 0.8, n_estimators= 51, warm_start= False)),
"LR" : (LogisticRegression(solver='saga', penalty='l1'), LogisticRegression(solver='saga', penalty='l1')),
"SVM" : (SVC(C=1.974, kernel='rbf'), SVC(C=1.503, kernel='rbf')),
"NN" : (MLPClassifier(activation='relu', hidden_layer_sizes=125, learning_rate='invscaling'), MLPClassifier(activation='tanh', hidden_layer_sizes=100, learning_rate='constant'))}
models = {"Tree" : (DecisionTreeClassifier(splitter='random', max_features='log2', criterion='gini'), DecisionTreeClassifier(splitter='random', max_features='sqrt', criterion='entropy')),
"RF" : (RandomForestClassifier(criterion='entropy', max_features='log2', n_estimators=134), RandomForestClassifier(criterion='entropy', max_features='log2', n_estimators=237)),
"Boost" : (AdaBoostClassifier(learning_rate= 0.9249, n_estimators= 54),AdaBoostClassifier(learning_rate= 0.9984, n_estimators= 91)),
"Bag" :(BaggingClassifier(max_features= 0.8, max_samples= 0.8, n_estimators= 11, warm_start= True), BaggingClassifier(max_features= 0.8, max_samples= 0.8, n_estimators= 16, warm_start= False)),
"LR" : (LogisticRegression(solver='sag', penalty='l2'), LogisticRegression(solver='lbfgs', penalty='l2')),
"SVM" : (SVC(C=0.9152, kernel='linear'), SVC(C=1.3079, kernel='linear')),
"NN" : (MLPClassifier(activation='identity', hidden_layer_sizes=114, learning_rate='constant'), MLPClassifier(activation='identity', hidden_layer_sizes=71, learning_rate='constant'))}
return models
def negative_recall_scorer(clf, X, y):
......@@ -131,54 +103,52 @@ def TP_scorer(clf, X, y):
return TP
if __name__ == '__main__':
datasets = ["Dropout_1"] #select the dataset to train on
filtering = ["FSS","noFSS"] #select whether the dataset has been through the filtering step or not
scorings = {'f1':make_scorer(f1_score), 'negative_recall': negative_recall_scorer, 'recall':make_scorer(recall_score), 'precision':make_scorer(precision_score), 'TN':TN_scorer, 'FN':FN_scorer, 'FP':FP_scorer, 'TP':TP_scorer} #scorings to be used for model evaluation
resample = SMOTETomek() #the method used to balance the output classes
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1) #the cross-validation protocole used
for f in filtering :
for d in datasets:
sin_cluster_data_features, sin_cluster_data_label, cluster_data_features, cluster_data_label = output_datasets(d,f)
models = models_input(d,f) #models selected for training
cluster_df = pd.DataFrame(columns=range(1,11), index=['SVM_F1','SVM_Precision','SVM_Recall','SVM_TN-prop','NN_F1','NN_Precision','NN_Recall','NN_TN-prop',
'LR_F1','LR_Precision','LR_Recall','LR_TN-prop','Bag_F1','Bag_Precision','Bag_Recall','Bag_TN-prop',
'RF_F1','RF_Precision','RF_Recall','RF_TN-prop','Boost_F1','Boost_Precision','Boost_Recall','Boost_TN-prop',
'Tree_F1','Tree_Precision','Tree_Recall','Tree_TN-prop']) #dataframe to save the results in for the cluster dataset
sin_cluster_df = cluster_df.copy(deep=True) #dataframe to save the results in for the cluster dataset
cluster_cm = pd.DataFrame(columns=range(1,11), index=['SVM_TN','SVM_FN','SVM_FP','SVM_TP','NN_TN','NN_FN','NN_FP','NN_TP',
'LR_TN','LR_FN','LR_FP','LR_TP','Bag_TN','Bag_FN','Bag_FP','Bag_TP',
'RF_TN','RF_FN','RF_FP','RF_TP','Boost_TN','Boost_FN','Boost_FP','Boost_TP',
'Tree_TN','Tree_FN','Tree_FP','Tree_TP']) #dataframe to save the results in for the cluster dataset
sin_cluster_cm = cluster_cm.copy(deep=True) #dataframe to save the results in for the cluster dataset
for k in models :
model = models[k][0] #selection of the first model of the tuple which is the one without clusters
pipeline = Pipeline(steps=[('r', resample), ('m', model)])
#training of the model for the dataset without clusters
sin_cluster_scores = cross_validate(pipeline, sin_cluster_data_features.values, sin_cluster_data_label.values, scoring=scorings, cv=cv, return_train_score=True, n_jobs=1)
sin_cluster_df.loc[k+'_F1']=list(np.around(np.array(sin_cluster_scores["test_f1"]),4)) #the F1 score for the database without cluster is stored in a dataframe
sin_cluster_df.loc[k+'_Precision']=list(np.around(np.array(sin_cluster_scores["test_precision"]),4)) #same for precision
sin_cluster_df.loc[k+'_Recall']=list(np.around(np.array(sin_cluster_scores["test_recall"]),4)) #same for recall
sin_cluster_df.loc[k+'_TN-prop']=list(np.around(np.array(sin_cluster_scores["test_negative_recall"]),4)) #same for negative_recall
sin_cluster_cm.loc[k+'_TN']=sin_cluster_scores["test_TN"] #the number of true negative samples for the database without cluster is stored in a dataframe
sin_cluster_cm.loc[k+'_FN']=sin_cluster_scores["test_FN"] #same for false negative
sin_cluster_cm.loc[k+'_FP']=sin_cluster_scores["test_FP"] #same for false positive
sin_cluster_cm.loc[k+'_TP']=sin_cluster_scores["test_TP"] #same for true positive
for f in filtering :
sin_cluster_data_features, sin_cluster_data_label, cluster_data_features, cluster_data_label = output_datasets(f)
models = models_input(f) #models selected for training
cluster_df = pd.DataFrame(columns=range(1,11), index=['SVM_F1','SVM_Precision','SVM_Recall','SVM_TN-prop','NN_F1','NN_Precision','NN_Recall','NN_TN-prop',
'LR_F1','LR_Precision','LR_Recall','LR_TN-prop','Bag_F1','Bag_Precision','Bag_Recall','Bag_TN-prop',
'RF_F1','RF_Precision','RF_Recall','RF_TN-prop','Boost_F1','Boost_Precision','Boost_Recall','Boost_TN-prop',
'Tree_F1','Tree_Precision','Tree_Recall','Tree_TN-prop']) #dataframe to save the results in for the cluster dataset
sin_cluster_df = cluster_df.copy(deep=True) #dataframe to save the results in for the cluster dataset
cluster_cm = pd.DataFrame(columns=range(1,11), index=['SVM_TN','SVM_FN','SVM_FP','SVM_TP','NN_TN','NN_FN','NN_FP','NN_TP',
'LR_TN','LR_FN','LR_FP','LR_TP','Bag_TN','Bag_FN','Bag_FP','Bag_TP',
'RF_TN','RF_FN','RF_FP','RF_TP','Boost_TN','Boost_FN','Boost_FP','Boost_TP',
'Tree_TN','Tree_FN','Tree_FP','Tree_TP']) #dataframe to save the results in for the cluster dataset
sin_cluster_cm = cluster_cm.copy(deep=True) #dataframe to save the results in for the cluster dataset
for k in models :
model = models[k][0] #selection of the first model of the tuple which is the one without clusters
pipeline = Pipeline(steps=[('r', resample), ('m', model)])
#training of the model for the dataset without clusters
sin_cluster_scores = cross_validate(pipeline, sin_cluster_data_features.values, sin_cluster_data_label.values, scoring=scorings, cv=cv, return_train_score=True, n_jobs=1)
sin_cluster_df.loc[k+'_F1']=list(np.around(np.array(sin_cluster_scores["test_f1"]),4)) #the F1 score for the database without cluster is stored in a dataframe
sin_cluster_df.loc[k+'_Precision']=list(np.around(np.array(sin_cluster_scores["test_precision"]),4)) #same for precision
sin_cluster_df.loc[k+'_Recall']=list(np.around(np.array(sin_cluster_scores["test_recall"]),4)) #same for recall
sin_cluster_df.loc[k+'_TN-prop']=list(np.around(np.array(sin_cluster_scores["test_negative_recall"]),4)) #same for negative_recall
sin_cluster_cm.loc[k+'_TN']=sin_cluster_scores["test_TN"] #the number of true negative samples for the database without cluster is stored in a dataframe
sin_cluster_cm.loc[k+'_FN']=sin_cluster_scores["test_FN"] #same for false negative
sin_cluster_cm.loc[k+'_FP']=sin_cluster_scores["test_FP"] #same for false positive
sin_cluster_cm.loc[k+'_TP']=sin_cluster_scores["test_TP"] #same for true positive
model = models[k][1] #selection of the second model of the tuple which is the one with clusters
pipeline = Pipeline(steps=[('r', resample), ('m', model)])
#training of the model for the dataset with clusters
cluster_scores = cross_validate(pipeline, cluster_data_features.values, cluster_data_label.values, scoring=scorings, cv=cv, return_train_score=True, n_jobs=1)
cluster_df.loc[k+'_F1']=list(np.around(np.array(cluster_scores["test_f1"]),4)) #the F1 score for the database without cluster is stored in a dataframe
cluster_df.loc[k+'_Precision']=list(np.around(np.array(cluster_scores["test_precision"]),4)) #same for precision
cluster_df.loc[k+'_Recall']=list(np.around(np.array(cluster_scores["test_recall"]),4)) #same for recall
cluster_df.loc[k+'_TN-prop']=list(np.around(np.array(cluster_scores["test_negative_recall"]),4)) #same for negative_recall
cluster_cm.loc[k+'_TN']=cluster_scores["test_TN"] #the number of true negative samples for the database without cluster is stored in a dataframe
cluster_cm.loc[k+'_FN']=cluster_scores["test_FN"] #same for false negative
cluster_cm.loc[k+'_FP']=cluster_scores["test_FP"] #same for false positive
cluster_cm.loc[k+'_TP']=cluster_scores["test_TP"] #same for true positive
#Download of results as csv files
cluster_df.to_csv("Results_3_"+d+"_Cluster_"+f+".csv")
sin_cluster_df.to_csv("Results_3_"+d+"_sin_Cluster_"+f+".csv")
cluster_cm.to_csv("Results_3_confusion_matrix_"+d+"_Cluster_"+f+".csv")
sin_cluster_cm.to_csv("Results_3_confusion_matrix"+d+"_sin_Cluster_"+f+".csv")
model = models[k][1] #selection of the second model of the tuple which is the one with clusters
pipeline = Pipeline(steps=[('r', resample), ('m', model)])
#training of the model for the dataset with clusters
cluster_scores = cross_validate(pipeline, cluster_data_features.values, cluster_data_label.values, scoring=scorings, cv=cv, return_train_score=True, n_jobs=1)
cluster_df.loc[k+'_F1']=list(np.around(np.array(cluster_scores["test_f1"]),4)) #the F1 score for the database without cluster is stored in a dataframe
cluster_df.loc[k+'_Precision']=list(np.around(np.array(cluster_scores["test_precision"]),4)) #same for precision
cluster_df.loc[k+'_Recall']=list(np.around(np.array(cluster_scores["test_recall"]),4)) #same for recall
cluster_df.loc[k+'_TN-prop']=list(np.around(np.array(cluster_scores["test_negative_recall"]),4)) #same for negative_recall
cluster_cm.loc[k+'_TN']=cluster_scores["test_TN"] #the number of true negative samples for the database without cluster is stored in a dataframe
cluster_cm.loc[k+'_FN']=cluster_scores["test_FN"] #same for false negative
cluster_cm.loc[k+'_FP']=cluster_scores["test_FP"] #same for false positive
cluster_cm.loc[k+'_TP']=cluster_scores["test_TP"] #same for true positive
#Download of results as csv files
cluster_df.to_csv("Results_3_Dropout_1_Cluster_"+f+".csv")
sin_cluster_df.to_csv("Results_3_Dropout_1_sin_Cluster_"+f+".csv")
cluster_cm.to_csv("Results_3_confusion_matrix_Dropout_1_Cluster_"+f+".csv")
sin_cluster_cm.to_csv("Results_3_confusion_matrix_Dropout_1_sin_Cluster_"+f+".csv")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment