Commit 208dd931 authored by Lucia Prieto's avatar Lucia Prieto

Update final-training.py

parent 5ae52883
......@@ -20,27 +20,17 @@ from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
def output_datasets(dataset,filtering):
"""Gives the features and labels to train the model on depending on the dataset considered"""
if dataset == "Dropout_1":
def output_datasets(filtering):
# Import of csv database from the feature engineering R code
db_cluster = pd.read_csv("dropout_cluster.csv", sep=",")
db = pd.read_csv("dropout_sin_cluster.csv", sep=",")
db_cluster = pd.read_csv("data/dropout_cluster.csv", sep=",")
db = pd.read_csv("data/dropout.csv", sep=",")
# Features to be selected from the feature filtering step
features_cluster = (pd.read_csv("dropout_cluster_FSS.csv",sep=",")).columns.values
features = (pd.read_csv("dropout_sin_cluster_FSS.csv",sep=",")).columns.values
else:
# Import of csv database from the feature engineering R code
db_cluster = pd.read_csv("relapse_cluster.csv", sep=",")
db = pd.read_csv("relapse_sin_cluster.csv", sep=",")
# Features to be selected from the feature filtering step
features_cluster = (pd.read_csv("relapse_cluster_FSS.csv",sep=",")).columns.values
features = (pd.read_csv("relapse_sin_cluster_FSS.csv",sep=",")).columns.values
features_cluster = (pd.read_csv("data/FSS/featsGR_cluster.csv",sep=",")).columns.values
features = (pd.read_csv("data/FSS/featsGR.csv",sep=",")).columns.values
# Creation of train and test sets for the dataset without cluster
sin_cluster_data_label = db[dataset]
sin_cluster_data_features = db.drop(dataset, axis=1) #elimination of the output from the training set
sin_cluster_data_label = db["Dropout_1"]
sin_cluster_data_features = db.drop("Dropout_1", axis=1) #elimination of the output from the training set
columns_to_be_changed = sin_cluster_data_features.select_dtypes(exclude='number').columns.values
sin_cluster_data_features = pd.get_dummies(sin_cluster_data_features, columns=columns_to_be_changed) #use of one hot encoding for categorical variables
if filtering == "FSS" : #selection of features in case the filtering is activated for the dataset
......@@ -49,8 +39,8 @@ def output_datasets(dataset,filtering):
sin_cluster_data_label.replace({False: 0, True: 1}, inplace=True) #convertion of boolean by integers for the features set (necessary for newest numpy versions)
# Creation of train and test sets for the dataset with cluster (same steps)
cluster_data_label = db_cluster[dataset]
cluster_data_features = db_cluster.drop(dataset, axis=1)
cluster_data_label = db_cluster["Dropout_1"]
cluster_data_features = db_cluster.drop("Dropout_1", axis=1)
columns_to_be_changed = cluster_data_features.select_dtypes(exclude='number').columns.values
cluster_data_features = pd.get_dummies(cluster_data_features, columns=columns_to_be_changed)
if filtering == "FSS" :
......@@ -60,10 +50,9 @@ def output_datasets(dataset,filtering):
return sin_cluster_data_features, sin_cluster_data_label, cluster_data_features, cluster_data_label
def models_input(dataset, filtering) :
def models_input(filtering) :
"""Gives a dictionnary of models to train with as a tuple model_name:(model optimized without cluster, model optimized with clusters)"""
if filtering == "FSS" :
if dataset == "Dropout_1" :
if filtering == "FSS":
models = {"Tree" : (DecisionTreeClassifier(splitter='best', max_features='sqrt', criterion='gini'), DecisionTreeClassifier(splitter='random', max_features='log2', criterion='log_loss')),
"RF" : (RandomForestClassifier(criterion='entropy', max_features='sqrt', n_estimators=111), RandomForestClassifier(criterion='gini', max_features='sqrt', n_estimators=194)),
"Boost" : (AdaBoostClassifier(learning_rate= 1.9061, n_estimators= 62),AdaBoostClassifier(learning_rate= 1.9184, n_estimators= 83)),
......@@ -71,16 +60,7 @@ def models_input(dataset, filtering) :
"LR" : (LogisticRegression(solver='lbfgs', penalty='l2'), LogisticRegression(solver='newton-cholesky', penalty='l2')),
"SVM" : (SVC(C=1.6663, kernel='linear'), SVC(C=0.9894, kernel='linear')),
"NN" : (MLPClassifier(activation='logistic', hidden_layer_sizes=116, learning_rate='invscaling'), MLPClassifier(activation='identity', hidden_layer_sizes=94, learning_rate='adaptive'))}
if dataset == "Relapse_1" :
models = {"Tree" : (DecisionTreeClassifier(splitter='random', max_features='log2', criterion='log_loss'), DecisionTreeClassifier(splitter='best', max_features='sqrt', criterion='gini')),
"RF" : (RandomForestClassifier(criterion='gini', max_features='sqrt', n_estimators=158), RandomForestClassifier(criterion='gini', max_features='sqrt', n_estimators=242)),
"Boost" : (AdaBoostClassifier(learning_rate= 0.994, n_estimators= 117),AdaBoostClassifier(learning_rate= 1.672, n_estimators= 144)),
"Bag" :(BaggingClassifier(max_features= 0.8, max_samples= 1, n_estimators= 18, warm_start= False), BaggingClassifier(max_features= 0.8, max_samples= 0.8, n_estimators= 67, warm_start= False)),
"LR" : (LogisticRegression(solver='sag', penalty='l2'), LogisticRegression(solver='saga', penalty='l2')),
"SVM" : (SVC(C=1.511, kernel='rbf'), SVC(C=1.033, kernel='rbf')),
"NN" : (MLPClassifier(activation='relu', hidden_layer_sizes=125, learning_rate='invscaling'), MLPClassifier(activation='tanh', hidden_layer_sizes=67, learning_rate='constant'))}
if filtering == "noFSS" :
if dataset == "Dropout_1" :
models = {"Tree" : (DecisionTreeClassifier(splitter='random', max_features='log2', criterion='gini'), DecisionTreeClassifier(splitter='random', max_features='sqrt', criterion='entropy')),
"RF" : (RandomForestClassifier(criterion='entropy', max_features='log2', n_estimators=134), RandomForestClassifier(criterion='entropy', max_features='log2', n_estimators=237)),
"Boost" : (AdaBoostClassifier(learning_rate= 0.9249, n_estimators= 54),AdaBoostClassifier(learning_rate= 0.9984, n_estimators= 91)),
......@@ -88,14 +68,6 @@ def models_input(dataset, filtering) :
"LR" : (LogisticRegression(solver='sag', penalty='l2'), LogisticRegression(solver='lbfgs', penalty='l2')),
"SVM" : (SVC(C=0.9152, kernel='linear'), SVC(C=1.3079, kernel='linear')),
"NN" : (MLPClassifier(activation='identity', hidden_layer_sizes=114, learning_rate='constant'), MLPClassifier(activation='identity', hidden_layer_sizes=71, learning_rate='constant'))}
if dataset == "Relapse_1" :
models = {"Tree" : (DecisionTreeClassifier(splitter='best', max_features='sqrt', criterion='entropy'), DecisionTreeClassifier(splitter='random', max_features='sqrt', criterion='entropy')),
"RF" : (RandomForestClassifier(criterion='entropy', max_features='sqrt', n_estimators=128), RandomForestClassifier(criterion='entropy', max_features='sqrt', n_estimators=131)),
"Boost" : (AdaBoostClassifier(learning_rate= 1.259, n_estimators= 127), AdaBoostClassifier(learning_rate= 1.393, n_estimators= 135)),
"Bag" :(BaggingClassifier(max_features= 0.8, max_samples= 0.8, n_estimators= 80, warm_start= False), BaggingClassifier(max_features= 0.8, max_samples= 0.8, n_estimators= 51, warm_start= False)),
"LR" : (LogisticRegression(solver='saga', penalty='l1'), LogisticRegression(solver='saga', penalty='l1')),
"SVM" : (SVC(C=1.974, kernel='rbf'), SVC(C=1.503, kernel='rbf')),
"NN" : (MLPClassifier(activation='relu', hidden_layer_sizes=125, learning_rate='invscaling'), MLPClassifier(activation='tanh', hidden_layer_sizes=100, learning_rate='constant'))}
return models
def negative_recall_scorer(clf, X, y):
......@@ -131,15 +103,13 @@ def TP_scorer(clf, X, y):
return TP
if __name__ == '__main__':
datasets = ["Dropout_1"] #select the dataset to train on
filtering = ["FSS","noFSS"] #select whether the dataset has been through the filtering step or not
scorings = {'f1':make_scorer(f1_score), 'negative_recall': negative_recall_scorer, 'recall':make_scorer(recall_score), 'precision':make_scorer(precision_score), 'TN':TN_scorer, 'FN':FN_scorer, 'FP':FP_scorer, 'TP':TP_scorer} #scorings to be used for model evaluation
resample = SMOTETomek() #the method used to balance the output classes
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1) #the cross-validation protocole used
for f in filtering :
for d in datasets:
sin_cluster_data_features, sin_cluster_data_label, cluster_data_features, cluster_data_label = output_datasets(d,f)
models = models_input(d,f) #models selected for training
sin_cluster_data_features, sin_cluster_data_label, cluster_data_features, cluster_data_label = output_datasets(f)
models = models_input(f) #models selected for training
cluster_df = pd.DataFrame(columns=range(1,11), index=['SVM_F1','SVM_Precision','SVM_Recall','SVM_TN-prop','NN_F1','NN_Precision','NN_Recall','NN_TN-prop',
'LR_F1','LR_Precision','LR_Recall','LR_TN-prop','Bag_F1','Bag_Precision','Bag_Recall','Bag_TN-prop',
'RF_F1','RF_Precision','RF_Recall','RF_TN-prop','Boost_F1','Boost_Precision','Boost_Recall','Boost_TN-prop',
......@@ -178,7 +148,7 @@ if __name__ == '__main__':
cluster_cm.loc[k+'_TP']=cluster_scores["test_TP"] #same for true positive
#Download of results as csv files
cluster_df.to_csv("Results_3_"+d+"_Cluster_"+f+".csv")
sin_cluster_df.to_csv("Results_3_"+d+"_sin_Cluster_"+f+".csv")
cluster_cm.to_csv("Results_3_confusion_matrix_"+d+"_Cluster_"+f+".csv")
sin_cluster_cm.to_csv("Results_3_confusion_matrix"+d+"_sin_Cluster_"+f+".csv")
cluster_df.to_csv("Results_3_Dropout_1_Cluster_"+f+".csv")
sin_cluster_df.to_csv("Results_3_Dropout_1_sin_Cluster_"+f+".csv")
cluster_cm.to_csv("Results_3_confusion_matrix_Dropout_1_Cluster_"+f+".csv")
sin_cluster_cm.to_csv("Results_3_confusion_matrix_Dropout_1_sin_Cluster_"+f+".csv")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment