diff --git a/code/models/final-training.py b/code/models/final-training.py index cd76b7c255ce337eb0f3af669d413a2e97a46b74..8e89172cb0e41847e07489a2ef5e7f69235ee101 100644 --- a/code/models/final-training.py +++ b/code/models/final-training.py @@ -20,27 +20,17 @@ from sklearn.svm import SVC from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier -def output_datasets(dataset,filtering): - """Gives the features and labels to train the model on depending on the dataset considered""" - if dataset == "Dropout_1": - # Import of csv database from the feature engineering R code - db_cluster = pd.read_csv("dropout_cluster.csv", sep=",") - db = pd.read_csv("dropout_sin_cluster.csv", sep=",") - # Features to be selected from the feature filtering step - features_cluster = (pd.read_csv("dropout_cluster_FSS.csv",sep=",")).columns.values - features = (pd.read_csv("dropout_sin_cluster_FSS.csv",sep=",")).columns.values - - else: - # Import of csv database from the feature engineering R code - db_cluster = pd.read_csv("relapse_cluster.csv", sep=",") - db = pd.read_csv("relapse_sin_cluster.csv", sep=",") - # Features to be selected from the feature filtering step - features_cluster = (pd.read_csv("relapse_cluster_FSS.csv",sep=",")).columns.values - features = (pd.read_csv("relapse_sin_cluster_FSS.csv",sep=",")).columns.values +def output_datasets(filtering): + # Import of csv database from the feature engineering R code + db_cluster = pd.read_csv("data/dropout_cluster.csv", sep=",") + db = pd.read_csv("data/dropout.csv", sep=",") + # Features to be selected from the feature filtering step + features_cluster = (pd.read_csv("data/FSS/featsGR_cluster.csv",sep=",")).columns.values + features = (pd.read_csv("data/FSS/featsGR.csv",sep=",")).columns.values # Creation of train and test sets for the dataset without cluster - sin_cluster_data_label = db[dataset] - sin_cluster_data_features = db.drop(dataset, axis=1) #elimination of the output from the training set + sin_cluster_data_label = db["Dropout_1"] + sin_cluster_data_features = db.drop("Dropout_1", axis=1) #elimination of the output from the training set columns_to_be_changed = sin_cluster_data_features.select_dtypes(exclude='number').columns.values sin_cluster_data_features = pd.get_dummies(sin_cluster_data_features, columns=columns_to_be_changed) #use of one hot encoding for categorical variables if filtering == "FSS" : #selection of features in case the filtering is activated for the dataset @@ -49,8 +39,8 @@ def output_datasets(dataset,filtering): sin_cluster_data_label.replace({False: 0, True: 1}, inplace=True) #convertion of boolean by integers for the features set (necessary for newest numpy versions) # Creation of train and test sets for the dataset with cluster (same steps) - cluster_data_label = db_cluster[dataset] - cluster_data_features = db_cluster.drop(dataset, axis=1) + cluster_data_label = db_cluster["Dropout_1"] + cluster_data_features = db_cluster.drop("Dropout_1", axis=1) columns_to_be_changed = cluster_data_features.select_dtypes(exclude='number').columns.values cluster_data_features = pd.get_dummies(cluster_data_features, columns=columns_to_be_changed) if filtering == "FSS" : @@ -60,42 +50,24 @@ def output_datasets(dataset,filtering): return sin_cluster_data_features, sin_cluster_data_label, cluster_data_features, cluster_data_label -def models_input(dataset, filtering) : +def models_input(filtering) : """Gives a dictionnary of models to train with as a tuple model_name:(model optimized without cluster, model optimized with clusters)""" - if filtering == "FSS" : - if dataset == "Dropout_1" : - models = {"Tree" : (DecisionTreeClassifier(splitter='best', max_features='sqrt', criterion='gini'), DecisionTreeClassifier(splitter='random', max_features='log2', criterion='log_loss')), - "RF" : (RandomForestClassifier(criterion='entropy', max_features='sqrt', n_estimators=111), RandomForestClassifier(criterion='gini', max_features='sqrt', n_estimators=194)), - "Boost" : (AdaBoostClassifier(learning_rate= 1.9061, n_estimators= 62),AdaBoostClassifier(learning_rate= 1.9184, n_estimators= 83)), - "Bag" :(BaggingClassifier(max_features= 0.8, max_samples= 1.0, n_estimators= 13, warm_start= True), BaggingClassifier(max_features= 0.8, max_samples= 0.8, n_estimators= 67, warm_start= True)), - "LR" : (LogisticRegression(solver='lbfgs', penalty='l2'), LogisticRegression(solver='newton-cholesky', penalty='l2')), - "SVM" : (SVC(C=1.6663, kernel='linear'), SVC(C=0.9894, kernel='linear')), - "NN" : (MLPClassifier(activation='logistic', hidden_layer_sizes=116, learning_rate='invscaling'), MLPClassifier(activation='identity', hidden_layer_sizes=94, learning_rate='adaptive'))} - if dataset == "Relapse_1" : - models = {"Tree" : (DecisionTreeClassifier(splitter='random', max_features='log2', criterion='log_loss'), DecisionTreeClassifier(splitter='best', max_features='sqrt', criterion='gini')), - "RF" : (RandomForestClassifier(criterion='gini', max_features='sqrt', n_estimators=158), RandomForestClassifier(criterion='gini', max_features='sqrt', n_estimators=242)), - "Boost" : (AdaBoostClassifier(learning_rate= 0.994, n_estimators= 117),AdaBoostClassifier(learning_rate= 1.672, n_estimators= 144)), - "Bag" :(BaggingClassifier(max_features= 0.8, max_samples= 1, n_estimators= 18, warm_start= False), BaggingClassifier(max_features= 0.8, max_samples= 0.8, n_estimators= 67, warm_start= False)), - "LR" : (LogisticRegression(solver='sag', penalty='l2'), LogisticRegression(solver='saga', penalty='l2')), - "SVM" : (SVC(C=1.511, kernel='rbf'), SVC(C=1.033, kernel='rbf')), - "NN" : (MLPClassifier(activation='relu', hidden_layer_sizes=125, learning_rate='invscaling'), MLPClassifier(activation='tanh', hidden_layer_sizes=67, learning_rate='constant'))} + if filtering == "FSS": + models = {"Tree" : (DecisionTreeClassifier(splitter='best', max_features='sqrt', criterion='gini'), DecisionTreeClassifier(splitter='random', max_features='log2', criterion='log_loss')), + "RF" : (RandomForestClassifier(criterion='entropy', max_features='sqrt', n_estimators=111), RandomForestClassifier(criterion='gini', max_features='sqrt', n_estimators=194)), + "Boost" : (AdaBoostClassifier(learning_rate= 1.9061, n_estimators= 62),AdaBoostClassifier(learning_rate= 1.9184, n_estimators= 83)), + "Bag" :(BaggingClassifier(max_features= 0.8, max_samples= 1.0, n_estimators= 13, warm_start= True), BaggingClassifier(max_features= 0.8, max_samples= 0.8, n_estimators= 67, warm_start= True)), + "LR" : (LogisticRegression(solver='lbfgs', penalty='l2'), LogisticRegression(solver='newton-cholesky', penalty='l2')), + "SVM" : (SVC(C=1.6663, kernel='linear'), SVC(C=0.9894, kernel='linear')), + "NN" : (MLPClassifier(activation='logistic', hidden_layer_sizes=116, learning_rate='invscaling'), MLPClassifier(activation='identity', hidden_layer_sizes=94, learning_rate='adaptive'))} if filtering == "noFSS" : - if dataset == "Dropout_1" : - models = {"Tree" : (DecisionTreeClassifier(splitter='random', max_features='log2', criterion='gini'), DecisionTreeClassifier(splitter='random', max_features='sqrt', criterion='entropy')), - "RF" : (RandomForestClassifier(criterion='entropy', max_features='log2', n_estimators=134), RandomForestClassifier(criterion='entropy', max_features='log2', n_estimators=237)), - "Boost" : (AdaBoostClassifier(learning_rate= 0.9249, n_estimators= 54),AdaBoostClassifier(learning_rate= 0.9984, n_estimators= 91)), - "Bag" :(BaggingClassifier(max_features= 0.8, max_samples= 0.8, n_estimators= 11, warm_start= True), BaggingClassifier(max_features= 0.8, max_samples= 0.8, n_estimators= 16, warm_start= False)), - "LR" : (LogisticRegression(solver='sag', penalty='l2'), LogisticRegression(solver='lbfgs', penalty='l2')), - "SVM" : (SVC(C=0.9152, kernel='linear'), SVC(C=1.3079, kernel='linear')), - "NN" : (MLPClassifier(activation='identity', hidden_layer_sizes=114, learning_rate='constant'), MLPClassifier(activation='identity', hidden_layer_sizes=71, learning_rate='constant'))} - if dataset == "Relapse_1" : - models = {"Tree" : (DecisionTreeClassifier(splitter='best', max_features='sqrt', criterion='entropy'), DecisionTreeClassifier(splitter='random', max_features='sqrt', criterion='entropy')), - "RF" : (RandomForestClassifier(criterion='entropy', max_features='sqrt', n_estimators=128), RandomForestClassifier(criterion='entropy', max_features='sqrt', n_estimators=131)), - "Boost" : (AdaBoostClassifier(learning_rate= 1.259, n_estimators= 127), AdaBoostClassifier(learning_rate= 1.393, n_estimators= 135)), - "Bag" :(BaggingClassifier(max_features= 0.8, max_samples= 0.8, n_estimators= 80, warm_start= False), BaggingClassifier(max_features= 0.8, max_samples= 0.8, n_estimators= 51, warm_start= False)), - "LR" : (LogisticRegression(solver='saga', penalty='l1'), LogisticRegression(solver='saga', penalty='l1')), - "SVM" : (SVC(C=1.974, kernel='rbf'), SVC(C=1.503, kernel='rbf')), - "NN" : (MLPClassifier(activation='relu', hidden_layer_sizes=125, learning_rate='invscaling'), MLPClassifier(activation='tanh', hidden_layer_sizes=100, learning_rate='constant'))} + models = {"Tree" : (DecisionTreeClassifier(splitter='random', max_features='log2', criterion='gini'), DecisionTreeClassifier(splitter='random', max_features='sqrt', criterion='entropy')), + "RF" : (RandomForestClassifier(criterion='entropy', max_features='log2', n_estimators=134), RandomForestClassifier(criterion='entropy', max_features='log2', n_estimators=237)), + "Boost" : (AdaBoostClassifier(learning_rate= 0.9249, n_estimators= 54),AdaBoostClassifier(learning_rate= 0.9984, n_estimators= 91)), + "Bag" :(BaggingClassifier(max_features= 0.8, max_samples= 0.8, n_estimators= 11, warm_start= True), BaggingClassifier(max_features= 0.8, max_samples= 0.8, n_estimators= 16, warm_start= False)), + "LR" : (LogisticRegression(solver='sag', penalty='l2'), LogisticRegression(solver='lbfgs', penalty='l2')), + "SVM" : (SVC(C=0.9152, kernel='linear'), SVC(C=1.3079, kernel='linear')), + "NN" : (MLPClassifier(activation='identity', hidden_layer_sizes=114, learning_rate='constant'), MLPClassifier(activation='identity', hidden_layer_sizes=71, learning_rate='constant'))} return models def negative_recall_scorer(clf, X, y): @@ -131,54 +103,52 @@ def TP_scorer(clf, X, y): return TP if __name__ == '__main__': - datasets = ["Dropout_1"] #select the dataset to train on filtering = ["FSS","noFSS"] #select whether the dataset has been through the filtering step or not scorings = {'f1':make_scorer(f1_score), 'negative_recall': negative_recall_scorer, 'recall':make_scorer(recall_score), 'precision':make_scorer(precision_score), 'TN':TN_scorer, 'FN':FN_scorer, 'FP':FP_scorer, 'TP':TP_scorer} #scorings to be used for model evaluation resample = SMOTETomek() #the method used to balance the output classes cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1) #the cross-validation protocole used - for f in filtering : - for d in datasets: - sin_cluster_data_features, sin_cluster_data_label, cluster_data_features, cluster_data_label = output_datasets(d,f) - models = models_input(d,f) #models selected for training - cluster_df = pd.DataFrame(columns=range(1,11), index=['SVM_F1','SVM_Precision','SVM_Recall','SVM_TN-prop','NN_F1','NN_Precision','NN_Recall','NN_TN-prop', - 'LR_F1','LR_Precision','LR_Recall','LR_TN-prop','Bag_F1','Bag_Precision','Bag_Recall','Bag_TN-prop', - 'RF_F1','RF_Precision','RF_Recall','RF_TN-prop','Boost_F1','Boost_Precision','Boost_Recall','Boost_TN-prop', - 'Tree_F1','Tree_Precision','Tree_Recall','Tree_TN-prop']) #dataframe to save the results in for the cluster dataset - sin_cluster_df = cluster_df.copy(deep=True) #dataframe to save the results in for the cluster dataset - cluster_cm = pd.DataFrame(columns=range(1,11), index=['SVM_TN','SVM_FN','SVM_FP','SVM_TP','NN_TN','NN_FN','NN_FP','NN_TP', - 'LR_TN','LR_FN','LR_FP','LR_TP','Bag_TN','Bag_FN','Bag_FP','Bag_TP', - 'RF_TN','RF_FN','RF_FP','RF_TP','Boost_TN','Boost_FN','Boost_FP','Boost_TP', - 'Tree_TN','Tree_FN','Tree_FP','Tree_TP']) #dataframe to save the results in for the cluster dataset - sin_cluster_cm = cluster_cm.copy(deep=True) #dataframe to save the results in for the cluster dataset - for k in models : - model = models[k][0] #selection of the first model of the tuple which is the one without clusters - pipeline = Pipeline(steps=[('r', resample), ('m', model)]) - #training of the model for the dataset without clusters - sin_cluster_scores = cross_validate(pipeline, sin_cluster_data_features.values, sin_cluster_data_label.values, scoring=scorings, cv=cv, return_train_score=True, n_jobs=1) - sin_cluster_df.loc[k+'_F1']=list(np.around(np.array(sin_cluster_scores["test_f1"]),4)) #the F1 score for the database without cluster is stored in a dataframe - sin_cluster_df.loc[k+'_Precision']=list(np.around(np.array(sin_cluster_scores["test_precision"]),4)) #same for precision - sin_cluster_df.loc[k+'_Recall']=list(np.around(np.array(sin_cluster_scores["test_recall"]),4)) #same for recall - sin_cluster_df.loc[k+'_TN-prop']=list(np.around(np.array(sin_cluster_scores["test_negative_recall"]),4)) #same for negative_recall - sin_cluster_cm.loc[k+'_TN']=sin_cluster_scores["test_TN"] #the number of true negative samples for the database without cluster is stored in a dataframe - sin_cluster_cm.loc[k+'_FN']=sin_cluster_scores["test_FN"] #same for false negative - sin_cluster_cm.loc[k+'_FP']=sin_cluster_scores["test_FP"] #same for false positive - sin_cluster_cm.loc[k+'_TP']=sin_cluster_scores["test_TP"] #same for true positive + for f in filtering : + sin_cluster_data_features, sin_cluster_data_label, cluster_data_features, cluster_data_label = output_datasets(f) + models = models_input(f) #models selected for training + cluster_df = pd.DataFrame(columns=range(1,11), index=['SVM_F1','SVM_Precision','SVM_Recall','SVM_TN-prop','NN_F1','NN_Precision','NN_Recall','NN_TN-prop', + 'LR_F1','LR_Precision','LR_Recall','LR_TN-prop','Bag_F1','Bag_Precision','Bag_Recall','Bag_TN-prop', + 'RF_F1','RF_Precision','RF_Recall','RF_TN-prop','Boost_F1','Boost_Precision','Boost_Recall','Boost_TN-prop', + 'Tree_F1','Tree_Precision','Tree_Recall','Tree_TN-prop']) #dataframe to save the results in for the cluster dataset + sin_cluster_df = cluster_df.copy(deep=True) #dataframe to save the results in for the cluster dataset + cluster_cm = pd.DataFrame(columns=range(1,11), index=['SVM_TN','SVM_FN','SVM_FP','SVM_TP','NN_TN','NN_FN','NN_FP','NN_TP', + 'LR_TN','LR_FN','LR_FP','LR_TP','Bag_TN','Bag_FN','Bag_FP','Bag_TP', + 'RF_TN','RF_FN','RF_FP','RF_TP','Boost_TN','Boost_FN','Boost_FP','Boost_TP', + 'Tree_TN','Tree_FN','Tree_FP','Tree_TP']) #dataframe to save the results in for the cluster dataset + sin_cluster_cm = cluster_cm.copy(deep=True) #dataframe to save the results in for the cluster dataset + for k in models : + model = models[k][0] #selection of the first model of the tuple which is the one without clusters + pipeline = Pipeline(steps=[('r', resample), ('m', model)]) + #training of the model for the dataset without clusters + sin_cluster_scores = cross_validate(pipeline, sin_cluster_data_features.values, sin_cluster_data_label.values, scoring=scorings, cv=cv, return_train_score=True, n_jobs=1) + sin_cluster_df.loc[k+'_F1']=list(np.around(np.array(sin_cluster_scores["test_f1"]),4)) #the F1 score for the database without cluster is stored in a dataframe + sin_cluster_df.loc[k+'_Precision']=list(np.around(np.array(sin_cluster_scores["test_precision"]),4)) #same for precision + sin_cluster_df.loc[k+'_Recall']=list(np.around(np.array(sin_cluster_scores["test_recall"]),4)) #same for recall + sin_cluster_df.loc[k+'_TN-prop']=list(np.around(np.array(sin_cluster_scores["test_negative_recall"]),4)) #same for negative_recall + sin_cluster_cm.loc[k+'_TN']=sin_cluster_scores["test_TN"] #the number of true negative samples for the database without cluster is stored in a dataframe + sin_cluster_cm.loc[k+'_FN']=sin_cluster_scores["test_FN"] #same for false negative + sin_cluster_cm.loc[k+'_FP']=sin_cluster_scores["test_FP"] #same for false positive + sin_cluster_cm.loc[k+'_TP']=sin_cluster_scores["test_TP"] #same for true positive - model = models[k][1] #selection of the second model of the tuple which is the one with clusters - pipeline = Pipeline(steps=[('r', resample), ('m', model)]) - #training of the model for the dataset with clusters - cluster_scores = cross_validate(pipeline, cluster_data_features.values, cluster_data_label.values, scoring=scorings, cv=cv, return_train_score=True, n_jobs=1) - cluster_df.loc[k+'_F1']=list(np.around(np.array(cluster_scores["test_f1"]),4)) #the F1 score for the database without cluster is stored in a dataframe - cluster_df.loc[k+'_Precision']=list(np.around(np.array(cluster_scores["test_precision"]),4)) #same for precision - cluster_df.loc[k+'_Recall']=list(np.around(np.array(cluster_scores["test_recall"]),4)) #same for recall - cluster_df.loc[k+'_TN-prop']=list(np.around(np.array(cluster_scores["test_negative_recall"]),4)) #same for negative_recall - cluster_cm.loc[k+'_TN']=cluster_scores["test_TN"] #the number of true negative samples for the database without cluster is stored in a dataframe - cluster_cm.loc[k+'_FN']=cluster_scores["test_FN"] #same for false negative - cluster_cm.loc[k+'_FP']=cluster_scores["test_FP"] #same for false positive - cluster_cm.loc[k+'_TP']=cluster_scores["test_TP"] #same for true positive - - #Download of results as csv files - cluster_df.to_csv("Results_3_"+d+"_Cluster_"+f+".csv") - sin_cluster_df.to_csv("Results_3_"+d+"_sin_Cluster_"+f+".csv") - cluster_cm.to_csv("Results_3_confusion_matrix_"+d+"_Cluster_"+f+".csv") - sin_cluster_cm.to_csv("Results_3_confusion_matrix"+d+"_sin_Cluster_"+f+".csv") + model = models[k][1] #selection of the second model of the tuple which is the one with clusters + pipeline = Pipeline(steps=[('r', resample), ('m', model)]) + #training of the model for the dataset with clusters + cluster_scores = cross_validate(pipeline, cluster_data_features.values, cluster_data_label.values, scoring=scorings, cv=cv, return_train_score=True, n_jobs=1) + cluster_df.loc[k+'_F1']=list(np.around(np.array(cluster_scores["test_f1"]),4)) #the F1 score for the database without cluster is stored in a dataframe + cluster_df.loc[k+'_Precision']=list(np.around(np.array(cluster_scores["test_precision"]),4)) #same for precision + cluster_df.loc[k+'_Recall']=list(np.around(np.array(cluster_scores["test_recall"]),4)) #same for recall + cluster_df.loc[k+'_TN-prop']=list(np.around(np.array(cluster_scores["test_negative_recall"]),4)) #same for negative_recall + cluster_cm.loc[k+'_TN']=cluster_scores["test_TN"] #the number of true negative samples for the database without cluster is stored in a dataframe + cluster_cm.loc[k+'_FN']=cluster_scores["test_FN"] #same for false negative + cluster_cm.loc[k+'_FP']=cluster_scores["test_FP"] #same for false positive + cluster_cm.loc[k+'_TP']=cluster_scores["test_TP"] #same for true positive + + #Download of results as csv files + cluster_df.to_csv("Results_3_Dropout_1_Cluster_"+f+".csv") + sin_cluster_df.to_csv("Results_3_Dropout_1_sin_Cluster_"+f+".csv") + cluster_cm.to_csv("Results_3_confusion_matrix_Dropout_1_Cluster_"+f+".csv") + sin_cluster_cm.to_csv("Results_3_confusion_matrix_Dropout_1_sin_Cluster_"+f+".csv")