final-training.py 12 KB
Newer Older
Lucia Prieto's avatar
Lucia Prieto committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
# The aim of this code is to get an ovrview of the performances of the selected models on the filtered data set whetehr thery are clustered of not and for both outputs 
# We train with a 10-folds scenario and get as an output the following metrics for each fold :
# -AUROC
# -F1
# -LogLoss

# Import of databases for training
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from imblearn.pipeline import Pipeline
from imblearn.combine import SMOTETomek
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score, make_scorer, precision_score, recall_score
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.linear_model import  LogisticRegression
from sklearn.tree import DecisionTreeClassifier

Lucia Prieto's avatar
Lucia Prieto committed
23 24 25 26 27 28 29
def output_datasets(filtering): 
    # Import of csv database from the feature engineering R code
    db_cluster = pd.read_csv("data/dropout_cluster.csv", sep=",")
    db = pd.read_csv("data/dropout.csv", sep=",")
    # Features to be selected from the feature filtering step
    features_cluster = (pd.read_csv("data/FSS/featsGR_cluster.csv",sep=",")).columns.values
    features = (pd.read_csv("data/FSS/featsGR.csv",sep=",")).columns.values
Lucia Prieto's avatar
Lucia Prieto committed
30 31

    # Creation of train and test sets for the dataset without cluster
Lucia Prieto's avatar
Lucia Prieto committed
32 33
    sin_cluster_data_label = db["Dropout_1"]
    sin_cluster_data_features = db.drop("Dropout_1", axis=1) #elimination of the output from the training set
Lucia Prieto's avatar
Lucia Prieto committed
34 35 36 37 38 39 40 41
    columns_to_be_changed = sin_cluster_data_features.select_dtypes(exclude='number').columns.values
    sin_cluster_data_features = pd.get_dummies(sin_cluster_data_features, columns=columns_to_be_changed) #use of one hot encoding for categorical variables
    if filtering == "FSS" : #selection of features in case the filtering is activated for the dataset
        sin_cluster_data_features = sin_cluster_data_features.filter(features, axis=1)
    sin_cluster_data_features.replace({False: 0, True: 1}, inplace=True) #convertion of boolean by integers for the features set (necessary for newest numpy versions)
    sin_cluster_data_label.replace({False: 0, True: 1}, inplace=True) #convertion of boolean by integers for the features set (necessary for newest numpy versions)

    # Creation of train and test sets for the dataset with cluster (same steps)
Lucia Prieto's avatar
Lucia Prieto committed
42 43
    cluster_data_label = db_cluster["Dropout_1"]
    cluster_data_features = db_cluster.drop("Dropout_1", axis=1)
Lucia Prieto's avatar
Lucia Prieto committed
44 45 46 47 48 49 50 51 52
    columns_to_be_changed = cluster_data_features.select_dtypes(exclude='number').columns.values
    cluster_data_features = pd.get_dummies(cluster_data_features, columns=columns_to_be_changed)
    if filtering == "FSS" :
        cluster_data_features = cluster_data_features.filter(features_cluster, axis=1)
    cluster_data_features.replace({False: 0, True: 1}, inplace=True)
    cluster_data_label.replace({False: 0, True: 1}, inplace=True)

    return sin_cluster_data_features, sin_cluster_data_label, cluster_data_features, cluster_data_label

Lucia Prieto's avatar
Lucia Prieto committed
53
def models_input(filtering) :
Lucia Prieto's avatar
Lucia Prieto committed
54
    """Gives a dictionnary of models to train with as a tuple model_name:(model optimized without cluster, model optimized with clusters)"""
Lucia Prieto's avatar
Lucia Prieto committed
55 56 57 58 59 60 61 62
    if filtering == "FSS":
        models = {"Tree" : (DecisionTreeClassifier(splitter='best', max_features='sqrt', criterion='gini'), DecisionTreeClassifier(splitter='random', max_features='log2', criterion='log_loss')), 
                   "RF" : (RandomForestClassifier(criterion='entropy', max_features='sqrt', n_estimators=111), RandomForestClassifier(criterion='gini', max_features='sqrt', n_estimators=194)), 
                   "Boost" : (AdaBoostClassifier(learning_rate= 1.9061, n_estimators= 62),AdaBoostClassifier(learning_rate= 1.9184, n_estimators= 83)), 
                   "Bag" :(BaggingClassifier(max_features= 0.8, max_samples= 1.0, n_estimators= 13, warm_start= True), BaggingClassifier(max_features= 0.8, max_samples= 0.8, n_estimators= 67, warm_start= True)), 
                   "LR" : (LogisticRegression(solver='lbfgs', penalty='l2'), LogisticRegression(solver='newton-cholesky', penalty='l2')), 
                   "SVM" : (SVC(C=1.6663, kernel='linear'), SVC(C=0.9894, kernel='linear')),
                   "NN" : (MLPClassifier(activation='logistic', hidden_layer_sizes=116, learning_rate='invscaling'), MLPClassifier(activation='identity', hidden_layer_sizes=94, learning_rate='adaptive'))}
Lucia Prieto's avatar
Lucia Prieto committed
63
    if filtering == "noFSS" :
Lucia Prieto's avatar
Lucia Prieto committed
64 65 66 67 68 69 70
        models = {"Tree" : (DecisionTreeClassifier(splitter='random', max_features='log2', criterion='gini'), DecisionTreeClassifier(splitter='random', max_features='sqrt', criterion='entropy')), 
                   "RF" : (RandomForestClassifier(criterion='entropy', max_features='log2', n_estimators=134), RandomForestClassifier(criterion='entropy', max_features='log2', n_estimators=237)), 
                   "Boost" : (AdaBoostClassifier(learning_rate= 0.9249, n_estimators= 54),AdaBoostClassifier(learning_rate= 0.9984, n_estimators= 91)), 
                   "Bag" :(BaggingClassifier(max_features= 0.8, max_samples= 0.8, n_estimators= 11, warm_start= True), BaggingClassifier(max_features= 0.8, max_samples= 0.8, n_estimators= 16, warm_start= False)), 
                   "LR" : (LogisticRegression(solver='sag', penalty='l2'), LogisticRegression(solver='lbfgs', penalty='l2')), 
                   "SVM" : (SVC(C=0.9152, kernel='linear'), SVC(C=1.3079, kernel='linear')),
                   "NN" : (MLPClassifier(activation='identity', hidden_layer_sizes=114, learning_rate='constant'), MLPClassifier(activation='identity', hidden_layer_sizes=71, learning_rate='constant'))}
Lucia Prieto's avatar
Lucia Prieto committed
71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109
    return models

def negative_recall_scorer(clf, X, y):
    """Gives the negative recall defined as the (number of true_negative_samples)/(total number of negative samples)"""
    y_pred = clf.predict(X)
    cm = confusion_matrix(y, y_pred)
    TN_prop = cm[0,0]/(cm[0,1]+cm[0,0])
    return TN_prop

def TN_scorer(clf, X, y):
    """Gives the number of samples predicted as true negatives"""
    y_pred = clf.predict(X)
    cm = confusion_matrix(y, y_pred)
    TN = cm[0,0]
    return TN
def FN_scorer(clf, X, y):
    """Gives the number of samples predicted as false negatives"""
    y_pred = clf.predict(X)
    cm = confusion_matrix(y, y_pred)
    FN = cm[0,1]
    return FN
def FP_scorer(clf, X, y):
    """Gives the number of samples predicted as false positive"""
    y_pred = clf.predict(X)
    cm = confusion_matrix(y, y_pred)
    FP = cm[1,0]
    return FP
def TP_scorer(clf, X, y):
    """Gives the number of samples predicted as true positive"""
    y_pred = clf.predict(X)
    cm = confusion_matrix(y, y_pred)
    TP = cm[1,1]
    return TP

if __name__ == '__main__':
    filtering = ["FSS","noFSS"] #select whether the dataset has been through the filtering step or not 
    scorings = {'f1':make_scorer(f1_score), 'negative_recall': negative_recall_scorer, 'recall':make_scorer(recall_score), 'precision':make_scorer(precision_score), 'TN':TN_scorer, 'FN':FN_scorer, 'FP':FP_scorer, 'TP':TP_scorer} #scorings to be used for model evaluation
    resample = SMOTETomek() #the method used to balance the output classes
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1) #the cross-validation protocole used
Lucia Prieto's avatar
Lucia Prieto committed
110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
    for f in filtering :
        sin_cluster_data_features, sin_cluster_data_label, cluster_data_features, cluster_data_label = output_datasets(f)
        models = models_input(f) #models selected for training
        cluster_df = pd.DataFrame(columns=range(1,11), index=['SVM_F1','SVM_Precision','SVM_Recall','SVM_TN-prop','NN_F1','NN_Precision','NN_Recall','NN_TN-prop',
                                                          'LR_F1','LR_Precision','LR_Recall','LR_TN-prop','Bag_F1','Bag_Precision','Bag_Recall','Bag_TN-prop',
                                                          'RF_F1','RF_Precision','RF_Recall','RF_TN-prop','Boost_F1','Boost_Precision','Boost_Recall','Boost_TN-prop',
                                                          'Tree_F1','Tree_Precision','Tree_Recall','Tree_TN-prop']) #dataframe to save the results in for the cluster dataset
        sin_cluster_df = cluster_df.copy(deep=True) #dataframe to save the results in for the cluster dataset
        cluster_cm = pd.DataFrame(columns=range(1,11), index=['SVM_TN','SVM_FN','SVM_FP','SVM_TP','NN_TN','NN_FN','NN_FP','NN_TP',
                                                          'LR_TN','LR_FN','LR_FP','LR_TP','Bag_TN','Bag_FN','Bag_FP','Bag_TP',
                                                          'RF_TN','RF_FN','RF_FP','RF_TP','Boost_TN','Boost_FN','Boost_FP','Boost_TP',
                                                          'Tree_TN','Tree_FN','Tree_FP','Tree_TP']) #dataframe to save the results in for the cluster dataset
        sin_cluster_cm = cluster_cm.copy(deep=True) #dataframe to save the results in for the cluster dataset
        for k in models :
            model = models[k][0] #selection of the first model of the tuple which is the one without clusters
            pipeline = Pipeline(steps=[('r', resample), ('m', model)])
            #training of the model for the dataset without clusters 
            sin_cluster_scores = cross_validate(pipeline, sin_cluster_data_features.values, sin_cluster_data_label.values, scoring=scorings, cv=cv, return_train_score=True, n_jobs=1)
            sin_cluster_df.loc[k+'_F1']=list(np.around(np.array(sin_cluster_scores["test_f1"]),4)) #the F1 score for the database without cluster is stored in a dataframe 
            sin_cluster_df.loc[k+'_Precision']=list(np.around(np.array(sin_cluster_scores["test_precision"]),4)) #same for precision
            sin_cluster_df.loc[k+'_Recall']=list(np.around(np.array(sin_cluster_scores["test_recall"]),4)) #same for recall 
            sin_cluster_df.loc[k+'_TN-prop']=list(np.around(np.array(sin_cluster_scores["test_negative_recall"]),4)) #same for negative_recall
            sin_cluster_cm.loc[k+'_TN']=sin_cluster_scores["test_TN"] #the number of true negative samples for the database without cluster is stored in a dataframe 
            sin_cluster_cm.loc[k+'_FN']=sin_cluster_scores["test_FN"] #same for false negative
            sin_cluster_cm.loc[k+'_FP']=sin_cluster_scores["test_FP"] #same for false positive
            sin_cluster_cm.loc[k+'_TP']=sin_cluster_scores["test_TP"] #same for true positive
Lucia Prieto's avatar
Lucia Prieto committed
136

Lucia Prieto's avatar
Lucia Prieto committed
137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154
            model = models[k][1] #selection of the second model of the tuple which is the one with clusters
            pipeline = Pipeline(steps=[('r', resample), ('m', model)])
            #training of the model for the dataset with clusters
            cluster_scores = cross_validate(pipeline, cluster_data_features.values, cluster_data_label.values, scoring=scorings, cv=cv, return_train_score=True, n_jobs=1)
            cluster_df.loc[k+'_F1']=list(np.around(np.array(cluster_scores["test_f1"]),4)) #the F1 score for the database without cluster is stored in a dataframe 
            cluster_df.loc[k+'_Precision']=list(np.around(np.array(cluster_scores["test_precision"]),4)) #same for precision
            cluster_df.loc[k+'_Recall']=list(np.around(np.array(cluster_scores["test_recall"]),4)) #same for recall 
            cluster_df.loc[k+'_TN-prop']=list(np.around(np.array(cluster_scores["test_negative_recall"]),4)) #same for negative_recall
            cluster_cm.loc[k+'_TN']=cluster_scores["test_TN"] #the number of true negative samples for the database without cluster is stored in a dataframe 
            cluster_cm.loc[k+'_FN']=cluster_scores["test_FN"] #same for false negative
            cluster_cm.loc[k+'_FP']=cluster_scores["test_FP"] #same for false positive
            cluster_cm.loc[k+'_TP']=cluster_scores["test_TP"] #same for true positive
    
        #Download of results as csv files 
        cluster_df.to_csv("Results_3_Dropout_1_Cluster_"+f+".csv")
        sin_cluster_df.to_csv("Results_3_Dropout_1_sin_Cluster_"+f+".csv")
        cluster_cm.to_csv("Results_3_confusion_matrix_Dropout_1_Cluster_"+f+".csv")
        sin_cluster_cm.to_csv("Results_3_confusion_matrix_Dropout_1_sin_Cluster_"+f+".csv")