final-training.py

# The aim of this code is to get an ovrview of the performances of the selected models on the filtered data set whetehr thery are clustered of not and for both outputs 
# We train with a 10-folds scenario and get as an output the following metrics for each fold :
# -AUROC
# -F1
# -LogLoss

# Import of databases for training
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from imblearn.pipeline import Pipeline
from imblearn.combine import SMOTETomek
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score, make_scorer, precision_score, recall_score
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.linear_model import  LogisticRegression
from sklearn.tree import DecisionTreeClassifier

def output_datasets(filtering): 
    # Import of csv database from the feature engineering R code
    db_cluster = pd.read_csv("data/dropout_cluster.csv", sep=",")
    db = pd.read_csv("data/dropout.csv", sep=",")
    # Features to be selected from the feature filtering step
    features_cluster = (pd.read_csv("data/FSS/featsGR_cluster.csv",sep=",")).columns.values
    features = (pd.read_csv("data/FSS/featsGR.csv",sep=",")).columns.values

    # Creation of train and test sets for the dataset without cluster
    sin_cluster_data_label = db["Dropout_1"]
    sin_cluster_data_features = db.drop("Dropout_1", axis=1) #elimination of the output from the training set
    columns_to_be_changed = sin_cluster_data_features.select_dtypes(exclude='number').columns.values
    sin_cluster_data_features = pd.get_dummies(sin_cluster_data_features, columns=columns_to_be_changed) #use of one hot encoding for categorical variables
    if filtering == "FSS" : #selection of features in case the filtering is activated for the dataset
        sin_cluster_data_features = sin_cluster_data_features.filter(features, axis=1)
    sin_cluster_data_features.replace({False: 0, True: 1}, inplace=True) #convertion of boolean by integers for the features set (necessary for newest numpy versions)
    sin_cluster_data_label.replace({False: 0, True: 1}, inplace=True) #convertion of boolean by integers for the features set (necessary for newest numpy versions)

    # Creation of train and test sets for the dataset with cluster (same steps)
    cluster_data_label = db_cluster["Dropout_1"]
    cluster_data_features = db_cluster.drop("Dropout_1", axis=1)
    columns_to_be_changed = cluster_data_features.select_dtypes(exclude='number').columns.values
    cluster_data_features = pd.get_dummies(cluster_data_features, columns=columns_to_be_changed)
    if filtering == "FSS" :
        cluster_data_features = cluster_data_features.filter(features_cluster, axis=1)
    cluster_data_features.replace({False: 0, True: 1}, inplace=True)
    cluster_data_label.replace({False: 0, True: 1}, inplace=True)

    return sin_cluster_data_features, sin_cluster_data_label, cluster_data_features, cluster_data_label

def models_input(filtering) :
    """Gives a dictionnary of models to train with as a tuple model_name:(model optimized without cluster, model optimized with clusters)"""
    if filtering == "FSS":
        models = {"Tree" : (DecisionTreeClassifier(splitter='best', max_features='sqrt', criterion='gini'), DecisionTreeClassifier(splitter='random', max_features='log2', criterion='log_loss')), 
                   "RF" : (RandomForestClassifier(criterion='entropy', max_features='sqrt', n_estimators=111), RandomForestClassifier(criterion='gini', max_features='sqrt', n_estimators=194)), 
                   "Boost" : (AdaBoostClassifier(learning_rate= 1.9061, n_estimators= 62),AdaBoostClassifier(learning_rate= 1.9184, n_estimators= 83)), 
                   "Bag" :(BaggingClassifier(max_features= 0.8, max_samples= 1.0, n_estimators= 13, warm_start= True), BaggingClassifier(max_features= 0.8, max_samples= 0.8, n_estimators= 67, warm_start= True)), 
                   "LR" : (LogisticRegression(solver='lbfgs', penalty='l2'), LogisticRegression(solver='newton-cholesky', penalty='l2')), 
                   "SVM" : (SVC(C=1.6663, kernel='linear'), SVC(C=0.9894, kernel='linear')),
                   "NN" : (MLPClassifier(activation='logistic', hidden_layer_sizes=116, learning_rate='invscaling'), MLPClassifier(activation='identity', hidden_layer_sizes=94, learning_rate='adaptive'))}
    if filtering == "noFSS" :
        models = {"Tree" : (DecisionTreeClassifier(splitter='random', max_features='log2', criterion='gini'), DecisionTreeClassifier(splitter='random', max_features='sqrt', criterion='entropy')), 
                   "RF" : (RandomForestClassifier(criterion='entropy', max_features='log2', n_estimators=134), RandomForestClassifier(criterion='entropy', max_features='log2', n_estimators=237)), 
                   "Boost" : (AdaBoostClassifier(learning_rate= 0.9249, n_estimators= 54),AdaBoostClassifier(learning_rate= 0.9984, n_estimators= 91)), 
                   "Bag" :(BaggingClassifier(max_features= 0.8, max_samples= 0.8, n_estimators= 11, warm_start= True), BaggingClassifier(max_features= 0.8, max_samples= 0.8, n_estimators= 16, warm_start= False)), 
                   "LR" : (LogisticRegression(solver='sag', penalty='l2'), LogisticRegression(solver='lbfgs', penalty='l2')), 
                   "SVM" : (SVC(C=0.9152, kernel='linear'), SVC(C=1.3079, kernel='linear')),
                   "NN" : (MLPClassifier(activation='identity', hidden_layer_sizes=114, learning_rate='constant'), MLPClassifier(activation='identity', hidden_layer_sizes=71, learning_rate='constant'))}
    return models

def negative_recall_scorer(clf, X, y):
    """Gives the negative recall defined as the (number of true_negative_samples)/(total number of negative samples)"""
    y_pred = clf.predict(X)
    cm = confusion_matrix(y, y_pred)
    TN_prop = cm[0,0]/(cm[0,1]+cm[0,0])
    return TN_prop

def TN_scorer(clf, X, y):
    """Gives the number of samples predicted as true negatives"""
    y_pred = clf.predict(X)
    cm = confusion_matrix(y, y_pred)
    TN = cm[0,0]
    return TN
def FN_scorer(clf, X, y):
    """Gives the number of samples predicted as false negatives"""
    y_pred = clf.predict(X)
    cm = confusion_matrix(y, y_pred)
    FN = cm[0,1]
    return FN
def FP_scorer(clf, X, y):
    """Gives the number of samples predicted as false positive"""
    y_pred = clf.predict(X)
    cm = confusion_matrix(y, y_pred)
    FP = cm[1,0]
    return FP
def TP_scorer(clf, X, y):
    """Gives the number of samples predicted as true positive"""
    y_pred = clf.predict(X)
    cm = confusion_matrix(y, y_pred)
    TP = cm[1,1]
    return TP

if __name__ == '__main__':
    filtering = ["FSS","noFSS"] #select whether the dataset has been through the filtering step or not 
    scorings = {'f1':make_scorer(f1_score), 'negative_recall': negative_recall_scorer, 'recall':make_scorer(recall_score), 'precision':make_scorer(precision_score), 'TN':TN_scorer, 'FN':FN_scorer, 'FP':FP_scorer, 'TP':TP_scorer} #scorings to be used for model evaluation
    resample = SMOTETomek() #the method used to balance the output classes
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1) #the cross-validation protocole used
    for f in filtering :
        sin_cluster_data_features, sin_cluster_data_label, cluster_data_features, cluster_data_label = output_datasets(f)
        models = models_input(f) #models selected for training
        cluster_df = pd.DataFrame(columns=range(1,11), index=['SVM_F1','SVM_Precision','SVM_Recall','SVM_TN-prop','NN_F1','NN_Precision','NN_Recall','NN_TN-prop',
                                                          'LR_F1','LR_Precision','LR_Recall','LR_TN-prop','Bag_F1','Bag_Precision','Bag_Recall','Bag_TN-prop',
                                                          'RF_F1','RF_Precision','RF_Recall','RF_TN-prop','Boost_F1','Boost_Precision','Boost_Recall','Boost_TN-prop',
                                                          'Tree_F1','Tree_Precision','Tree_Recall','Tree_TN-prop']) #dataframe to save the results in for the cluster dataset
        sin_cluster_df = cluster_df.copy(deep=True) #dataframe to save the results in for the cluster dataset
        cluster_cm = pd.DataFrame(columns=range(1,11), index=['SVM_TN','SVM_FN','SVM_FP','SVM_TP','NN_TN','NN_FN','NN_FP','NN_TP',
                                                          'LR_TN','LR_FN','LR_FP','LR_TP','Bag_TN','Bag_FN','Bag_FP','Bag_TP',
                                                          'RF_TN','RF_FN','RF_FP','RF_TP','Boost_TN','Boost_FN','Boost_FP','Boost_TP',
                                                          'Tree_TN','Tree_FN','Tree_FP','Tree_TP']) #dataframe to save the results in for the cluster dataset
        sin_cluster_cm = cluster_cm.copy(deep=True) #dataframe to save the results in for the cluster dataset
        for k in models :
            model = models[k][0] #selection of the first model of the tuple which is the one without clusters
            pipeline = Pipeline(steps=[('r', resample), ('m', model)])
            #training of the model for the dataset without clusters 
            sin_cluster_scores = cross_validate(pipeline, sin_cluster_data_features.values, sin_cluster_data_label.values, scoring=scorings, cv=cv, return_train_score=True, n_jobs=1)
            sin_cluster_df.loc[k+'_F1']=list(np.around(np.array(sin_cluster_scores["test_f1"]),4)) #the F1 score for the database without cluster is stored in a dataframe 
            sin_cluster_df.loc[k+'_Precision']=list(np.around(np.array(sin_cluster_scores["test_precision"]),4)) #same for precision
            sin_cluster_df.loc[k+'_Recall']=list(np.around(np.array(sin_cluster_scores["test_recall"]),4)) #same for recall 
            sin_cluster_df.loc[k+'_TN-prop']=list(np.around(np.array(sin_cluster_scores["test_negative_recall"]),4)) #same for negative_recall
            sin_cluster_cm.loc[k+'_TN']=sin_cluster_scores["test_TN"] #the number of true negative samples for the database without cluster is stored in a dataframe 
            sin_cluster_cm.loc[k+'_FN']=sin_cluster_scores["test_FN"] #same for false negative
            sin_cluster_cm.loc[k+'_FP']=sin_cluster_scores["test_FP"] #same for false positive
            sin_cluster_cm.loc[k+'_TP']=sin_cluster_scores["test_TP"] #same for true positive

            model = models[k][1] #selection of the second model of the tuple which is the one with clusters
            pipeline = Pipeline(steps=[('r', resample), ('m', model)])
            #training of the model for the dataset with clusters
            cluster_scores = cross_validate(pipeline, cluster_data_features.values, cluster_data_label.values, scoring=scorings, cv=cv, return_train_score=True, n_jobs=1)
            cluster_df.loc[k+'_F1']=list(np.around(np.array(cluster_scores["test_f1"]),4)) #the F1 score for the database without cluster is stored in a dataframe 
            cluster_df.loc[k+'_Precision']=list(np.around(np.array(cluster_scores["test_precision"]),4)) #same for precision
            cluster_df.loc[k+'_Recall']=list(np.around(np.array(cluster_scores["test_recall"]),4)) #same for recall 
            cluster_df.loc[k+'_TN-prop']=list(np.around(np.array(cluster_scores["test_negative_recall"]),4)) #same for negative_recall
            cluster_cm.loc[k+'_TN']=cluster_scores["test_TN"] #the number of true negative samples for the database without cluster is stored in a dataframe 
            cluster_cm.loc[k+'_FN']=cluster_scores["test_FN"] #same for false negative
            cluster_cm.loc[k+'_FP']=cluster_scores["test_FP"] #same for false positive
            cluster_cm.loc[k+'_TP']=cluster_scores["test_TP"] #same for true positive
    
        #Download of results as csv files 
        cluster_df.to_csv("Results_3_Dropout_1_Cluster_"+f+".csv")
        sin_cluster_df.to_csv("Results_3_Dropout_1_sin_Cluster_"+f+".csv")
        cluster_cm.to_csv("Results_3_confusion_matrix_Dropout_1_Cluster_"+f+".csv")
        sin_cluster_cm.to_csv("Results_3_confusion_matrix_Dropout_1_sin_Cluster_"+f+".csv")