hyperparameters-fitting.py 6.61 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
# The aim of this code is to get an ovrview of the performances of the selected models on the filtered data set whetehr thery are clustered of not and for both outputs 
# We train with a 10-folds scenario and get as an output the following metrics for each fold :
# -AUROC
# -F1
# -LogLoss

# Import of databases for training
import numpy as np
import pandas as pd
from imblearn.pipeline import Pipeline
from imblearn.combine import SMOTETomek
from scipy.stats import uniform
from scipy.stats import randint
from sklearn.linear_model import  LogisticRegression
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

22 23 24 25 26 27 28 29 30 31 32 33 34
def output_datasets(filtering): 
    # Import of csv database from the feature engineering R code
    db_cluster = pd.read_csv("data/dropout_cluster.csv", sep=",")
    db = pd.read_csv("data/dropout.csv", sep=",")
    # Features to be selected from the feature filtering step
    features_cluster = (pd.read_csv("data/FSS/featsGR_cluster.csv",sep=",")).columns.values
    features = (pd.read_csv("data/FSS/featsGR.csv",sep=",")).columns.values
    # # Import of csv database from the feature engineering R code
    # db_cluster = pd.read_csv("dropout_cluster.csv", sep=",")
    # db = pd.read_csv("dropout_sin_cluster.csv", sep=",")
    # # Features to be selected from the feature filtering step
    # features_cluster = (pd.read_csv("dropout_cluster_FSS.csv",sep=",")).columns.values
    # features = (pd.read_csv("dropout_sin_cluster_FSS.csv",sep=",")).columns.values
35 36

    # Creation of train and test sets for the dataset without cluster
37 38
    sin_cluster_data_label = db["Dropout_1"]
    sin_cluster_data_features = db.drop("Dropout_1", axis=1) #elimination of the output from the training set
39 40 41 42 43 44 45 46
    columns_to_be_changed = sin_cluster_data_features.select_dtypes(exclude='number').columns.values
    sin_cluster_data_features = pd.get_dummies(sin_cluster_data_features, columns=columns_to_be_changed) #use of one hot encoding for categorical variables
    if filtering == "FSS" : #selection of features in case the filtering is activated for the dataset
        sin_cluster_data_features = sin_cluster_data_features.filter(features, axis=1)
    sin_cluster_data_features.replace({False: 0, True: 1}, inplace=True) #convertion of boolean by integers for the features set (necessary for newest numpy versions)
    sin_cluster_data_label.replace({False: 0, True: 1}, inplace=True) #convertion of boolean by integers for the features set (necessary for newest numpy versions)

    # Creation of train and test sets for the dataset with cluster (same steps)
47 48
    cluster_data_label = db_cluster["Dropout_1"]
    cluster_data_features = db_cluster.drop("Dropout_1", axis=1)
49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74
    columns_to_be_changed = cluster_data_features.select_dtypes(exclude='number').columns.values
    cluster_data_features = pd.get_dummies(cluster_data_features, columns=columns_to_be_changed)
    if filtering == "FSS" :
        cluster_data_features = cluster_data_features.filter(features_cluster, axis=1)
    cluster_data_features.replace({False: 0, True: 1}, inplace=True)
    cluster_data_label.replace({False: 0, True: 1}, inplace=True)

    return sin_cluster_data_features, sin_cluster_data_label, cluster_data_features, cluster_data_label

if __name__ == '__main__':
    filtering = ["FSS","noFSS"] #select whether the dataset has been through the filtering step or not 
    scorings = ("roc_auc","f1","neg_log_loss") #scorings to be used for model evaluation
    
    models = {"Tree" : DecisionTreeClassifier(), "RF" : RandomForestClassifier(n_estimators=50), "Boosting" : AdaBoostClassifier(), "Bagging" :BaggingClassifier(), "LR" : LogisticRegression(max_iter=1000), "SVM" : SVC(probability=True), "NN" : MLPClassifier(max_iter=500)} #models selected for training
    hyperparameters = {"Tree" : {'m__splitter': ['best','random'],'m__max_features': ['sqrt', 'log2'],'m__criterion' :['gini', 'entropy','log_loss']}, 
                       "RF" : {'m__n_estimators': randint(100,250),'m__max_features': ['sqrt', 'log2'],'m__criterion' :['gini', 'entropy']}, 
                       "Bagging" : {'m__n_estimators': randint(10,100),'m__max_samples': [0.8,1.0],'m__max_features': [0.8,1.0],'m__warm_start' :[True, False]}, 
                       "Boosting" :{'m__n_estimators': randint(50,150),'m__learning_rate': uniform(0.8,1.2)}, 
                       "LR" : {'m__penalty': ['l1','l2','elasticnet', None],'m__solver' : ['lbfgs','sag','saga','newton-cholesky']}, 
                       "SVM" : {'m__C': uniform(0.8,1.2),'m__kernel': ['linear','poly','rbf', 'sigmoid']},
                       "NN" : {'m__activation': ['identity', 'logistic','tanh','relu'],'m__hidden_layer_sizes': randint(50,150),'m__learning_rate':['constant','invscaling','adaptive']}}
    
    resample = SMOTETomek() #the method used to balance the output classes
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1) #the cross-validation protocole used
    
    for f in filtering : 
75 76 77 78 79 80 81 82
        sin_cluster_data_features, sin_cluster_data_label, cluster_data_features, cluster_data_label = output_datasets(f)
        cluster_params = pd.DataFrame(index=['SVM','NN','LR','Bagging','RF','Boosting','Tree'], columns=['Parameters','Score']) #dataframe to save the results in for the cluster dataset
        sin_cluster_params = cluster_params.copy(deep=True) #dataframe to save the results in for the cluster dataset
        for k in models :
            model = models[k]
            parameters = hyperparameters[k]
            pipeline = Pipeline(steps=[('r', resample), ('m', model)])
            search = RandomizedSearchCV(pipeline, param_distributions=parameters, cv=cv, n_jobs=1, scoring='precision')
83

84 85 86 87 88
            search.fit(sin_cluster_data_features.values, sin_cluster_data_label.values)
            print(search.best_params_)
            print(search.best_score_)
            sin_cluster_params.at[k,'Parameters']=search.best_params_
            sin_cluster_params.at[k,'Score']=round(search.best_score_,4)
89

90 91 92 93 94 95 96 97 98
            search.fit(cluster_data_features.values, cluster_data_label.values)
            print(search.best_params_)
            print(search.best_score_)
            cluster_params.at[k,'Parameters']=search.best_params_
            cluster_params.at[k,'Score']=round(search.best_score_,4)
    
        #Download of results as csv files 
        cluster_params.to_csv("Results_2_Dropout_1_Cluster_"+f+".csv")
        sin_cluster_params.to_csv("Results_2_Dropout_1_sin_Cluster_"+f+".csv")