# The aim of this code is to get an ovrview of the performances of the selected models on the filtered data set whetehr thery are clustered of not and for both outputs # We train with a 10-folds scenario and get as an output the following metrics for each fold : # -AUROC # -F1 # -LogLoss # Import of databases for training import numpy as np import pandas as pd from imblearn.pipeline import Pipeline from imblearn.combine import SMOTETomek from scipy.stats import uniform from scipy.stats import randint from sklearn.linear_model import LogisticRegression from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier from sklearn.naive_bayes import GaussianNB from sklearn.neural_network import MLPClassifier from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier def output_datasets(dataset,filtering): """Gives the features and labels to train the model on depending on the dataset considered""" if dataset == "Dropout_1": # Import of csv database from the feature engineering R code db_cluster = pd.read_csv("dropout_cluster.csv", sep=",") db = pd.read_csv("dropout_sin_cluster.csv", sep=",") # Features to be selected from the feature filtering step features_cluster = (pd.read_csv("dropout_cluster_FSS.csv",sep=",")).columns.values features = (pd.read_csv("dropout_sin_cluster_FSS.csv",sep=",")).columns.values else: # Import of csv database from the feature engineering R code db_cluster = pd.read_csv("relapse_cluster.csv", sep=",") db = pd.read_csv("relapse_sin_cluster.csv", sep=",") # Features to be selected from the feature filtering step features_cluster = (pd.read_csv("relapse_cluster_FSS.csv",sep=",")).columns.values features = (pd.read_csv("relapse_sin_cluster_FSS.csv",sep=",")).columns.values # Creation of train and test sets for the dataset without cluster sin_cluster_data_label = db[dataset] sin_cluster_data_features = db.drop(dataset, axis=1) #elimination of the output from the training set columns_to_be_changed = sin_cluster_data_features.select_dtypes(exclude='number').columns.values sin_cluster_data_features = pd.get_dummies(sin_cluster_data_features, columns=columns_to_be_changed) #use of one hot encoding for categorical variables if filtering == "FSS" : #selection of features in case the filtering is activated for the dataset sin_cluster_data_features = sin_cluster_data_features.filter(features, axis=1) sin_cluster_data_features.replace({False: 0, True: 1}, inplace=True) #convertion of boolean by integers for the features set (necessary for newest numpy versions) sin_cluster_data_label.replace({False: 0, True: 1}, inplace=True) #convertion of boolean by integers for the features set (necessary for newest numpy versions) # Creation of train and test sets for the dataset with cluster (same steps) cluster_data_label = db_cluster[dataset] cluster_data_features = db_cluster.drop(dataset, axis=1) columns_to_be_changed = cluster_data_features.select_dtypes(exclude='number').columns.values cluster_data_features = pd.get_dummies(cluster_data_features, columns=columns_to_be_changed) if filtering == "FSS" : cluster_data_features = cluster_data_features.filter(features_cluster, axis=1) cluster_data_features.replace({False: 0, True: 1}, inplace=True) cluster_data_label.replace({False: 0, True: 1}, inplace=True) return sin_cluster_data_features, sin_cluster_data_label, cluster_data_features, cluster_data_label if __name__ == '__main__': datasets = ["Dropout_1"] #select the dataset to train on filtering = ["FSS","noFSS"] #select whether the dataset has been through the filtering step or not scorings = ("roc_auc","f1","neg_log_loss") #scorings to be used for model evaluation models = {"Tree" : DecisionTreeClassifier(), "RF" : RandomForestClassifier(n_estimators=50), "Boosting" : AdaBoostClassifier(), "Bagging" :BaggingClassifier(), "LR" : LogisticRegression(max_iter=1000), "SVM" : SVC(probability=True), "NN" : MLPClassifier(max_iter=500)} #models selected for training hyperparameters = {"Tree" : {'m__splitter': ['best','random'],'m__max_features': ['sqrt', 'log2'],'m__criterion' :['gini', 'entropy','log_loss']}, "RF" : {'m__n_estimators': randint(100,250),'m__max_features': ['sqrt', 'log2'],'m__criterion' :['gini', 'entropy']}, "Bagging" : {'m__n_estimators': randint(10,100),'m__max_samples': [0.8,1.0],'m__max_features': [0.8,1.0],'m__warm_start' :[True, False]}, "Boosting" :{'m__n_estimators': randint(50,150),'m__learning_rate': uniform(0.8,1.2)}, "LR" : {'m__penalty': ['l1','l2','elasticnet', None],'m__solver' : ['lbfgs','sag','saga','newton-cholesky']}, "SVM" : {'m__C': uniform(0.8,1.2),'m__kernel': ['linear','poly','rbf', 'sigmoid']}, "NN" : {'m__activation': ['identity', 'logistic','tanh','relu'],'m__hidden_layer_sizes': randint(50,150),'m__learning_rate':['constant','invscaling','adaptive']}} resample = SMOTETomek() #the method used to balance the output classes cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1) #the cross-validation protocole used for f in filtering : for d in datasets: sin_cluster_data_features, sin_cluster_data_label, cluster_data_features, cluster_data_label = output_datasets(d,f) cluster_params = pd.DataFrame(index=['SVM','NN','LR','Bagging','RF','Boosting','Tree'], columns=['Parameters','Score']) #dataframe to save the results in for the cluster dataset sin_cluster_params = cluster_params.copy(deep=True) #dataframe to save the results in for the cluster dataset for k in models : model = models[k] parameters = hyperparameters[k] pipeline = Pipeline(steps=[('r', resample), ('m', model)]) search = RandomizedSearchCV(pipeline, param_distributions=parameters, cv=cv, n_jobs=1, scoring='precision') search.fit(sin_cluster_data_features.values, sin_cluster_data_label.values) print(search.best_params_) print(search.best_score_) sin_cluster_params.at[k,'Parameters']=search.best_params_ sin_cluster_params.at[k,'Score']=round(search.best_score_,4) search.fit(cluster_data_features.values, cluster_data_label.values) print(search.best_params_) print(search.best_score_) cluster_params.at[k,'Parameters']=search.best_params_ cluster_params.at[k,'Score']=round(search.best_score_,4) #Download of results as csv files cluster_params.to_csv("Results_2_"+d+"_Cluster_"+f+".csv") sin_cluster_params.to_csv("Results_2_"+d+"_sin_Cluster_"+f+".csv")