From f7d56894eb866eadb7976635049223cce82c6e92 Mon Sep 17 00:00:00 2001 From: Lucia Prieto Date: Thu, 27 Jul 2023 14:52:47 +0000 Subject: [PATCH] Upload hyperparameters-fitting.py --- code/models/hyperparameters-fitting.py | 104 +++++++++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 code/models/hyperparameters-fitting.py diff --git a/code/models/hyperparameters-fitting.py b/code/models/hyperparameters-fitting.py new file mode 100644 index 0000000..de0009c --- /dev/null +++ b/code/models/hyperparameters-fitting.py @@ -0,0 +1,104 @@ +# The aim of this code is to get an ovrview of the performances of the selected models on the filtered data set whetehr thery are clustered of not and for both outputs +# We train with a 10-folds scenario and get as an output the following metrics for each fold : +# -AUROC +# -F1 +# -LogLoss + +# Import of databases for training +import numpy as np +import pandas as pd +from imblearn.pipeline import Pipeline +from imblearn.combine import SMOTETomek +from scipy.stats import uniform +from scipy.stats import randint +from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV +from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier +from sklearn.naive_bayes import GaussianNB +from sklearn.neural_network import MLPClassifier +from sklearn.svm import SVC +from sklearn.tree import DecisionTreeClassifier + +def output_datasets(dataset,filtering): + """Gives the features and labels to train the model on depending on the dataset considered""" + if dataset == "Dropout_1": + # Import of csv database from the feature engineering R code + db_cluster = pd.read_csv("dropout_cluster.csv", sep=",") + db = pd.read_csv("dropout_sin_cluster.csv", sep=",") + # Features to be selected from the feature filtering step + features_cluster = (pd.read_csv("dropout_cluster_FSS.csv",sep=",")).columns.values + features = (pd.read_csv("dropout_sin_cluster_FSS.csv",sep=",")).columns.values + + else: + # Import of csv database from the feature engineering R code + db_cluster = pd.read_csv("relapse_cluster.csv", sep=",") + db = pd.read_csv("relapse_sin_cluster.csv", sep=",") + # Features to be selected from the feature filtering step + features_cluster = (pd.read_csv("relapse_cluster_FSS.csv",sep=",")).columns.values + features = (pd.read_csv("relapse_sin_cluster_FSS.csv",sep=",")).columns.values + + # Creation of train and test sets for the dataset without cluster + sin_cluster_data_label = db[dataset] + sin_cluster_data_features = db.drop(dataset, axis=1) #elimination of the output from the training set + columns_to_be_changed = sin_cluster_data_features.select_dtypes(exclude='number').columns.values + sin_cluster_data_features = pd.get_dummies(sin_cluster_data_features, columns=columns_to_be_changed) #use of one hot encoding for categorical variables + if filtering == "FSS" : #selection of features in case the filtering is activated for the dataset + sin_cluster_data_features = sin_cluster_data_features.filter(features, axis=1) + sin_cluster_data_features.replace({False: 0, True: 1}, inplace=True) #convertion of boolean by integers for the features set (necessary for newest numpy versions) + sin_cluster_data_label.replace({False: 0, True: 1}, inplace=True) #convertion of boolean by integers for the features set (necessary for newest numpy versions) + + # Creation of train and test sets for the dataset with cluster (same steps) + cluster_data_label = db_cluster[dataset] + cluster_data_features = db_cluster.drop(dataset, axis=1) + columns_to_be_changed = cluster_data_features.select_dtypes(exclude='number').columns.values + cluster_data_features = pd.get_dummies(cluster_data_features, columns=columns_to_be_changed) + if filtering == "FSS" : + cluster_data_features = cluster_data_features.filter(features_cluster, axis=1) + cluster_data_features.replace({False: 0, True: 1}, inplace=True) + cluster_data_label.replace({False: 0, True: 1}, inplace=True) + + return sin_cluster_data_features, sin_cluster_data_label, cluster_data_features, cluster_data_label + +if __name__ == '__main__': + datasets = ["Dropout_1"] #select the dataset to train on + filtering = ["FSS","noFSS"] #select whether the dataset has been through the filtering step or not + scorings = ("roc_auc","f1","neg_log_loss") #scorings to be used for model evaluation + + models = {"Tree" : DecisionTreeClassifier(), "RF" : RandomForestClassifier(n_estimators=50), "Boosting" : AdaBoostClassifier(), "Bagging" :BaggingClassifier(), "LR" : LogisticRegression(max_iter=1000), "SVM" : SVC(probability=True), "NN" : MLPClassifier(max_iter=500)} #models selected for training + hyperparameters = {"Tree" : {'m__splitter': ['best','random'],'m__max_features': ['sqrt', 'log2'],'m__criterion' :['gini', 'entropy','log_loss']}, + "RF" : {'m__n_estimators': randint(100,250),'m__max_features': ['sqrt', 'log2'],'m__criterion' :['gini', 'entropy']}, + "Bagging" : {'m__n_estimators': randint(10,100),'m__max_samples': [0.8,1.0],'m__max_features': [0.8,1.0],'m__warm_start' :[True, False]}, + "Boosting" :{'m__n_estimators': randint(50,150),'m__learning_rate': uniform(0.8,1.2)}, + "LR" : {'m__penalty': ['l1','l2','elasticnet', None],'m__solver' : ['lbfgs','sag','saga','newton-cholesky']}, + "SVM" : {'m__C': uniform(0.8,1.2),'m__kernel': ['linear','poly','rbf', 'sigmoid']}, + "NN" : {'m__activation': ['identity', 'logistic','tanh','relu'],'m__hidden_layer_sizes': randint(50,150),'m__learning_rate':['constant','invscaling','adaptive']}} + + resample = SMOTETomek() #the method used to balance the output classes + cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1) #the cross-validation protocole used + + for f in filtering : + for d in datasets: + sin_cluster_data_features, sin_cluster_data_label, cluster_data_features, cluster_data_label = output_datasets(d,f) + cluster_params = pd.DataFrame(index=['SVM','NN','LR','Bagging','RF','Boosting','Tree'], columns=['Parameters','Score']) #dataframe to save the results in for the cluster dataset + sin_cluster_params = cluster_params.copy(deep=True) #dataframe to save the results in for the cluster dataset + for k in models : + model = models[k] + parameters = hyperparameters[k] + pipeline = Pipeline(steps=[('r', resample), ('m', model)]) + search = RandomizedSearchCV(pipeline, param_distributions=parameters, cv=cv, n_jobs=1, scoring='precision') + + search.fit(sin_cluster_data_features.values, sin_cluster_data_label.values) + print(search.best_params_) + print(search.best_score_) + sin_cluster_params.at[k,'Parameters']=search.best_params_ + sin_cluster_params.at[k,'Score']=round(search.best_score_,4) + + search.fit(cluster_data_features.values, cluster_data_label.values) + print(search.best_params_) + print(search.best_score_) + cluster_params.at[k,'Parameters']=search.best_params_ + cluster_params.at[k,'Score']=round(search.best_score_,4) + + #Download of results as csv files + cluster_params.to_csv("Results_2_"+d+"_Cluster_"+f+".csv") + sin_cluster_params.to_csv("Results_2_"+d+"_sin_Cluster_"+f+".csv") -- 2.24.1