Commit f7d56894 authored by Lucia Prieto's avatar Lucia Prieto

Upload hyperparameters-fitting.py

parent d8cb406e
# The aim of this code is to get an ovrview of the performances of the selected models on the filtered data set whetehr thery are clustered of not and for both outputs
# We train with a 10-folds scenario and get as an output the following metrics for each fold :
# -AUROC
# -F1
# -LogLoss
# Import of databases for training
import numpy as np
import pandas as pd
from imblearn.pipeline import Pipeline
from imblearn.combine import SMOTETomek
from scipy.stats import uniform
from scipy.stats import randint
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
def output_datasets(dataset,filtering):
"""Gives the features and labels to train the model on depending on the dataset considered"""
if dataset == "Dropout_1":
# Import of csv database from the feature engineering R code
db_cluster = pd.read_csv("dropout_cluster.csv", sep=",")
db = pd.read_csv("dropout_sin_cluster.csv", sep=",")
# Features to be selected from the feature filtering step
features_cluster = (pd.read_csv("dropout_cluster_FSS.csv",sep=",")).columns.values
features = (pd.read_csv("dropout_sin_cluster_FSS.csv",sep=",")).columns.values
else:
# Import of csv database from the feature engineering R code
db_cluster = pd.read_csv("relapse_cluster.csv", sep=",")
db = pd.read_csv("relapse_sin_cluster.csv", sep=",")
# Features to be selected from the feature filtering step
features_cluster = (pd.read_csv("relapse_cluster_FSS.csv",sep=",")).columns.values
features = (pd.read_csv("relapse_sin_cluster_FSS.csv",sep=",")).columns.values
# Creation of train and test sets for the dataset without cluster
sin_cluster_data_label = db[dataset]
sin_cluster_data_features = db.drop(dataset, axis=1) #elimination of the output from the training set
columns_to_be_changed = sin_cluster_data_features.select_dtypes(exclude='number').columns.values
sin_cluster_data_features = pd.get_dummies(sin_cluster_data_features, columns=columns_to_be_changed) #use of one hot encoding for categorical variables
if filtering == "FSS" : #selection of features in case the filtering is activated for the dataset
sin_cluster_data_features = sin_cluster_data_features.filter(features, axis=1)
sin_cluster_data_features.replace({False: 0, True: 1}, inplace=True) #convertion of boolean by integers for the features set (necessary for newest numpy versions)
sin_cluster_data_label.replace({False: 0, True: 1}, inplace=True) #convertion of boolean by integers for the features set (necessary for newest numpy versions)
# Creation of train and test sets for the dataset with cluster (same steps)
cluster_data_label = db_cluster[dataset]
cluster_data_features = db_cluster.drop(dataset, axis=1)
columns_to_be_changed = cluster_data_features.select_dtypes(exclude='number').columns.values
cluster_data_features = pd.get_dummies(cluster_data_features, columns=columns_to_be_changed)
if filtering == "FSS" :
cluster_data_features = cluster_data_features.filter(features_cluster, axis=1)
cluster_data_features.replace({False: 0, True: 1}, inplace=True)
cluster_data_label.replace({False: 0, True: 1}, inplace=True)
return sin_cluster_data_features, sin_cluster_data_label, cluster_data_features, cluster_data_label
if __name__ == '__main__':
datasets = ["Dropout_1"] #select the dataset to train on
filtering = ["FSS","noFSS"] #select whether the dataset has been through the filtering step or not
scorings = ("roc_auc","f1","neg_log_loss") #scorings to be used for model evaluation
models = {"Tree" : DecisionTreeClassifier(), "RF" : RandomForestClassifier(n_estimators=50), "Boosting" : AdaBoostClassifier(), "Bagging" :BaggingClassifier(), "LR" : LogisticRegression(max_iter=1000), "SVM" : SVC(probability=True), "NN" : MLPClassifier(max_iter=500)} #models selected for training
hyperparameters = {"Tree" : {'m__splitter': ['best','random'],'m__max_features': ['sqrt', 'log2'],'m__criterion' :['gini', 'entropy','log_loss']},
"RF" : {'m__n_estimators': randint(100,250),'m__max_features': ['sqrt', 'log2'],'m__criterion' :['gini', 'entropy']},
"Bagging" : {'m__n_estimators': randint(10,100),'m__max_samples': [0.8,1.0],'m__max_features': [0.8,1.0],'m__warm_start' :[True, False]},
"Boosting" :{'m__n_estimators': randint(50,150),'m__learning_rate': uniform(0.8,1.2)},
"LR" : {'m__penalty': ['l1','l2','elasticnet', None],'m__solver' : ['lbfgs','sag','saga','newton-cholesky']},
"SVM" : {'m__C': uniform(0.8,1.2),'m__kernel': ['linear','poly','rbf', 'sigmoid']},
"NN" : {'m__activation': ['identity', 'logistic','tanh','relu'],'m__hidden_layer_sizes': randint(50,150),'m__learning_rate':['constant','invscaling','adaptive']}}
resample = SMOTETomek() #the method used to balance the output classes
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1) #the cross-validation protocole used
for f in filtering :
for d in datasets:
sin_cluster_data_features, sin_cluster_data_label, cluster_data_features, cluster_data_label = output_datasets(d,f)
cluster_params = pd.DataFrame(index=['SVM','NN','LR','Bagging','RF','Boosting','Tree'], columns=['Parameters','Score']) #dataframe to save the results in for the cluster dataset
sin_cluster_params = cluster_params.copy(deep=True) #dataframe to save the results in for the cluster dataset
for k in models :
model = models[k]
parameters = hyperparameters[k]
pipeline = Pipeline(steps=[('r', resample), ('m', model)])
search = RandomizedSearchCV(pipeline, param_distributions=parameters, cv=cv, n_jobs=1, scoring='precision')
search.fit(sin_cluster_data_features.values, sin_cluster_data_label.values)
print(search.best_params_)
print(search.best_score_)
sin_cluster_params.at[k,'Parameters']=search.best_params_
sin_cluster_params.at[k,'Score']=round(search.best_score_,4)
search.fit(cluster_data_features.values, cluster_data_label.values)
print(search.best_params_)
print(search.best_score_)
cluster_params.at[k,'Parameters']=search.best_params_
cluster_params.at[k,'Score']=round(search.best_score_,4)
#Download of results as csv files
cluster_params.to_csv("Results_2_"+d+"_Cluster_"+f+".csv")
sin_cluster_params.to_csv("Results_2_"+d+"_sin_Cluster_"+f+".csv")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment