diff --git a/code/models/hyperparameters-fitting.py b/code/models/hyperparameters-fitting.py index de0009c4ecd5bde12a9ca07fe8bd8d310106507f..1829bf3244a1f17c3fd4a417eb666e9fad3e3906 100644 --- a/code/models/hyperparameters-fitting.py +++ b/code/models/hyperparameters-fitting.py @@ -19,27 +19,23 @@ from sklearn.neural_network import MLPClassifier from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier -def output_datasets(dataset,filtering): - """Gives the features and labels to train the model on depending on the dataset considered""" - if dataset == "Dropout_1": - # Import of csv database from the feature engineering R code - db_cluster = pd.read_csv("dropout_cluster.csv", sep=",") - db = pd.read_csv("dropout_sin_cluster.csv", sep=",") - # Features to be selected from the feature filtering step - features_cluster = (pd.read_csv("dropout_cluster_FSS.csv",sep=",")).columns.values - features = (pd.read_csv("dropout_sin_cluster_FSS.csv",sep=",")).columns.values - - else: - # Import of csv database from the feature engineering R code - db_cluster = pd.read_csv("relapse_cluster.csv", sep=",") - db = pd.read_csv("relapse_sin_cluster.csv", sep=",") - # Features to be selected from the feature filtering step - features_cluster = (pd.read_csv("relapse_cluster_FSS.csv",sep=",")).columns.values - features = (pd.read_csv("relapse_sin_cluster_FSS.csv",sep=",")).columns.values +def output_datasets(filtering): + # Import of csv database from the feature engineering R code + db_cluster = pd.read_csv("data/dropout_cluster.csv", sep=",") + db = pd.read_csv("data/dropout.csv", sep=",") + # Features to be selected from the feature filtering step + features_cluster = (pd.read_csv("data/FSS/featsGR_cluster.csv",sep=",")).columns.values + features = (pd.read_csv("data/FSS/featsGR.csv",sep=",")).columns.values + # # Import of csv database from the feature engineering R code + # db_cluster = pd.read_csv("dropout_cluster.csv", sep=",") + # db = pd.read_csv("dropout_sin_cluster.csv", sep=",") + # # Features to be selected from the feature filtering step + # features_cluster = (pd.read_csv("dropout_cluster_FSS.csv",sep=",")).columns.values + # features = (pd.read_csv("dropout_sin_cluster_FSS.csv",sep=",")).columns.values # Creation of train and test sets for the dataset without cluster - sin_cluster_data_label = db[dataset] - sin_cluster_data_features = db.drop(dataset, axis=1) #elimination of the output from the training set + sin_cluster_data_label = db["Dropout_1"] + sin_cluster_data_features = db.drop("Dropout_1", axis=1) #elimination of the output from the training set columns_to_be_changed = sin_cluster_data_features.select_dtypes(exclude='number').columns.values sin_cluster_data_features = pd.get_dummies(sin_cluster_data_features, columns=columns_to_be_changed) #use of one hot encoding for categorical variables if filtering == "FSS" : #selection of features in case the filtering is activated for the dataset @@ -48,8 +44,8 @@ def output_datasets(dataset,filtering): sin_cluster_data_label.replace({False: 0, True: 1}, inplace=True) #convertion of boolean by integers for the features set (necessary for newest numpy versions) # Creation of train and test sets for the dataset with cluster (same steps) - cluster_data_label = db_cluster[dataset] - cluster_data_features = db_cluster.drop(dataset, axis=1) + cluster_data_label = db_cluster["Dropout_1"] + cluster_data_features = db_cluster.drop("Dropout_1", axis=1) columns_to_be_changed = cluster_data_features.select_dtypes(exclude='number').columns.values cluster_data_features = pd.get_dummies(cluster_data_features, columns=columns_to_be_changed) if filtering == "FSS" : @@ -60,7 +56,6 @@ def output_datasets(dataset,filtering): return sin_cluster_data_features, sin_cluster_data_label, cluster_data_features, cluster_data_label if __name__ == '__main__': - datasets = ["Dropout_1"] #select the dataset to train on filtering = ["FSS","noFSS"] #select whether the dataset has been through the filtering step or not scorings = ("roc_auc","f1","neg_log_loss") #scorings to be used for model evaluation @@ -77,28 +72,27 @@ if __name__ == '__main__': cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1) #the cross-validation protocole used for f in filtering : - for d in datasets: - sin_cluster_data_features, sin_cluster_data_label, cluster_data_features, cluster_data_label = output_datasets(d,f) - cluster_params = pd.DataFrame(index=['SVM','NN','LR','Bagging','RF','Boosting','Tree'], columns=['Parameters','Score']) #dataframe to save the results in for the cluster dataset - sin_cluster_params = cluster_params.copy(deep=True) #dataframe to save the results in for the cluster dataset - for k in models : - model = models[k] - parameters = hyperparameters[k] - pipeline = Pipeline(steps=[('r', resample), ('m', model)]) - search = RandomizedSearchCV(pipeline, param_distributions=parameters, cv=cv, n_jobs=1, scoring='precision') + sin_cluster_data_features, sin_cluster_data_label, cluster_data_features, cluster_data_label = output_datasets(f) + cluster_params = pd.DataFrame(index=['SVM','NN','LR','Bagging','RF','Boosting','Tree'], columns=['Parameters','Score']) #dataframe to save the results in for the cluster dataset + sin_cluster_params = cluster_params.copy(deep=True) #dataframe to save the results in for the cluster dataset + for k in models : + model = models[k] + parameters = hyperparameters[k] + pipeline = Pipeline(steps=[('r', resample), ('m', model)]) + search = RandomizedSearchCV(pipeline, param_distributions=parameters, cv=cv, n_jobs=1, scoring='precision') - search.fit(sin_cluster_data_features.values, sin_cluster_data_label.values) - print(search.best_params_) - print(search.best_score_) - sin_cluster_params.at[k,'Parameters']=search.best_params_ - sin_cluster_params.at[k,'Score']=round(search.best_score_,4) + search.fit(sin_cluster_data_features.values, sin_cluster_data_label.values) + print(search.best_params_) + print(search.best_score_) + sin_cluster_params.at[k,'Parameters']=search.best_params_ + sin_cluster_params.at[k,'Score']=round(search.best_score_,4) - search.fit(cluster_data_features.values, cluster_data_label.values) - print(search.best_params_) - print(search.best_score_) - cluster_params.at[k,'Parameters']=search.best_params_ - cluster_params.at[k,'Score']=round(search.best_score_,4) - - #Download of results as csv files - cluster_params.to_csv("Results_2_"+d+"_Cluster_"+f+".csv") - sin_cluster_params.to_csv("Results_2_"+d+"_sin_Cluster_"+f+".csv") + search.fit(cluster_data_features.values, cluster_data_label.values) + print(search.best_params_) + print(search.best_score_) + cluster_params.at[k,'Parameters']=search.best_params_ + cluster_params.at[k,'Score']=round(search.best_score_,4) + + #Download of results as csv files + cluster_params.to_csv("Results_2_Dropout_1_Cluster_"+f+".csv") + sin_cluster_params.to_csv("Results_2_Dropout_1_sin_Cluster_"+f+".csv")