Commit 811d1c8a authored by Lucia Prieto's avatar Lucia Prieto

Update hyperparameters-fitting.py

parent 208dd931
......@@ -19,27 +19,23 @@ from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
def output_datasets(dataset,filtering):
"""Gives the features and labels to train the model on depending on the dataset considered"""
if dataset == "Dropout_1":
# Import of csv database from the feature engineering R code
db_cluster = pd.read_csv("dropout_cluster.csv", sep=",")
db = pd.read_csv("dropout_sin_cluster.csv", sep=",")
# Features to be selected from the feature filtering step
features_cluster = (pd.read_csv("dropout_cluster_FSS.csv",sep=",")).columns.values
features = (pd.read_csv("dropout_sin_cluster_FSS.csv",sep=",")).columns.values
else:
# Import of csv database from the feature engineering R code
db_cluster = pd.read_csv("relapse_cluster.csv", sep=",")
db = pd.read_csv("relapse_sin_cluster.csv", sep=",")
# Features to be selected from the feature filtering step
features_cluster = (pd.read_csv("relapse_cluster_FSS.csv",sep=",")).columns.values
features = (pd.read_csv("relapse_sin_cluster_FSS.csv",sep=",")).columns.values
def output_datasets(filtering):
# Import of csv database from the feature engineering R code
db_cluster = pd.read_csv("data/dropout_cluster.csv", sep=",")
db = pd.read_csv("data/dropout.csv", sep=",")
# Features to be selected from the feature filtering step
features_cluster = (pd.read_csv("data/FSS/featsGR_cluster.csv",sep=",")).columns.values
features = (pd.read_csv("data/FSS/featsGR.csv",sep=",")).columns.values
# # Import of csv database from the feature engineering R code
# db_cluster = pd.read_csv("dropout_cluster.csv", sep=",")
# db = pd.read_csv("dropout_sin_cluster.csv", sep=",")
# # Features to be selected from the feature filtering step
# features_cluster = (pd.read_csv("dropout_cluster_FSS.csv",sep=",")).columns.values
# features = (pd.read_csv("dropout_sin_cluster_FSS.csv",sep=",")).columns.values
# Creation of train and test sets for the dataset without cluster
sin_cluster_data_label = db[dataset]
sin_cluster_data_features = db.drop(dataset, axis=1) #elimination of the output from the training set
sin_cluster_data_label = db["Dropout_1"]
sin_cluster_data_features = db.drop("Dropout_1", axis=1) #elimination of the output from the training set
columns_to_be_changed = sin_cluster_data_features.select_dtypes(exclude='number').columns.values
sin_cluster_data_features = pd.get_dummies(sin_cluster_data_features, columns=columns_to_be_changed) #use of one hot encoding for categorical variables
if filtering == "FSS" : #selection of features in case the filtering is activated for the dataset
......@@ -48,8 +44,8 @@ def output_datasets(dataset,filtering):
sin_cluster_data_label.replace({False: 0, True: 1}, inplace=True) #convertion of boolean by integers for the features set (necessary for newest numpy versions)
# Creation of train and test sets for the dataset with cluster (same steps)
cluster_data_label = db_cluster[dataset]
cluster_data_features = db_cluster.drop(dataset, axis=1)
cluster_data_label = db_cluster["Dropout_1"]
cluster_data_features = db_cluster.drop("Dropout_1", axis=1)
columns_to_be_changed = cluster_data_features.select_dtypes(exclude='number').columns.values
cluster_data_features = pd.get_dummies(cluster_data_features, columns=columns_to_be_changed)
if filtering == "FSS" :
......@@ -60,7 +56,6 @@ def output_datasets(dataset,filtering):
return sin_cluster_data_features, sin_cluster_data_label, cluster_data_features, cluster_data_label
if __name__ == '__main__':
datasets = ["Dropout_1"] #select the dataset to train on
filtering = ["FSS","noFSS"] #select whether the dataset has been through the filtering step or not
scorings = ("roc_auc","f1","neg_log_loss") #scorings to be used for model evaluation
......@@ -77,28 +72,27 @@ if __name__ == '__main__':
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1) #the cross-validation protocole used
for f in filtering :
for d in datasets:
sin_cluster_data_features, sin_cluster_data_label, cluster_data_features, cluster_data_label = output_datasets(d,f)
cluster_params = pd.DataFrame(index=['SVM','NN','LR','Bagging','RF','Boosting','Tree'], columns=['Parameters','Score']) #dataframe to save the results in for the cluster dataset
sin_cluster_params = cluster_params.copy(deep=True) #dataframe to save the results in for the cluster dataset
for k in models :
model = models[k]
parameters = hyperparameters[k]
pipeline = Pipeline(steps=[('r', resample), ('m', model)])
search = RandomizedSearchCV(pipeline, param_distributions=parameters, cv=cv, n_jobs=1, scoring='precision')
sin_cluster_data_features, sin_cluster_data_label, cluster_data_features, cluster_data_label = output_datasets(f)
cluster_params = pd.DataFrame(index=['SVM','NN','LR','Bagging','RF','Boosting','Tree'], columns=['Parameters','Score']) #dataframe to save the results in for the cluster dataset
sin_cluster_params = cluster_params.copy(deep=True) #dataframe to save the results in for the cluster dataset
for k in models :
model = models[k]
parameters = hyperparameters[k]
pipeline = Pipeline(steps=[('r', resample), ('m', model)])
search = RandomizedSearchCV(pipeline, param_distributions=parameters, cv=cv, n_jobs=1, scoring='precision')
search.fit(sin_cluster_data_features.values, sin_cluster_data_label.values)
print(search.best_params_)
print(search.best_score_)
sin_cluster_params.at[k,'Parameters']=search.best_params_
sin_cluster_params.at[k,'Score']=round(search.best_score_,4)
search.fit(sin_cluster_data_features.values, sin_cluster_data_label.values)
print(search.best_params_)
print(search.best_score_)
sin_cluster_params.at[k,'Parameters']=search.best_params_
sin_cluster_params.at[k,'Score']=round(search.best_score_,4)
search.fit(cluster_data_features.values, cluster_data_label.values)
print(search.best_params_)
print(search.best_score_)
cluster_params.at[k,'Parameters']=search.best_params_
cluster_params.at[k,'Score']=round(search.best_score_,4)
#Download of results as csv files
cluster_params.to_csv("Results_2_"+d+"_Cluster_"+f+".csv")
sin_cluster_params.to_csv("Results_2_"+d+"_sin_Cluster_"+f+".csv")
search.fit(cluster_data_features.values, cluster_data_label.values)
print(search.best_params_)
print(search.best_score_)
cluster_params.at[k,'Parameters']=search.best_params_
cluster_params.at[k,'Score']=round(search.best_score_,4)
#Download of results as csv files
cluster_params.to_csv("Results_2_Dropout_1_Cluster_"+f+".csv")
sin_cluster_params.to_csv("Results_2_Dropout_1_sin_Cluster_"+f+".csv")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment