# The aim of this code is to get an ovrview of the performances of the selected models on the filtered data set whetehr thery are clustered of not and for both outputs
# We train with a 10-folds scenario and get as an output the following metrics for each fold :
sin_cluster_data_features.replace({False:0,True:1},inplace=True)#convertion of boolean by integers for the features set (necessary for newest numpy versions)
sin_cluster_data_label.replace({False:0,True:1},inplace=True)#convertion of boolean by integers for the features set (necessary for newest numpy versions)
# Creation of train and test sets for the dataset with cluster (same steps)
"""Gives the negative recall defined as the (number of true_negative_samples)/(total number of negative samples)"""
y_pred=clf.predict(X)
cm=confusion_matrix(y,y_pred)
TN_prop=cm[0,0]/(cm[0,1]+cm[0,0])
returnTN_prop
defTN_scorer(clf,X,y):
"""Gives the number of samples predicted as true negatives"""
y_pred=clf.predict(X)
cm=confusion_matrix(y,y_pred)
TN=cm[0,0]
returnTN
defFN_scorer(clf,X,y):
"""Gives the number of samples predicted as false negatives"""
y_pred=clf.predict(X)
cm=confusion_matrix(y,y_pred)
FN=cm[0,1]
returnFN
defFP_scorer(clf,X,y):
"""Gives the number of samples predicted as false positive"""
y_pred=clf.predict(X)
cm=confusion_matrix(y,y_pred)
FP=cm[1,0]
returnFP
defTP_scorer(clf,X,y):
"""Gives the number of samples predicted as true positive"""
y_pred=clf.predict(X)
cm=confusion_matrix(y,y_pred)
TP=cm[1,1]
returnTP
if__name__=='__main__':
datasets=["Dropout_1"]#select the dataset to train on
filtering=["FSS","noFSS"]#select whether the dataset has been through the filtering step or not
scorings={'f1':make_scorer(f1_score),'negative_recall':negative_recall_scorer,'recall':make_scorer(recall_score),'precision':make_scorer(precision_score),'TN':TN_scorer,'FN':FN_scorer,'FP':FP_scorer,'TP':TP_scorer}#scorings to be used for model evaluation
resample=SMOTETomek()#the method used to balance the output classes
cv=StratifiedKFold(n_splits=10,shuffle=True,random_state=1)#the cross-validation protocole used
sin_cluster_df.loc[k+'_F1']=list(np.around(np.array(sin_cluster_scores["test_f1"]),4))#the F1 score for the database without cluster is stored in a dataframe
sin_cluster_df.loc[k+'_Precision']=list(np.around(np.array(sin_cluster_scores["test_precision"]),4))#same for precision
sin_cluster_df.loc[k+'_Recall']=list(np.around(np.array(sin_cluster_scores["test_recall"]),4))#same for recall
sin_cluster_df.loc[k+'_TN-prop']=list(np.around(np.array(sin_cluster_scores["test_negative_recall"]),4))#same for negative_recall
sin_cluster_cm.loc[k+'_TN']=sin_cluster_scores["test_TN"]#the number of true negative samples for the database without cluster is stored in a dataframe
sin_cluster_cm.loc[k+'_FN']=sin_cluster_scores["test_FN"]#same for false negative
sin_cluster_cm.loc[k+'_FP']=sin_cluster_scores["test_FP"]#same for false positive
sin_cluster_cm.loc[k+'_TP']=sin_cluster_scores["test_TP"]#same for true positive
model=models[k][1]#selection of the second model of the tuple which is the one with clusters