Commit f26ac8cd authored by Joaquin Torres's avatar Joaquin Torres

working on loop for model evaluation

parent d72df2cb
......@@ -23,10 +23,41 @@ from sklearn.tree import DecisionTreeClassifier
# --------------------------------------------------------------------------------------------------------
if __name__ == "__main__":
def negative_recall_scorer(clf, X, y):
"""Gives the negative recall defined as the (number of true_negative_samples)/(total number of negative samples)"""
y_pred = clf.predict(X)
cm = confusion_matrix(y, y_pred)
TN_prop = cm[0,0]/(cm[0,1]+cm[0,0])
return TN_prop
def TN_scorer(clf, X, y):
"""Gives the number of samples predicted as true negatives"""
y_pred = clf.predict(X)
cm = confusion_matrix(y, y_pred)
TN = cm[0,0]
return TN
def FN_scorer(clf, X, y):
"""Gives the number of samples predicted as false negatives"""
y_pred = clf.predict(X)
cm = confusion_matrix(y, y_pred)
FN = cm[0,1]
return FN
def FP_scorer(clf, X, y):
"""Gives the number of samples predicted as false positive"""
y_pred = clf.predict(X)
cm = confusion_matrix(y, y_pred)
FP = cm[1,0]
return FP
def TP_scorer(clf, X, y):
"""Gives the number of samples predicted as true positive"""
y_pred = clf.predict(X)
cm = confusion_matrix(y, y_pred)
TP = cm[1,1]
return TP
def read_data():
import numpy as np
# Reading training data
# --------------------------------------------------------------------------------------------------------
# Load test data
X_test_pre = np.load('gen_train_data/data/output/pre/X_test_pre.npy', allow_pickle=True)
y_test_pre = np.load('gen_train_data/data/output/pre/y_test_pre.npy', allow_pickle=True)
......@@ -50,31 +81,77 @@ if __name__ == "__main__":
y_train_under_pre = np.load('gen_train_data/data/output/pre/y_train_under_pre.npy', allow_pickle=True)
X_train_under_post = np.load('gen_train_data/data/output/post/X_train_under_post.npy', allow_pickle=True)
y_train_under_post = np.load('gen_train_data/data/output/post/y_train_under_post.npy', allow_pickle=True)
# --------------------------------------------------------------------------------------------------------
data_dic = {
"X_test_pre": X_test_pre,
"y_test_pre": y_test_pre,
"X_test_post": X_test_post,
"y_test_post": y_test_post,
"X_train_pre": X_train_pre,
"y_train_pre": y_train_pre,
"X_train_post": X_train_post,
"y_train_post": y_train_post,
"X_train_over_pre": X_train_over_pre,
"y_train_over_pre": y_train_over_pre,
"X_train_over_post": X_train_over_post,
"y_train_over_post": y_train_over_post,
"X_train_under_pre": X_train_under_pre,
"y_train_under_pre": y_train_under_pre,
"X_train_under_post": X_train_under_post,
"y_train_under_post": y_train_under_post,
}
return data_dic
if __name__ == "__main__":
# Reading training data
data_dic = read_data()
# Defining the models to train
# --------------------------------------------------------------------------------------------------------
# 1. No class weight
models_1 = {"DT" : DecisionTreeClassifier(),
"RF" : RandomForestClassifier(),
"Bagging" : BaggingClassifier(),
"AB" : AdaBoostClassifier(),
"XGB": XGBClassifier(),
"LR" : LogisticRegression(),
"ElNet" : LogisticRegression(penalty='elasticnet'),
"SVM" : SVC(),
"MLP" : MLPClassifier(),
# "RF" : RandomForestClassifier(),
# "Bagging" : BaggingClassifier(),
# "AB" : AdaBoostClassifier(),
# "XGB": XGBClassifier(),
# "LR" : LogisticRegression(),
# "ElNet" : LogisticRegression(penalty='elasticnet'),
# "SVM" : SVC(),
# "MLP" : MLPClassifier(),
}
# 2. Class weight
models_2 = {"DT" : DecisionTreeClassifier(class_weight='balanced'),
"RF" : RandomForestClassifier(class_weight='balanced'),
"Bagging" : BaggingClassifier(), # <-
"AB" : AdaBoostClassifier(), # <-
"XGB": XGBClassifier(), # <-
"LR" : LogisticRegression(class_weight='balanced'),
"ElNet" : LogisticRegression(penalty='elasticnet', class_weight='balanced'),
"SVM" : SVC(class_weight='balanced'),
"MLP" : MLPClassifier(), # <-
# "RF" : RandomForestClassifier(class_weight='balanced'),
# "Bagging" : BaggingClassifier(), # <-
# "AB" : AdaBoostClassifier(), # <-
# "XGB": XGBClassifier(), # <-
# "LR" : LogisticRegression(class_weight='balanced'),
# "ElNet" : LogisticRegression(penalty='elasticnet', class_weight='balanced'),
# "SVM" : SVC(class_weight='balanced'),
# "MLP" : MLPClassifier(), # <-
}
# --------------------------------------------------------------------------------------------------------
# Setup
# --------------------------------------------------------------------------------------------------------
# Scorings to use for model evaluation
scorings = {'f1':make_scorer(f1_score), 'negative_recall': negative_recall_scorer, 'recall':make_scorer(recall_score), 'precision':make_scorer(precision_score), 'TN':TN_scorer, 'FN':FN_scorer, 'FP':FP_scorer, 'TP':TP_scorer}
# Defining cross-validation protocol
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
# --------------------------------------------------------------------------------------------------------
for i, group in enumerate(['pre', 'post']):
for j, method in enumerate(['', '', 'over_', 'under_']):
# Get dataset based on group and method
X = data_dic['X_train_' + method + group]
y = data_dic['y_train_' + method + group]
# Use group of models with class weight if needed
models = models_2 if j == 2 else models_1
# Create df to keep track of each group-method for all its models
results = pd.DataFrame()
for model_name, model in models.items():
cv_results = cross_validate(model, X, y, scoring=scorings, cv=cv, return_train_score=True, n_jobs=1)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment