Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Submit feedback
Sign in
Toggle navigation
C
covid_analysis
Project overview
Project overview
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
COMPARA
covid_analysis
Commits
553cf866
Commit
553cf866
authored
May 08, 2024
by
Joaquin Torres
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Basic structure for testing models
parent
bb7d8177
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
241 additions
and
0 deletions
+241
-0
models/test_models.py
models/test_models.py
+241
-0
No files found.
models/test_models.py
0 → 100644
View file @
553cf866
"""
Evaluating optimized models with test data
"""
# Libraries
# --------------------------------------------------------------------------------------------------------
import
pandas
as
pd
import
numpy
as
np
from
xgboost
import
XGBClassifier
from
sklearn.metrics
import
confusion_matrix
from
sklearn.metrics
import
f1_score
,
make_scorer
,
precision_score
,
recall_score
,
accuracy_score
from
sklearn.ensemble
import
RandomForestClassifier
,
BaggingClassifier
,
AdaBoostClassifier
from
sklearn.neural_network
import
MLPClassifier
from
sklearn.svm
import
SVC
from
sklearn.linear_model
import
LogisticRegression
from
sklearn.tree
import
DecisionTreeClassifier
# --------------------------------------------------------------------------------------------------------
# Reading test data
# --------------------------------------------------------------------------------------------------------
def
read_test_data
():
# Load test data
X_test_pre
=
np
.
load
(
'../gen_train_data/data/output/pre/X_test_pre.npy'
,
allow_pickle
=
True
)
y_test_pre
=
np
.
load
(
'../gen_train_data/data/output/pre/y_test_pre.npy'
,
allow_pickle
=
True
)
X_test_post
=
np
.
load
(
'../gen_train_data/data/output/post/X_test_post.npy'
,
allow_pickle
=
True
)
y_test_post
=
np
.
load
(
'../gen_train_data/data/output/post/y_test_post.npy'
,
allow_pickle
=
True
)
data_dic
=
{
"X_test_pre"
:
X_test_pre
,
"y_test_pre"
:
y_test_pre
,
"X_test_post"
:
X_test_post
,
"y_test_post"
:
y_test_post
,
}
return
data_dic
# --------------------------------------------------------------------------------------------------------
# Returning tuned models for each situation
# --------------------------------------------------------------------------------------------------------
def
get_tuned_models
(
group_id
,
method_id
):
# 1. PRE
if
group_id
==
0
:
# 1.1) Trained with original dataset
if
method_id
==
0
:
tuned_models
=
{
"DT"
:
DecisionTreeClassifier
(),
"RF"
:
RandomForestClassifier
(),
"Bagging"
:
BaggingClassifier
(),
"AB"
:
AdaBoostClassifier
(),
"XGB"
:
XGBClassifier
(),
"LR"
:
LogisticRegression
(
max_iter
=
1000
),
"ElNet"
:
LogisticRegression
(
max_iter
=
1000
,
penalty
=
'elasticnet'
),
"SVM"
:
SVC
(
probability
=
True
),
"MLP"
:
MLPClassifier
(
max_iter
=
500
)
}
# 1.2) Trained with original dataset and cost-sensitive learning
elif
method_id
==
1
:
tuned_models
=
{
"DT"
:
DecisionTreeClassifier
(),
"RF"
:
RandomForestClassifier
(),
"Bagging"
:
BaggingClassifier
(),
"AB"
:
AdaBoostClassifier
(),
"XGB"
:
XGBClassifier
(),
"LR"
:
LogisticRegression
(
max_iter
=
1000
),
"ElNet"
:
LogisticRegression
(
max_iter
=
1000
,
penalty
=
'elasticnet'
),
"SVM"
:
SVC
(
probability
=
True
),
"MLP"
:
MLPClassifier
(
max_iter
=
500
)
}
# 1.3) Trained with oversampled training dataset
elif
method_id
==
2
:
tuned_models
=
{
"DT"
:
DecisionTreeClassifier
(),
"RF"
:
RandomForestClassifier
(),
"Bagging"
:
BaggingClassifier
(),
"AB"
:
AdaBoostClassifier
(),
"XGB"
:
XGBClassifier
(),
"LR"
:
LogisticRegression
(
max_iter
=
1000
),
"ElNet"
:
LogisticRegression
(
max_iter
=
1000
,
penalty
=
'elasticnet'
),
"SVM"
:
SVC
(
probability
=
True
),
"MLP"
:
MLPClassifier
(
max_iter
=
500
)
}
# 1. 4) Trained with undersampled training dataset
elif
method_id
==
3
:
tuned_models
=
{
"DT"
:
DecisionTreeClassifier
(),
"RF"
:
RandomForestClassifier
(),
"Bagging"
:
BaggingClassifier
(),
"AB"
:
AdaBoostClassifier
(),
"XGB"
:
XGBClassifier
(),
"LR"
:
LogisticRegression
(
max_iter
=
1000
),
"ElNet"
:
LogisticRegression
(
max_iter
=
1000
,
penalty
=
'elasticnet'
),
"SVM"
:
SVC
(
probability
=
True
),
"MLP"
:
MLPClassifier
(
max_iter
=
500
)
}
# 2. POST
else
:
# 2.1) Trained with original dataset
if
method_id
==
0
:
tuned_models
=
{
"DT"
:
DecisionTreeClassifier
(),
"RF"
:
RandomForestClassifier
(),
"Bagging"
:
BaggingClassifier
(),
"AB"
:
AdaBoostClassifier
(),
"XGB"
:
XGBClassifier
(),
"LR"
:
LogisticRegression
(
max_iter
=
1000
),
"ElNet"
:
LogisticRegression
(
max_iter
=
1000
,
penalty
=
'elasticnet'
),
"SVM"
:
SVC
(
probability
=
True
),
"MLP"
:
MLPClassifier
(
max_iter
=
500
)
}
# 2.2) Trained with original dataset and cost-sensitive learning
elif
method_id
==
1
:
tuned_models
=
{
"DT"
:
DecisionTreeClassifier
(),
"RF"
:
RandomForestClassifier
(),
"Bagging"
:
BaggingClassifier
(),
"AB"
:
AdaBoostClassifier
(),
"XGB"
:
XGBClassifier
(),
"LR"
:
LogisticRegression
(
max_iter
=
1000
),
"ElNet"
:
LogisticRegression
(
max_iter
=
1000
,
penalty
=
'elasticnet'
),
"SVM"
:
SVC
(
probability
=
True
),
"MLP"
:
MLPClassifier
(
max_iter
=
500
)
}
# 2.3) Trained with oversampled training dataset
elif
method_id
==
2
:
tuned_models
=
{
"DT"
:
DecisionTreeClassifier
(),
"RF"
:
RandomForestClassifier
(),
"Bagging"
:
BaggingClassifier
(),
"AB"
:
AdaBoostClassifier
(),
"XGB"
:
XGBClassifier
(),
"LR"
:
LogisticRegression
(
max_iter
=
1000
),
"ElNet"
:
LogisticRegression
(
max_iter
=
1000
,
penalty
=
'elasticnet'
),
"SVM"
:
SVC
(
probability
=
True
),
"MLP"
:
MLPClassifier
(
max_iter
=
500
)
}
# 2.4) Trained with undersampled training dataset
elif
method_id
==
3
:
tuned_models
=
{
"DT"
:
DecisionTreeClassifier
(),
"RF"
:
RandomForestClassifier
(),
"Bagging"
:
BaggingClassifier
(),
"AB"
:
AdaBoostClassifier
(),
"XGB"
:
XGBClassifier
(),
"LR"
:
LogisticRegression
(
max_iter
=
1000
),
"ElNet"
:
LogisticRegression
(
max_iter
=
1000
,
penalty
=
'elasticnet'
),
"SVM"
:
SVC
(
probability
=
True
),
"MLP"
:
MLPClassifier
(
max_iter
=
500
)
}
return
tuned_models
# --------------------------------------------------------------------------------------------------------
# Scorers
# --------------------------------------------------------------------------------------------------------
def
TN_scorer
(
clf
,
X
,
y
):
"""Gives the number of samples predicted as true negatives"""
y_pred
=
clf
.
predict
(
X
)
cm
=
confusion_matrix
(
y
,
y_pred
)
TN
=
cm
[
0
,
0
]
return
TN
def
FN_scorer
(
clf
,
X
,
y
):
"""Gives the number of samples predicted as false negatives"""
y_pred
=
clf
.
predict
(
X
)
cm
=
confusion_matrix
(
y
,
y_pred
)
FN
=
cm
[
0
,
1
]
return
FN
def
FP_scorer
(
clf
,
X
,
y
):
"""Gives the number of samples predicted as false positive"""
y_pred
=
clf
.
predict
(
X
)
cm
=
confusion_matrix
(
y
,
y_pred
)
FP
=
cm
[
1
,
0
]
return
FP
def
TP_scorer
(
clf
,
X
,
y
):
"""Gives the number of samples predicted as true positive"""
y_pred
=
clf
.
predict
(
X
)
cm
=
confusion_matrix
(
y
,
y_pred
)
TP
=
cm
[
1
,
1
]
return
TP
def
negative_recall_scorer
(
clf
,
X
,
y
):
"""Gives the negative recall defined as the (number of true_negative_samples)/(total number of negative samples)"""
y_pred
=
clf
.
predict
(
X
)
cm
=
confusion_matrix
(
y
,
y_pred
)
TN_prop
=
cm
[
0
,
0
]
/
(
cm
[
0
,
1
]
+
cm
[
0
,
0
])
return
TN_prop
# --------------------------------------------------------------------------------------------------------
if
__name__
==
"__main__"
:
# Reading testing data
data_dic
=
read_test_data
()
# Setup
# --------------------------------------------------------------------------------------------------------
# Scorings to use for model evaluation
scorings
=
{
'F1'
:
make_scorer
(
f1_score
),
'NREC'
:
negative_recall_scorer
,
'REC'
:
make_scorer
(
recall_score
),
'PREC'
:
make_scorer
(
precision_score
),
'ACC'
:
make_scorer
(
accuracy_score
),
'TN'
:
TN_scorer
,
'FN'
:
FN_scorer
,
'FP'
:
FP_scorer
,
'TP'
:
TP_scorer
# AUROC and AUPRC (plot?)
}
method_names
=
{
0
:
"ORIG"
,
1
:
"ORIG_CW"
,
2
:
"OVER"
,
3
:
"UNDER"
}
# --------------------------------------------------------------------------------------------------------
# Evaluating performance using test dataset
# --------------------------------------------------------------------------------------------------------
scores_sheets
=
{}
# To store score dfs as sheets in the same excel file
for
i
,
group
in
enumerate
([
'pre'
,
'post'
]):
# Get test dataset based on group
X
=
data_dic
[
'X_test'
+
group
]
y
=
data_dic
[
'y_test'
+
group
]
for
j
,
method
in
enumerate
([
''
,
''
,
'over_'
,
'under_'
]):
# Get tuned models for this group and method
models
=
get_tuned_models
(
group_id
=
i
,
method_id
=
j
)
# Scores df
scores_df
=
pd
.
DataFrame
(
index
=
models
.
keys
(),
columns
=
scorings
.
keys
())
# Evaluate each model
for
model_name
,
model
in
models
.
items
():
# At each of the scores of interest
for
score_name
,
scorer
in
scorings
.
items
():
score_value
=
scorer
(
model
,
X
,
y
)
scores_df
.
at
[
model_name
,
score_name
]
=
score_value
# Store the DataFrame in the dictionary with a unique key for each sheet
sheet_name
=
f
"{group}_{method_names[j]}"
scores_sheets
[
sheet_name
]
=
scores_df
# Write results to Excel file
with
pd
.
ExcelWriter
(
'./training_models/output/testing_tuned_models.xlsx'
)
as
writer
:
for
sheet_name
,
data
in
scores_sheets
.
items
():
data
.
to_excel
(
writer
,
sheet_name
=
sheet_name
)
# --------------------------------------------------------------------------------------------------------
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment