Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Submit feedback
Sign in
Toggle navigation
C
covid_analysis
Project overview
Project overview
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
COMPARA
covid_analysis
Commits
cb2c95a1
Commit
cb2c95a1
authored
May 06, 2024
by
Joaquin Torres
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Testing CV flow with DT and just PREC and REC
parent
48f79f60
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
27 additions
and
37 deletions
+27
-37
training_models/output/cross_val_res.xlsx
training_models/output/cross_val_res.xlsx
+0
-0
training_models/train_models.py
training_models/train_models.py
+27
-37
No files found.
training_models/output/cross_val_res.xlsx
0 → 100644
View file @
cb2c95a1
File added
training_models/train_models.py
View file @
cb2c95a1
...
...
@@ -22,39 +22,6 @@ from sklearn.linear_model import LogisticRegression
from
sklearn.tree
import
DecisionTreeClassifier
# --------------------------------------------------------------------------------------------------------
def
negative_recall_scorer
(
clf
,
X
,
y
):
"""Gives the negative recall defined as the (number of true_negative_samples)/(total number of negative samples)"""
y_pred
=
clf
.
predict
(
X
)
cm
=
confusion_matrix
(
y
,
y_pred
)
TN_prop
=
cm
[
0
,
0
]
/
(
cm
[
0
,
1
]
+
cm
[
0
,
0
])
return
TN_prop
def
TN_scorer
(
clf
,
X
,
y
):
"""Gives the number of samples predicted as true negatives"""
y_pred
=
clf
.
predict
(
X
)
cm
=
confusion_matrix
(
y
,
y_pred
)
TN
=
cm
[
0
,
0
]
return
TN
def
FN_scorer
(
clf
,
X
,
y
):
"""Gives the number of samples predicted as false negatives"""
y_pred
=
clf
.
predict
(
X
)
cm
=
confusion_matrix
(
y
,
y_pred
)
FN
=
cm
[
0
,
1
]
return
FN
def
FP_scorer
(
clf
,
X
,
y
):
"""Gives the number of samples predicted as false positive"""
y_pred
=
clf
.
predict
(
X
)
cm
=
confusion_matrix
(
y
,
y_pred
)
FP
=
cm
[
1
,
0
]
return
FP
def
TP_scorer
(
clf
,
X
,
y
):
"""Gives the number of samples predicted as true positive"""
y_pred
=
clf
.
predict
(
X
)
cm
=
confusion_matrix
(
y
,
y_pred
)
TP
=
cm
[
1
,
1
]
return
TP
def
read_data
():
import
numpy
as
np
...
...
@@ -138,11 +105,22 @@ if __name__ == "__main__":
# Setup
# --------------------------------------------------------------------------------------------------------
# Scorings to use for model evaluation
scorings
=
{
'
f1'
:
make_scorer
(
f1_score
),
'negative_recall'
:
negative_recall_scorer
,
'recall'
:
make_scorer
(
recall_score
),
'precision'
:
make_scorer
(
precision_score
),
'TN'
:
TN_scorer
,
'FN'
:
FN_scorer
,
'FP'
:
FP_scorer
,
'TP'
:
TP_scorer
}
scorings
=
{
'
recall'
:
make_scorer
(
recall_score
),
'precision'
:
make_scorer
(
precision_score
)
}
# Defining cross-validation protocol
cv
=
StratifiedKFold
(
n_splits
=
10
,
shuffle
=
True
,
random_state
=
1
)
result_cols
=
[
f
"{model}_{metric}"
for
model
in
models_1
.
keys
()
for
metric
in
[
'PREC'
,
'REC'
]]
method_names
=
{
0
:
"ORIG"
,
1
:
"ORIG_CW"
,
2
:
"OVER"
,
3
:
"UNDER"
}
# --------------------------------------------------------------------------------------------------------
# Evaluating performance through cross validation and exporting results
# --------------------------------------------------------------------------------------------------------
# Store each df as a sheet in an excel file
sheets_dict
=
{}
for
i
,
group
in
enumerate
([
'pre'
,
'post'
]):
for
j
,
method
in
enumerate
([
''
,
''
,
'over_'
,
'under_'
]):
# Get dataset based on group and method
...
...
@@ -150,8 +128,20 @@ if __name__ == "__main__":
y
=
data_dic
[
'y_train_'
+
method
+
group
]
# Use group of models with class weight if needed
models
=
models_2
if
j
==
2
else
models_1
#
Create df to keep track of each group-method for all its models
res
ults
=
pd
.
DataFrame
(
)
#
Save results in dataframe (10 columns since 10-fold cv)
res
_df
=
pd
.
DataFrame
(
columns
=
range
(
1
,
11
),
index
=
result_cols
)
for
model_name
,
model
in
models
.
items
():
cv_results
=
cross_validate
(
model
,
X
,
y
,
scoring
=
scorings
,
cv
=
cv
,
return_train_score
=
True
,
n_jobs
=
1
)
cv_scores
=
cross_validate
(
model
,
X
,
y
,
scoring
=
scorings
,
cv
=
cv
,
return_train_score
=
True
,
n_jobs
=
1
)
res_df
.
loc
[
model_name
+
'_PREC'
]
=
list
(
np
.
around
(
np
.
array
(
cv_scores
[
"test_precision"
]),
4
))
res_df
.
loc
[
model_name
+
'_REC'
]
=
list
(
np
.
around
(
np
.
array
(
cv_scores
[
"test_recall"
]),
4
))
# Store the DataFrame in the dictionary with a unique key for each sheet
sheet_name
=
f
"{group}_{method_names[j]}"
sheets_dict
[
sheet_name
]
=
res_df
# Write results to Excel file
with
pd
.
ExcelWriter
(
'./training_models/output/cross_val_res.xlsx'
)
as
writer
:
for
sheet_name
,
data
in
sheets_dict
.
items
():
data
.
to_excel
(
writer
,
sheet_name
=
sheet_name
)
# --------------------------------------------------------------------------------------------------------
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment