Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Submit feedback
Sign in
Toggle navigation
C
covid_analysis
Project overview
Project overview
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
COMPARA
covid_analysis
Commits
f128a515
Commit
f128a515
authored
May 23, 2024
by
Joaquin Torres
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Testing PR curve
parent
9a51e5c3
Changes
3
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
14062 additions
and
1080 deletions
+14062
-1080
model_selection/cv_metric_gen.py
model_selection/cv_metric_gen.py
+56
-44
model_selection/output_cv_metrics/curves/pre_ORIG.svg
model_selection/output_cv_metrics/curves/pre_ORIG.svg
+14006
-1036
model_selection/output_cv_metrics/metrics.xlsx
model_selection/output_cv_metrics/metrics.xlsx
+0
-0
No files found.
model_selection/cv_metric_gen.py
View file @
f128a515
...
...
@@ -16,7 +16,7 @@ from sklearn.svm import SVC
from
sklearn.linear_model
import
LogisticRegression
from
sklearn.tree
import
DecisionTreeClassifier
from
sklearn.model_selection
import
StratifiedKFold
,
cross_validate
from
sklearn.metrics
import
RocCurveDisplay
,
roc_curve
,
auc
from
sklearn.metrics
import
RocCurveDisplay
,
auc
from
sklearn.metrics
import
PrecisionRecallDisplay
,
precision_recall_curve
import
matplotlib.pyplot
as
plt
import
ast
# String to dictionary
...
...
@@ -185,49 +185,64 @@ if __name__ == "__main__":
# Scores df -> one column per cv split, one row for each model-metric
scores_df
=
pd
.
DataFrame
(
columns
=
range
(
1
,
11
),
index
=
[
f
"{model_name}_{metric_name}"
for
model_name
in
models
.
keys
()
for
metric_name
in
scorings
.
keys
()])
# Create a figure for all models in this group-method
fig
,
axes
=
plt
.
subplots
(
len
(
models
),
1
,
figsize
=
(
10
,
8
*
len
(
models
)))
fig
,
axes
=
plt
.
subplots
(
len
(
models
),
2
,
figsize
=
(
10
,
8
*
len
(
models
)))
if
len
(
models
)
==
1
:
# Adjustment if there's only one model (axes indexing issue)
axes
=
[
axes
]
# Metric generation for each model
for
model_idx
,
(
model_name
,
model
)
in
enumerate
(
models
.
items
()):
if
model_name
==
'XGB'
:
print
(
f
"{group}-{method_names[j]}-{model_name}"
)
# Retrieve cv scores for our metrics of interest
scores
=
cross_validate
(
model
,
X_train
,
y_train
,
scoring
=
scorings
,
cv
=
cv
,
return_train_score
=
True
,
n_jobs
=
10
)
# Save results of each fold
for
metric_name
in
scorings
.
keys
():
scores_df
.
loc
[
model_name
+
f
'_{metric_name}'
]
=
list
(
np
.
around
(
np
.
array
(
scores
[
f
"test_{metric_name}"
]),
4
))
# ---------- Generate ROC curves
----------
# ---------------------------------------- Generate curves ------------------------------
----------
mean_fpr
=
np
.
linspace
(
0
,
1
,
100
)
tprs
,
aucs
=
[],
[]
mean_recall
=
np
.
linspace
(
0
,
1
,
100
)
precisions
,
pr_aucs
=
[],
[]
cmap
=
plt
.
get_cmap
(
'tab10'
)
# Colormap
# Loop through each fold in the cross-validation (redoing cv for simplicity)
# Loop through each fold in the cross-validation
for
fold_idx
,
(
train
,
test
)
in
enumerate
(
cv
.
split
(
X_train
,
y_train
)):
# Fit the model on the training data
model
.
fit
(
X_train
[
train
],
y_train
[
train
])
# Use RocCurveDisplay to generate the ROC curve
# Generate ROC curve for the fold
roc_display
=
RocCurveDisplay
.
from_estimator
(
model
,
X_train
[
test
],
y_train
[
test
],
name
=
f
"ROC fold {fold_idx}"
,
alpha
=
0.6
,
lw
=
2
,
ax
=
axes
[
model_idx
],
color
=
cmap
(
fold_idx
%
10
))
# Interpolate the true positive rates to get a smooth curve
ax
=
axes
[
model_idx
][
0
],
color
=
cmap
(
fold_idx
%
10
))
interp_tpr
=
np
.
interp
(
mean_fpr
,
roc_display
.
fpr
,
roc_display
.
tpr
)
interp_tpr
[
0
]
=
0.0
# Append the interpolated TPR and AUC for this fold
tprs
.
append
(
interp_tpr
)
aucs
.
append
(
roc_display
.
roc_auc
)
# Plot the diagonal line representing random guessing
axes
[
model_idx
]
.
plot
([
0
,
1
],
[
0
,
1
],
linestyle
=
'--'
,
lw
=
2
,
color
=
'r'
,
alpha
=
.8
,
label
=
'Random guessing'
)
# Compute the mean of the TPRs
# Generate Precision-Recall curve for the fold
pr_display
=
PrecisionRecallDisplay
.
from_estimator
(
model
,
X_train
[
test
],
y_train
[
test
],
name
=
f
"PR fold {fold_idx}"
,
alpha
=
0.6
,
lw
=
2
,
ax
=
axes
[
model_idx
][
1
],
color
=
cmap
(
fold_idx
%
10
))
interp_precision
=
np
.
interp
(
mean_recall
,
pr_display
.
recall
[::
-
1
],
pr_display
.
precision
[::
-
1
])
precisions
.
append
(
interp_precision
)
pr_aucs
.
append
(
pr_display
.
average_precision
)
# Plot diagonal line for random guessing in ROC curve
axes
[
model_idx
][
0
]
.
plot
([
0
,
1
],
[
0
,
1
],
linestyle
=
'--'
,
lw
=
2
,
color
=
'r'
,
alpha
=
.8
,
label
=
'Random guessing'
)
# Compute mean ROC curve
mean_tpr
=
np
.
mean
(
tprs
,
axis
=
0
)
mean_tpr
[
-
1
]
=
1.0
mean_auc
=
auc
(
mean_fpr
,
mean_tpr
)
# Calculate the mean AUC
# Plot the mean ROC curve with a thicker line and distinct color
axes
[
model_idx
]
.
plot
(
mean_fpr
,
mean_tpr
,
color
=
'b'
,
lw
=
4
,
label
=
r'Mean ROC (AUC =
%0.2
f)'
%
mean_auc
,
alpha
=
.8
)
# Set plot limits and title
axes
[
model_idx
]
.
set
(
xlim
=
[
-
0.05
,
1.05
],
ylim
=
[
-
0.05
,
1.05
],
title
=
f
"ROC Curve - {model_name} ({group}-{method_names[j]})"
)
axes
[
model_idx
]
.
legend
(
loc
=
"lower right"
)
# ---------- END ROC curves Generation ----------
mean_auc
=
auc
(
mean_fpr
,
mean_tpr
)
axes
[
model_idx
][
0
]
.
plot
(
mean_fpr
,
mean_tpr
,
color
=
'b'
,
lw
=
4
,
label
=
r'Mean ROC (AUC =
%0.2
f)'
%
mean_auc
,
alpha
=
.8
)
# Set ROC plot limits and title
axes
[
model_idx
][
0
]
.
set
(
xlim
=
[
-
0.05
,
1.05
],
ylim
=
[
-
0.05
,
1.05
],
title
=
f
"ROC Curve - {model_name} ({group}-{method_names[j]})"
)
axes
[
model_idx
][
0
]
.
legend
(
loc
=
"lower right"
)
# Compute mean Precision-Recall curve
mean_precision
=
np
.
mean
(
precisions
,
axis
=
0
)
mean_pr_auc
=
np
.
mean
(
pr_aucs
)
axes
[
model_idx
][
1
]
.
plot
(
mean_recall
,
mean_precision
,
color
=
'b'
,
lw
=
4
,
label
=
r'Mean PR (AUC =
%0.2
f)'
%
mean_pr_auc
,
alpha
=
.8
)
# # Plot baseline precision (proportion of positive samples)
# baseline = np.sum(y_train) / len(y_train)
# axes[model_idx][1].plot([0, 1], [baseline, baseline], linestyle='--', lw=2, color='r', alpha=.8, label='Baseline')
# Set Precision-Recall plot limits and title
axes
[
model_idx
][
1
]
.
set
(
xlim
=
[
-
0.05
,
1.05
],
ylim
=
[
-
0.05
,
1.05
],
title
=
f
"Precision-Recall Curve - {model_name} ({group}-{method_names[j]})"
)
axes
[
model_idx
][
1
]
.
legend
(
loc
=
"lower right"
)
# ---------------------------------------- End Generate Curves ----------------------------------------
# Store the DataFrame in the dictionary with a unique key for each sheet
sheet_name
=
f
"{group}_{method_names[j]}"
scores_sheets
[
sheet_name
]
=
scores_df
...
...
@@ -240,6 +255,3 @@ if __name__ == "__main__":
for
sheet_name
,
data
in
scores_sheets
.
items
():
data
.
to_excel
(
writer
,
sheet_name
=
sheet_name
)
print
(
"Successful cv metric generation for tuned models"
)
\ No newline at end of file
\ No newline at end of file
model_selection/output_cv_metrics/curves/pre_ORIG.svg
View file @
f128a515
This diff is collapsed.
Click to expand it.
model_selection/output_cv_metrics/metrics.xlsx
View file @
f128a515
No preview for this file type
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment