Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Submit feedback
Sign in
Toggle navigation
C
covid_analysis
Project overview
Project overview
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
COMPARA
covid_analysis
Commits
b7ae7c60
Commit
b7ae7c60
authored
May 08, 2024
by
Joaquin Torres
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fixed minor issues, renaming
parent
553cf866
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
20 additions
and
164 deletions
+20
-164
explicability/shap_vals.py
explicability/shap_vals.py
+13
-0
model_selection/hyperparam_tuning.py
model_selection/hyperparam_tuning.py
+7
-7
model_selection/output/hyperparam.xlsx
model_selection/output/hyperparam.xlsx
+0
-0
model_selection/test_models.py
model_selection/test_models.py
+0
-0
models/eval_models.py
models/eval_models.py
+0
-147
models/output/cross_val_res.xlsx
models/output/cross_val_res.xlsx
+0
-0
models/shap.py
models/shap.py
+0
-10
No files found.
explicability/shap_vals.py
0 → 100644
View file @
b7ae7c60
# Libraries
# --------------------------------------------------------------------------------------------------------
import
pandas
as
pd
import
numpy
as
np
from
xgboost
import
XGBClassifier
from
sklearn.metrics
import
confusion_matrix
from
sklearn.metrics
import
f1_score
,
make_scorer
,
precision_score
,
recall_score
,
accuracy_score
from
sklearn.ensemble
import
RandomForestClassifier
,
BaggingClassifier
,
AdaBoostClassifier
from
sklearn.neural_network
import
MLPClassifier
from
sklearn.svm
import
SVC
from
sklearn.linear_model
import
LogisticRegression
from
sklearn.tree
import
DecisionTreeClassifier
# --------------------------------------------------------------------------------------------------------
\ No newline at end of file
model
s
/hyperparam_tuning.py
→
model
_selection
/hyperparam_tuning.py
View file @
b7ae7c60
...
@@ -86,10 +86,10 @@ if __name__ == "__main__":
...
@@ -86,10 +86,10 @@ if __name__ == "__main__":
models_1
=
{
"DT"
:
DecisionTreeClassifier
(),
models_1
=
{
"DT"
:
DecisionTreeClassifier
(),
"RF"
:
RandomForestClassifier
(),
"RF"
:
RandomForestClassifier
(),
"Bagging"
:
BaggingClassifier
(),
"Bagging"
:
BaggingClassifier
(),
"AB"
:
AdaBoostClassifier
(),
"AB"
:
AdaBoostClassifier
(
algorithm
=
'SAMME'
),
"XGB"
:
XGBClassifier
(),
"XGB"
:
XGBClassifier
(),
"LR"
:
LogisticRegression
(
max_iter
=
1000
),
"LR"
:
LogisticRegression
(
max_iter
=
1000
),
"ElNet"
:
LogisticRegression
(
max_iter
=
1000
,
penalty
=
'elasticnet'
),
#
"ElNet" : LogisticRegression(max_iter=1000, penalty='elasticnet'),
"SVM"
:
SVC
(
probability
=
True
),
"SVM"
:
SVC
(
probability
=
True
),
"MLP"
:
MLPClassifier
(
max_iter
=
500
)
"MLP"
:
MLPClassifier
(
max_iter
=
500
)
}
}
...
@@ -98,12 +98,12 @@ if __name__ == "__main__":
...
@@ -98,12 +98,12 @@ if __name__ == "__main__":
models_2
=
{
"DT"
:
DecisionTreeClassifier
(
class_weight
=
'balanced'
),
models_2
=
{
"DT"
:
DecisionTreeClassifier
(
class_weight
=
'balanced'
),
"RF"
:
RandomForestClassifier
(
class_weight
=
'balanced'
),
"RF"
:
RandomForestClassifier
(
class_weight
=
'balanced'
),
"Bagging"
:
BaggingClassifier
(
estimator
=
DecisionTreeClassifier
(
class_weight
=
'balanced'
)),
"Bagging"
:
BaggingClassifier
(
estimator
=
DecisionTreeClassifier
(
class_weight
=
'balanced'
)),
"AB"
:
AdaBoostClassifier
(
estimator
=
DecisionTreeClassifier
(
class_weight
=
'balanced'
)),
"AB"
:
AdaBoostClassifier
(
estimator
=
DecisionTreeClassifier
(
class_weight
=
'balanced'
)
,
algorithm
=
'SAMME'
),
# "XGB": XGBClassifier(), # <-
# "XGB": XGBClassifier(), # <-
"LR"
:
LogisticRegression
(
max_iter
=
1000
,
class_weight
=
'balanced'
),
"LR"
:
LogisticRegression
(
max_iter
=
1000
,
class_weight
=
'balanced'
),
"ElNet"
:
LogisticRegression
(
max_iter
=
1000
,
penalty
=
'elasticnet'
,
class_weight
=
'balanced'
),
#
"ElNet" : LogisticRegression(max_iter=1000, penalty='elasticnet', class_weight='balanced'),
"SVM"
:
SVC
(
probability
=
True
,
class_weight
=
'balanced'
),
"SVM"
:
SVC
(
probability
=
True
,
class_weight
=
'balanced'
),
# "MLP" : MLPClassifier(max_iter=500)
# "MLP" : MLPClassifier(max_iter=500)
# <-
}
}
# Hyperparameter tuning setup
# Hyperparameter tuning setup
...
@@ -124,9 +124,9 @@ if __name__ == "__main__":
...
@@ -124,9 +124,9 @@ if __name__ == "__main__":
"XGB"
:
{
'n_estimators'
:
randint
(
100
,
1000
),
"XGB"
:
{
'n_estimators'
:
randint
(
100
,
1000
),
'max_depth'
:
randint
(
3
,
10
),
'max_depth'
:
randint
(
3
,
10
),
'learning_rate'
:
uniform
(
0.01
,
0.3
)},
'learning_rate'
:
uniform
(
0.01
,
0.3
)},
"LR"
:
{
'penalty'
:
[
'l1'
,
'l2'
,
None
],
"LR"
:
{
'penalty'
:
[
'l1'
,
'l2'
,
'elasticnet'
,
None
],
'solver'
:
[
'lbfgs'
,
'sag'
,
'saga'
]},
'solver'
:
[
'lbfgs'
,
'sag'
,
'saga'
]},
"EL
"
:
{
'solver'
:
[
'lbfgs'
,
'sag'
,
'saga'
]},
# "ElNet
": {'solver': ['lbfgs', 'sag', 'saga']},
"SVM"
:
{
'C'
:
uniform
(
0.8
,
1.2
),
"SVM"
:
{
'C'
:
uniform
(
0.8
,
1.2
),
'kernel'
:
[
'linear'
,
'poly'
,
'rbf'
,
'sigmoid'
]},
'kernel'
:
[
'linear'
,
'poly'
,
'rbf'
,
'sigmoid'
]},
"MLP"
:
{
'activation'
:
[
'identity'
,
'logistic'
,
'tanh'
,
'relu'
],
"MLP"
:
{
'activation'
:
[
'identity'
,
'logistic'
,
'tanh'
,
'relu'
],
...
...
model
s
/output/hyperparam.xlsx
→
model
_selection
/output/hyperparam.xlsx
View file @
b7ae7c60
File moved
model
s
/test_models.py
→
model
_selection
/test_models.py
View file @
b7ae7c60
File moved
models/eval_models.py
deleted
100644 → 0
View file @
553cf866
"""
Selecting best models through cross validation and hyperparameter tunning
for each method:
1. Original training dataset
2. Original training dataset - Cost sensitive
3. Oversampling
4. Undersampling
"""
# Libraries
# --------------------------------------------------------------------------------------------------------
import
pandas
as
pd
import
numpy
as
np
from
xgboost
import
XGBClassifier
from
sklearn.metrics
import
confusion_matrix
from
sklearn.metrics
import
f1_score
,
make_scorer
,
precision_score
,
recall_score
from
sklearn.model_selection
import
StratifiedKFold
,
cross_validate
from
sklearn.ensemble
import
RandomForestClassifier
,
BaggingClassifier
,
AdaBoostClassifier
from
sklearn.neural_network
import
MLPClassifier
from
sklearn.svm
import
SVC
from
sklearn.linear_model
import
LogisticRegression
from
sklearn.tree
import
DecisionTreeClassifier
# --------------------------------------------------------------------------------------------------------
def
read_data
():
import
numpy
as
np
# Load test data
X_test_pre
=
np
.
load
(
'gen_train_data/data/output/pre/X_test_pre.npy'
,
allow_pickle
=
True
)
y_test_pre
=
np
.
load
(
'gen_train_data/data/output/pre/y_test_pre.npy'
,
allow_pickle
=
True
)
X_test_post
=
np
.
load
(
'gen_train_data/data/output/post/X_test_post.npy'
,
allow_pickle
=
True
)
y_test_post
=
np
.
load
(
'gen_train_data/data/output/post/y_test_post.npy'
,
allow_pickle
=
True
)
# Load ORIGINAL training data
X_train_pre
=
np
.
load
(
'gen_train_data/data/output/pre/X_train_pre.npy'
,
allow_pickle
=
True
)
y_train_pre
=
np
.
load
(
'gen_train_data/data/output/pre/y_train_pre.npy'
,
allow_pickle
=
True
)
X_train_post
=
np
.
load
(
'gen_train_data/data/output/post/X_train_post.npy'
,
allow_pickle
=
True
)
y_train_post
=
np
.
load
(
'gen_train_data/data/output/post/y_train_post.npy'
,
allow_pickle
=
True
)
# Load oversampled training data
X_train_over_pre
=
np
.
load
(
'gen_train_data/data/output/pre/X_train_over_pre.npy'
,
allow_pickle
=
True
)
y_train_over_pre
=
np
.
load
(
'gen_train_data/data/output/pre/y_train_over_pre.npy'
,
allow_pickle
=
True
)
X_train_over_post
=
np
.
load
(
'gen_train_data/data/output/post/X_train_over_post.npy'
,
allow_pickle
=
True
)
y_train_over_post
=
np
.
load
(
'gen_train_data/data/output/post/y_train_over_post.npy'
,
allow_pickle
=
True
)
# Load undersampled training data
X_train_under_pre
=
np
.
load
(
'gen_train_data/data/output/pre/X_train_under_pre.npy'
,
allow_pickle
=
True
)
y_train_under_pre
=
np
.
load
(
'gen_train_data/data/output/pre/y_train_under_pre.npy'
,
allow_pickle
=
True
)
X_train_under_post
=
np
.
load
(
'gen_train_data/data/output/post/X_train_under_post.npy'
,
allow_pickle
=
True
)
y_train_under_post
=
np
.
load
(
'gen_train_data/data/output/post/y_train_under_post.npy'
,
allow_pickle
=
True
)
data_dic
=
{
"X_test_pre"
:
X_test_pre
,
"y_test_pre"
:
y_test_pre
,
"X_test_post"
:
X_test_post
,
"y_test_post"
:
y_test_post
,
"X_train_pre"
:
X_train_pre
,
"y_train_pre"
:
y_train_pre
,
"X_train_post"
:
X_train_post
,
"y_train_post"
:
y_train_post
,
"X_train_over_pre"
:
X_train_over_pre
,
"y_train_over_pre"
:
y_train_over_pre
,
"X_train_over_post"
:
X_train_over_post
,
"y_train_over_post"
:
y_train_over_post
,
"X_train_under_pre"
:
X_train_under_pre
,
"y_train_under_pre"
:
y_train_under_pre
,
"X_train_under_post"
:
X_train_under_post
,
"y_train_under_post"
:
y_train_under_post
,
}
return
data_dic
if
__name__
==
"__main__"
:
# Reading training data
data_dic
=
read_data
()
# Defining the models to train
# --------------------------------------------------------------------------------------------------------
# 1. No class weight
models_1
=
{
#"DT" : DecisionTreeClassifier(),
"RF"
:
RandomForestClassifier
(
n_estimators
=
50
),
# "Bagging" : BaggingClassifier(),
# "AB" : AdaBoostClassifier(),
# "XGB": XGBClassifier(),
# "LR" : LogisticRegression(max_iter=1000),
# "ElNet" : LogisticRegression(max_iter=1000, penalty='elasticnet'),
# "SVM" : SVC(probability=True),
# "MLP" : MLPClassifier(max_iter=500),
}
# 2. Class weight
models_2
=
{
#"DT" : DecisionTreeClassifier(class_weight='balanced'),
"RF"
:
RandomForestClassifier
(
n_estimators
=
50
,
class_weight
=
'balanced'
),
# "Bagging" : BaggingClassifier(), # <-
# "AB" : AdaBoostClassifier(), # <-
# "XGB": XGBClassifier(), # <-
# "LR" : LogisticRegression(max_iter=1000, class_weight='balanced'),
# "ElNet" : LogisticRegression(max_iter=1000, penalty='elasticnet', class_weight='balanced'),
# "SVM" : SVC(probability=True, class_weight='balanced'),
# "MLP" : MLPClassifier(max_iter=500), # <-
}
# --------------------------------------------------------------------------------------------------------
# Setup
# --------------------------------------------------------------------------------------------------------
# Scorings to use for model evaluation
scorings
=
{
'recall'
:
make_scorer
(
recall_score
),
'precision'
:
make_scorer
(
precision_score
)}
# Defining cross-validation protocol
cv
=
StratifiedKFold
(
n_splits
=
10
,
shuffle
=
True
,
random_state
=
1
)
result_cols
=
[
f
"{model}_{metric}"
for
model
in
models_1
.
keys
()
for
metric
in
[
'PREC'
,
'REC'
]]
method_names
=
{
0
:
"ORIG"
,
1
:
"ORIG_CW"
,
2
:
"OVER"
,
3
:
"UNDER"
}
# --------------------------------------------------------------------------------------------------------
# Evaluating performance through cross validation and exporting results
# --------------------------------------------------------------------------------------------------------
# Store each df as a sheet in an excel file
sheets_dict
=
{}
for
i
,
group
in
enumerate
([
'pre'
,
'post'
]):
for
j
,
method
in
enumerate
([
''
,
''
,
'over_'
,
'under_'
]):
# Get dataset based on group and method
X
=
data_dic
[
'X_train_'
+
method
+
group
]
y
=
data_dic
[
'y_train_'
+
method
+
group
]
# Use group of models with class weight if needed
models
=
models_2
if
j
==
2
else
models_1
# Save results in dataframe (10 columns since 10-fold cv)
res_df
=
pd
.
DataFrame
(
columns
=
range
(
1
,
11
),
index
=
result_cols
)
for
model_name
,
model
in
models
.
items
():
cv_scores
=
cross_validate
(
model
,
X
,
y
,
scoring
=
scorings
,
cv
=
cv
,
return_train_score
=
True
,
n_jobs
=
1
)
res_df
.
loc
[
model_name
+
'_PREC'
]
=
list
(
np
.
around
(
np
.
array
(
cv_scores
[
"test_precision"
]),
4
))
res_df
.
loc
[
model_name
+
'_REC'
]
=
list
(
np
.
around
(
np
.
array
(
cv_scores
[
"test_recall"
]),
4
))
# Store the DataFrame in the dictionary with a unique key for each sheet
sheet_name
=
f
"{group}_{method_names[j]}"
sheets_dict
[
sheet_name
]
=
res_df
# Write results to Excel file
with
pd
.
ExcelWriter
(
'./training_models/output/cross_val_res.xlsx'
)
as
writer
:
for
sheet_name
,
data
in
sheets_dict
.
items
():
data
.
to_excel
(
writer
,
sheet_name
=
sheet_name
)
# --------------------------------------------------------------------------------------------------------
models/output/cross_val_res.xlsx
deleted
100644 → 0
View file @
553cf866
File deleted
models/shap.py
deleted
100644 → 0
View file @
553cf866
# Libraries
# --------------------------------------------------------------------------------------------------------
import
shap
import
numpy
as
np
# --------------------------------------------------------------------------------------------------------
# Load test data
X_test_pre
=
np
.
load
(
'../gen_train_data/data/output/pre/X_test_pre.npy'
,
allow_pickle
=
True
)
print
(
list
(
X_test_pre
.
columns
.
values
))
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment