Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Submit feedback
Sign in
Toggle navigation
C
covid_analysis
Project overview
Project overview
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
COMPARA
covid_analysis
Commits
a1ce917a
Commit
a1ce917a
authored
May 07, 2024
by
Joaquin Torres
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
crossval + hyperparam tested on DF
parent
9d246651
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
192 additions
and
10 deletions
+192
-10
training_models/eval_models.py
training_models/eval_models.py
+10
-10
training_models/hyperparam_tuning.py
training_models/hyperparam_tuning.py
+182
-0
training_models/output/cross_val_res.xlsx
training_models/output/cross_val_res.xlsx
+0
-0
training_models/output/hyperparam.xlsx
training_models/output/hyperparam.xlsx
+0
-0
No files found.
training_models/
train
_models.py
→
training_models/
eval
_models.py
View file @
a1ce917a
...
...
@@ -79,26 +79,26 @@ if __name__ == "__main__":
# --------------------------------------------------------------------------------------------------------
# 1. No class weight
models_1
=
{
#"DT" : DecisionTreeClassifier(),
"RF"
:
RandomForestClassifier
(),
"RF"
:
RandomForestClassifier
(
n_estimators
=
50
),
# "Bagging" : BaggingClassifier(),
# "AB" : AdaBoostClassifier(),
# "XGB": XGBClassifier(),
# "LR" : LogisticRegression(),
# "ElNet" : LogisticRegression(penalty='elasticnet'),
# "SVM" : SVC(),
# "MLP" : MLPClassifier(),
# "LR" : LogisticRegression(
max_iter=1000
),
# "ElNet" : LogisticRegression(
max_iter=1000,
penalty='elasticnet'),
# "SVM" : SVC(
probability=True
),
# "MLP" : MLPClassifier(
max_iter=500
),
}
# 2. Class weight
models_2
=
{
#"DT" : DecisionTreeClassifier(class_weight='balanced'),
"RF"
:
RandomForestClassifier
(
class_weight
=
'balanced'
),
"RF"
:
RandomForestClassifier
(
n_estimators
=
50
,
class_weight
=
'balanced'
),
# "Bagging" : BaggingClassifier(), # <-
# "AB" : AdaBoostClassifier(), # <-
# "XGB": XGBClassifier(), # <-
# "LR" : LogisticRegression(class_weight='balanced'),
# "ElNet" : LogisticRegression(penalty='elasticnet', class_weight='balanced'),
# "SVM" : SVC(class_weight='balanced'),
# "MLP" : MLPClassifier(), # <-
# "LR" : LogisticRegression(
max_iter=1000,
class_weight='balanced'),
# "ElNet" : LogisticRegression(
max_iter=1000,
penalty='elasticnet', class_weight='balanced'),
# "SVM" : SVC(
probability=True,
class_weight='balanced'),
# "MLP" : MLPClassifier(
max_iter=500
), # <-
}
# --------------------------------------------------------------------------------------------------------
...
...
training_models/hyperparam_tuning.py
0 → 100644
View file @
a1ce917a
"""
Selecting best models through cross validation and hyperparameter tunning
for each method:
1. Original training dataset
2. Original training dataset - Cost sensitive
3. Oversampling
4. Undersampling
"""
# Libraries
# --------------------------------------------------------------------------------------------------------
import
pandas
as
pd
import
numpy
as
np
from
xgboost
import
XGBClassifier
from
sklearn.metrics
import
confusion_matrix
from
sklearn.metrics
import
f1_score
,
make_scorer
,
precision_score
,
recall_score
from
sklearn.model_selection
import
StratifiedKFold
,
cross_validate
from
sklearn.ensemble
import
RandomForestClassifier
,
BaggingClassifier
,
AdaBoostClassifier
from
sklearn.neural_network
import
MLPClassifier
from
sklearn.svm
import
SVC
from
sklearn.linear_model
import
LogisticRegression
from
sklearn.tree
import
DecisionTreeClassifier
from
scipy.stats
import
randint
,
uniform
from
sklearn.model_selection
import
RandomizedSearchCV
# --------------------------------------------------------------------------------------------------------
# Function to read datasets
# --------------------------------------------------------------------------------------------------------
def
read_data
():
import
numpy
as
np
# Load test data
X_test_pre
=
np
.
load
(
'./gen_train_data/data/output/pre/X_test_pre.npy'
,
allow_pickle
=
True
)
y_test_pre
=
np
.
load
(
'./gen_train_data/data/output/pre/y_test_pre.npy'
,
allow_pickle
=
True
)
X_test_post
=
np
.
load
(
'./gen_train_data/data/output/post/X_test_post.npy'
,
allow_pickle
=
True
)
y_test_post
=
np
.
load
(
'./gen_train_data/data/output/post/y_test_post.npy'
,
allow_pickle
=
True
)
# Load ORIGINAL training data
X_train_pre
=
np
.
load
(
'./gen_train_data/data/output/pre/X_train_pre.npy'
,
allow_pickle
=
True
)
y_train_pre
=
np
.
load
(
'./gen_train_data/data/output/pre/y_train_pre.npy'
,
allow_pickle
=
True
)
X_train_post
=
np
.
load
(
'./gen_train_data/data/output/post/X_train_post.npy'
,
allow_pickle
=
True
)
y_train_post
=
np
.
load
(
'./gen_train_data/data/output/post/y_train_post.npy'
,
allow_pickle
=
True
)
# Load oversampled training data
X_train_over_pre
=
np
.
load
(
'./gen_train_data/data/output/pre/X_train_over_pre.npy'
,
allow_pickle
=
True
)
y_train_over_pre
=
np
.
load
(
'./gen_train_data/data/output/pre/y_train_over_pre.npy'
,
allow_pickle
=
True
)
X_train_over_post
=
np
.
load
(
'./gen_train_data/data/output/post/X_train_over_post.npy'
,
allow_pickle
=
True
)
y_train_over_post
=
np
.
load
(
'./gen_train_data/data/output/post/y_train_over_post.npy'
,
allow_pickle
=
True
)
# Load undersampled training data
X_train_under_pre
=
np
.
load
(
'./gen_train_data/data/output/pre/X_train_under_pre.npy'
,
allow_pickle
=
True
)
y_train_under_pre
=
np
.
load
(
'./gen_train_data/data/output/pre/y_train_under_pre.npy'
,
allow_pickle
=
True
)
X_train_under_post
=
np
.
load
(
'./gen_train_data/data/output/post/X_train_under_post.npy'
,
allow_pickle
=
True
)
y_train_under_post
=
np
.
load
(
'./gen_train_data/data/output/post/y_train_under_post.npy'
,
allow_pickle
=
True
)
data_dic
=
{
"X_test_pre"
:
X_test_pre
,
"y_test_pre"
:
y_test_pre
,
"X_test_post"
:
X_test_post
,
"y_test_post"
:
y_test_post
,
"X_train_pre"
:
X_train_pre
,
"y_train_pre"
:
y_train_pre
,
"X_train_post"
:
X_train_post
,
"y_train_post"
:
y_train_post
,
"X_train_over_pre"
:
X_train_over_pre
,
"y_train_over_pre"
:
y_train_over_pre
,
"X_train_over_post"
:
X_train_over_post
,
"y_train_over_post"
:
y_train_over_post
,
"X_train_under_pre"
:
X_train_under_pre
,
"y_train_under_pre"
:
y_train_under_pre
,
"X_train_under_post"
:
X_train_under_post
,
"y_train_under_post"
:
y_train_under_post
,
}
return
data_dic
# --------------------------------------------------------------------------------------------------------
if
__name__
==
"__main__"
:
# Reading training data
data_dic
=
read_data
()
# Defining the models to train
# --------------------------------------------------------------------------------------------------------
# 1. No class weight
models_1
=
{
"DT"
:
DecisionTreeClassifier
(),
# "RF" : RandomForestClassifier(n_estimators=50),
# "Bagging" : BaggingClassifier(),
# "AB" : AdaBoostClassifier(),
# "XGB": XGBClassifier(),
# "LR" : LogisticRegression(max_iter=1000),
# "ElNet" : LogisticRegression(max_iter=1000, penalty='elasticnet'),
# "SVM" : SVC(probability=True),
# "MLP" : MLPClassifier(max_iter=500),
}
# 2. Class weight
models_2
=
{
"DT"
:
DecisionTreeClassifier
(
class_weight
=
'balanced'
),
# "RF" : RandomForestClassifier(n_estimators=50, class_weight='balanced'),
# "Bagging" : BaggingClassifier(), # <-
# "AB" : AdaBoostClassifier(), # <-
# "XGB": XGBClassifier(), # <-
# "LR" : LogisticRegression(max_iter=1000, class_weight='balanced'),
# "ElNet" : LogisticRegression(max_iter=1000, penalty='elasticnet', class_weight='balanced'),
# "SVM" : SVC(probability=True, class_weight='balanced'),
# "MLP" : MLPClassifier(max_iter=500), # <-
}
# Hyperparameter tuning setup
# --------------------------------------------------------------------------------------------------------
hyperparameters
=
{
"DT"
:
{
'splitter'
:
[
'best'
,
'random'
],
'max_features'
:
[
'sqrt'
,
'log2'
],
'criterion'
:
[
'gini'
,
'entropy'
,
'log_loss'
]},
"RF"
:
{
'n_estimators'
:
randint
(
100
,
250
),
'max_features'
:
[
'sqrt'
,
'log2'
],
'criterion'
:
[
'gini'
,
'entropy'
]},
"Bagging"
:
{
'n_estimators'
:
randint
(
10
,
100
),
'max_samples'
:
[
0.8
,
1.0
],
'max_features'
:
[
0.8
,
1.0
],
'warm_start'
:
[
True
,
False
]},
"AB"
:
{
'n_estimators'
:
randint
(
50
,
150
),
'learning_rate'
:
uniform
(
0.8
,
1.2
)},
"XGB"
:
{
'n_estimators'
:
randint
(
100
,
1000
),
'max_depth'
:
randint
(
3
,
10
),
'learning_rate'
:
uniform
(
0.01
,
0.3
)},
"LR"
:
{
'penalty'
:
[
'l1'
,
'l2'
,
None
],
'solver'
:
[
'lbfgs'
,
'sag'
,
'saga'
]},
"EL"
:
{
'solver'
:
[
'lbfgs'
,
'sag'
,
'saga'
]},
"SVM"
:
{
'C'
:
uniform
(
0.8
,
1.2
),
'kernel'
:
[
'linear'
,
'poly'
,
'rbf'
,
'sigmoid'
]},
"MLP"
:
{
'activation'
:
[
'identity'
,
'logistic'
,
'tanh'
,
'relu'
],
'hidden_layer_sizes'
:
randint
(
50
,
150
),
'learning_rate'
:
[
'constant'
,
'invscaling'
,
'adaptive'
]}
}
# --------------------------------------------------------------------------------------------------------
# Cross-validation setup
# --------------------------------------------------------------------------------------------------------
# Defining cross-validation protocol
cv
=
StratifiedKFold
(
n_splits
=
10
,
shuffle
=
True
,
random_state
=
1
)
method_names
=
{
0
:
"ORIG"
,
1
:
"ORIG_CW"
,
2
:
"OVER"
,
3
:
"UNDER"
}
# --------------------------------------------------------------------------------------------------------
# Hyperparameter tuning loop and exporting results
# --------------------------------------------------------------------------------------------------------
# Store each df as a sheet in an excel file
sheets_dict
=
{}
for
i
,
group
in
enumerate
([
'pre'
,
'post'
]):
for
j
,
method
in
enumerate
([
''
,
''
,
'over_'
,
'under_'
]):
print
(
f
"ITERATION {i+j}"
)
# Get dataset based on group and method
X
=
data_dic
[
'X_train_'
+
method
+
group
]
y
=
data_dic
[
'y_train_'
+
method
+
group
]
# Use group of models with class weight if needed
models
=
models_2
if
j
==
2
else
models_1
# Save results: params and best score for each of the mdodels of this method and group
hyperparam_df
=
pd
.
DataFrame
(
index
=
list
(
models
.
keys
()),
columns
=
[
'Parameters'
,
'Score'
])
for
model_name
,
model
in
models
.
items
():
# Find optimal hyperparams for curr model
params
=
hyperparameters
[
model_name
]
search
=
RandomizedSearchCV
(
model
,
param_distributions
=
params
,
cv
=
cv
,
n_jobs
=
1
,
scoring
=
'precision'
)
search
.
fit
(
X
,
y
)
hyperparam_df
.
at
[
model_name
,
'Parameters'
]
=
search
.
best_params_
hyperparam_df
.
at
[
model_name
,
'Score'
]
=
round
(
search
.
best_score_
,
4
)
# Store the DataFrame in the dictionary with a unique key for each sheet
sheet_name
=
f
"{group}_{method_names[j]}"
sheets_dict
[
sheet_name
]
=
hyperparam_df
# Write results to Excel file
with
pd
.
ExcelWriter
(
'./training_models/output/hyperparam.xlsx'
)
as
writer
:
for
sheet_name
,
data
in
sheets_dict
.
items
():
data
.
to_excel
(
writer
,
sheet_name
=
sheet_name
)
# --------------------------------------------------------------------------------------------------------
training_models/output/cross_val_res.xlsx
View file @
a1ce917a
No preview for this file type
training_models/output/hyperparam.xlsx
0 → 100644
View file @
a1ce917a
File added
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment