Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Submit feedback
Sign in
Toggle navigation
C
covid_analysis
Project overview
Project overview
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
COMPARA
covid_analysis
Commits
c556b024
Commit
c556b024
authored
May 09, 2024
by
Joaquin Torres
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
minor fixes
parent
b7ae7c60
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
12 additions
and
23 deletions
+12
-23
model_selection/hyperparam_tuning.py
model_selection/hyperparam_tuning.py
+12
-23
No files found.
model_selection/hyperparam_tuning.py
View file @
c556b024
...
...
@@ -12,9 +12,7 @@
import
pandas
as
pd
import
numpy
as
np
from
xgboost
import
XGBClassifier
from
sklearn.metrics
import
confusion_matrix
from
sklearn.metrics
import
f1_score
,
make_scorer
,
precision_score
,
recall_score
from
sklearn.model_selection
import
StratifiedKFold
,
cross_validate
from
sklearn.model_selection
import
StratifiedKFold
from
sklearn.ensemble
import
RandomForestClassifier
,
BaggingClassifier
,
AdaBoostClassifier
from
sklearn.neural_network
import
MLPClassifier
from
sklearn.svm
import
SVC
...
...
@@ -24,17 +22,11 @@ from scipy.stats import randint, uniform
from
sklearn.model_selection
import
RandomizedSearchCV
# --------------------------------------------------------------------------------------------------------
# Function to read datasets
# Function to read
training
datasets
# --------------------------------------------------------------------------------------------------------
def
read_data
():
import
numpy
as
np
# Load test data
X_test_pre
=
np
.
load
(
'../gen_train_data/data/output/pre/X_test_pre.npy'
,
allow_pickle
=
True
)
y_test_pre
=
np
.
load
(
'../gen_train_data/data/output/pre/y_test_pre.npy'
,
allow_pickle
=
True
)
X_test_post
=
np
.
load
(
'../gen_train_data/data/output/post/X_test_post.npy'
,
allow_pickle
=
True
)
y_test_post
=
np
.
load
(
'../gen_train_data/data/output/post/y_test_post.npy'
,
allow_pickle
=
True
)
# Load ORIGINAL training data
X_train_pre
=
np
.
load
(
'../gen_train_data/data/output/pre/X_train_pre.npy'
,
allow_pickle
=
True
)
y_train_pre
=
np
.
load
(
'../gen_train_data/data/output/pre/y_train_pre.npy'
,
allow_pickle
=
True
)
...
...
@@ -54,10 +46,6 @@ def read_data():
y_train_under_post
=
np
.
load
(
'../gen_train_data/data/output/post/y_train_under_post.npy'
,
allow_pickle
=
True
)
data_dic
=
{
"X_test_pre"
:
X_test_pre
,
"y_test_pre"
:
y_test_pre
,
"X_test_post"
:
X_test_post
,
"y_test_post"
:
y_test_post
,
"X_train_pre"
:
X_train_pre
,
"y_train_pre"
:
y_train_pre
,
"X_train_post"
:
X_train_post
,
...
...
@@ -83,28 +71,29 @@ if __name__ == "__main__":
# Defining the models to train
# --------------------------------------------------------------------------------------------------------
# 1. No class weight
models_
1
=
{
"DT"
:
DecisionTreeClassifier
(),
models_
simple
=
{
"DT"
:
DecisionTreeClassifier
(),
"RF"
:
RandomForestClassifier
(),
"Bagging"
:
BaggingClassifier
(),
"AB"
:
AdaBoostClassifier
(
algorithm
=
'SAMME'
),
"XGB"
:
XGBClassifier
(),
"LR"
:
LogisticRegression
(
max_iter
=
1000
),
# "ElNet" : LogisticRegression(max_iter=1000, penalty='elasticnet'),
"SVM"
:
SVC
(
probability
=
True
),
"MLP"
:
MLPClassifier
(
max_iter
=
500
)
# "ElNet" : LogisticRegression(max_iter=1000, penalty='elasticnet')
}
# 2. Class weight: cost-sensitive learning
models_
2
=
{
"DT"
:
DecisionTreeClassifier
(
class_weight
=
'balanced'
),
models_
CS
=
{
"DT"
:
DecisionTreeClassifier
(
class_weight
=
'balanced'
),
"RF"
:
RandomForestClassifier
(
class_weight
=
'balanced'
),
"Bagging"
:
BaggingClassifier
(
estimator
=
DecisionTreeClassifier
(
class_weight
=
'balanced'
)),
"AB"
:
AdaBoostClassifier
(
estimator
=
DecisionTreeClassifier
(
class_weight
=
'balanced'
),
algorithm
=
'SAMME'
),
# "XGB": XGBClassifier(), # <-
"LR"
:
LogisticRegression
(
max_iter
=
1000
,
class_weight
=
'balanced'
),
# "ElNet" : LogisticRegression(max_iter=1000, penalty='elasticnet', class_weight='balanced'),
"SVM"
:
SVC
(
probability
=
True
,
class_weight
=
'balanced'
),
# "ElNet" : LogisticRegression(max_iter=1000, penalty='elasticnet', class_weight='balanced'),
# "XGB": XGBClassifier(), # <-
# "MLP" : MLPClassifier(max_iter=500) # <-
}
# --------------------------------------------------------------------------------------------------------
# Hyperparameter tuning setup
# --------------------------------------------------------------------------------------------------------
...
...
@@ -126,12 +115,12 @@ if __name__ == "__main__":
'learning_rate'
:
uniform
(
0.01
,
0.3
)},
"LR"
:
{
'penalty'
:
[
'l1'
,
'l2'
,
'elasticnet'
,
None
],
'solver'
:
[
'lbfgs'
,
'sag'
,
'saga'
]},
# "ElNet": {'solver': ['lbfgs', 'sag', 'saga']},
"SVM"
:
{
'C'
:
uniform
(
0.8
,
1.2
),
'kernel'
:
[
'linear'
,
'poly'
,
'rbf'
,
'sigmoid'
]},
"MLP"
:
{
'activation'
:
[
'identity'
,
'logistic'
,
'tanh'
,
'relu'
],
'hidden_layer_sizes'
:
randint
(
50
,
150
),
'learning_rate'
:
[
'constant'
,
'invscaling'
,
'adaptive'
]}
# "ElNet": {'solver': ['lbfgs', 'sag', 'saga']},
}
# --------------------------------------------------------------------------------------------------------
...
...
@@ -151,20 +140,20 @@ if __name__ == "__main__":
# --------------------------------------------------------------------------------------------------------
# Store each df as a sheet in an excel file
sheets_dict
=
{}
for
i
,
group
in
enumerate
([
'pre'
,
'post'
]):
for
i
,
group
in
enumerate
([
'pre'
]):
for
j
,
method
in
enumerate
([
''
,
''
,
'over_'
,
'under_'
]):
# Get dataset based on group and method
X
=
data_dic
[
'X_train_'
+
method
+
group
]
y
=
data_dic
[
'y_train_'
+
method
+
group
]
# Use group of models with class weight if needed
models
=
models_
2
if
j
==
2
else
models_1
models
=
models_
CS
if
j
==
2
else
models_simple
# Save results: params and best score for each of the mdodels of this method and group
hyperparam_df
=
pd
.
DataFrame
(
index
=
list
(
models
.
keys
()),
columns
=
[
'Parameters'
,
'Score'
])
for
model_name
,
model
in
models
.
items
():
print
(
f
"{group}-{method}-{model_name}
\n\n
"
)
# Find optimal hyperparams for curr model
params
=
hyperparameters
[
model_name
]
search
=
RandomizedSearchCV
(
model
,
param_distributions
=
params
,
cv
=
cv
,
n_jobs
=
1
,
scoring
=
'precision'
)
search
=
RandomizedSearchCV
(
model
,
param_distributions
=
params
,
cv
=
cv
,
n_jobs
=
3
,
scoring
=
'precision'
)
search
.
fit
(
X
,
y
)
hyperparam_df
.
at
[
model_name
,
'Parameters'
]
=
search
.
best_params_
hyperparam_df
.
at
[
model_name
,
'Score'
]
=
round
(
search
.
best_score_
,
4
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment