Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Submit feedback
Sign in
Toggle navigation
C
covid_analysis
Project overview
Project overview
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
COMPARA
covid_analysis
Commits
2d351899
Commit
2d351899
authored
May 14, 2024
by
Joaquin Torres
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
automated reading the parameters for tuned models
parent
6674b724
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
45 additions
and
100 deletions
+45
-100
model_selection/test_models.py
model_selection/test_models.py
+45
-100
No files found.
model_selection/test_models.py
View file @
2d351899
...
...
@@ -18,6 +18,7 @@ from sklearn.metrics import RocCurveDisplay, roc_curve
from
sklearn.metrics
import
PrecisionRecallDisplay
,
precision_recall_curve
import
matplotlib.pyplot
as
plt
from
sklearn.metrics
import
confusion_matrix
,
ConfusionMatrixDisplay
import
ast
# String to dictionary
# --------------------------------------------------------------------------------------------------------
# Reading test data
...
...
@@ -71,103 +72,47 @@ def read_test_data():
# Returning tuned models for each situation
# --------------------------------------------------------------------------------------------------------
def
get_tuned_models
(
group_id
,
method_id
):
# 1. PRE
if
group_id
==
0
:
# 1.1) Trained with original dataset
if
method_id
==
0
:
tuned_models
=
{
"DT"
:
DecisionTreeClassifier
(
**
{
'splitter'
:
'best'
,
'max_features'
:
'sqrt'
,
'criterion'
:
'entropy'
}),
"RF"
:
RandomForestClassifier
(
**
{
'criterion'
:
'entropy'
,
'max_features'
:
'sqrt'
,
'n_estimators'
:
123
}),
"Bagging"
:
BaggingClassifier
(
**
{
'max_features'
:
1.0
,
'max_samples'
:
0.8
,
'n_estimators'
:
13
,
'warm_start'
:
False
}),
"AB"
:
AdaBoostClassifier
(
**
{
'learning_rate'
:
1.8473150336970519
,
'n_estimators'
:
96
,
'algorithm'
:
'SAMME'
}),
"XGB"
:
XGBClassifier
(
**
{
'learning_rate'
:
0.21528982071549305
,
'max_depth'
:
6
,
'n_estimators'
:
804
}),
"LR"
:
LogisticRegression
(
**
{
'solver'
:
'lbfgs'
,
'penalty'
:
'l2'
,
'max_iter'
:
1000
}),
"SVM"
:
SVC
(
**
{
'C'
:
1.051871311397777
,
'kernel'
:
'linear'
,
'max_iter'
:
1000
,
'probability'
:
True
}),
"MLP"
:
MLPClassifier
(
**
{
'activation'
:
'identity'
,
'hidden_layer_sizes'
:
78
,
'learning_rate'
:
'constant'
,
'max_iter'
:
500
})
}
# 1.2) Trained with original dataset and cost-sensitive learning
elif
method_id
==
1
:
tuned_models
=
{
"DT"
:
DecisionTreeClassifier
(
**
{
'splitter'
:
'best'
,
'max_features'
:
'log2'
,
'criterion'
:
'entropy'
,
'class_weight'
:
'balanced'
}),
"RF"
:
RandomForestClassifier
(
**
{
'criterion'
:
'entropy'
,
'max_features'
:
'sqrt'
,
'n_estimators'
:
238
,
'class_weight'
:
'balanced'
}),
"Bagging"
:
BaggingClassifier
(
**
{
'max_features'
:
1.0
,
'max_samples'
:
0.8
,
'n_estimators'
:
22
,
'warm_start'
:
False
,
'estimator'
:
DecisionTreeClassifier
(
class_weight
=
'balanced'
)}),
"AB"
:
AdaBoostClassifier
(
**
{
'learning_rate'
:
1.7136783954287846
,
'n_estimators'
:
99
,
'algorithm'
:
'SAMME'
,
'estimator'
:
DecisionTreeClassifier
(
class_weight
=
'balanced'
)}),
"LR"
:
LogisticRegression
(
**
{
'solver'
:
'lbfgs'
,
'penalty'
:
'l2'
,
'max_iter'
:
1000
,
'class_weight'
:
'balanced'
}),
"SVM"
:
SVC
(
**
{
'C'
:
1.480857958217729
,
'kernel'
:
'linear'
,
'max_iter'
:
1000
,
'class_weight'
:
'balanced'
,
'probability'
:
True
}),
}
# 1.3) Trained with oversampled training dataset
elif
method_id
==
2
:
tuned_models
=
{
"DT"
:
DecisionTreeClassifier
(
**
{
'splitter'
:
'best'
,
'max_features'
:
'sqrt'
,
'criterion'
:
'log_loss'
}),
"RF"
:
RandomForestClassifier
(
**
{
'criterion'
:
'gini'
,
'max_features'
:
'sqrt'
,
'n_estimators'
:
121
}),
"Bagging"
:
BaggingClassifier
(
**
{
'max_features'
:
1.0
,
'max_samples'
:
1.0
,
'n_estimators'
:
22
,
'warm_start'
:
True
}),
"AB"
:
AdaBoostClassifier
(
**
{
'learning_rate'
:
1.4640913091426446
,
'n_estimators'
:
145
,
'algorithm'
:
'SAMME'
}),
"XGB"
:
XGBClassifier
(
**
{
'learning_rate'
:
0.19621698151985992
,
'max_depth'
:
7
,
'n_estimators'
:
840
}),
"LR"
:
LogisticRegression
(
**
{
'solver'
:
'lbfgs'
,
'penalty'
:
'l2'
,
'max_iter'
:
1000
}),
"SVM"
:
SVC
(
**
{
'C'
:
1.590799972846728
,
'kernel'
:
'poly'
,
'max_iter'
:
1000
,
'probability'
:
True
}),
"MLP"
:
MLPClassifier
(
**
{
'activation'
:
'relu'
,
'hidden_layer_sizes'
:
112
,
'learning_rate'
:
'constant'
,
'max_iter'
:
500
})
}
# 1.4) Trained with undersampled training dataset
elif
method_id
==
3
:
tuned_models
=
{
"DT"
:
DecisionTreeClassifier
(
**
{
'splitter'
:
'best'
,
'max_features'
:
'sqrt'
,
'criterion'
:
'log_loss'
}),
"RF"
:
RandomForestClassifier
(
**
{
'criterion'
:
'gini'
,
'max_features'
:
'sqrt'
,
'n_estimators'
:
148
}),
"Bagging"
:
BaggingClassifier
(
**
{
'max_features'
:
1.0
,
'max_samples'
:
0.8
,
'n_estimators'
:
24
,
'warm_start'
:
True
}),
"AB"
:
AdaBoostClassifier
(
**
{
'learning_rate'
:
1.7970533619575801
,
'n_estimators'
:
122
,
'algorithm'
:
'SAMME'
}),
"XGB"
:
XGBClassifier
(
**
{
'learning_rate'
:
0.13148624656904934
,
'max_depth'
:
9
,
'n_estimators'
:
723
}),
"LR"
:
LogisticRegression
(
**
{
'solver'
:
'sag'
,
'penalty'
:
'l2'
,
'max_iter'
:
1000
}),
"SVM"
:
SVC
(
**
{
'C'
:
1.383651513577477
,
'kernel'
:
'poly'
,
'max_iter'
:
1000
,
'probability'
:
True
}),
"MLP"
:
MLPClassifier
(
**
{
'activation'
:
'relu'
,
'hidden_layer_sizes'
:
89
,
'learning_rate'
:
'invscaling'
,
'max_iter'
:
500
})
}
# 2. POST
else
:
# 2.1) Trained with original dataset
if
method_id
==
0
:
tuned_models
=
{
"DT"
:
DecisionTreeClassifier
(
**
{
'splitter'
:
'best'
,
'max_features'
:
'sqrt'
,
'criterion'
:
'log_loss'
}),
"RF"
:
RandomForestClassifier
(
**
{
'criterion'
:
'entropy'
,
'max_features'
:
'sqrt'
,
'n_estimators'
:
120
}),
"Bagging"
:
BaggingClassifier
(
**
{
'max_features'
:
1.0
,
'max_samples'
:
0.8
,
'n_estimators'
:
38
,
'warm_start'
:
True
}),
"AB"
:
AdaBoostClassifier
(
**
{
'learning_rate'
:
1.9069394544838472
,
'n_estimators'
:
121
,
'algorithm'
:
'SAMME'
}),
"XGB"
:
XGBClassifier
(
**
{
'learning_rate'
:
0.24787889985627387
,
'max_depth'
:
4
,
'n_estimators'
:
956
}),
"LR"
:
LogisticRegression
(
**
{
'solver'
:
'lbfgs'
,
'penalty'
:
'l2'
}),
"SVM"
:
SVC
(
**
{
'C'
:
1.7965537393241109
,
'kernel'
:
'linear'
,
'max_iter'
:
1000
,
'probability'
:
True
}),
"MLP"
:
MLPClassifier
(
**
{
'activation'
:
'relu'
,
'hidden_layer_sizes'
:
147
,
'learning_rate'
:
'invscaling'
,
'max_iter'
:
500
})
}
# 2.2) Trained with original dataset and cost-sensitive learning
elif
method_id
==
1
:
tuned_models
=
{
"DT"
:
DecisionTreeClassifier
(
**
{
'splitter'
:
'best'
,
'max_features'
:
'sqrt'
,
'criterion'
:
'gini'
,
'class_weight'
:
'balanced'
}),
"RF"
:
RandomForestClassifier
(
**
{
'criterion'
:
'entropy'
,
'max_features'
:
'sqrt'
,
'n_estimators'
:
138
,
'class_weight'
:
'balanced'
}),
"Bagging"
:
BaggingClassifier
(
**
{
'max_features'
:
1.0
,
'max_samples'
:
1.0
,
'n_estimators'
:
66
,
'warm_start'
:
True
,
'estimator'
:
DecisionTreeClassifier
(
class_weight
=
'balanced'
)}),
"AB"
:
AdaBoostClassifier
(
**
{
'learning_rate'
:
1.92541653518023
,
'n_estimators'
:
114
,
'algorithm'
:
'SAMME'
,
'estimator'
:
DecisionTreeClassifier
(
class_weight
=
'balanced'
)}),
"LR"
:
LogisticRegression
(
**
{
'solver'
:
'lbfgs'
,
'penalty'
:
'l2'
,
'max_iter'
:
1000
,
'class_weight'
:
'balanced'
}),
"SVM"
:
SVC
(
**
{
'C'
:
0.8395104850983046
,
'kernel'
:
'linear'
,
'max_iter'
:
1000
,
'class_weight'
:
'balanced'
,
'probability'
:
True
})
}
# 2.3) Trained with oversampled training dataset
elif
method_id
==
2
:
tuned_models
=
{
"DT"
:
DecisionTreeClassifier
(
**
{
'splitter'
:
'best'
,
'max_features'
:
'log2'
,
'criterion'
:
'entropy'
}),
"RF"
:
RandomForestClassifier
(
**
{
'criterion'
:
'gini'
,
'max_features'
:
'sqrt'
,
'n_estimators'
:
118
}),
"Bagging"
:
BaggingClassifier
(
**
{
'max_features'
:
1.0
,
'max_samples'
:
1.0
,
'n_estimators'
:
56
,
'warm_start'
:
False
}),
"AB"
:
AdaBoostClassifier
(
**
{
'learning_rate'
:
1.5933610622176648
,
'n_estimators'
:
114
,
'algorithm'
:
'SAMME'
}),
"XGB"
:
XGBClassifier
(
**
{
'learning_rate'
:
0.059934879882855396
,
'max_depth'
:
9
,
'n_estimators'
:
660
}),
"LR"
:
LogisticRegression
(
**
{
'solver'
:
'lbfgs'
,
'penalty'
:
'l2'
,
'max_iter'
:
1000
}),
"SVM"
:
SVC
(
**
{
'C'
:
1.2237930722499044
,
'kernel'
:
'poly'
,
'max_iter'
:
1000
,
'probability'
:
True
}),
"MLP"
:
MLPClassifier
(
**
{
'activation'
:
'identity'
,
'hidden_layer_sizes'
:
134
,
'learning_rate'
:
'invscaling'
,
'max_iter'
:
500
})
}
# 2.4) Trained with undersampled training dataset
elif
method_id
==
3
:
tuned_models
=
{
"DT"
:
DecisionTreeClassifier
(
**
{
'splitter'
:
'best'
,
'max_features'
:
'log2'
,
'criterion'
:
'log_loss'
}),
"RF"
:
RandomForestClassifier
(
**
{
'criterion'
:
'gini'
,
'max_features'
:
'sqrt'
,
'n_estimators'
:
151
}),
"Bagging"
:
BaggingClassifier
(
**
{
'max_features'
:
1.0
,
'max_samples'
:
1.0
,
'n_estimators'
:
20
,
'warm_start'
:
False
}),
"AB"
:
AdaBoostClassifier
(
**
{
'learning_rate'
:
1.6523810056317618
,
'n_estimators'
:
89
,
'algorithm'
:
'SAMME'
}),
"XGB"
:
XGBClassifier
(
**
{
'learning_rate'
:
0.18430397856234193
,
'max_depth'
:
4
,
'n_estimators'
:
956
}),
"LR"
:
LogisticRegression
(
**
{
'solver'
:
'lbfgs'
,
'penalty'
:
'l2'
,
'max_iter'
:
1000
}),
"SVM"
:
SVC
(
**
{
'C'
:
1.1807459108651588
,
'kernel'
:
'linear'
,
'max_iter'
:
1000
,
'probability'
:
True
}),
"MLP"
:
MLPClassifier
(
**
{
'activation'
:
'identity'
,
'hidden_layer_sizes'
:
55
,
'learning_rate'
:
'constant'
,
'max_iter'
:
500
})
}
def
get_tuned_models
(
group_str
,
method_str
):
# Read sheet corresponding to group and method with tuned models and their hyperparam
tuned_models_df
=
pd
.
read_excel
(
"./output_hyperparam/hyperparamers.xlsx"
,
sheet_name
=
f
"{group_str}_{method_str}"
)
# Mapping from model abbreviations to sklearn model classes
model_mapping
=
{
'DT'
:
DecisionTreeClassifier
,
'RF'
:
RandomForestClassifier
,
'Bagging'
:
BaggingClassifier
,
'AB'
:
AdaBoostClassifier
,
'XGB'
:
XGBClassifier
,
'LR'
:
LogisticRegression
,
'SVM'
:
SVC
,
'MLP'
:
MLPClassifier
}
tuned_models
=
{}
# Iterate through each row of the DataFrame
for
index
,
row
in
tuned_models_df
.
iterrows
():
model_name
=
row
[
0
]
# Read dictionary
parameters
=
ast
.
literal_eval
(
row
[
'Parameters'
])
# Add extra parameters
if
model_name
==
'AB'
:
parameters
[
'algorithm'
]
=
'SAMME'
elif
model_name
==
'LR'
:
parameters
[
'max_iter'
]
=
1000
elif
model_name
==
'SVM'
:
parameters
[
'max_iter'
]
=
1000
parameters
[
'probability'
]
=
True
elif
model_name
==
"MLP"
:
parameters
[
'max_iter'
]
=
500
# Add class_weight argument for cost-sensitive learning method
if
'CW'
in
method_str
:
if
model_name
==
'Bagging'
or
model_name
==
'AB'
:
parameters
[
'estimator'
]
=
DecisionTreeClassifier
(
class_weight
=
'balanced'
)
else
:
parameters
[
'class_weight'
]
=
'balanced'
# Fetch class
model_class
=
model_mapping
[
model_name
]
# Initialize model
tuned_models
[
model_name
]
=
model_class
(
**
parameters
)
return
tuned_models
# --------------------------------------------------------------------------------------------------------
...
...
@@ -242,12 +187,12 @@ if __name__ == "__main__":
X_test
=
data_dic
[
'X_test_'
+
group
]
y_test
=
data_dic
[
'y_test_'
+
group
]
for
j
,
method
in
enumerate
([
''
,
''
,
'over_'
,
'under_'
]):
print
(
f
"{group}-{method}"
)
print
(
f
"{group}-{method
_names[j]
}"
)
# Get train dataset based on group and method
X_train
=
data_dic
[
'X_train_'
+
method
+
group
]
y_train
=
data_dic
[
'y_train_'
+
method
+
group
]
# Get tuned models for this group and method
models
=
get_tuned_models
(
group
_id
=
i
,
method_id
=
j
)
models
=
get_tuned_models
(
group
,
method_names
[
j
]
)
# Scores df
scores_df
=
pd
.
DataFrame
(
index
=
models
.
keys
(),
columns
=
scorings
.
keys
())
# Create a figure for all models in this group-method
...
...
@@ -292,6 +237,6 @@ if __name__ == "__main__":
with
pd
.
ExcelWriter
(
'./test_results/testing_tuned_models.xlsx'
)
as
writer
:
for
sheet_name
,
data
in
scores_sheets
.
items
():
data
.
to_excel
(
writer
,
sheet_name
=
sheet_name
)
# --------------------------------------------------------------------------------------------------------
# --------------------------------------------------------------------------------------------------------
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment