Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Submit feedback
Sign in
Toggle navigation
C
covid_analysis
Project overview
Project overview
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
COMPARA
covid_analysis
Commits
070552ff
Commit
070552ff
authored
Jul 10, 2024
by
Joaquin Torres
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Paths updated
parent
39e40062
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
114 additions
and
114 deletions
+114
-114
explainability/compute_shap_inter_vals.py
explainability/compute_shap_inter_vals.py
+7
-7
explainability/compute_shap_vals.py
explainability/compute_shap_vals.py
+7
-7
explainability/shap_plots.ipynb
explainability/shap_plots.ipynb
+18
-18
gen_train_data/gen_train_data.ipynb
gen_train_data/gen_train_data.ipynb
+18
-18
model_selection/cv_metric_distr.py
model_selection/cv_metric_distr.py
+2
-2
model_selection/cv_metric_gen.py
model_selection/cv_metric_gen.py
+15
-15
model_selection/fit_final_models.py
model_selection/fit_final_models.py
+15
-15
model_selection/hyperparam_tuning.py
model_selection/hyperparam_tuning.py
+13
-13
model_selection/test_models.py
model_selection/test_models.py
+19
-19
No files found.
explainability/compute_shap_inter_vals.py
View file @
070552ff
...
...
@@ -23,10 +23,10 @@ from sklearn.tree import DecisionTreeClassifier
# --------------------------------------------------------------------------------------------------------
def
read_test_data
(
attribute_names
):
# Load test data
X_test_pre
=
np
.
load
(
'../gen_train_data/
output
/pre/X_test_pre.npy'
,
allow_pickle
=
True
)
y_test_pre
=
np
.
load
(
'../gen_train_data/
output
/pre/y_test_pre.npy'
,
allow_pickle
=
True
)
X_test_post
=
np
.
load
(
'../gen_train_data/
output
/post/X_test_post.npy'
,
allow_pickle
=
True
)
y_test_post
=
np
.
load
(
'../gen_train_data/
output
/post/y_test_post.npy'
,
allow_pickle
=
True
)
X_test_pre
=
np
.
load
(
'../gen_train_data/
results
/pre/X_test_pre.npy'
,
allow_pickle
=
True
)
y_test_pre
=
np
.
load
(
'../gen_train_data/
results
/pre/y_test_pre.npy'
,
allow_pickle
=
True
)
X_test_post
=
np
.
load
(
'../gen_train_data/
results
/post/X_test_post.npy'
,
allow_pickle
=
True
)
y_test_post
=
np
.
load
(
'../gen_train_data/
results
/post/y_test_post.npy'
,
allow_pickle
=
True
)
# Type conversion needed
data_dic
=
{
...
...
@@ -43,7 +43,7 @@ if __name__ == "__main__":
# Setup
# --------------------------------------------------------------------------------------------------------
# Retrieve attribute names in order
attribute_names
=
list
(
np
.
load
(
'../EDA/
output
/feature_names/all_features.npy'
,
allow_pickle
=
True
))
attribute_names
=
list
(
np
.
load
(
'../EDA/
results
/feature_names/all_features.npy'
,
allow_pickle
=
True
))
# Reading data
data_dic
=
read_test_data
(
attribute_names
)
method_names
=
{
...
...
@@ -70,7 +70,7 @@ if __name__ == "__main__":
print
(
f
"{group}-{method_names[j]}"
)
method_name
=
method_names
[
j
]
model_name
=
model_choices
[
method_name
]
model_path
=
f
"../model_selection/
output
/fitted_models/{group}_{method_names[j]}_{model_name}.pkl"
model_path
=
f
"../model_selection/
results
/fitted_models/{group}_{method_names[j]}_{model_name}.pkl"
# Load the fitted model from disk
with
open
(
model_path
,
'rb'
)
as
file
:
fitted_model
=
pickle
.
load
(
file
)
...
...
@@ -84,5 +84,5 @@ if __name__ == "__main__":
shap_interaction_values
=
explainer
.
shap_interaction_values
(
X_test
)
# ---------------------------------------------------------------------------------------------------------
# Save results
np
.
save
(
f
"./
output
/shap_inter_values/{group}_{method_names[j]}"
,
shap_interaction_values
)
np
.
save
(
f
"./
results
/shap_inter_values/{group}_{method_names[j]}"
,
shap_interaction_values
)
# --------------------------------------------------------------------------------------------------------
\ No newline at end of file
explainability/compute_shap_vals.py
View file @
070552ff
...
...
@@ -23,10 +23,10 @@ from sklearn.tree import DecisionTreeClassifier
# --------------------------------------------------------------------------------------------------------
def
read_test_data
(
attribute_names
):
# Load test data
X_test_pre
=
np
.
load
(
'../gen_train_data/
output
/pre/X_test_pre.npy'
,
allow_pickle
=
True
)
y_test_pre
=
np
.
load
(
'../gen_train_data/
output
/pre/y_test_pre.npy'
,
allow_pickle
=
True
)
X_test_post
=
np
.
load
(
'../gen_train_data/
output
/post/X_test_post.npy'
,
allow_pickle
=
True
)
y_test_post
=
np
.
load
(
'../gen_train_data/
output
/post/y_test_post.npy'
,
allow_pickle
=
True
)
X_test_pre
=
np
.
load
(
'../gen_train_data/
results
/pre/X_test_pre.npy'
,
allow_pickle
=
True
)
y_test_pre
=
np
.
load
(
'../gen_train_data/
results
/pre/y_test_pre.npy'
,
allow_pickle
=
True
)
X_test_post
=
np
.
load
(
'../gen_train_data/
results
/post/X_test_post.npy'
,
allow_pickle
=
True
)
y_test_post
=
np
.
load
(
'../gen_train_data/
results
/post/y_test_post.npy'
,
allow_pickle
=
True
)
# Type conversion needed
data_dic
=
{
...
...
@@ -43,7 +43,7 @@ if __name__ == "__main__":
# Setup
# --------------------------------------------------------------------------------------------------------
# Retrieve attribute names in order
attribute_names
=
list
(
np
.
load
(
'../EDA/
output
/feature_names/all_features.npy'
,
allow_pickle
=
True
))
attribute_names
=
list
(
np
.
load
(
'../EDA/
results
/feature_names/all_features.npy'
,
allow_pickle
=
True
))
# Reading data
data_dic
=
read_test_data
(
attribute_names
)
method_names
=
{
...
...
@@ -70,7 +70,7 @@ if __name__ == "__main__":
print
(
f
"{group}-{method_names[j]}"
)
method_name
=
method_names
[
j
]
model_name
=
model_choices
[
method_name
]
model_path
=
f
"../model_selection/
output
/fitted_models/{group}_{method_names[j]}_{model_name}.pkl"
model_path
=
f
"../model_selection/
results
/fitted_models/{group}_{method_names[j]}_{model_name}.pkl"
# Load the fitted model from disk
with
open
(
model_path
,
'rb'
)
as
file
:
fitted_model
=
pickle
.
load
(
file
)
...
...
@@ -84,5 +84,5 @@ if __name__ == "__main__":
shap_vals
=
explainer
.
shap_values
(
X_test
,
check_additivity
=
True
)
# Change to true for final results
# ---------------------------------------------------------------------------------------------------------
# Save results
np
.
save
(
f
"./
output
/shap_values/{group}_{method_names[j]}"
,
shap_vals
)
np
.
save
(
f
"./
results
/shap_values/{group}_{method_names[j]}"
,
shap_vals
)
# --------------------------------------------------------------------------------------------------------
\ No newline at end of file
explainability/shap_plots.ipynb
View file @
070552ff
...
...
@@ -45,13 +45,13 @@
"outputs": [],
"source": [
"# Retrieve attribute names in order\n",
"attribute_names = attribute_names = list(np.load('../EDA/
output
/feature_names/all_features.npy', allow_pickle=True))\n",
"attribute_names = attribute_names = list(np.load('../EDA/
results
/feature_names/all_features.npy', allow_pickle=True))\n",
"\n",
"# Load test data\n",
"X_test_pre = np.load('../gen_train_data/
output
/pre/X_test_pre.npy', allow_pickle=True)\n",
"y_test_pre = np.load('../gen_train_data/
output
/pre/y_test_pre.npy', allow_pickle=True)\n",
"X_test_post = np.load('../gen_train_data/
output
/post/X_test_post.npy', allow_pickle=True)\n",
"y_test_post = np.load('../gen_train_data/
output
/post/y_test_post.npy', allow_pickle=True)\n",
"X_test_pre = np.load('../gen_train_data/
results
/pre/X_test_pre.npy', allow_pickle=True)\n",
"y_test_pre = np.load('../gen_train_data/
results
/pre/y_test_pre.npy', allow_pickle=True)\n",
"X_test_post = np.load('../gen_train_data/
results
/post/X_test_post.npy', allow_pickle=True)\n",
"y_test_post = np.load('../gen_train_data/
results
/post/y_test_post.npy', allow_pickle=True)\n",
"\n",
"# Type conversion needed \n",
"data_dic = {\n",
...
...
@@ -82,8 +82,8 @@
"}\n",
"\n",
"# Load names of social and individual attributes\n",
"soc_var_names = np.load('../EDA/
output
/feature_names/social_factors.npy', allow_pickle=True)\n",
"ind_var_names = np.load('../EDA/
output
/feature_names/individual_factors.npy', allow_pickle=True)"
"soc_var_names = np.load('../EDA/
results
/feature_names/social_factors.npy', allow_pickle=True)\n",
"ind_var_names = np.load('../EDA/
results
/feature_names/individual_factors.npy', allow_pickle=True)"
]
},
{
...
...
@@ -106,7 +106,7 @@
" X_test = data_dic['X_test_' + group]\n",
" y_test = data_dic['y_test_' + group]\n",
" model_name = model_choices[method_name]\n",
" shap_vals = np.load(f'./
output
/shap_values/{group}_{method_name}.npy')\n",
" shap_vals = np.load(f'./
results
/shap_values/{group}_{method_name}.npy')\n",
" ax = plt.subplot(2,1,i+1) # 2 rows (pre - post) 1 column\n",
" # show = False to modify plot before showing\n",
" shap.summary_plot(shap_vals, X_test, max_display=len(attribute_names), show=False)\n",
...
...
@@ -129,7 +129,7 @@
"plt.suptitle(f'SHAP Summary Plots PRE vs POST - Pipeline: Oversampling - Model: {model_name}\\n\\n')\n",
"plt.subplots_adjust(wspace=1)\n",
"plt.tight_layout()\n",
"plt.savefig(f'./
output
/plots/shap_summary/{method_name}_{model_name}.svg', format='svg', dpi=1250)\n",
"plt.savefig(f'./
results
/plots/shap_summary/{method_name}_{model_name}.svg', format='svg', dpi=1250)\n",
"plt.show()"
]
},
...
...
@@ -145,7 +145,7 @@
" X_test = data_dic['X_test_' + group]\n",
" y_test = data_dic['y_test_' + group]\n",
" model_name = model_choices[method_name]\n",
" shap_vals = np.load(f'./
output
/shap_values/{group}_{method_name}.npy')\n",
" shap_vals = np.load(f'./
results
/shap_values/{group}_{method_name}.npy')\n",
" shap_vals = shap_vals[:,:,1] # Select shap values for positive class\n",
" ax = plt.subplot(2,1,i+1)\n",
" shap.summary_plot(shap_vals, X_test, max_display=len(attribute_names), show=False)\n",
...
...
@@ -166,7 +166,7 @@
"plt.suptitle(f'SHAP Summary Plots PRE vs POST - Pipeline: Original with Class Weight - Model: {model_name}\\n\\n')\n",
"plt.subplots_adjust(wspace=1)\n",
"plt.tight_layout()\n",
"plt.savefig(f'./
output
/plots/shap_summary/{method_name}_{model_name}.svg', format='svg', dpi=1250)\n",
"plt.savefig(f'./
results
/plots/shap_summary/{method_name}_{model_name}.svg', format='svg', dpi=1250)\n",
"plt.show()"
]
},
...
...
@@ -203,7 +203,7 @@
" y_test = data_dic['y_test_' + group]\n",
" model_name = model_choices[method_name]\n",
"\n",
" shap_inter_vals = np.load(f'./
output
/shap_inter_values/{group}_{method_name}.npy')\n",
" shap_inter_vals = np.load(f'./
results
/shap_inter_values/{group}_{method_name}.npy')\n",
" if method_name == 'ORIG_CW':\n",
" shap_inter_vals = shap_inter_vals[:,:,:,1] # Take info about positive class\n",
"\n",
...
...
@@ -255,7 +255,7 @@
" # plt.suptitle(f'Simplified Example SHAP Summary Interaction Plot\\n', fontsize=15, fontweight='bold', x=0.5, y=0.95, ha='center')\n",
" plt.suptitle(f'SHAP Summary Interaction Plot - {method_name} - {str.upper(group)}\\n', fontsize=20, fontweight='bold') #, x=0.5, y=0.95, ha='center'\n",
" plt.tight_layout()\n",
" plt.savefig(f'./
output
/plots/shap_inter_summary/{str.upper(group)}_{method_name}_{model_name}.svg', format='svg', dpi=700)\n",
" plt.savefig(f'./
results
/plots/shap_inter_summary/{str.upper(group)}_{method_name}_{model_name}.svg', format='svg', dpi=700)\n",
" # plt.show()"
]
},
...
...
@@ -300,8 +300,8 @@
"outputs": [],
"source": [
"# Load array of shap inter matrices for pre and post for the chosen method\n",
"shap_inter_vals_pre = np.load(f'./
output
/shap_inter_values/pre_{method_name}.npy')\n",
"shap_inter_vals_post = np.load(f'./
output
/shap_inter_values/post_{method_name}.npy')\n",
"shap_inter_vals_pre = np.load(f'./
results
/shap_inter_values/pre_{method_name}.npy')\n",
"shap_inter_vals_post = np.load(f'./
results
/shap_inter_values/post_{method_name}.npy')\n",
"if method_name == 'ORIG_CW':\n",
" shap_inter_vals_pre = shap_inter_vals_pre[:,:,:,1] # Take info about positive class\n",
" shap_inter_vals_post = shap_inter_vals_post[:,:,:,1]\n",
...
...
@@ -367,7 +367,7 @@
" cbar = ax.collections[0].colorbar\n",
" cbar.set_label('Interaction POST - Interaction PRE', labelpad=15, rotation=270, verticalalignment='bottom')\n",
"\n",
" plt.savefig(f'./
output
/plots/heatmaps_interactions/DIST_{method_name}.svg', format='svg', dpi=600)\n",
" plt.savefig(f'./
results
/plots/heatmaps_interactions/DIST_{method_name}.svg', format='svg', dpi=600)\n",
" \n",
" plt.show()"
]
...
...
@@ -426,7 +426,7 @@
" interactions_df['SHAP Inter Variation PRE-POST'].abs().sort_values(ascending=False).index)\n",
"\n",
"# Export to Excel\n",
"sorted_interactions_df.to_excel(f'./
output
/inter_variation_{method_name}.xlsx', index=False)\n",
"sorted_interactions_df.to_excel(f'./
results
/inter_variation_{method_name}.xlsx', index=False)\n",
"\n",
"print(\"Excel file has been created.\")"
]
...
...
@@ -448,7 +448,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.
9.5
"
"version": "3.
12.2
"
}
},
"nbformat": 4,
...
...
gen_train_data/gen_train_data.ipynb
View file @
070552ff
...
...
@@ -36,8 +36,8 @@
"outputs": [],
"source": [
"# Load clean datasets\n",
"df_pre = pd.read_csv('../EDA/
output
/datasets/pre_dataset.csv')\n",
"df_post = pd.read_csv('../EDA/
output
/datasets/post_dataset.csv')"
"df_pre = pd.read_csv('../EDA/
results
/datasets/pre_dataset.csv')\n",
"df_post = pd.read_csv('../EDA/
results
/datasets/post_dataset.csv')"
]
},
{
...
...
@@ -90,10 +90,10 @@
"outputs": [],
"source": [
"# Save test data\n",
"np.save('./
output
/pre/X_test_pre.npy', X_test_pre)\n",
"np.save('./
output
/pre/y_test_pre.npy', y_test_pre)\n",
"np.save('./
output
/post/X_test_post.npy', X_test_post)\n",
"np.save('./
output
/post/y_test_post.npy', y_test_post)"
"np.save('./
results
/pre/X_test_pre.npy', X_test_pre)\n",
"np.save('./
results
/pre/y_test_pre.npy', y_test_pre)\n",
"np.save('./
results
/post/X_test_post.npy', X_test_post)\n",
"np.save('./
results
/post/y_test_post.npy', y_test_post)"
]
},
{
...
...
@@ -103,10 +103,10 @@
"outputs": [],
"source": [
"# Save ORIGINAL training data\n",
"np.save('./
output
/pre/X_train_pre.npy', X_train_pre)\n",
"np.save('./
output
/pre/y_train_pre.npy', y_train_pre)\n",
"np.save('./
output
/post/X_train_post.npy', X_train_post)\n",
"np.save('./
output
/post/y_train_post.npy', y_train_post)"
"np.save('./
results
/pre/X_train_pre.npy', X_train_pre)\n",
"np.save('./
results
/pre/y_train_pre.npy', y_train_pre)\n",
"np.save('./
results
/post/X_train_post.npy', X_train_post)\n",
"np.save('./
results
/post/y_train_post.npy', y_train_post)"
]
},
{
...
...
@@ -135,10 +135,10 @@
"outputs": [],
"source": [
"# Save oversampled training data\n",
"np.save('./
output
/pre/X_train_over_pre.npy', X_train_over_pre)\n",
"np.save('./
output
/pre/y_train_over_pre.npy', y_train_over_pre)\n",
"np.save('./
output
/post/X_train_over_post.npy', X_train_over_post)\n",
"np.save('./
output
/post/y_train_over_post.npy', y_train_over_post)"
"np.save('./
results
/pre/X_train_over_pre.npy', X_train_over_pre)\n",
"np.save('./
results
/pre/y_train_over_pre.npy', y_train_over_pre)\n",
"np.save('./
results
/post/X_train_over_post.npy', X_train_over_post)\n",
"np.save('./
results
/post/y_train_over_post.npy', y_train_over_post)"
]
},
{
...
...
@@ -167,10 +167,10 @@
"outputs": [],
"source": [
"# Save undersampled training data\n",
"np.save('./
output
/pre/X_train_under_pre.npy', X_train_under_pre)\n",
"np.save('./
output
/pre/y_train_under_pre.npy', y_train_under_pre)\n",
"np.save('./
output
/post/X_train_under_post.npy', X_train_under_post)\n",
"np.save('./
output
/post/y_train_under_post.npy', y_train_under_post)"
"np.save('./
results
/pre/X_train_under_pre.npy', X_train_under_pre)\n",
"np.save('./
results
/pre/y_train_under_pre.npy', y_train_under_pre)\n",
"np.save('./
results
/post/X_train_under_post.npy', X_train_under_post)\n",
"np.save('./
results
/post/y_train_under_post.npy', y_train_under_post)"
]
}
],
...
...
model_selection/cv_metric_distr.py
View file @
070552ff
...
...
@@ -21,7 +21,7 @@ if __name__ == "__main__":
for
group
in
[
'pre'
,
'post'
]:
for
method
in
[
'_ORIG'
,
'_ORIG_CW'
,
'_OVER'
,
'_UNDER'
]:
# Read CV metrics sheet for current group and method
df
=
pd
.
read_excel
(
'./
output_
cv_metrics/metrics.xlsx'
,
sheet_name
=
group
+
method
)
df
=
pd
.
read_excel
(
'./
results/
cv_metrics/metrics.xlsx'
,
sheet_name
=
group
+
method
)
# Model names based on cost-senstive training or not
if
method
==
'_ORIG_CW'
:
model_names
=
model_names_cs
...
...
@@ -47,7 +47,7 @@ if __name__ == "__main__":
if
metric_name
in
[
'F1'
,
'PREC'
,
'REC'
,
'ACC'
,
'AUROC'
,
'AUPRC'
]:
ax
.
set_ylim
(
0
,
1
)
plt
.
tight_layout
()
fig
.
savefig
(
f
'./
output
/cv_metrics/distributions/{group}{method}.svg'
,
format
=
'svg'
,
dpi
=
600
)
fig
.
savefig
(
f
'./
results
/cv_metrics/distributions/{group}{method}.svg'
,
format
=
'svg'
,
dpi
=
600
)
plt
.
close
(
fig
)
print
(
"Succesful distribution plots generation"
)
...
...
model_selection/cv_metric_gen.py
View file @
070552ff
...
...
@@ -34,22 +34,22 @@ import ast # String to dictionary
def
read_data
():
# Load ORIGINAL training data
X_train_pre
=
np
.
load
(
'../gen_train_data/data/
output
/pre/X_train_pre.npy'
,
allow_pickle
=
True
)
y_train_pre
=
np
.
load
(
'../gen_train_data/data/
output
/pre/y_train_pre.npy'
,
allow_pickle
=
True
)
X_train_post
=
np
.
load
(
'../gen_train_data/data/
output
/post/X_train_post.npy'
,
allow_pickle
=
True
)
y_train_post
=
np
.
load
(
'../gen_train_data/data/
output
/post/y_train_post.npy'
,
allow_pickle
=
True
)
X_train_pre
=
np
.
load
(
'../gen_train_data/data/
results
/pre/X_train_pre.npy'
,
allow_pickle
=
True
)
y_train_pre
=
np
.
load
(
'../gen_train_data/data/
results
/pre/y_train_pre.npy'
,
allow_pickle
=
True
)
X_train_post
=
np
.
load
(
'../gen_train_data/data/
results
/post/X_train_post.npy'
,
allow_pickle
=
True
)
y_train_post
=
np
.
load
(
'../gen_train_data/data/
results
/post/y_train_post.npy'
,
allow_pickle
=
True
)
# Load oversampled training data
X_train_over_pre
=
np
.
load
(
'../gen_train_data/data/
output
/pre/X_train_over_pre.npy'
,
allow_pickle
=
True
)
y_train_over_pre
=
np
.
load
(
'../gen_train_data/data/
output
/pre/y_train_over_pre.npy'
,
allow_pickle
=
True
)
X_train_over_post
=
np
.
load
(
'../gen_train_data/data/
output
/post/X_train_over_post.npy'
,
allow_pickle
=
True
)
y_train_over_post
=
np
.
load
(
'../gen_train_data/data/
output
/post/y_train_over_post.npy'
,
allow_pickle
=
True
)
X_train_over_pre
=
np
.
load
(
'../gen_train_data/data/
results
/pre/X_train_over_pre.npy'
,
allow_pickle
=
True
)
y_train_over_pre
=
np
.
load
(
'../gen_train_data/data/
results
/pre/y_train_over_pre.npy'
,
allow_pickle
=
True
)
X_train_over_post
=
np
.
load
(
'../gen_train_data/data/
results
/post/X_train_over_post.npy'
,
allow_pickle
=
True
)
y_train_over_post
=
np
.
load
(
'../gen_train_data/data/
results
/post/y_train_over_post.npy'
,
allow_pickle
=
True
)
# Load undersampled training data
X_train_under_pre
=
np
.
load
(
'../gen_train_data/data/
output
/pre/X_train_under_pre.npy'
,
allow_pickle
=
True
)
y_train_under_pre
=
np
.
load
(
'../gen_train_data/data/
output
/pre/y_train_under_pre.npy'
,
allow_pickle
=
True
)
X_train_under_post
=
np
.
load
(
'../gen_train_data/data/
output
/post/X_train_under_post.npy'
,
allow_pickle
=
True
)
y_train_under_post
=
np
.
load
(
'../gen_train_data/data/
output
/post/y_train_under_post.npy'
,
allow_pickle
=
True
)
X_train_under_pre
=
np
.
load
(
'../gen_train_data/data/
results
/pre/X_train_under_pre.npy'
,
allow_pickle
=
True
)
y_train_under_pre
=
np
.
load
(
'../gen_train_data/data/
results
/pre/y_train_under_pre.npy'
,
allow_pickle
=
True
)
X_train_under_post
=
np
.
load
(
'../gen_train_data/data/
results
/post/X_train_under_post.npy'
,
allow_pickle
=
True
)
y_train_under_post
=
np
.
load
(
'../gen_train_data/data/
results
/post/y_train_under_post.npy'
,
allow_pickle
=
True
)
data_dic
=
{
"X_train_pre"
:
X_train_pre
,
...
...
@@ -73,7 +73,7 @@ def read_data():
# --------------------------------------------------------------------------------------------------------
def
get_tuned_models
(
group_str
,
method_str
):
# Read sheet corresponding to group and method with tuned models and their hyperparam
tuned_models_df
=
pd
.
read_excel
(
"./
output
_hyperparam/hyperparamers.xlsx"
,
sheet_name
=
f
"{group_str}_{method_str}"
)
tuned_models_df
=
pd
.
read_excel
(
"./
results
_hyperparam/hyperparamers.xlsx"
,
sheet_name
=
f
"{group_str}_{method_str}"
)
# Mapping from model abbreviations to sklearn model classes
model_mapping
=
{
'DT'
:
DecisionTreeClassifier
,
...
...
@@ -290,10 +290,10 @@ if __name__ == "__main__":
scores_sheets
[
sheet_name
]
=
scores_df
# Adjust layout and save figure
plt
.
tight_layout
()
plt
.
savefig
(
f
'./
output
/cv_metrics/curves/{group}_{method_names[j]}.svg'
,
format
=
'svg'
,
dpi
=
500
)
plt
.
savefig
(
f
'./
results
/cv_metrics/curves/{group}_{method_names[j]}.svg'
,
format
=
'svg'
,
dpi
=
500
)
plt
.
close
(
fig
)
# Write results to Excel file
with
pd
.
ExcelWriter
(
'./
output
./cv_metrics/metrics.xlsx'
)
as
writer
:
with
pd
.
ExcelWriter
(
'./
results
./cv_metrics/metrics.xlsx'
)
as
writer
:
for
sheet_name
,
data
in
scores_sheets
.
items
():
data
.
to_excel
(
writer
,
sheet_name
=
sheet_name
)
print
(
"Successful cv metric generation for tuned models"
)
...
...
model_selection/fit_final_models.py
View file @
070552ff
...
...
@@ -24,22 +24,22 @@ import ast # String to dictionary
# --------------------------------------------------------------------------------------------------------
def
read_training_data
(
attribute_names
):
# Load ORIGINAL training data
X_train_pre
=
np
.
load
(
'../gen_train_data/
output
/pre/X_train_pre.npy'
,
allow_pickle
=
True
)
y_train_pre
=
np
.
load
(
'../gen_train_data/
output
/pre/y_train_pre.npy'
,
allow_pickle
=
True
)
X_train_post
=
np
.
load
(
'../gen_train_data/
output
/post/X_train_post.npy'
,
allow_pickle
=
True
)
y_train_post
=
np
.
load
(
'../gen_train_data/
output
/post/y_train_post.npy'
,
allow_pickle
=
True
)
X_train_pre
=
np
.
load
(
'../gen_train_data/
results
/pre/X_train_pre.npy'
,
allow_pickle
=
True
)
y_train_pre
=
np
.
load
(
'../gen_train_data/
results
/pre/y_train_pre.npy'
,
allow_pickle
=
True
)
X_train_post
=
np
.
load
(
'../gen_train_data/
results
/post/X_train_post.npy'
,
allow_pickle
=
True
)
y_train_post
=
np
.
load
(
'../gen_train_data/
results
/post/y_train_post.npy'
,
allow_pickle
=
True
)
# Load oversampled training data
X_train_over_pre
=
np
.
load
(
'../gen_train_data/
output
/pre/X_train_over_pre.npy'
,
allow_pickle
=
True
)
y_train_over_pre
=
np
.
load
(
'../gen_train_data/
output
/pre/y_train_over_pre.npy'
,
allow_pickle
=
True
)
X_train_over_post
=
np
.
load
(
'../gen_train_data/
output
/post/X_train_over_post.npy'
,
allow_pickle
=
True
)
y_train_over_post
=
np
.
load
(
'../gen_train_data/
output
/post/y_train_over_post.npy'
,
allow_pickle
=
True
)
X_train_over_pre
=
np
.
load
(
'../gen_train_data/
results
/pre/X_train_over_pre.npy'
,
allow_pickle
=
True
)
y_train_over_pre
=
np
.
load
(
'../gen_train_data/
results
/pre/y_train_over_pre.npy'
,
allow_pickle
=
True
)
X_train_over_post
=
np
.
load
(
'../gen_train_data/
results
/post/X_train_over_post.npy'
,
allow_pickle
=
True
)
y_train_over_post
=
np
.
load
(
'../gen_train_data/
results
/post/y_train_over_post.npy'
,
allow_pickle
=
True
)
# Load undersampled training data
X_train_under_pre
=
np
.
load
(
'../gen_train_data/
output
/pre/X_train_under_pre.npy'
,
allow_pickle
=
True
)
y_train_under_pre
=
np
.
load
(
'../gen_train_data/
output
/pre/y_train_under_pre.npy'
,
allow_pickle
=
True
)
X_train_under_post
=
np
.
load
(
'../gen_train_data/
output
/post/X_train_under_post.npy'
,
allow_pickle
=
True
)
y_train_under_post
=
np
.
load
(
'../gen_train_data/
output
/post/y_train_under_post.npy'
,
allow_pickle
=
True
)
X_train_under_pre
=
np
.
load
(
'../gen_train_data/
results
/pre/X_train_under_pre.npy'
,
allow_pickle
=
True
)
y_train_under_pre
=
np
.
load
(
'../gen_train_data/
results
/pre/y_train_under_pre.npy'
,
allow_pickle
=
True
)
X_train_under_post
=
np
.
load
(
'../gen_train_data/
results
/post/X_train_under_post.npy'
,
allow_pickle
=
True
)
y_train_under_post
=
np
.
load
(
'../gen_train_data/
results
/post/y_train_under_post.npy'
,
allow_pickle
=
True
)
# Type conversion needed
data_dic
=
{
...
...
@@ -63,7 +63,7 @@ def read_training_data(attribute_names):
# --------------------------------------------------------------------------------------------------------
def
get_chosen_model
(
group_str
,
method_str
,
model_name
):
# Read sheet corresponding to group and method with tuned models and their hyperparameters
tuned_models_df
=
pd
.
read_excel
(
"../model_selection/
output
/hyperparam/hyperparamers.xlsx"
,
sheet_name
=
f
"{group_str}_{method_str}"
)
tuned_models_df
=
pd
.
read_excel
(
"../model_selection/
results
/hyperparam/hyperparamers.xlsx"
,
sheet_name
=
f
"{group_str}_{method_str}"
)
tuned_models_df
.
columns
=
[
'Model'
,
'Best Parameters'
]
# Define the mapping from model abbreviations to sklearn model classes
...
...
@@ -117,7 +117,7 @@ if __name__ == "__main__":
# Setup
# --------------------------------------------------------------------------------------------------------
# Retrieve attribute names in order
attribute_names
=
list
(
np
.
load
(
'../EDA/
output
/feature_names/all_features.npy'
,
allow_pickle
=
True
))
attribute_names
=
list
(
np
.
load
(
'../EDA/
results
/feature_names/all_features.npy'
,
allow_pickle
=
True
))
# Reading data
data_dic
=
read_training_data
(
attribute_names
)
method_names
=
{
...
...
@@ -147,7 +147,7 @@ if __name__ == "__main__":
model
,
is_tree
=
get_chosen_model
(
group_str
=
group
,
method_str
=
method_name
,
model_name
=
model_choices
[
method_name
])
fitted_model
=
model
.
fit
(
X_train
,
y_train
)
# Define the file path where you want to save the model
model_save_path
=
f
"./
output
/fitted_models/{group}_{method_names[j]}_{model_choices[method_name]}.pkl"
model_save_path
=
f
"./
results
/fitted_models/{group}_{method_names[j]}_{model_choices[method_name]}.pkl"
# Save the model to disk
with
open
(
model_save_path
,
'wb'
)
as
f
:
pickle
.
dump
(
fitted_model
,
f
)
...
...
model_selection/hyperparam_tuning.py
View file @
070552ff
...
...
@@ -32,22 +32,22 @@ from scipy.stats import randint, uniform
def
read_data
():
# Load ORIGINAL training data
X_train_pre
=
np
.
load
(
'../gen_train_data/data/
output
/pre/X_train_pre.npy'
,
allow_pickle
=
True
)
y_train_pre
=
np
.
load
(
'../gen_train_data/data/
output
/pre/y_train_pre.npy'
,
allow_pickle
=
True
)
X_train_post
=
np
.
load
(
'../gen_train_data/data/
output
/post/X_train_post.npy'
,
allow_pickle
=
True
)
y_train_post
=
np
.
load
(
'../gen_train_data/data/
output
/post/y_train_post.npy'
,
allow_pickle
=
True
)
X_train_pre
=
np
.
load
(
'../gen_train_data/data/
results
/pre/X_train_pre.npy'
,
allow_pickle
=
True
)
y_train_pre
=
np
.
load
(
'../gen_train_data/data/
results
/pre/y_train_pre.npy'
,
allow_pickle
=
True
)
X_train_post
=
np
.
load
(
'../gen_train_data/data/
results
/post/X_train_post.npy'
,
allow_pickle
=
True
)
y_train_post
=
np
.
load
(
'../gen_train_data/data/
results
/post/y_train_post.npy'
,
allow_pickle
=
True
)
# Load oversampled training data
X_train_over_pre
=
np
.
load
(
'../gen_train_data/data/
output
/pre/X_train_over_pre.npy'
,
allow_pickle
=
True
)
y_train_over_pre
=
np
.
load
(
'../gen_train_data/data/
output
/pre/y_train_over_pre.npy'
,
allow_pickle
=
True
)
X_train_over_post
=
np
.
load
(
'../gen_train_data/data/
output
/post/X_train_over_post.npy'
,
allow_pickle
=
True
)
y_train_over_post
=
np
.
load
(
'../gen_train_data/data/
output
/post/y_train_over_post.npy'
,
allow_pickle
=
True
)
X_train_over_pre
=
np
.
load
(
'../gen_train_data/data/
results
/pre/X_train_over_pre.npy'
,
allow_pickle
=
True
)
y_train_over_pre
=
np
.
load
(
'../gen_train_data/data/
results
/pre/y_train_over_pre.npy'
,
allow_pickle
=
True
)
X_train_over_post
=
np
.
load
(
'../gen_train_data/data/
results
/post/X_train_over_post.npy'
,
allow_pickle
=
True
)
y_train_over_post
=
np
.
load
(
'../gen_train_data/data/
results
/post/y_train_over_post.npy'
,
allow_pickle
=
True
)
# Load undersampled training data
X_train_under_pre
=
np
.
load
(
'../gen_train_data/data/
output
/pre/X_train_under_pre.npy'
,
allow_pickle
=
True
)
y_train_under_pre
=
np
.
load
(
'../gen_train_data/data/
output
/pre/y_train_under_pre.npy'
,
allow_pickle
=
True
)
X_train_under_post
=
np
.
load
(
'../gen_train_data/data/
output
/post/X_train_under_post.npy'
,
allow_pickle
=
True
)
y_train_under_post
=
np
.
load
(
'../gen_train_data/data/
output
/post/y_train_under_post.npy'
,
allow_pickle
=
True
)
X_train_under_pre
=
np
.
load
(
'../gen_train_data/data/
results
/pre/X_train_under_pre.npy'
,
allow_pickle
=
True
)
y_train_under_pre
=
np
.
load
(
'../gen_train_data/data/
results
/pre/y_train_under_pre.npy'
,
allow_pickle
=
True
)
X_train_under_post
=
np
.
load
(
'../gen_train_data/data/
results
/post/X_train_under_post.npy'
,
allow_pickle
=
True
)
y_train_under_post
=
np
.
load
(
'../gen_train_data/data/
results
/post/y_train_under_post.npy'
,
allow_pickle
=
True
)
data_dic
=
{
"X_train_pre"
:
X_train_pre
,
...
...
@@ -163,7 +163,7 @@ if __name__ == "__main__":
sheets_dict
[
sheet_name
]
=
hyperparam_df
# Write results to Excel file
with
pd
.
ExcelWriter
(
'./
output
/hyperparam/hyperparamers.xlsx'
)
as
writer
:
with
pd
.
ExcelWriter
(
'./
results
/hyperparam/hyperparamers.xlsx'
)
as
writer
:
for
sheet_name
,
data
in
sheets_dict
.
items
():
data
.
to_excel
(
writer
,
sheet_name
=
sheet_name
)
...
...
model_selection/test_models.py
View file @
070552ff
...
...
@@ -33,28 +33,28 @@ from mpl_toolkits.axes_grid1 import make_axes_locatable # Custom color bar for c
# --------------------------------------------------------------------------------------------------------
def
read_data
():
# Load test data
X_test_pre
=
np
.
load
(
'../gen_train_data/data/
output
/pre/X_test_pre.npy'
,
allow_pickle
=
True
)
y_test_pre
=
np
.
load
(
'../gen_train_data/data/
output
/pre/y_test_pre.npy'
,
allow_pickle
=
True
)
X_test_post
=
np
.
load
(
'../gen_train_data/data/
output
/post/X_test_post.npy'
,
allow_pickle
=
True
)
y_test_post
=
np
.
load
(
'../gen_train_data/data/
output
/post/y_test_post.npy'
,
allow_pickle
=
True
)
X_test_pre
=
np
.
load
(
'../gen_train_data/data/
results
/pre/X_test_pre.npy'
,
allow_pickle
=
True
)
y_test_pre
=
np
.
load
(
'../gen_train_data/data/
results
/pre/y_test_pre.npy'
,
allow_pickle
=
True
)
X_test_post
=
np
.
load
(
'../gen_train_data/data/
results
/post/X_test_post.npy'
,
allow_pickle
=
True
)
y_test_post
=
np
.
load
(
'../gen_train_data/data/
results
/post/y_test_post.npy'
,
allow_pickle
=
True
)
# Load ORIGINAL training data
X_train_pre
=
np
.
load
(
'../gen_train_data/data/
output
/pre/X_train_pre.npy'
,
allow_pickle
=
True
)
y_train_pre
=
np
.
load
(
'../gen_train_data/data/
output
/pre/y_train_pre.npy'
,
allow_pickle
=
True
)
X_train_post
=
np
.
load
(
'../gen_train_data/data/
output
/post/X_train_post.npy'
,
allow_pickle
=
True
)
y_train_post
=
np
.
load
(
'../gen_train_data/data/
output
/post/y_train_post.npy'
,
allow_pickle
=
True
)
X_train_pre
=
np
.
load
(
'../gen_train_data/data/
results
/pre/X_train_pre.npy'
,
allow_pickle
=
True
)
y_train_pre
=
np
.
load
(
'../gen_train_data/data/
results
/pre/y_train_pre.npy'
,
allow_pickle
=
True
)
X_train_post
=
np
.
load
(
'../gen_train_data/data/
results
/post/X_train_post.npy'
,
allow_pickle
=
True
)
y_train_post
=
np
.
load
(
'../gen_train_data/data/
results
/post/y_train_post.npy'
,
allow_pickle
=
True
)
# Load oversampled training data
X_train_over_pre
=
np
.
load
(
'../gen_train_data/data/
output
/pre/X_train_over_pre.npy'
,
allow_pickle
=
True
)
y_train_over_pre
=
np
.
load
(
'../gen_train_data/data/
output
/pre/y_train_over_pre.npy'
,
allow_pickle
=
True
)
X_train_over_post
=
np
.
load
(
'../gen_train_data/data/
output
/post/X_train_over_post.npy'
,
allow_pickle
=
True
)
y_train_over_post
=
np
.
load
(
'../gen_train_data/data/
output
/post/y_train_over_post.npy'
,
allow_pickle
=
True
)
X_train_over_pre
=
np
.
load
(
'../gen_train_data/data/
results
/pre/X_train_over_pre.npy'
,
allow_pickle
=
True
)
y_train_over_pre
=
np
.
load
(
'../gen_train_data/data/
results
/pre/y_train_over_pre.npy'
,
allow_pickle
=
True
)
X_train_over_post
=
np
.
load
(
'../gen_train_data/data/
results
/post/X_train_over_post.npy'
,
allow_pickle
=
True
)
y_train_over_post
=
np
.
load
(
'../gen_train_data/data/
results
/post/y_train_over_post.npy'
,
allow_pickle
=
True
)
# Load undersampled training data
X_train_under_pre
=
np
.
load
(
'../gen_train_data/data/
output
/pre/X_train_under_pre.npy'
,
allow_pickle
=
True
)
y_train_under_pre
=
np
.
load
(
'../gen_train_data/data/
output
/pre/y_train_under_pre.npy'
,
allow_pickle
=
True
)
X_train_under_post
=
np
.
load
(
'../gen_train_data/data/
output
/post/X_train_under_post.npy'
,
allow_pickle
=
True
)
y_train_under_post
=
np
.
load
(
'../gen_train_data/data/
output
/post/y_train_under_post.npy'
,
allow_pickle
=
True
)
X_train_under_pre
=
np
.
load
(
'../gen_train_data/data/
results
/pre/X_train_under_pre.npy'
,
allow_pickle
=
True
)
y_train_under_pre
=
np
.
load
(
'../gen_train_data/data/
results
/pre/y_train_under_pre.npy'
,
allow_pickle
=
True
)
X_train_under_post
=
np
.
load
(
'../gen_train_data/data/
results
/post/X_train_under_post.npy'
,
allow_pickle
=
True
)
y_train_under_post
=
np
.
load
(
'../gen_train_data/data/
results
/post/y_train_under_post.npy'
,
allow_pickle
=
True
)
data_dic
=
{
"X_test_pre"
:
X_test_pre
,
...
...
@@ -83,7 +83,7 @@ def read_data():
def
get_tuned_models
(
group_str
,
method_str
):
# Read sheet corresponding to group and method with tuned models and their hyperparam
tuned_models_df
=
pd
.
read_excel
(
"./
output
/hyperparam/hyperparamers.xlsx"
,
sheet_name
=
f
"{group_str}_{method_str}"
)
tuned_models_df
=
pd
.
read_excel
(
"./
results
/hyperparam/hyperparamers.xlsx"
,
sheet_name
=
f
"{group_str}_{method_str}"
)
# Mapping from model abbreviations to sklearn model classes
model_mapping
=
{
'DT'
:
DecisionTreeClassifier
,
...
...
@@ -280,13 +280,13 @@ if __name__ == "__main__":
# ----------------------------------------------------------
# Adjust layout and save/show figure
plt
.
tight_layout
()
plt
.
savefig
(
f
'./
output
/testing/plots/{group}_{method_names[j]}.svg'
,
format
=
'svg'
,
dpi
=
500
)
plt
.
savefig
(
f
'./
results
/testing/plots/{group}_{method_names[j]}.svg'
,
format
=
'svg'
,
dpi
=
500
)
plt
.
close
(
fig
)
# Store the DataFrame in the dictionary with a unique key for each sheet
sheet_name
=
f
"{group}_{method_names[j]}"
scores_sheets
[
sheet_name
]
=
scores_df
# Write results to Excel file
with
pd
.
ExcelWriter
(
'./
output
/testing/testing_tuned_models.xlsx'
)
as
writer
:
with
pd
.
ExcelWriter
(
'./
results
/testing/testing_tuned_models.xlsx'
)
as
writer
:
for
sheet_name
,
data
in
scores_sheets
.
items
():
data
.
to_excel
(
writer
,
sheet_name
=
sheet_name
)
print
(
"Successful evaluation with test dataset"
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment