Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Submit feedback
Sign in
Toggle navigation
C
covid_analysis
Project overview
Project overview
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
COMPARA
covid_analysis
Commits
d769c473
Commit
d769c473
authored
Jun 28, 2024
by
Joaquin Torres
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Cleaning
parent
0bbb8d6a
Changes
4
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
22 additions
and
33599 deletions
+22
-33599
EDA/EDA.ipynb
EDA/EDA.ipynb
+1
-1
gen_train_data/gen_train_data.ipynb
gen_train_data/gen_train_data.ipynb
+21
-58
gen_train_data/input/post_dataset.csv
gen_train_data/input/post_dataset.csv
+0
-10678
gen_train_data/input/pre_dataset.csv
gen_train_data/input/pre_dataset.csv
+0
-22862
No files found.
EDA/EDA.ipynb
View file @
d769c473
...
@@ -4,7 +4,7 @@
...
@@ -4,7 +4,7 @@
"cell_type": "markdown",
"cell_type": "markdown",
"metadata": {},
"metadata": {},
"source": [
"source": [
"
_Exploratory Data Analysis_
\\\n",
"
**Exploratory Data Analysis**
\\\n",
"_Author: Joaquín Torres Bravo_"
"_Author: Joaquín Torres Bravo_"
]
]
},
},
...
...
gen_train_data/gen_train_data.ipynb
View file @
d769c473
...
@@ -4,8 +4,8 @@
...
@@ -4,8 +4,8 @@
"cell_type": "markdown",
"cell_type": "markdown",
"metadata": {},
"metadata": {},
"source": [
"source": [
"
## Training Data Generation
\n",
"
**Training Data Generation** \\
\n",
"
By Joaquín Torres, May 2024
"
"
_Author: Joaquín Torres Bravo_
"
]
]
},
},
{
{
...
@@ -17,11 +17,10 @@
...
@@ -17,11 +17,10 @@
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count":
1
,
"execution_count":
null
,
"metadata": {},
"metadata": {},
"outputs": [],
"outputs": [],
"source": [
"source": [
"# Libraries\n",
"import pandas as pd\n",
"import pandas as pd\n",
"import numpy as np\n",
"import numpy as np\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.model_selection import train_test_split\n",
...
@@ -31,57 +30,31 @@
...
@@ -31,57 +30,31 @@
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count":
2
,
"execution_count":
null
,
"metadata": {},
"metadata": {},
"outputs": [],
"outputs": [],
"source": [
"source": [
"# Load clean datasets\n",
"# Load clean datasets\n",
"df_pre = pd.read_csv('.
/input
/pre_dataset.csv')\n",
"df_pre = pd.read_csv('.
./EDA/output/datasets
/pre_dataset.csv')\n",
"df_post = pd.read_csv('.
/input
/post_dataset.csv')"
"df_post = pd.read_csv('.
./EDA/output/datasets
/post_dataset.csv')"
]
]
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count":
3
,
"execution_count":
null
,
"metadata": {},
"metadata": {},
"outputs": [
"outputs": [],
{
"name": "stdout",
"output_type": "stream",
"text": [
"(22861, 39)\n",
"(10677, 39)\n",
"(22861,)\n",
"(10677,)\n",
"['Ed_Not Complete primary school' 'Ed_Primary education'\n",
" 'Ed_Secondary Education' 'Ed_Secondary more technical education'\n",
" 'Ed_Tertiary' 'Social_protection_REDEF' 'JobIn_Non-stable' 'JobIn_Stable'\n",
" 'JobIn_Unemployed' 'Hous_Institutional' 'Hous_Stable' 'Hous_Unstable'\n",
" 'Alterations_early_childhood_develop_REDEF'\n",
" 'SocInc_Live with families or friends' 'SocInc_live alone'\n",
" 'SocInc_live in institutions' 'Risk_stigma_REDEF' 'Structural_conflic'\n",
" 'Age' 'Sex_REDEF' 'NumHijos' 'Smoking_REDEF'\n",
" 'Biological_vulnerability_REDEF' 'Opiaceos_DxCIE_REDEF'\n",
" 'Cannabis_DXCIE_REDEF' 'BZD_DxCIE_REDEF' 'Cocaina_DxCIE_REDEF'\n",
" 'Alucinogenos_DXCIE_REDEF' 'Tabaco_DXCIE_REDEF' 'Frec30_1 día/semana'\n",
" 'Frec30_2-3 días\\u200e/semana' 'Frec30_4-6 días/semana'\n",
" 'Frec30_Menos de 1 día\\u200e/semana' 'Frec30_No consumio'\n",
" 'Frec30_Todos los días' 'Años_consumo_droga' 'OtrosDx_Psiquiatrico_REDEF'\n",
" 'Tx_previos_REDEF' 'Adherencia_tto_recalc']\n"
]
}
],
"source": [
"source": [
"# Creating a numpy matrix (X) without the target variable and a list with the target variable (y) \n",
"# Creating a numpy matrix (X) without the target variable and a list with the target variable (y) \n",
"X_pre, y_pre = df_pre.loc[:, df_pre.columns != \"
Situacion_tratamiento_REDEF\"].to_numpy(), df_pre.Situacion_tratamiento_REDEF
\n",
"X_pre, y_pre = df_pre.loc[:, df_pre.columns != \"
Treatment_Outcome\"].to_numpy(), df_pre.Treatment_Outcome
\n",
"X_post, y_post = df_post.loc[:, df_post.columns != \"
Situacion_tratamiento_REDEF\"].to_numpy(), df_post.Situacion_tratamiento_REDEF
\n",
"X_post, y_post = df_post.loc[:, df_post.columns != \"
Treatment_Outcome\"].to_numpy(), df_post.Treatment_Outcome
\n",
"feat = np.delete(df_pre.columns.to_numpy(),-1) # Get labels and remove target \n",
"feat = np.delete(df_pre.columns.to_numpy(),-1) # Get labels and remove target \n",
"\n",
"\n",
"print(X_pre.shape)\n",
"print(X_pre.shape)\n",
"print(X_post.shape)\n",
"print(X_post.shape)\n",
"print(y_pre.shape)\n",
"print(y_pre.shape)\n",
"print(y_post.shape)\n",
"print(y_post.shape)\n",
"print(
(feat)
)"
"print(
feat
)"
]
]
},
},
{
{
...
@@ -93,18 +66,18 @@
...
@@ -93,18 +66,18 @@
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count":
4
,
"execution_count":
null
,
"metadata": {},
"metadata": {},
"outputs": [],
"outputs": [],
"source": [
"source": [
"# ORIGINAL\n",
"#
1.
ORIGINAL\n",
"X_train_pre, X_test_pre, y_train_pre, y_test_pre = train_test_split(X_pre, y_pre, test_size=0.1, random_state=42) #90-10 split\n",
"X_train_pre, X_test_pre, y_train_pre, y_test_pre = train_test_split(X_pre, y_pre, test_size=0.1, random_state=42) #90-10 split\n",
"X_train_post, X_test_post, y_train_post, y_test_post = train_test_split(X_post, y_post, test_size=0.1, random_state=42)"
"X_train_post, X_test_post, y_train_post, y_test_post = train_test_split(X_post, y_post, test_size=0.1, random_state=42)"
]
]
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count":
5
,
"execution_count":
null
,
"metadata": {},
"metadata": {},
"outputs": [],
"outputs": [],
"source": [
"source": [
...
@@ -117,7 +90,7 @@
...
@@ -117,7 +90,7 @@
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count":
6
,
"execution_count":
null
,
"metadata": {},
"metadata": {},
"outputs": [],
"outputs": [],
"source": [
"source": [
...
@@ -130,11 +103,11 @@
...
@@ -130,11 +103,11 @@
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count":
7
,
"execution_count":
null
,
"metadata": {},
"metadata": {},
"outputs": [],
"outputs": [],
"source": [
"source": [
"# OVERSAMPLED training data\n",
"#
2.
OVERSAMPLED training data\n",
"smote_tomek = SMOTETomek()\n",
"smote_tomek = SMOTETomek()\n",
"X_train_over_pre, y_train_over_pre = smote_tomek.fit_resample(X_train_pre, y_train_pre)\n",
"X_train_over_pre, y_train_over_pre = smote_tomek.fit_resample(X_train_pre, y_train_pre)\n",
"X_train_over_post, y_train_over_post = smote_tomek.fit_resample(X_train_post, y_train_post)"
"X_train_over_post, y_train_over_post = smote_tomek.fit_resample(X_train_post, y_train_post)"
...
@@ -142,7 +115,7 @@
...
@@ -142,7 +115,7 @@
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count":
9
,
"execution_count":
null
,
"metadata": {},
"metadata": {},
"outputs": [],
"outputs": [],
"source": [
"source": [
...
@@ -155,11 +128,11 @@
...
@@ -155,11 +128,11 @@
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count":
10
,
"execution_count":
null
,
"metadata": {},
"metadata": {},
"outputs": [],
"outputs": [],
"source": [
"source": [
"# UNDERSAMPLING: TOMEK-LINKS \n",
"#
3.
UNDERSAMPLING: TOMEK-LINKS \n",
"tomek = TomekLinks()\n",
"tomek = TomekLinks()\n",
"X_train_under_pre, y_train_under_pre = tomek.fit_resample(X_train_pre, y_train_pre)\n",
"X_train_under_pre, y_train_under_pre = tomek.fit_resample(X_train_pre, y_train_pre)\n",
"X_train_under_post, y_train_under_post = tomek.fit_resample(X_train_post, y_train_post)"
"X_train_under_post, y_train_under_post = tomek.fit_resample(X_train_post, y_train_post)"
...
@@ -167,7 +140,7 @@
...
@@ -167,7 +140,7 @@
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count":
14
,
"execution_count":
null
,
"metadata": {},
"metadata": {},
"outputs": [],
"outputs": [],
"source": [
"source": [
...
@@ -177,16 +150,6 @@
...
@@ -177,16 +150,6 @@
"np.save('./output/post/X_train_under_post.npy', X_train_under_post)\n",
"np.save('./output/post/X_train_under_post.npy', X_train_under_post)\n",
"np.save('./output/post/y_train_under_post.npy', y_train_under_post)"
"np.save('./output/post/y_train_under_post.npy', y_train_under_post)"
]
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# Save features\n",
"np.save('./output/attributes.npy', feat)"
]
}
}
],
],
"metadata": {
"metadata": {
...
...
gen_train_data/input/post_dataset.csv
deleted
100644 → 0
View file @
0bbb8d6a
This diff is collapsed.
Click to expand it.
gen_train_data/input/pre_dataset.csv
deleted
100644 → 0
View file @
0bbb8d6a
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment