Commit d769c473 authored by Joaquin Torres's avatar Joaquin Torres

Cleaning

parent 0bbb8d6a
......@@ -4,7 +4,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"_Exploratory Data Analysis_ \\\n",
"**Exploratory Data Analysis** \\\n",
"_Author: Joaquín Torres Bravo_"
]
},
......
......@@ -4,8 +4,8 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"## Training Data Generation\n",
"By Joaquín Torres, May 2024"
"**Training Data Generation** \\\n",
"_Author: Joaquín Torres Bravo_"
]
},
{
......@@ -17,11 +17,10 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Libraries\n",
"import pandas as pd\n",
"import numpy as np\n",
"from sklearn.model_selection import train_test_split\n",
......@@ -31,57 +30,31 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Load clean datasets\n",
"df_pre = pd.read_csv('./input/pre_dataset.csv')\n",
"df_post = pd.read_csv('./input/post_dataset.csv')"
"df_pre = pd.read_csv('../EDA/output/datasets/pre_dataset.csv')\n",
"df_post = pd.read_csv('../EDA/output/datasets/post_dataset.csv')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(22861, 39)\n",
"(10677, 39)\n",
"(22861,)\n",
"(10677,)\n",
"['Ed_Not Complete primary school' 'Ed_Primary education'\n",
" 'Ed_Secondary Education' 'Ed_Secondary more technical education'\n",
" 'Ed_Tertiary' 'Social_protection_REDEF' 'JobIn_Non-stable' 'JobIn_Stable'\n",
" 'JobIn_Unemployed' 'Hous_Institutional' 'Hous_Stable' 'Hous_Unstable'\n",
" 'Alterations_early_childhood_develop_REDEF'\n",
" 'SocInc_Live with families or friends' 'SocInc_live alone'\n",
" 'SocInc_live in institutions' 'Risk_stigma_REDEF' 'Structural_conflic'\n",
" 'Age' 'Sex_REDEF' 'NumHijos' 'Smoking_REDEF'\n",
" 'Biological_vulnerability_REDEF' 'Opiaceos_DxCIE_REDEF'\n",
" 'Cannabis_DXCIE_REDEF' 'BZD_DxCIE_REDEF' 'Cocaina_DxCIE_REDEF'\n",
" 'Alucinogenos_DXCIE_REDEF' 'Tabaco_DXCIE_REDEF' 'Frec30_1 día/semana'\n",
" 'Frec30_2-3 días\\u200e/semana' 'Frec30_4-6 días/semana'\n",
" 'Frec30_Menos de 1 día\\u200e/semana' 'Frec30_No consumio'\n",
" 'Frec30_Todos los días' 'Años_consumo_droga' 'OtrosDx_Psiquiatrico_REDEF'\n",
" 'Tx_previos_REDEF' 'Adherencia_tto_recalc']\n"
]
}
],
"outputs": [],
"source": [
"# Creating a numpy matrix (X) without the target variable and a list with the target variable (y) \n",
"X_pre, y_pre = df_pre.loc[:, df_pre.columns != \"Situacion_tratamiento_REDEF\"].to_numpy(), df_pre.Situacion_tratamiento_REDEF\n",
"X_post, y_post = df_post.loc[:, df_post.columns != \"Situacion_tratamiento_REDEF\"].to_numpy(), df_post.Situacion_tratamiento_REDEF\n",
"X_pre, y_pre = df_pre.loc[:, df_pre.columns != \"Treatment_Outcome\"].to_numpy(), df_pre.Treatment_Outcome\n",
"X_post, y_post = df_post.loc[:, df_post.columns != \"Treatment_Outcome\"].to_numpy(), df_post.Treatment_Outcome\n",
"feat = np.delete(df_pre.columns.to_numpy(),-1) # Get labels and remove target \n",
"\n",
"print(X_pre.shape)\n",
"print(X_post.shape)\n",
"print(y_pre.shape)\n",
"print(y_post.shape)\n",
"print((feat))"
"print(feat)"
]
},
{
......@@ -93,18 +66,18 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# ORIGINAL\n",
"# 1. ORIGINAL\n",
"X_train_pre, X_test_pre, y_train_pre, y_test_pre = train_test_split(X_pre, y_pre, test_size=0.1, random_state=42) #90-10 split\n",
"X_train_post, X_test_post, y_train_post, y_test_post = train_test_split(X_post, y_post, test_size=0.1, random_state=42)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
......@@ -117,7 +90,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
......@@ -130,11 +103,11 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# OVERSAMPLED training data\n",
"# 2. OVERSAMPLED training data\n",
"smote_tomek = SMOTETomek()\n",
"X_train_over_pre, y_train_over_pre = smote_tomek.fit_resample(X_train_pre, y_train_pre)\n",
"X_train_over_post, y_train_over_post = smote_tomek.fit_resample(X_train_post, y_train_post)"
......@@ -142,7 +115,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
......@@ -155,11 +128,11 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# UNDERSAMPLING: TOMEK-LINKS \n",
"# 3. UNDERSAMPLING: TOMEK-LINKS \n",
"tomek = TomekLinks()\n",
"X_train_under_pre, y_train_under_pre = tomek.fit_resample(X_train_pre, y_train_pre)\n",
"X_train_under_post, y_train_under_post = tomek.fit_resample(X_train_post, y_train_post)"
......@@ -167,7 +140,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
......@@ -177,16 +150,6 @@
"np.save('./output/post/X_train_under_post.npy', X_train_under_post)\n",
"np.save('./output/post/y_train_under_post.npy', y_train_under_post)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# Save features\n",
"np.save('./output/attributes.npy', feat)"
]
}
],
"metadata": {
......
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment