Commit d769c473 authored by Joaquin Torres's avatar Joaquin Torres

Cleaning

parent 0bbb8d6a
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"_Exploratory Data Analysis_ \\\n", "**Exploratory Data Analysis** \\\n",
"_Author: Joaquín Torres Bravo_" "_Author: Joaquín Torres Bravo_"
] ]
}, },
......
...@@ -4,8 +4,8 @@ ...@@ -4,8 +4,8 @@
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"## Training Data Generation\n", "**Training Data Generation** \\\n",
"By Joaquín Torres, May 2024" "_Author: Joaquín Torres Bravo_"
] ]
}, },
{ {
...@@ -17,11 +17,10 @@ ...@@ -17,11 +17,10 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 1, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# Libraries\n",
"import pandas as pd\n", "import pandas as pd\n",
"import numpy as np\n", "import numpy as np\n",
"from sklearn.model_selection import train_test_split\n", "from sklearn.model_selection import train_test_split\n",
...@@ -31,57 +30,31 @@ ...@@ -31,57 +30,31 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 2, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# Load clean datasets\n", "# Load clean datasets\n",
"df_pre = pd.read_csv('./input/pre_dataset.csv')\n", "df_pre = pd.read_csv('../EDA/output/datasets/pre_dataset.csv')\n",
"df_post = pd.read_csv('./input/post_dataset.csv')" "df_post = pd.read_csv('../EDA/output/datasets/post_dataset.csv')"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 3, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [],
{
"name": "stdout",
"output_type": "stream",
"text": [
"(22861, 39)\n",
"(10677, 39)\n",
"(22861,)\n",
"(10677,)\n",
"['Ed_Not Complete primary school' 'Ed_Primary education'\n",
" 'Ed_Secondary Education' 'Ed_Secondary more technical education'\n",
" 'Ed_Tertiary' 'Social_protection_REDEF' 'JobIn_Non-stable' 'JobIn_Stable'\n",
" 'JobIn_Unemployed' 'Hous_Institutional' 'Hous_Stable' 'Hous_Unstable'\n",
" 'Alterations_early_childhood_develop_REDEF'\n",
" 'SocInc_Live with families or friends' 'SocInc_live alone'\n",
" 'SocInc_live in institutions' 'Risk_stigma_REDEF' 'Structural_conflic'\n",
" 'Age' 'Sex_REDEF' 'NumHijos' 'Smoking_REDEF'\n",
" 'Biological_vulnerability_REDEF' 'Opiaceos_DxCIE_REDEF'\n",
" 'Cannabis_DXCIE_REDEF' 'BZD_DxCIE_REDEF' 'Cocaina_DxCIE_REDEF'\n",
" 'Alucinogenos_DXCIE_REDEF' 'Tabaco_DXCIE_REDEF' 'Frec30_1 día/semana'\n",
" 'Frec30_2-3 días\\u200e/semana' 'Frec30_4-6 días/semana'\n",
" 'Frec30_Menos de 1 día\\u200e/semana' 'Frec30_No consumio'\n",
" 'Frec30_Todos los días' 'Años_consumo_droga' 'OtrosDx_Psiquiatrico_REDEF'\n",
" 'Tx_previos_REDEF' 'Adherencia_tto_recalc']\n"
]
}
],
"source": [ "source": [
"# Creating a numpy matrix (X) without the target variable and a list with the target variable (y) \n", "# Creating a numpy matrix (X) without the target variable and a list with the target variable (y) \n",
"X_pre, y_pre = df_pre.loc[:, df_pre.columns != \"Situacion_tratamiento_REDEF\"].to_numpy(), df_pre.Situacion_tratamiento_REDEF\n", "X_pre, y_pre = df_pre.loc[:, df_pre.columns != \"Treatment_Outcome\"].to_numpy(), df_pre.Treatment_Outcome\n",
"X_post, y_post = df_post.loc[:, df_post.columns != \"Situacion_tratamiento_REDEF\"].to_numpy(), df_post.Situacion_tratamiento_REDEF\n", "X_post, y_post = df_post.loc[:, df_post.columns != \"Treatment_Outcome\"].to_numpy(), df_post.Treatment_Outcome\n",
"feat = np.delete(df_pre.columns.to_numpy(),-1) # Get labels and remove target \n", "feat = np.delete(df_pre.columns.to_numpy(),-1) # Get labels and remove target \n",
"\n", "\n",
"print(X_pre.shape)\n", "print(X_pre.shape)\n",
"print(X_post.shape)\n", "print(X_post.shape)\n",
"print(y_pre.shape)\n", "print(y_pre.shape)\n",
"print(y_post.shape)\n", "print(y_post.shape)\n",
"print((feat))" "print(feat)"
] ]
}, },
{ {
...@@ -93,18 +66,18 @@ ...@@ -93,18 +66,18 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 4, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# ORIGINAL\n", "# 1. ORIGINAL\n",
"X_train_pre, X_test_pre, y_train_pre, y_test_pre = train_test_split(X_pre, y_pre, test_size=0.1, random_state=42) #90-10 split\n", "X_train_pre, X_test_pre, y_train_pre, y_test_pre = train_test_split(X_pre, y_pre, test_size=0.1, random_state=42) #90-10 split\n",
"X_train_post, X_test_post, y_train_post, y_test_post = train_test_split(X_post, y_post, test_size=0.1, random_state=42)" "X_train_post, X_test_post, y_train_post, y_test_post = train_test_split(X_post, y_post, test_size=0.1, random_state=42)"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 5, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
...@@ -117,7 +90,7 @@ ...@@ -117,7 +90,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 6, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
...@@ -130,11 +103,11 @@ ...@@ -130,11 +103,11 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 7, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# OVERSAMPLED training data\n", "# 2. OVERSAMPLED training data\n",
"smote_tomek = SMOTETomek()\n", "smote_tomek = SMOTETomek()\n",
"X_train_over_pre, y_train_over_pre = smote_tomek.fit_resample(X_train_pre, y_train_pre)\n", "X_train_over_pre, y_train_over_pre = smote_tomek.fit_resample(X_train_pre, y_train_pre)\n",
"X_train_over_post, y_train_over_post = smote_tomek.fit_resample(X_train_post, y_train_post)" "X_train_over_post, y_train_over_post = smote_tomek.fit_resample(X_train_post, y_train_post)"
...@@ -142,7 +115,7 @@ ...@@ -142,7 +115,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 9, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
...@@ -155,11 +128,11 @@ ...@@ -155,11 +128,11 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 10, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# UNDERSAMPLING: TOMEK-LINKS \n", "# 3. UNDERSAMPLING: TOMEK-LINKS \n",
"tomek = TomekLinks()\n", "tomek = TomekLinks()\n",
"X_train_under_pre, y_train_under_pre = tomek.fit_resample(X_train_pre, y_train_pre)\n", "X_train_under_pre, y_train_under_pre = tomek.fit_resample(X_train_pre, y_train_pre)\n",
"X_train_under_post, y_train_under_post = tomek.fit_resample(X_train_post, y_train_post)" "X_train_under_post, y_train_under_post = tomek.fit_resample(X_train_post, y_train_post)"
...@@ -167,7 +140,7 @@ ...@@ -167,7 +140,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 14, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
...@@ -177,16 +150,6 @@ ...@@ -177,16 +150,6 @@
"np.save('./output/post/X_train_under_post.npy', X_train_under_post)\n", "np.save('./output/post/X_train_under_post.npy', X_train_under_post)\n",
"np.save('./output/post/y_train_under_post.npy', y_train_under_post)" "np.save('./output/post/y_train_under_post.npy', y_train_under_post)"
] ]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# Save features\n",
"np.save('./output/attributes.npy', feat)"
]
} }
], ],
"metadata": { "metadata": {
......
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment