Cleaning

d769c473 · Joaquin Torres · 0bbb8d6a · d769c473 · d769c473 · 0bbb8d6a
Commit d769c473 authored Jun 28, 2024 by Joaquin Torres
4 changed files
--- a/EDA/EDA.ipynb
+++ b/EDA/EDA.ipynb
@@ -4,7 +4,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "_Exploratory Data Analysis_ \\\n",
+    "**Exploratory Data Analysis** \\\n",
    "_Author: Joaquín Torres Bravo_"
   ]
  },

--- a/gen_train_data/gen_train_data.ipynb
+++ b/gen_train_data/gen_train_data.ipynb
@@ -4,8 +4,8 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "## Training Data Generation\n",
-    "By Joaquín Torres, May 2024"
+    "**Training Data Generation** \\\n",
+    "_Author: Joaquín Torres Bravo_"
   ]
  },
  {
@@ -17,11 +17,10 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
-    "# Libraries\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.model_selection import train_test_split\n",
@@ -31,57 +30,31 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load clean datasets\n",
-    "df_pre = pd.read_csv('./input/pre_dataset.csv')\n",
-    "df_post = pd.read_csv('./input/post_dataset.csv')"
+    "df_pre = pd.read_csv('../EDA/output/datasets/pre_dataset.csv')\n",
+    "df_post = pd.read_csv('../EDA/output/datasets/post_dataset.csv')"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "(22861, 39)\n",
-      "(10677, 39)\n",
-      "(22861,)\n",
-      "(10677,)\n",
-      "['Ed_Not Complete primary school' 'Ed_Primary education'\n",
-      " 'Ed_Secondary Education' 'Ed_Secondary more technical education'\n",
-      " 'Ed_Tertiary' 'Social_protection_REDEF' 'JobIn_Non-stable' 'JobIn_Stable'\n",
-      " 'JobIn_Unemployed' 'Hous_Institutional' 'Hous_Stable' 'Hous_Unstable'\n",
-      " 'Alterations_early_childhood_develop_REDEF'\n",
-      " 'SocInc_Live with families or friends' 'SocInc_live alone'\n",
-      " 'SocInc_live in institutions' 'Risk_stigma_REDEF' 'Structural_conflic'\n",
-      " 'Age' 'Sex_REDEF' 'NumHijos' 'Smoking_REDEF'\n",
-      " 'Biological_vulnerability_REDEF' 'Opiaceos_DxCIE_REDEF'\n",
-      " 'Cannabis_DXCIE_REDEF' 'BZD_DxCIE_REDEF' 'Cocaina_DxCIE_REDEF'\n",
-      " 'Alucinogenos_DXCIE_REDEF' 'Tabaco_DXCIE_REDEF' 'Frec30_1 día/semana'\n",
-      " 'Frec30_2-3 días\\u200e/semana' 'Frec30_4-6 días/semana'\n",
-      " 'Frec30_Menos de 1 día\\u200e/semana' 'Frec30_No consumio'\n",
-      " 'Frec30_Todos los días' 'Años_consumo_droga' 'OtrosDx_Psiquiatrico_REDEF'\n",
-      " 'Tx_previos_REDEF' 'Adherencia_tto_recalc']\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "# Creating a numpy matrix (X) without the target variable and a list with the target variable (y) \n",
-    "X_pre, y_pre = df_pre.loc[:, df_pre.columns != \"Situacion_tratamiento_REDEF\"].to_numpy(), df_pre.Situacion_tratamiento_REDEF\n",
-    "X_post, y_post = df_post.loc[:, df_post.columns != \"Situacion_tratamiento_REDEF\"].to_numpy(), df_post.Situacion_tratamiento_REDEF\n",
+    "X_pre, y_pre = df_pre.loc[:, df_pre.columns != \"Treatment_Outcome\"].to_numpy(), df_pre.Treatment_Outcome\n",
+    "X_post, y_post = df_post.loc[:, df_post.columns != \"Treatment_Outcome\"].to_numpy(), df_post.Treatment_Outcome\n",
    "feat = np.delete(df_pre.columns.to_numpy(),-1) # Get labels and remove target \n",
    "\n",
    "print(X_pre.shape)\n",
    "print(X_post.shape)\n",
    "print(y_pre.shape)\n",
    "print(y_post.shape)\n",
-    "print((feat))"
+    "print(feat)"
   ]
  },
  {
@@ -93,18 +66,18 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
-    "# ORIGINAL\n",
+    "# 1. ORIGINAL\n",
    "X_train_pre, X_test_pre, y_train_pre, y_test_pre = train_test_split(X_pre, y_pre, test_size=0.1, random_state=42) #90-10 split\n",
    "X_train_post, X_test_post, y_train_post, y_test_post = train_test_split(X_post, y_post, test_size=0.1, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -117,7 +90,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -130,11 +103,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
-    "# OVERSAMPLED training data\n",
+    "# 2. OVERSAMPLED training data\n",
    "smote_tomek = SMOTETomek()\n",
    "X_train_over_pre, y_train_over_pre = smote_tomek.fit_resample(X_train_pre, y_train_pre)\n",
    "X_train_over_post, y_train_over_post = smote_tomek.fit_resample(X_train_post, y_train_post)"
@@ -142,7 +115,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -155,11 +128,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
-    "# UNDERSAMPLING: TOMEK-LINKS \n",
+    "# 3. UNDERSAMPLING: TOMEK-LINKS \n",
    "tomek = TomekLinks()\n",
    "X_train_under_pre, y_train_under_pre = tomek.fit_resample(X_train_pre, y_train_pre)\n",
    "X_train_under_post, y_train_under_post = tomek.fit_resample(X_train_post, y_train_post)"
@@ -167,7 +140,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -177,16 +150,6 @@
    "np.save('./output/post/X_train_under_post.npy', X_train_under_post)\n",
    "np.save('./output/post/y_train_under_post.npy', y_train_under_post)"
   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Save features\n",
-    "np.save('./output/attributes.npy', feat)"
-   ]
  }
 ],
 "metadata": {

--- a/gen_train_data/input/post_dataset.csv
+++ b/gen_train_data/input/post_dataset.csv
--- a/gen_train_data/input/pre_dataset.csv
+++ b/gen_train_data/input/pre_dataset.csv