Minor fixes

7df4155b · Joaquin Torres · 2a1e1b03 · 7df4155b
Commit 7df4155b authored Jun 28, 2024 by Joaquin Torres
Hide whitespace changes
Inline Side-by-side

Showing with 424 additions and 32 deletions

EDA/EDA.ipynb EDA/EDA.ipynb +424 -32

No files found.
--- a/EDA/EDA.ipynb
+++ b/EDA/EDA.ipynb
@@ -17,7 +17,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -41,11 +41,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
-    "bd_all = pd.read_spss('./input/17_abril.sav')\n",
+    "bd_all = pd.read_spss('./input/data.sav')\n",
    "\n",
    "# Filter the dataset to work only with alcohol patients\n",
    "bd = bd_all[bd_all['Alcohol_DxCIE'] == 'Sí']\n",
@@ -56,9 +56,28 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_10184\\2495984927.py:18: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+      "Try using .loc[row_indexer,col_indexer] = value instead\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  conj_post['Group'] = 'Post'\n",
+      "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_10184\\2495984927.py:19: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+      "Try using .loc[row_indexer,col_indexer] = value instead\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  conj_pre['Group'] = 'Pre'\n"
+     ]
+    }
+   ],
   "source": [
    "# Pre-pandemic\n",
    "conj_pre = bd[bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio y fin prepandemia']\n",
@@ -84,9 +103,22 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "PRE: 22861\n",
+      "\tALTA: 2792\n",
+      "\tABANDONO: 20069\n",
+      "POST: 10677\n",
+      "\tALTA: 1882\n",
+      "\tABANDONO: 8795\n"
+     ]
+    }
+   ],
   "source": [
    "# Printing size of different datasets\n",
    "print(f\"PRE: {len(conj_pre)}\")\n",
@@ -100,9 +132,286 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "PRE\n",
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "Index: 22861 entries, 0 to 85164\n",
+      "Data columns (total 35 columns):\n",
+      " #   Column                               Non-Null Count  Dtype   \n",
+      "---  ------                               --------------  -----   \n",
+      " 0   CODPROYECTO                          22861 non-null  float64 \n",
+      " 1   Education                            22861 non-null  object  \n",
+      " 2   Social_protection                    22861 non-null  object  \n",
+      " 3   Job_insecurity                       22861 non-null  object  \n",
+      " 4   Housing                              22861 non-null  object  \n",
+      " 5   Alterations_early_childhood_develop  22861 non-null  object  \n",
+      " 6   Social_inclusion                     22861 non-null  object  \n",
+      " 7   Risk_stigma                          21606 non-null  category\n",
+      " 8   Structural_conflic                   22861 non-null  float64 \n",
+      " 9   Age                                  22852 non-null  float64 \n",
+      " 10  Sex                                  22861 non-null  object  \n",
+      " 11  NumHijos                             21647 non-null  float64 \n",
+      " 12  Smoking                              22861 non-null  object  \n",
+      " 13  Biological_vulnerability             22861 non-null  object  \n",
+      " 14  Alcohol_DxCIE                        22861 non-null  object  \n",
+      " 15  Opiaceos_DxCIE                       22861 non-null  object  \n",
+      " 16  Cannabis_DXCIE                       22861 non-null  object  \n",
+      " 17  BZD_DxCIE                            22861 non-null  object  \n",
+      " 18  Cocaina_DxCIE                        22861 non-null  object  \n",
+      " 19  Alucinogenos_DXCIE                   22861 non-null  object  \n",
+      " 20  Tabaco_DXCIE                         22861 non-null  object  \n",
+      " 21  FrecuenciaConsumo30Dias              22861 non-null  object  \n",
+      " 22  Años_consumo_droga                   22342 non-null  float64 \n",
+      " 23  OtrosDx_Psiquiatrico                 22861 non-null  object  \n",
+      " 24  Tx_previos                           22861 non-null  object  \n",
+      " 25  Adherencia_tto_recalc                22861 non-null  float64 \n",
+      " 26  Tiempo_tx                            22861 non-null  float64 \n",
+      " 27  Readmisiones_estudios                22861 non-null  object  \n",
+      " 28  Situacion_tratamiento                22861 non-null  object  \n",
+      " 29  Periodos_COVID                       22861 non-null  object  \n",
+      " 30  Pandemia_inicio_fin_tratamiento      22861 non-null  object  \n",
+      " 31  Nreadmision                          22861 non-null  float64 \n",
+      " 32  Readmisiones_PRECOVID                22861 non-null  float64 \n",
+      " 33  Readmisiones_COVID                   22861 non-null  float64 \n",
+      " 34  Group                                22861 non-null  object  \n",
+      "dtypes: category(1), float64(10), object(24)\n",
+      "memory usage: 6.1+ MB\n",
+      "None\n",
+      "-------------------------------\n",
+      "PRE-ABANDONO\n",
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "Index: 20069 entries, 0 to 85164\n",
+      "Data columns (total 34 columns):\n",
+      " #   Column                               Non-Null Count  Dtype   \n",
+      "---  ------                               --------------  -----   \n",
+      " 0   CODPROYECTO                          20069 non-null  float64 \n",
+      " 1   Education                            20069 non-null  object  \n",
+      " 2   Social_protection                    20069 non-null  object  \n",
+      " 3   Job_insecurity                       20069 non-null  object  \n",
+      " 4   Housing                              20069 non-null  object  \n",
+      " 5   Alterations_early_childhood_develop  20069 non-null  object  \n",
+      " 6   Social_inclusion                     20069 non-null  object  \n",
+      " 7   Risk_stigma                          18919 non-null  category\n",
+      " 8   Structural_conflic                   20069 non-null  float64 \n",
+      " 9   Age                                  20061 non-null  float64 \n",
+      " 10  Sex                                  20069 non-null  object  \n",
+      " 11  NumHijos                             18958 non-null  float64 \n",
+      " 12  Smoking                              20069 non-null  object  \n",
+      " 13  Biological_vulnerability             20069 non-null  object  \n",
+      " 14  Alcohol_DxCIE                        20069 non-null  object  \n",
+      " 15  Opiaceos_DxCIE                       20069 non-null  object  \n",
+      " 16  Cannabis_DXCIE                       20069 non-null  object  \n",
+      " 17  BZD_DxCIE                            20069 non-null  object  \n",
+      " 18  Cocaina_DxCIE                        20069 non-null  object  \n",
+      " 19  Alucinogenos_DXCIE                   20069 non-null  object  \n",
+      " 20  Tabaco_DXCIE                         20069 non-null  object  \n",
+      " 21  FrecuenciaConsumo30Dias              20069 non-null  object  \n",
+      " 22  Años_consumo_droga                   19609 non-null  float64 \n",
+      " 23  OtrosDx_Psiquiatrico                 20069 non-null  object  \n",
+      " 24  Tx_previos                           20069 non-null  object  \n",
+      " 25  Adherencia_tto_recalc                20069 non-null  float64 \n",
+      " 26  Tiempo_tx                            20069 non-null  float64 \n",
+      " 27  Readmisiones_estudios                20069 non-null  object  \n",
+      " 28  Situacion_tratamiento                20069 non-null  object  \n",
+      " 29  Periodos_COVID                       20069 non-null  object  \n",
+      " 30  Pandemia_inicio_fin_tratamiento      20069 non-null  object  \n",
+      " 31  Nreadmision                          20069 non-null  float64 \n",
+      " 32  Readmisiones_PRECOVID                20069 non-null  float64 \n",
+      " 33  Readmisiones_COVID                   20069 non-null  float64 \n",
+      "dtypes: category(1), float64(10), object(23)\n",
+      "memory usage: 5.2+ MB\n",
+      "None\n",
+      "-------------------------------\n",
+      "PRE-ALTA\n",
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "Index: 2792 entries, 23 to 85159\n",
+      "Data columns (total 34 columns):\n",
+      " #   Column                               Non-Null Count  Dtype   \n",
+      "---  ------                               --------------  -----   \n",
+      " 0   CODPROYECTO                          2792 non-null   float64 \n",
+      " 1   Education                            2792 non-null   object  \n",
+      " 2   Social_protection                    2792 non-null   object  \n",
+      " 3   Job_insecurity                       2792 non-null   object  \n",
+      " 4   Housing                              2792 non-null   object  \n",
+      " 5   Alterations_early_childhood_develop  2792 non-null   object  \n",
+      " 6   Social_inclusion                     2792 non-null   object  \n",
+      " 7   Risk_stigma                          2687 non-null   category\n",
+      " 8   Structural_conflic                   2792 non-null   float64 \n",
+      " 9   Age                                  2791 non-null   float64 \n",
+      " 10  Sex                                  2792 non-null   object  \n",
+      " 11  NumHijos                             2689 non-null   float64 \n",
+      " 12  Smoking                              2792 non-null   object  \n",
+      " 13  Biological_vulnerability             2792 non-null   object  \n",
+      " 14  Alcohol_DxCIE                        2792 non-null   object  \n",
+      " 15  Opiaceos_DxCIE                       2792 non-null   object  \n",
+      " 16  Cannabis_DXCIE                       2792 non-null   object  \n",
+      " 17  BZD_DxCIE                            2792 non-null   object  \n",
+      " 18  Cocaina_DxCIE                        2792 non-null   object  \n",
+      " 19  Alucinogenos_DXCIE                   2792 non-null   object  \n",
+      " 20  Tabaco_DXCIE                         2792 non-null   object  \n",
+      " 21  FrecuenciaConsumo30Dias              2792 non-null   object  \n",
+      " 22  Años_consumo_droga                   2733 non-null   float64 \n",
+      " 23  OtrosDx_Psiquiatrico                 2792 non-null   object  \n",
+      " 24  Tx_previos                           2792 non-null   object  \n",
+      " 25  Adherencia_tto_recalc                2792 non-null   float64 \n",
+      " 26  Tiempo_tx                            2792 non-null   float64 \n",
+      " 27  Readmisiones_estudios                2792 non-null   object  \n",
+      " 28  Situacion_tratamiento                2792 non-null   object  \n",
+      " 29  Periodos_COVID                       2792 non-null   object  \n",
+      " 30  Pandemia_inicio_fin_tratamiento      2792 non-null   object  \n",
+      " 31  Nreadmision                          2792 non-null   float64 \n",
+      " 32  Readmisiones_PRECOVID                2792 non-null   float64 \n",
+      " 33  Readmisiones_COVID                   2792 non-null   float64 \n",
+      "dtypes: category(1), float64(10), object(23)\n",
+      "memory usage: 744.5+ KB\n",
+      "None\n",
+      "-------------------------------\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "POST\n",
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "Index: 10677 entries, 11 to 85156\n",
+      "Data columns (total 35 columns):\n",
+      " #   Column                               Non-Null Count  Dtype   \n",
+      "---  ------                               --------------  -----   \n",
+      " 0   CODPROYECTO                          10677 non-null  float64 \n",
+      " 1   Education                            10677 non-null  object  \n",
+      " 2   Social_protection                    10677 non-null  object  \n",
+      " 3   Job_insecurity                       10677 non-null  object  \n",
+      " 4   Housing                              10677 non-null  object  \n",
+      " 5   Alterations_early_childhood_develop  10677 non-null  object  \n",
+      " 6   Social_inclusion                     10677 non-null  object  \n",
+      " 7   Risk_stigma                          10085 non-null  category\n",
+      " 8   Structural_conflic                   10677 non-null  float64 \n",
+      " 9   Age                                  10676 non-null  float64 \n",
+      " 10  Sex                                  10677 non-null  object  \n",
+      " 11  NumHijos                             10103 non-null  float64 \n",
+      " 12  Smoking                              10677 non-null  object  \n",
+      " 13  Biological_vulnerability             10677 non-null  object  \n",
+      " 14  Alcohol_DxCIE                        10677 non-null  object  \n",
+      " 15  Opiaceos_DxCIE                       10677 non-null  object  \n",
+      " 16  Cannabis_DXCIE                       10677 non-null  object  \n",
+      " 17  BZD_DxCIE                            10677 non-null  object  \n",
+      " 18  Cocaina_DxCIE                        10677 non-null  object  \n",
+      " 19  Alucinogenos_DXCIE                   10677 non-null  object  \n",
+      " 20  Tabaco_DXCIE                         10677 non-null  object  \n",
+      " 21  FrecuenciaConsumo30Dias              10677 non-null  object  \n",
+      " 22  Años_consumo_droga                   10478 non-null  float64 \n",
+      " 23  OtrosDx_Psiquiatrico                 10677 non-null  object  \n",
+      " 24  Tx_previos                           10677 non-null  object  \n",
+      " 25  Adherencia_tto_recalc                10677 non-null  float64 \n",
+      " 26  Tiempo_tx                            10677 non-null  float64 \n",
+      " 27  Readmisiones_estudios                10677 non-null  object  \n",
+      " 28  Situacion_tratamiento                10677 non-null  object  \n",
+      " 29  Periodos_COVID                       10677 non-null  object  \n",
+      " 30  Pandemia_inicio_fin_tratamiento      10677 non-null  object  \n",
+      " 31  Nreadmision                          10677 non-null  float64 \n",
+      " 32  Readmisiones_PRECOVID                10677 non-null  float64 \n",
+      " 33  Readmisiones_COVID                   10677 non-null  float64 \n",
+      " 34  Group                                10677 non-null  object  \n",
+      "dtypes: category(1), float64(10), object(24)\n",
+      "memory usage: 2.9+ MB\n",
+      "None\n",
+      "-------------------------------\n",
+      "POST-ABANDONO\n",
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "Index: 8795 entries, 11 to 85156\n",
+      "Data columns (total 34 columns):\n",
+      " #   Column                               Non-Null Count  Dtype   \n",
+      "---  ------                               --------------  -----   \n",
+      " 0   CODPROYECTO                          8795 non-null   float64 \n",
+      " 1   Education                            8795 non-null   object  \n",
+      " 2   Social_protection                    8795 non-null   object  \n",
+      " 3   Job_insecurity                       8795 non-null   object  \n",
+      " 4   Housing                              8795 non-null   object  \n",
+      " 5   Alterations_early_childhood_develop  8795 non-null   object  \n",
+      " 6   Social_inclusion                     8795 non-null   object  \n",
+      " 7   Risk_stigma                          8308 non-null   category\n",
+      " 8   Structural_conflic                   8795 non-null   float64 \n",
+      " 9   Age                                  8794 non-null   float64 \n",
+      " 10  Sex                                  8795 non-null   object  \n",
+      " 11  NumHijos                             8325 non-null   float64 \n",
+      " 12  Smoking                              8795 non-null   object  \n",
+      " 13  Biological_vulnerability             8795 non-null   object  \n",
+      " 14  Alcohol_DxCIE                        8795 non-null   object  \n",
+      " 15  Opiaceos_DxCIE                       8795 non-null   object  \n",
+      " 16  Cannabis_DXCIE                       8795 non-null   object  \n",
+      " 17  BZD_DxCIE                            8795 non-null   object  \n",
+      " 18  Cocaina_DxCIE                        8795 non-null   object  \n",
+      " 19  Alucinogenos_DXCIE                   8795 non-null   object  \n",
+      " 20  Tabaco_DXCIE                         8795 non-null   object  \n",
+      " 21  FrecuenciaConsumo30Dias              8795 non-null   object  \n",
+      " 22  Años_consumo_droga                   8627 non-null   float64 \n",
+      " 23  OtrosDx_Psiquiatrico                 8795 non-null   object  \n",
+      " 24  Tx_previos                           8795 non-null   object  \n",
+      " 25  Adherencia_tto_recalc                8795 non-null   float64 \n",
+      " 26  Tiempo_tx                            8795 non-null   float64 \n",
+      " 27  Readmisiones_estudios                8795 non-null   object  \n",
+      " 28  Situacion_tratamiento                8795 non-null   object  \n",
+      " 29  Periodos_COVID                       8795 non-null   object  \n",
+      " 30  Pandemia_inicio_fin_tratamiento      8795 non-null   object  \n",
+      " 31  Nreadmision                          8795 non-null   float64 \n",
+      " 32  Readmisiones_PRECOVID                8795 non-null   float64 \n",
+      " 33  Readmisiones_COVID                   8795 non-null   float64 \n",
+      "dtypes: category(1), float64(10), object(23)\n",
+      "memory usage: 2.3+ MB\n",
+      "None\n",
+      "-------------------------------\n",
+      "POST-ALTA\n",
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "Index: 1882 entries, 258 to 85149\n",
+      "Data columns (total 34 columns):\n",
+      " #   Column                               Non-Null Count  Dtype   \n",
+      "---  ------                               --------------  -----   \n",
+      " 0   CODPROYECTO                          1882 non-null   float64 \n",
+      " 1   Education                            1882 non-null   object  \n",
+      " 2   Social_protection                    1882 non-null   object  \n",
+      " 3   Job_insecurity                       1882 non-null   object  \n",
+      " 4   Housing                              1882 non-null   object  \n",
+      " 5   Alterations_early_childhood_develop  1882 non-null   object  \n",
+      " 6   Social_inclusion                     1882 non-null   object  \n",
+      " 7   Risk_stigma                          1777 non-null   category\n",
+      " 8   Structural_conflic                   1882 non-null   float64 \n",
+      " 9   Age                                  1882 non-null   float64 \n",
+      " 10  Sex                                  1882 non-null   object  \n",
+      " 11  NumHijos                             1778 non-null   float64 \n",
+      " 12  Smoking                              1882 non-null   object  \n",
+      " 13  Biological_vulnerability             1882 non-null   object  \n",
+      " 14  Alcohol_DxCIE                        1882 non-null   object  \n",
+      " 15  Opiaceos_DxCIE                       1882 non-null   object  \n",
+      " 16  Cannabis_DXCIE                       1882 non-null   object  \n",
+      " 17  BZD_DxCIE                            1882 non-null   object  \n",
+      " 18  Cocaina_DxCIE                        1882 non-null   object  \n",
+      " 19  Alucinogenos_DXCIE                   1882 non-null   object  \n",
+      " 20  Tabaco_DXCIE                         1882 non-null   object  \n",
+      " 21  FrecuenciaConsumo30Dias              1882 non-null   object  \n",
+      " 22  Años_consumo_droga                   1851 non-null   float64 \n",
+      " 23  OtrosDx_Psiquiatrico                 1882 non-null   object  \n",
+      " 24  Tx_previos                           1882 non-null   object  \n",
+      " 25  Adherencia_tto_recalc                1882 non-null   float64 \n",
+      " 26  Tiempo_tx                            1882 non-null   float64 \n",
+      " 27  Readmisiones_estudios                1882 non-null   object  \n",
+      " 28  Situacion_tratamiento                1882 non-null   object  \n",
+      " 29  Periodos_COVID                       1882 non-null   object  \n",
+      " 30  Pandemia_inicio_fin_tratamiento      1882 non-null   object  \n",
+      " 31  Nreadmision                          1882 non-null   float64 \n",
+      " 32  Readmisiones_PRECOVID                1882 non-null   float64 \n",
+      " 33  Readmisiones_COVID                   1882 non-null   float64 \n",
+      "dtypes: category(1), float64(10), object(23)\n",
+      "memory usage: 501.9+ KB\n",
+      "None\n",
+      "-------------------------------\n"
+     ]
+    }
+   ],
   "source": [
    "print(\"PRE\")\n",
    "print(conj_pre.info())\n",
@@ -136,9 +445,36 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['Live with families or friends' 'live alone' 'live in institutions' '9.0']\n",
+      "['Live with families or friends' 'live alone' 'live in institutions']\n",
+      "['No alterations (first exposure at 11 or more years)'\n",
+      " 'Alterations (first exposure before 11 years old)' '9']\n",
+      "['No alterations (first exposure at 11 or more years)'\n",
+      " 'Alterations (first exposure before 11 years old)']\n",
+      "[NaN, 'Yes', 'No']\n",
+      "Categories (3, object): [99.0, 'No', 'Yes']\n",
+      "[NaN, 'Yes', 'No']\n",
+      "Categories (2, object): ['No', 'Yes']\n",
+      "[nan  0.  1.  2.  3.  4.  5.  8. 10.  6. 11. 12.  9.  7. 99. 14. 15.]\n",
+      "[nan  0.  1.  2.  3.  4.  5.  8. 10.  6. 11. 12.  9.  7. 14. 15.]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_10184\\1003504044.py:14: FutureWarning: The behavior of Series.replace (and DataFrame.replace) with CategoricalDtype is deprecated. In a future version, replace will only be used for cases that preserve the categories. To change the categories, use ser.cat.rename_categories instead.\n",
+      "  bd['Risk_stigma'] = bd['Risk_stigma'].replace(99.0, mode_stigma)\n"
+     ]
+    }
+   ],
   "source": [
    "# 9.0 represents unknown according to Variables.docx \n",
    "print(bd['Social_inclusion'].unique())\n",
@@ -164,9 +500,30 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Total missing values Age: 10\n",
+      "Total missing values Años_consumo_droga: 718\n",
+      "Total missing values Risk_stigma: 1847\n",
+      "Total missing values NumHijos: 1788\n",
+      "\tCONJUNTO PREPANDEMIA\n",
+      "\t\tMissing values Age: 9\n",
+      "\t\tMissing values Años_consumo_droga: 519\n",
+      "\t\tMissing values Risk_stigma: 1255\n",
+      "\t\tMissing values NumHijos: 1214\n",
+      "\tCONJUNTO POSTPANDEMIA\n",
+      "\t\tMissing values Age: 1\n",
+      "\t\tMissing values Años_consumo_droga: 199\n",
+      "\t\tMissing values Risk_stigma: 592\n",
+      "\t\tMissing values NumHijos: 574\n"
+     ]
+    }
+   ],
   "source": [
    "print(f\"Total missing values Age: {bd['Age'].isnull().sum()}\")\n",
    "print(f\"Total missing values Años_consumo_droga: {bd['Años_consumo_droga'].isnull().sum()}\")\n",
@@ -188,9 +545,44 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_10184\\3303146707.py:2: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n",
+      "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n",
+      "\n",
+      "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n",
+      "\n",
+      "\n",
+      "  bd['Age'].fillna(age_mode, inplace=True)\n",
+      "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_10184\\3303146707.py:5: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n",
+      "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n",
+      "\n",
+      "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n",
+      "\n",
+      "\n",
+      "  bd['Años_consumo_droga'].fillna(años_consumo_mode, inplace=True)\n",
+      "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_10184\\3303146707.py:8: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n",
+      "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n",
+      "\n",
+      "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n",
+      "\n",
+      "\n",
+      "  bd['Risk_stigma'].fillna(risk_stigma_mode, inplace=True)\n",
+      "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_10184\\3303146707.py:11: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n",
+      "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n",
+      "\n",
+      "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n",
+      "\n",
+      "\n",
+      "  bd['NumHijos'].fillna(num_hijos_mode, inplace=True)\n"
+     ]
+    }
+   ],
   "source": [
    "age_mode = bd['Age'].mode()[0]\n",
    "bd['Age'].fillna(age_mode, inplace=True)\n",
@@ -481,7 +873,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -512,7 +904,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -589,7 +981,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -622,7 +1014,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -662,7 +1054,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -681,7 +1073,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -691,7 +1083,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -748,14 +1140,14 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Export feature names\n",
-    "np.save('./output/feature_names/feature_names.npy', corr_cols)\n",
-    "np.save('./output/feature_names/soc_vars_names.npy', soc_vars_enc)\n",
-    "np.save('./output/feature_names/ind_vars_names.npy', ind_vars_enc)"
+    "np.save('./output/feature_names/all_features.npy', corr_cols)\n",
+    "np.save('./output/feature_names/social_factors.npy', soc_vars_enc)\n",
+    "np.save('./output/feature_names/individual_factors.npy', ind_vars_enc)"
   ]
  },
  {