EDA.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### EDA"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Libraries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "from pypair.association import binary_binary, continuous_continuous, binary_continuous\n",
    "\n",
    "from sklearn.feature_selection import VarianceThreshold\n",
    "from sklearn.feature_selection import SelectKBest\n",
    "from sklearn.feature_selection import f_classif\n",
    "from sklearn.feature_selection import mutual_info_classif"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Preparing Data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Reading and filtering"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "bd_all = pd.read_spss('17_abril.sav')\n",
    "\n",
    "# Filter the dataset to work only with alcohol patients\n",
    "bd = bd_all[bd_all['Alcohol_DxCIE'] == 'Sí']\n",
    "\n",
    "# Filter the dataset to work only with 'Situacion_tratamiento' == 'Abandono' or 'Alta'\n",
    "bd = bd[(bd['Situacion_tratamiento'] == 'Abandono') | (bd['Situacion_tratamiento'] == 'Alta terapéutica')]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Defining sets of patients"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_19000\\2495984927.py:18: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  conj_post['Group'] = 'Post'\n",
      "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_19000\\2495984927.py:19: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  conj_pre['Group'] = 'Pre'\n"
     ]
    }
   ],
   "source": [
    "# Pre-pandemic\n",
    "conj_pre = bd[bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio y fin prepandemia']\n",
    "# Pre-pandemic abandono\n",
    "pre_abandono = conj_pre[conj_pre['Situacion_tratamiento'] == 'Abandono']\n",
    "# Pre-pandemic alta\n",
    "pre_alta = conj_pre[conj_pre['Situacion_tratamiento'] == 'Alta terapéutica']\n",
    "\n",
    "# Post-pandemic\n",
    "# Merging last two classes to balance sets\n",
    "conj_post = bd[(bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio prepandemia y fin en pandemia') | \n",
    "               (bd['Pandemia_inicio_fin_tratamiento'] == 'inicio y fin en pandemia')]\n",
    "# Post-pandemic abandono\n",
    "post_abandono = conj_post[conj_post['Situacion_tratamiento'] == 'Abandono']\n",
    "# Post-pandemic alta\n",
    "post_alta = conj_post[conj_post['Situacion_tratamiento'] == 'Alta terapéutica']\n",
    "\n",
    "# Concatenate the two data frames and add a new column to distinguish between them. Useful for plots\n",
    "conj_post['Group'] = 'Post'\n",
    "conj_pre['Group'] = 'Pre'\n",
    "combined_pre_post = pd.concat([conj_post, conj_pre])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "PRE: 22861\n",
      "\tALTA: 2792\n",
      "\tABANDONO: 20069\n",
      "POST: 10677\n",
      "\tALTA: 1882\n",
      "\tABANDONO: 8795\n"
     ]
    }
   ],
   "source": [
    "# Printing size of different datasets\n",
    "print(f\"PRE: {len(conj_pre)}\")\n",
    "print(f\"\\tALTA: {len(pre_alta)}\")\n",
    "print(f\"\\tABANDONO: {len(pre_abandono)}\")\n",
    "\n",
    "print(f\"POST: {len(conj_post)}\")\n",
    "print(f\"\\tALTA: {len(post_alta)}\")\n",
    "print(f\"\\tABANDONO: {len(post_abandono)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### First Steps"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Inspecting the dataframes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "PRE\n",
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Index: 22861 entries, 0 to 85164\n",
      "Data columns (total 35 columns):\n",
      " #   Column                               Non-Null Count  Dtype   \n",
      "---  ------                               --------------  -----   \n",
      " 0   CODPROYECTO                          22861 non-null  float64 \n",
      " 1   Education                            22861 non-null  object  \n",
      " 2   Social_protection                    22861 non-null  object  \n",
      " 3   Job_insecurity                       22861 non-null  object  \n",
      " 4   Housing                              22861 non-null  object  \n",
      " 5   Alterations_early_childhood_develop  22861 non-null  object  \n",
      " 6   Social_inclusion                     22861 non-null  object  \n",
      " 7   Risk_stigma                          21606 non-null  category\n",
      " 8   Structural_conflic                   22861 non-null  float64 \n",
      " 9   Age                                  22852 non-null  float64 \n",
      " 10  Sex                                  22861 non-null  object  \n",
      " 11  NumHijos                             21647 non-null  float64 \n",
      " 12  Smoking                              22861 non-null  object  \n",
      " 13  Biological_vulnerability             22861 non-null  object  \n",
      " 14  Alcohol_DxCIE                        22861 non-null  object  \n",
      " 15  Opiaceos_DxCIE                       22861 non-null  object  \n",
      " 16  Cannabis_DXCIE                       22861 non-null  object  \n",
      " 17  BZD_DxCIE                            22861 non-null  object  \n",
      " 18  Cocaina_DxCIE                        22861 non-null  object  \n",
      " 19  Alucinogenos_DXCIE                   22861 non-null  object  \n",
      " 20  Tabaco_DXCIE                         22861 non-null  object  \n",
      " 21  FrecuenciaConsumo30Dias              22861 non-null  object  \n",
      " 22  Años_consumo_droga                   22342 non-null  float64 \n",
      " 23  OtrosDx_Psiquiatrico                 22861 non-null  object  \n",
      " 24  Tx_previos                           22861 non-null  object  \n",
      " 25  Adherencia_tto_recalc                22861 non-null  float64 \n",
      " 26  Tiempo_tx                            22861 non-null  float64 \n",
      " 27  Readmisiones_estudios                22861 non-null  object  \n",
      " 28  Situacion_tratamiento                22861 non-null  object  \n",
      " 29  Periodos_COVID                       22861 non-null  object  \n",
      " 30  Pandemia_inicio_fin_tratamiento      22861 non-null  object  \n",
      " 31  Nreadmision                          22861 non-null  float64 \n",
      " 32  Readmisiones_PRECOVID                22861 non-null  float64 \n",
      " 33  Readmisiones_COVID                   22861 non-null  float64 \n",
      " 34  Group                                22861 non-null  object  \n",
      "dtypes: category(1), float64(10), object(24)\n",
      "memory usage: 6.1+ MB\n",
      "None\n",
      "-------------------------------\n",
      "PRE-ABANDONO\n",
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Index: 20069 entries, 0 to 85164\n",
      "Data columns (total 34 columns):\n",
      " #   Column                               Non-Null Count  Dtype   \n",
      "---  ------                               --------------  -----   \n",
      " 0   CODPROYECTO                          20069 non-null  float64 \n",
      " 1   Education                            20069 non-null  object  \n",
      " 2   Social_protection                    20069 non-null  object  \n",
      " 3   Job_insecurity                       20069 non-null  object  \n",
      " 4   Housing                              20069 non-null  object  \n",
      " 5   Alterations_early_childhood_develop  20069 non-null  object  \n",
      " 6   Social_inclusion                     20069 non-null  object  \n",
      " 7   Risk_stigma                          18919 non-null  category\n",
      " 8   Structural_conflic                   20069 non-null  float64 \n",
      " 9   Age                                  20061 non-null  float64 \n",
      " 10  Sex                                  20069 non-null  object  \n",
      " 11  NumHijos                             18958 non-null  float64 \n",
      " 12  Smoking                              20069 non-null  object  \n",
      " 13  Biological_vulnerability             20069 non-null  object  \n",
      " 14  Alcohol_DxCIE                        20069 non-null  object  \n",
      " 15  Opiaceos_DxCIE                       20069 non-null  object  \n",
      " 16  Cannabis_DXCIE                       20069 non-null  object  \n",
      " 17  BZD_DxCIE                            20069 non-null  object  \n",
      " 18  Cocaina_DxCIE                        20069 non-null  object  \n",
      " 19  Alucinogenos_DXCIE                   20069 non-null  object  \n",
      " 20  Tabaco_DXCIE                         20069 non-null  object  \n",
      " 21  FrecuenciaConsumo30Dias              20069 non-null  object  \n",
      " 22  Años_consumo_droga                   19609 non-null  float64 \n",
      " 23  OtrosDx_Psiquiatrico                 20069 non-null  object  \n",
      " 24  Tx_previos                           20069 non-null  object  \n",
      " 25  Adherencia_tto_recalc                20069 non-null  float64 \n",
      " 26  Tiempo_tx                            20069 non-null  float64 \n",
      " 27  Readmisiones_estudios                20069 non-null  object  \n",
      " 28  Situacion_tratamiento                20069 non-null  object  \n",
      " 29  Periodos_COVID                       20069 non-null  object  \n",
      " 30  Pandemia_inicio_fin_tratamiento      20069 non-null  object  \n",
      " 31  Nreadmision                          20069 non-null  float64 \n",
      " 32  Readmisiones_PRECOVID                20069 non-null  float64 \n",
      " 33  Readmisiones_COVID                   20069 non-null  float64 \n",
      "dtypes: category(1), float64(10), object(23)\n",
      "memory usage: 5.2+ MB\n",
      "None\n",
      "-------------------------------\n",
      "PRE-ALTA\n",
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Index: 2792 entries, 23 to 85159\n",
      "Data columns (total 34 columns):\n",
      " #   Column                               Non-Null Count  Dtype   \n",
      "---  ------                               --------------  -----   \n",
      " 0   CODPROYECTO                          2792 non-null   float64 \n",
      " 1   Education                            2792 non-null   object  \n",
      " 2   Social_protection                    2792 non-null   object  \n",
      " 3   Job_insecurity                       2792 non-null   object  \n",
      " 4   Housing                              2792 non-null   object  \n",
      " 5   Alterations_early_childhood_develop  2792 non-null   object  \n",
      " 6   Social_inclusion                     2792 non-null   object  \n",
      " 7   Risk_stigma                          2687 non-null   category\n",
      " 8   Structural_conflic                   2792 non-null   float64 \n",
      " 9   Age                                  2791 non-null   float64 \n",
      " 10  Sex                                  2792 non-null   object  \n",
      " 11  NumHijos                             2689 non-null   float64 \n",
      " 12  Smoking                              2792 non-null   object  \n",
      " 13  Biological_vulnerability             2792 non-null   object  \n",
      " 14  Alcohol_DxCIE                        2792 non-null   object  \n",
      " 15  Opiaceos_DxCIE                       2792 non-null   object  \n",
      " 16  Cannabis_DXCIE                       2792 non-null   object  \n",
      " 17  BZD_DxCIE                            2792 non-null   object  \n",
      " 18  Cocaina_DxCIE                        2792 non-null   object  \n",
      " 19  Alucinogenos_DXCIE                   2792 non-null   object  \n",
      " 20  Tabaco_DXCIE                         2792 non-null   object  \n",
      " 21  FrecuenciaConsumo30Dias              2792 non-null   object  \n",
      " 22  Años_consumo_droga                   2733 non-null   float64 \n",
      " 23  OtrosDx_Psiquiatrico                 2792 non-null   object  \n",
      " 24  Tx_previos                           2792 non-null   object  \n",
      " 25  Adherencia_tto_recalc                2792 non-null   float64 \n",
      " 26  Tiempo_tx                            2792 non-null   float64 \n",
      " 27  Readmisiones_estudios                2792 non-null   object  \n",
      " 28  Situacion_tratamiento                2792 non-null   object  \n",
      " 29  Periodos_COVID                       2792 non-null   object  \n",
      " 30  Pandemia_inicio_fin_tratamiento      2792 non-null   object  \n",
      " 31  Nreadmision                          2792 non-null   float64 \n",
      " 32  Readmisiones_PRECOVID                2792 non-null   float64 \n",
      " 33  Readmisiones_COVID                   2792 non-null   float64 \n",
      "dtypes: category(1), float64(10), object(23)\n",
      "memory usage: 744.5+ KB\n",
      "None\n",
      "-------------------------------\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "POST\n",
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Index: 10677 entries, 11 to 85156\n",
      "Data columns (total 35 columns):\n",
      " #   Column                               Non-Null Count  Dtype   \n",
      "---  ------                               --------------  -----   \n",
      " 0   CODPROYECTO                          10677 non-null  float64 \n",
      " 1   Education                            10677 non-null  object  \n",
      " 2   Social_protection                    10677 non-null  object  \n",
      " 3   Job_insecurity                       10677 non-null  object  \n",
      " 4   Housing                              10677 non-null  object  \n",
      " 5   Alterations_early_childhood_develop  10677 non-null  object  \n",
      " 6   Social_inclusion                     10677 non-null  object  \n",
      " 7   Risk_stigma                          10085 non-null  category\n",
      " 8   Structural_conflic                   10677 non-null  float64 \n",
      " 9   Age                                  10676 non-null  float64 \n",
      " 10  Sex                                  10677 non-null  object  \n",
      " 11  NumHijos                             10103 non-null  float64 \n",
      " 12  Smoking                              10677 non-null  object  \n",
      " 13  Biological_vulnerability             10677 non-null  object  \n",
      " 14  Alcohol_DxCIE                        10677 non-null  object  \n",
      " 15  Opiaceos_DxCIE                       10677 non-null  object  \n",
      " 16  Cannabis_DXCIE                       10677 non-null  object  \n",
      " 17  BZD_DxCIE                            10677 non-null  object  \n",
      " 18  Cocaina_DxCIE                        10677 non-null  object  \n",
      " 19  Alucinogenos_DXCIE                   10677 non-null  object  \n",
      " 20  Tabaco_DXCIE                         10677 non-null  object  \n",
      " 21  FrecuenciaConsumo30Dias              10677 non-null  object  \n",
      " 22  Años_consumo_droga                   10478 non-null  float64 \n",
      " 23  OtrosDx_Psiquiatrico                 10677 non-null  object  \n",
      " 24  Tx_previos                           10677 non-null  object  \n",
      " 25  Adherencia_tto_recalc                10677 non-null  float64 \n",
      " 26  Tiempo_tx                            10677 non-null  float64 \n",
      " 27  Readmisiones_estudios                10677 non-null  object  \n",
      " 28  Situacion_tratamiento                10677 non-null  object  \n",
      " 29  Periodos_COVID                       10677 non-null  object  \n",
      " 30  Pandemia_inicio_fin_tratamiento      10677 non-null  object  \n",
      " 31  Nreadmision                          10677 non-null  float64 \n",
      " 32  Readmisiones_PRECOVID                10677 non-null  float64 \n",
      " 33  Readmisiones_COVID                   10677 non-null  float64 \n",
      " 34  Group                                10677 non-null  object  \n",
      "dtypes: category(1), float64(10), object(24)\n",
      "memory usage: 2.9+ MB\n",
      "None\n",
      "-------------------------------\n",
      "POST-ABANDONO\n",
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Index: 8795 entries, 11 to 85156\n",
      "Data columns (total 34 columns):\n",
      " #   Column                               Non-Null Count  Dtype   \n",
      "---  ------                               --------------  -----   \n",
      " 0   CODPROYECTO                          8795 non-null   float64 \n",
      " 1   Education                            8795 non-null   object  \n",
      " 2   Social_protection                    8795 non-null   object  \n",
      " 3   Job_insecurity                       8795 non-null   object  \n",
      " 4   Housing                              8795 non-null   object  \n",
      " 5   Alterations_early_childhood_develop  8795 non-null   object  \n",
      " 6   Social_inclusion                     8795 non-null   object  \n",
      " 7   Risk_stigma                          8308 non-null   category\n",
      " 8   Structural_conflic                   8795 non-null   float64 \n",
      " 9   Age                                  8794 non-null   float64 \n",
      " 10  Sex                                  8795 non-null   object  \n",
      " 11  NumHijos                             8325 non-null   float64 \n",
      " 12  Smoking                              8795 non-null   object  \n",
      " 13  Biological_vulnerability             8795 non-null   object  \n",
      " 14  Alcohol_DxCIE                        8795 non-null   object  \n",
      " 15  Opiaceos_DxCIE                       8795 non-null   object  \n",
      " 16  Cannabis_DXCIE                       8795 non-null   object  \n",
      " 17  BZD_DxCIE                            8795 non-null   object  \n",
      " 18  Cocaina_DxCIE                        8795 non-null   object  \n",
      " 19  Alucinogenos_DXCIE                   8795 non-null   object  \n",
      " 20  Tabaco_DXCIE                         8795 non-null   object  \n",
      " 21  FrecuenciaConsumo30Dias              8795 non-null   object  \n",
      " 22  Años_consumo_droga                   8627 non-null   float64 \n",
      " 23  OtrosDx_Psiquiatrico                 8795 non-null   object  \n",
      " 24  Tx_previos                           8795 non-null   object  \n",
      " 25  Adherencia_tto_recalc                8795 non-null   float64 \n",
      " 26  Tiempo_tx                            8795 non-null   float64 \n",
      " 27  Readmisiones_estudios                8795 non-null   object  \n",
      " 28  Situacion_tratamiento                8795 non-null   object  \n",
      " 29  Periodos_COVID                       8795 non-null   object  \n",
      " 30  Pandemia_inicio_fin_tratamiento      8795 non-null   object  \n",
      " 31  Nreadmision                          8795 non-null   float64 \n",
      " 32  Readmisiones_PRECOVID                8795 non-null   float64 \n",
      " 33  Readmisiones_COVID                   8795 non-null   float64 \n",
      "dtypes: category(1), float64(10), object(23)\n",
      "memory usage: 2.3+ MB\n",
      "None\n",
      "-------------------------------\n",
      "POST-ALTA\n",
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Index: 1882 entries, 258 to 85149\n",
      "Data columns (total 34 columns):\n",
      " #   Column                               Non-Null Count  Dtype   \n",
      "---  ------                               --------------  -----   \n",
      " 0   CODPROYECTO                          1882 non-null   float64 \n",
      " 1   Education                            1882 non-null   object  \n",
      " 2   Social_protection                    1882 non-null   object  \n",
      " 3   Job_insecurity                       1882 non-null   object  \n",
      " 4   Housing                              1882 non-null   object  \n",
      " 5   Alterations_early_childhood_develop  1882 non-null   object  \n",
      " 6   Social_inclusion                     1882 non-null   object  \n",
      " 7   Risk_stigma                          1777 non-null   category\n",
      " 8   Structural_conflic                   1882 non-null   float64 \n",
      " 9   Age                                  1882 non-null   float64 \n",
      " 10  Sex                                  1882 non-null   object  \n",
      " 11  NumHijos                             1778 non-null   float64 \n",
      " 12  Smoking                              1882 non-null   object  \n",
      " 13  Biological_vulnerability             1882 non-null   object  \n",
      " 14  Alcohol_DxCIE                        1882 non-null   object  \n",
      " 15  Opiaceos_DxCIE                       1882 non-null   object  \n",
      " 16  Cannabis_DXCIE                       1882 non-null   object  \n",
      " 17  BZD_DxCIE                            1882 non-null   object  \n",
      " 18  Cocaina_DxCIE                        1882 non-null   object  \n",
      " 19  Alucinogenos_DXCIE                   1882 non-null   object  \n",
      " 20  Tabaco_DXCIE                         1882 non-null   object  \n",
      " 21  FrecuenciaConsumo30Dias              1882 non-null   object  \n",
      " 22  Años_consumo_droga                   1851 non-null   float64 \n",
      " 23  OtrosDx_Psiquiatrico                 1882 non-null   object  \n",
      " 24  Tx_previos                           1882 non-null   object  \n",
      " 25  Adherencia_tto_recalc                1882 non-null   float64 \n",
      " 26  Tiempo_tx                            1882 non-null   float64 \n",
      " 27  Readmisiones_estudios                1882 non-null   object  \n",
      " 28  Situacion_tratamiento                1882 non-null   object  \n",
      " 29  Periodos_COVID                       1882 non-null   object  \n",
      " 30  Pandemia_inicio_fin_tratamiento      1882 non-null   object  \n",
      " 31  Nreadmision                          1882 non-null   float64 \n",
      " 32  Readmisiones_PRECOVID                1882 non-null   float64 \n",
      " 33  Readmisiones_COVID                   1882 non-null   float64 \n",
      "dtypes: category(1), float64(10), object(23)\n",
      "memory usage: 501.9+ KB\n",
      "None\n",
      "-------------------------------\n"
     ]
    }
   ],
   "source": [
    "print(\"PRE\")\n",
    "print(conj_pre.info())\n",
    "print (\"-------------------------------\")\n",
    "print(\"PRE-ABANDONO\")\n",
    "print(pre_abandono.info())\n",
    "print (\"-------------------------------\")\n",
    "print(\"PRE-ALTA\")\n",
    "print(pre_alta.info())\n",
    "print (\"-------------------------------\")\n",
    "\n",
    "print(\"\\n\\n\\n\")\n",
    "\n",
    "print (\"POST\")\n",
    "print(conj_post.info())\n",
    "print (\"-------------------------------\")\n",
    "print(\"POST-ABANDONO\")\n",
    "print(post_abandono.info())\n",
    "print (\"-------------------------------\")\n",
    "print(\"POST-ALTA\")\n",
    "print(post_alta.info())\n",
    "print (\"-------------------------------\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Replacing unknown values with the mode"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['Live with families or friends' 'live alone' 'live in institutions' '9.0']\n",
      "['Live with families or friends' 'live alone' 'live in institutions']\n"
     ]
    }
   ],
   "source": [
    "# 9.0 represents unknown according to Variables.docx \n",
    "print(bd['Social_inclusion'].unique())\n",
    "mode_soc_inc = bd['Social_inclusion'].mode()[0]\n",
    "# print(mode_soc_inc)\n",
    "bd['Social_inclusion'] = bd['Social_inclusion'].replace('9.0', mode_soc_inc)\n",
    "print(bd['Social_inclusion'].unique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['No alterations (first exposure at 11 or more years)'\n",
      " 'Alterations (first exposure before 11 years old)' '9']\n",
      "['No alterations (first exposure at 11 or more years)'\n",
      " 'Alterations (first exposure before 11 years old)']\n"
     ]
    }
   ],
   "source": [
    "print(bd['Alterations_early_childhood_develop'].unique())\n",
    "mode_alt = bd['Alterations_early_childhood_develop'].mode()[0]\n",
    "bd['Alterations_early_childhood_develop'] = bd['Alterations_early_childhood_develop'].replace('9', mode_alt)\n",
    "print(bd['Alterations_early_childhood_develop'].unique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[NaN, 'Yes', 'No']\n",
      "Categories (3, object): [99.0, 'No', 'Yes']\n",
      "[NaN, 'Yes', 'No']\n",
      "Categories (2, object): ['No', 'Yes']\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_19000\\1073322024.py:3: FutureWarning: The behavior of Series.replace (and DataFrame.replace) with CategoricalDtype is deprecated. In a future version, replace will only be used for cases that preserve the categories. To change the categories, use ser.cat.rename_categories instead.\n",
      "  bd['Risk_stigma'] = bd['Risk_stigma'].replace(99.0, mode_stigma)\n"
     ]
    }
   ],
   "source": [
    "print(bd['Risk_stigma'].unique())\n",
    "mode_stigma = bd['Risk_stigma'].mode()[0]\n",
    "bd['Risk_stigma'] = bd['Risk_stigma'].replace(99.0, mode_stigma)\n",
    "print(bd['Risk_stigma'].unique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[nan  0.  1.  2.  3.  4.  5.  8. 10.  6. 11. 12.  9.  7. 99. 14. 15.]\n",
      "[nan  0.  1.  2.  3.  4.  5.  8. 10.  6. 11. 12.  9.  7. 14. 15.]\n"
     ]
    }
   ],
   "source": [
    "print(bd['NumHijos'].unique())\n",
    "mode_hijos = bd['NumHijos'].mode()[0]\n",
    "bd['NumHijos'] = bd['NumHijos'].replace(99.0, mode_hijos)\n",
    "print(bd['NumHijos'].unique())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Quantifying Null Values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f\"Total missing values Age: {bd['Age'].isnull().sum()}\")\n",
    "print(f\"Total missing values Años_consumo_droga: {bd['Años_consumo_droga'].isnull().sum()}\")\n",
    "print(f\"Total missing values Risk_stigma: {bd['Risk_stigma'].isnull().sum()}\")\n",
    "print(f\"Total missing values NumHijos: {bd['NumHijos'].isnull().sum()}\")\n",
    "\n",
    "print(\"\\tCONJUNTO PREPANDEMIA\")\n",
    "print(f\"\\t\\tMissing values Age: {conj_pre['Age'].isnull().sum()}\")\n",
    "print(f\"\\t\\tMissing values Años_consumo_droga: {conj_pre['Años_consumo_droga'].isnull().sum()}\")\n",
    "print(f\"\\t\\tMissing values Risk_stigma: {conj_pre['Risk_stigma'].isnull().sum()}\")\n",
    "print(f\"\\t\\tMissing values NumHijos: {conj_pre['NumHijos'].isnull().sum()}\")\n",
    "\n",
    "print(\"\\tCONJUNTO POSTPANDEMIA\")\n",
    "print(f\"\\t\\tMissing values Age: {conj_post['Age'].isnull().sum()}\")\n",
    "print(f\"\\t\\tMissing values Años_consumo_droga: {conj_post['Años_consumo_droga'].isnull().sum()}\")\n",
    "print(f\"\\t\\tMissing values Risk_stigma: {conj_post['Risk_stigma'].isnull().sum()}\")\n",
    "print(f\"\\t\\tMissing values NumHijos: {conj_post['NumHijos'].isnull().sum()}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Replacing missing values with mode"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_19000\\3303146707.py:2: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n",
      "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n",
      "\n",
      "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n",
      "\n",
      "\n",
      "  bd['Age'].fillna(age_mode, inplace=True)\n",
      "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_19000\\3303146707.py:5: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n",
      "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n",
      "\n",
      "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n",
      "\n",
      "\n",
      "  bd['Años_consumo_droga'].fillna(años_consumo_mode, inplace=True)\n",
      "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_19000\\3303146707.py:8: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n",
      "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n",
      "\n",
      "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n",
      "\n",
      "\n",
      "  bd['Risk_stigma'].fillna(risk_stigma_mode, inplace=True)\n",
      "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_19000\\3303146707.py:11: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n",
      "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n",
      "\n",
      "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n",
      "\n",
      "\n",
      "  bd['NumHijos'].fillna(num_hijos_mode, inplace=True)\n"
     ]
    }
   ],
   "source": [
    "age_mode = bd['Age'].mode()[0]\n",
    "bd['Age'].fillna(age_mode, inplace=True)\n",
    "\n",
    "años_consumo_mode = bd['Años_consumo_droga'].mode()[0]\n",
    "bd['Años_consumo_droga'].fillna(años_consumo_mode, inplace=True)\n",
    "\n",
    "risk_stigma_mode = bd['Risk_stigma'].mode()[0]\n",
    "bd['Risk_stigma'].fillna(risk_stigma_mode, inplace=True)\n",
    "\n",
    "num_hijos_mode = bd['NumHijos'].mode()[0]\n",
    "bd['NumHijos'].fillna(num_hijos_mode, inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Distribution of variables"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Classifying variables into numerical and discrete/categorical "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "disc_atts = ['Education', 'Social_protection', 'Job_insecurity', 'Housing',\n",
    "        'Alterations_early_childhood_develop', 'Social_inclusion',\n",
    "        'Risk_stigma', 'Sex', 'NumHijos', 'Smoking', 'Biological_vulnerability',\n",
    "        'Opiaceos_DxCIE', 'Cannabis_DXCIE', 'BZD_DxCIE', 'Cocaina_DxCIE',\n",
    "        'Alucinogenos_DXCIE', 'Tabaco_DXCIE', 'FrecuenciaConsumo30Dias',\n",
    "        'OtrosDx_Psiquiatrico', 'Tx_previos', 'Readmisiones_estudios', 'Nreadmision'\n",
    "        ]\n",
    "\n",
    "num_atts = ['Structural_conflic', 'Adherencia_tto_recalc', 'Age', 'Años_consumo_droga', 'Tiempo_tx']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Distribution of discrete attributes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###### Count plots"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, axs = plt.subplots(len(disc_atts), 1, figsize=(15, 5*len(disc_atts)))\n",
    "plt.subplots_adjust(hspace=0.75, wspace=1.25)\n",
    "\n",
    "for i, disc_att in enumerate(disc_atts):\n",
    "    ax = sns.countplot(x=disc_att, data=combined_pre_post, hue=combined_pre_post[['Situacion_tratamiento', 'Group']].apply(tuple, axis=1),\n",
    "                       hue_order=[('Abandono', 'Pre'),('Alta terapéutica', 'Pre'), ('Abandono', 'Post'), ('Alta terapéutica', 'Post')],\n",
    "                       ax=axs[i])\n",
    "    ax.set_title(disc_att, fontsize=16, fontweight='bold')\n",
    "    ax.get_legend().set_title(\"Groups\")\n",
    "    \n",
    "    # Adding count annotations\n",
    "    for p in ax.patches:\n",
    "        if p.get_label() == '_nolegend_':\n",
    "            ax.annotate(format(p.get_height(), '.0f'), \n",
    "                        (p.get_x() + p.get_width() / 2., p.get_height()), \n",
    "                        ha = 'center', va = 'center', \n",
    "                        xytext = (0, 9), \n",
    "                        textcoords = 'offset points')\n",
    "\n",
    "# Adjust layout to prevent overlapping titles\n",
    "plt.tight_layout()\n",
    "\n",
    "# Save the figure in SVG format with DPI=600 in the \"./EDA_plots\" folder\n",
    "plt.savefig('./EDA_plots/countplots.svg', dpi=600, bbox_inches='tight')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###### Normalized count plots"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Function to plot countplot \n",
    "def plot_count_perc_norm(i: int, group:int, disc_att:str) -> None:\n",
    "    \"\"\"\n",
    "        group: 1 (all), 2 (pre), 3 (post) \n",
    "    \"\"\"\n",
    "\n",
    "    # Define data to work with based on group\n",
    "    if group == 1:\n",
    "        df = bd \n",
    "    elif group == 2:\n",
    "        df = conj_pre\n",
    "    elif group == 3:\n",
    "        df = conj_post\n",
    "\n",
    "    # GOAL: find percentage of each possible category within the total of its situacion_tto subset\n",
    "    # Group data by 'Situacion_tratamiento' and 'Education' and count occurrences\n",
    "    grouped_counts = df.groupby(['Situacion_tratamiento', disc_att]).size().reset_index(name='count')\n",
    "    # Calculate total count for each 'Situacion_tratamiento' group\n",
    "    total_counts = df.groupby('Situacion_tratamiento')[disc_att].count()\n",
    "    # Divide each count by its corresponding total count and calculate percentage\n",
    "    grouped_counts['percentage'] = grouped_counts.apply(lambda row: row['count'] / total_counts[row['Situacion_tratamiento']] * 100, axis=1)\n",
    "    \n",
    "    # Follow the same order in plot as in computations\n",
    "    col_order = grouped_counts[grouped_counts['Situacion_tratamiento'] == 'Abandono'][disc_att].tolist()\n",
    "\n",
    "    # Create countplot and split each bar into two based on the value of sit_tto\n",
    "    ax = sns.countplot(x=disc_att, hue='Situacion_tratamiento', data=df, order=col_order, ax=axs[i, group-2])\n",
    "\n",
    "    # Adjust y-axis to represent percentages out of the total count\n",
    "    ax.set_ylim(0, 100)\n",
    "\n",
    "    percentages = grouped_counts['percentage']\n",
    "    for i, p in enumerate(ax.patches):\n",
    "        # Skip going over the legend values\n",
    "        if p.get_label() == \"_nolegend_\":\n",
    "            # Set height to corresponding percentage and annotate result\n",
    "            height = percentages[i]\n",
    "            p.set_height(height)\n",
    "            ax.annotate(f'{height:.2f}%', (p.get_x() + p.get_width() / 2., height),\n",
    "                        ha='center', va='bottom', fontsize=6, color='black', xytext=(0, 5),\n",
    "                        textcoords='offset points')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, axs = plt.subplots(len(disc_atts), 2, figsize=(15, 7*len(disc_atts)))\n",
    "plt.subplots_adjust(hspace=0.75, wspace=1.5)\n",
    "\n",
    "for i, disc_att in enumerate(disc_atts):\n",
    "\n",
    "    # # 1: ALL    \n",
    "    # plot_count_perc_norm(i, 1, disc_att)\n",
    "    # axs[i, 0].set_title(\"\\nALL\")\n",
    "    # axs[i, 0].set_xlabel(disc_att, fontweight='bold')\n",
    "    # axs[i, 0].set_ylabel(\"% of total within its Sit_tto group\")\n",
    "    # axs[i, 0].tick_params(axis='x', rotation=90)\n",
    "    \n",
    "    # 2: PRE\n",
    "    plot_count_perc_norm(i, 2, disc_att)\n",
    "    axs[i, 0].set_title(\"\\nPRE\")\n",
    "    axs[i, 0].set_xlabel(disc_att, fontweight='bold')\n",
    "    axs[i, 0].set_ylabel(\"% of total within its Sit_tto group\")\n",
    "    axs[i, 0].tick_params(axis='x', rotation=90)\n",
    "\n",
    "    # 3: POST\n",
    "    plot_count_perc_norm(i, 3, disc_att)\n",
    "    axs[i, 1].set_title(\"\\nPOST\")\n",
    "    axs[i, 1].set_xlabel(disc_att, fontweight='bold')\n",
    "    axs[i, 1].set_ylabel(\"% of total within its Sit_tto group\")\n",
    "    axs[i, 1].tick_params(axis='x', rotation=90)\n",
    "\n",
    "    \n",
    "# Adjust layout to prevent overlapping titles\n",
    "plt.tight_layout()\n",
    "\n",
    "# Save the figure in SVG format with DPI=600 in the \"./EDA_plots\" folder\n",
    "plt.savefig('./EDA_plots/norm_countplots.svg', dpi=600, bbox_inches='tight')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Distribution of numeric attributes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###### Summary statistics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(bd[num_atts].describe())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###### Boxplots"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, axs = plt.subplots(len(num_atts), 1, figsize=(12, 5*len(num_atts)))\n",
    "plt.subplots_adjust(hspace=0.75, wspace=1.5)\n",
    "\n",
    "for i, num_att in enumerate(num_atts):\n",
    "    plt.subplot(len(num_atts), 1, i+1)\n",
    "    sns.boxplot(\n",
    "        data=combined_pre_post,\n",
    "        x = num_att,\n",
    "        y = 'Group',\n",
    "        hue='Situacion_tratamiento',\n",
    "    )\n",
    "\n",
    "# Adjust layout to prevent overlapping titles\n",
    "plt.tight_layout()\n",
    "\n",
    "# Save the figure in SVG format with DPI=600 in the \"./EDA_plots\" folder\n",
    "plt.savefig('./EDA_plots/boxplots.svg', dpi=600, bbox_inches='tight')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###### Histograms"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, axs = plt.subplots(len(num_atts), 3, figsize=(15, 6*len(num_atts)))\n",
    "plt.subplots_adjust(hspace=0.75, wspace=1.5)\n",
    "\n",
    "for i, num_att in enumerate(num_atts):\n",
    "\n",
    "    # 1: All alcohol patients\n",
    "    sns.histplot(data=bd,x=num_att,bins=15, hue='Situacion_tratamiento', stat='probability', common_norm=False, kde=True,\n",
    "                 line_kws={'lw': 5}, alpha = 0.4, ax=axs[i, 0])\n",
    "    axs[i, 0].set_title(f\"\\nDistr. of {num_att}  - ALL\")\n",
    "\n",
    "    # 2: PRE\n",
    "    sns.histplot(data=conj_pre,x=num_att,bins=15, hue='Situacion_tratamiento', stat='probability', common_norm=False, kde=True, \n",
    "                 line_kws={'lw': 5}, alpha = 0.4, ax=axs[i, 1])\n",
    "    axs[i, 1].set_title(f\"\\nDistr. of {num_att}  - PRE\")\n",
    "\n",
    "    # Subplot 3: POST\n",
    "    sns.histplot(data=conj_post,x=num_att,bins=15, hue='Situacion_tratamiento', stat='probability', common_norm=False, kde=True, \n",
    "                 line_kws={'lw': 5}, alpha = 0.4, ax=axs[i, 2])\n",
    "    axs[i, 2].set_title(f\"\\nDistr. of {num_att}  - POST\")\n",
    "\n",
    "# Adjust layout to prevent overlapping titles\n",
    "plt.tight_layout()\n",
    "\n",
    "# Save the figure in SVG format with DPI=600 in the \"./EDA_plots\" folder\n",
    "plt.savefig('./EDA_plots/histograms.svg', dpi=600, bbox_inches='tight')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Correlation Analysis"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Turning binary variables into 0/1 values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# --------------------------------------------------------------------------\n",
    "\n",
    "# 'Alterations_early_childhood_develop'\n",
    "alterations_mapping = {\n",
    "    'No alterations (first exposure at 11 or more years)' : 0,\n",
    "    'Alterations (first exposure before 11 years old)': 1,\n",
    "}\n",
    "\n",
    "bd['Alterations_early_childhood_develop_REDEF'] = bd['Alterations_early_childhood_develop'].map(alterations_mapping)\n",
    "\n",
    "# --------------------------------------------------------------------------\n",
    "\n",
    "# Social protection\n",
    "bd['Social_protection_REDEF'] = bd['Social_protection'].map({'No':0, 'Sí':1})\n",
    "\n",
    "# --------------------------------------------------------------------------\n",
    "\n",
    "# 'Risk_stigma'\n",
    "bd['Risk_stigma_REDEF'] = bd['Risk_stigma'].map({'No':0, 'Yes':1})\n",
    "\n",
    "# --------------------------------------------------------------------------\n",
    "\n",
    "# 'Sex'\n",
    "bd['Sex_REDEF'] = bd['Sex'].map({'Hombre':0, 'Mujer':1})\n",
    "\n",
    "# --------------------------------------------------------------------------\n",
    "\n",
    "# 'Smoking'\n",
    "bd['Smoking_REDEF'] = bd['Smoking'].map({'No':0, 'Sí':1})\n",
    "\n",
    "# --------------------------------------------------------------------------\n",
    "\n",
    "# 'Biological_vulnerability'\n",
    "bd['Biological_vulnerability_REDEF'] = bd['Biological_vulnerability'].map({'No':0, 'Sí':1})\n",
    "\n",
    "# --------------------------------------------------------------------------\n",
    "\n",
    "# 'Droga_DxCIE'\n",
    "bd['Opiaceos_DxCIE_REDEF'] = bd['Opiaceos_DxCIE'].map({'No': 0, 'Sí': 1})\n",
    "bd['Cannabis_DXCIE_REDEF'] = bd['Cannabis_DXCIE'].map({'No': 0, 'Sí': 1})\n",
    "bd['BZD_DxCIE_REDEF'] = bd['BZD_DxCIE'].map({'No': 0, 'Sí': 1})\n",
    "bd['Cocaina_DxCIE_REDEF'] = bd['Cocaina_DxCIE'].map({'No': 0, 'Sí': 1})\n",
    "bd['Alucinogenos_DXCIE_REDEF'] = bd['Alucinogenos_DXCIE'].map({'No': 0, 'Sí': 1})\n",
    "bd['Tabaco_DXCIE_REDEF'] = bd['Tabaco_DXCIE'].map({'No': 0, 'Sí': 1})\n",
    "\n",
    "# --------------------------------------------------------------------------\n",
    "\n",
    "# 'OtrosDx_Psiquiatrico'\n",
    "bd['OtrosDx_Psiquiatrico_REDEF'] = bd['OtrosDx_Psiquiatrico'].map({'No':0, 'Sí':1})\n",
    "\n",
    "# --------------------------------------------------------------------------\n",
    "\n",
    "# 'Tx_previos'\n",
    "bd['Tx_previos_REDEF'] = bd['Tx_previos'].map({'No':0, 'Sí':1})\n",
    "\n",
    "# --------------------------------------------------------------------------\n",
    "\n",
    "# 'Situacion_tratamiento (!!!!!)\n",
    "# Important to define properly\n",
    "bd['Situacion_tratamiento_REDEF'] = bd['Situacion_tratamiento'].map({'Abandono':1, 'Alta terapéutica':0})\n",
    "\n",
    "# --------------------------------------------------------------------------"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Defining groups of variables"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "social_vars = ['Education', 'Social_protection', 'Job_insecurity', 'Housing', 'Alterations_early_childhood_develop', \n",
    "            'Social_inclusion', 'Risk_stigma', 'Structural_conflic']\n",
    "ind_vars = ['Age', 'Sex', 'NumHijos', 'Smoking', 'Biological_vulnerability', 'Opiaceos_DxCIE', \n",
    "            'Cannabis_DXCIE', 'BZD_DxCIE', 'Cocaina_DxCIE', 'Alucinogenos_DXCIE', 'Tabaco_DXCIE', \n",
    "            'FrecuenciaConsumo30Dias', 'Años_consumo_droga','OtrosDx_Psiquiatrico', 'Tx_previos', 'Adherencia_tto_recalc'] \n",
    "target_var = 'Situacion_tratamiento'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Columns that are already numeric and we don't need to redefine \n",
    "no_redef_cols = ['Structural_conflic', 'Age', 'NumHijos', 'Años_consumo_droga', 'Adherencia_tto_recalc']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# res_vars = ['Tiempo_tx', 'Readmisiones_estudios', 'Periodos_COVID', 'Pandemia_inicio_fin_tratamiento', \n",
    "#            'Nreadmision', 'Readmisiones_PRECOVID', 'Readmisiones_COVID']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### One-hot encode categorical variables"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Specify columns to one hot encode; empty list otherwise\n",
    "one_hot_vars = ['Education', 'Job_insecurity', 'Housing', 'Social_inclusion', 'FrecuenciaConsumo30Dias']\n",
    "\n",
    "one_hots_vars_prefix = {\n",
    "    'Education': 'Ed',\n",
    "    'Job_insecurity': 'JobIn',\n",
    "    'Housing': 'Hous', \n",
    "    'Social_inclusion': 'SocInc',\n",
    "    'FrecuenciaConsumo30Dias': 'Frec30',\n",
    "}\n",
    "\n",
    "one_hot_cols_dic = {}\n",
    "\n",
    "for one_hot_var in one_hot_vars:\n",
    "    # Create one hot encoding version of attribute and concatenate new columns to main df\n",
    "    encoded_var = pd.get_dummies(bd[one_hot_var], prefix=one_hots_vars_prefix[one_hot_var])\n",
    "    bd = pd.concat([bd, encoded_var], axis=1)\n",
    "    one_hot_cols_dic[one_hot_var] = encoded_var.columns.tolist()\n",
    "\n",
    "# print(one_hot_cols_dic['FrecuenciaConsumo30Dias'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###### Defining final version of columns of interest"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "soc_vars_enc = []\n",
    "for soc_var in social_vars:\n",
    "    # If no need to redefine, append directly\n",
    "    if soc_var in no_redef_cols:\n",
    "        soc_vars_enc.append(soc_var)\n",
    "    # If need to redefine\n",
    "    else:\n",
    "        # Check if it was one-hot encoded\n",
    "        if soc_var in one_hot_vars:\n",
    "            # Append all one hot columns\n",
    "            soc_vars_enc = soc_vars_enc + one_hot_cols_dic[soc_var]\n",
    "        # If not, use redefined version through mapping\n",
    "        else:\n",
    "            soc_vars_enc.append(soc_var + '_REDEF')\n",
    "\n",
    "ind_vars_enc = []\n",
    "for ind_var in ind_vars:\n",
    "    # If no need to redefine, append directly\n",
    "    if ind_var in no_redef_cols:\n",
    "        ind_vars_enc.append(ind_var)\n",
    "    # If need to redefine\n",
    "    else:\n",
    "        # Check if it was one-hot encoded\n",
    "        if ind_var in one_hot_vars:\n",
    "            # Append all one hot columns\n",
    "            ind_vars_enc = ind_vars_enc + one_hot_cols_dic[ind_var]\n",
    "        # If not, use redefined version through mapping\n",
    "        else:\n",
    "            ind_vars_enc.append(ind_var + '_REDEF')\n",
    "\n",
    "# Final version of columns we need to use for correlation analysis\n",
    "corr_cols = soc_vars_enc + ind_vars_enc"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###### Update main data frames"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Pre-pandemic\n",
    "conj_pre = bd[bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio y fin prepandemia']\n",
    "# Pre-pandemic abandono\n",
    "pre_abandono = conj_pre[conj_pre['Situacion_tratamiento'] == 'Abandono']\n",
    "# Pre-pandemic alta\n",
    "pre_alta = conj_pre[conj_pre['Situacion_tratamiento'] == 'Alta terapéutica']\n",
    "\n",
    "# Post-pandemic\n",
    "# Merging last two classes to balance sets\n",
    "conj_post = bd[(bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio prepandemia y fin en pandemia') | \n",
    "               (bd['Pandemia_inicio_fin_tratamiento'] == 'inicio y fin en pandemia')]\n",
    "# Post-pandemic abandono\n",
    "post_abandono = conj_post[conj_post['Situacion_tratamiento'] == 'Abandono']\n",
    "# Post-pandemic alta\n",
    "post_alta = conj_post[conj_post['Situacion_tratamiento'] == 'Alta terapéutica']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Building correlation matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "binary_vars = [col for col in corr_cols if len(bd[col].unique()) == 2] + ['Situacion_tratamiento_REDEF', 'Risk_stigma_REDEF']\n",
    "cont_vars = ['Structural_conflic', 'Age', 'NumHijos', 'Años_consumo_droga', 'Adherencia_tto_recalc']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_corr_matrix(df, cols):\n",
    "    \n",
    "    # Initialize nxn matrix to zeroes\n",
    "    n = len(cols)\n",
    "    corr_matrix = np.zeros((n,n))\n",
    "\n",
    "    for i, var_i in enumerate(cols):\n",
    "        for j, var_j in enumerate(cols):\n",
    "            # Fill lower triangle of matrix\n",
    "            if i > j:\n",
    "                # Binary with binary correlation: tetrachoric\n",
    "                if var_i in binary_vars and var_j in binary_vars:\n",
    "                    corr = binary_binary(df[var_i], df[var_j], measure='tetrachoric')\n",
    "                # Continuous with continuous correlation: \n",
    "                elif var_i in cont_vars and var_j in cont_vars:\n",
    "                    # Returning nan sometimes:\n",
    "                    # corr_tuple = continuous_continuous(df[var_i], df[var_j], measure = 'spearman')\n",
    "                    # corr = corr_tuple[0]\n",
    "                    corr = df[var_i].corr(df[var_j], method='spearman')\n",
    "                # Binary vs Continuous correlation:\n",
    "                else:\n",
    "                    if var_i in binary_vars:\n",
    "                        bin_var = var_i\n",
    "                        cont_var = var_j\n",
    "                    else:\n",
    "                        bin_var = var_j\n",
    "                        cont_var = var_i\n",
    "                    corr = binary_continuous(df[bin_var], df[cont_var], measure='point_biserial')\n",
    "                # Assign value to matrix\n",
    "                corr_matrix[i][j] = corr \n",
    "                      \n",
    "    return corr_matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "def plot_heatmap(sit_tto: int, group:int) -> None:\n",
    "    \"\"\"\n",
    "        sit_tto: 1 (include it as another var), 2 (only abandono), 3 (only alta)\n",
    "        group: 1 (all alcohol patients), 2 (pre), 3 (post)\n",
    "    \"\"\"\n",
    "\n",
    "    # Define columns based on sit_tto arg\n",
    "    if sit_tto == 1:\n",
    "        # Include target as another variable\n",
    "        cols = [target_var + '_REDEF'] + corr_cols\n",
    "    else:\n",
    "        cols = corr_cols\n",
    "        \n",
    "    # Title plot and select datat based on group and sit_tto\n",
    "    if group == 1:\n",
    "        plot_title = \"Correl Matrix - ALL\"\n",
    "        if sit_tto == 1:\n",
    "            bd_ca = bd[cols]\n",
    "        elif sit_tto == 2:\n",
    "            bd_ca = bd[bd['Situacion_tratamiento'] == 'Abandono'][cols]\n",
    "        elif sit_tto == 3:\n",
    "            bd_ca = bd[bd['Situacion_tratamiento'] == 'Alta terapéutica'][cols]\n",
    "    elif group == 2:\n",
    "        plot_title = \"Correl Matrix - PRE\"\n",
    "        if sit_tto == 1:    \n",
    "            bd_ca = conj_pre[cols]\n",
    "        elif sit_tto == 2:\n",
    "            bd_ca = pre_abandono[cols]\n",
    "        elif sit_tto == 3:\n",
    "            bd_ca = pre_alta[cols]\n",
    "    elif group == 3:\n",
    "        plot_title = \"Correl Matrix - POST\"\n",
    "        if sit_tto == 1:    \n",
    "            bd_ca = conj_post[cols]\n",
    "        elif sit_tto == 2:\n",
    "            bd_ca = post_abandono[cols]\n",
    "        elif sit_tto == 3:\n",
    "            bd_ca = post_alta[cols]\n",
    "            \n",
    "    # Complete title\n",
    "    if sit_tto == 2:\n",
    "        plot_title += \" - ABANDONO\"\n",
    "    elif sit_tto == 3:\n",
    "        plot_title += \" - ALTA\"\n",
    "\n",
    "    corr_matrix = get_corr_matrix(bd_ca, cols)\n",
    "\n",
    "    # Create a mask for the upper triangle\n",
    "    mask = np.triu(np.ones_like(corr_matrix, dtype=bool))\n",
    "\n",
    "    # Create heatmap correlation matrix\n",
    "    dataplot = sns.heatmap(corr_matrix, mask=mask, xticklabels=cols, yticklabels=cols, cmap=\"coolwarm\", vmin=-1, vmax=1, annot=True, fmt=\".2f\", annot_kws={\"size\": 4})\n",
    "\n",
    "    # Group ind vs social vars by color and modify tick label names\n",
    "    for tick_label in dataplot.axes.xaxis.get_ticklabels():\n",
    "        if tick_label.get_text() in ind_vars_enc:\n",
    "            tick_label.set_color('green')\n",
    "        elif tick_label.get_text() in soc_vars_enc:\n",
    "            tick_label.set_color('purple')  \n",
    "    for tick_label in dataplot.axes.yaxis.get_ticklabels():\n",
    "        if tick_label.get_text() in ind_vars_enc:\n",
    "            tick_label.set_color('green')\n",
    "        elif tick_label.get_text() in soc_vars_enc:\n",
    "            tick_label.set_color('purple') \n",
    "\n",
    "    # Increase the size of xtick labels\n",
    "    # dataplot.tick_params(axis='x', labelsize=12)\n",
    "\n",
    "    # Increase the size of ytick labels\n",
    "    # dataplot.tick_params(axis='y', labelsize=12)\n",
    "\n",
    "    # Add legend and place it in lower left \n",
    "    plt.legend(handles=[\n",
    "        plt.Line2D([0], [0], marker='o', color='w', label='Social Factors', markerfacecolor='purple', markersize=10),\n",
    "        plt.Line2D([0], [0], marker='o', color='w', label='Individual Factors', markerfacecolor='green', markersize=10)\n",
    "    ], bbox_to_anchor=(-0.1, -0.1), fontsize = 20)\n",
    "\n",
    "    plt.title(\"\\n\\n\" + plot_title, fontdict={'fontsize': 30, 'fontweight': 'bold'})\n",
    "\n",
    "    return corr_matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, axs = plt.subplots(3, 3, figsize=(50, 50))\n",
    "plt.subplots_adjust(hspace=0.75, wspace=2)\n",
    "corr_mats = [] # List of tuples (m1, m2) to store the 3 pairs of matrices to compare (pre vs post)\n",
    "\n",
    "# Go through possible values for 'Situacion_tratamiento' and 'Group'\n",
    "for sit_tto in range(1,4):\n",
    "    # ALL\n",
    "    plt.subplot(3, 3, 3*(sit_tto-1) + 1)  # Calculate the subplot position dynamically\n",
    "    _ = plot_heatmap(sit_tto, 1)\n",
    "    # PRE\n",
    "    plt.subplot(3, 3, 3*(sit_tto-1) + 2) \n",
    "    corr_matrix_pre = plot_heatmap(sit_tto, 2)\n",
    "    # POST\n",
    "    plt.subplot(3, 3, 3*(sit_tto-1) + 3)\n",
    "    corr_matrix_post = plot_heatmap(sit_tto, 3)\n",
    "\n",
    "    corr_mats.append((corr_matrix_pre, corr_matrix_post))\n",
    "        \n",
    "# Adjust layout to prevent overlapping titles\n",
    "plt.tight_layout()\n",
    "\n",
    "# Save the figure in SVG format in the \"./EDA_plots\" folder\n",
    "plt.savefig('./EDA_plots/heatmaps_one_hot.svg', dpi=550, bbox_inches='tight')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Finding significative differences between PRE and POST"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def find_diff (sit_tto:int, m_pre, m_post):\n",
    "\n",
    "    diff_list = []  # List to store tuples of (difference, variable_i, variable_j)\n",
    "\n",
    "    if sit_tto == 1:\n",
    "        cols = [target_var + '_REDEF'] + corr_cols\n",
    "    else:\n",
    "        cols = corr_cols\n",
    "    # Go through matrices\n",
    "    for i, var_i in enumerate(cols):\n",
    "        for j, var_j in enumerate(cols):\n",
    "            # If difference greater than certain threshold, print variables \n",
    "            val_pre = m_pre[i][j]\n",
    "            val_post = m_post[i][j]\n",
    "            diff = abs(val_pre - val_post)\n",
    "            diff_list.append((diff, var_i, var_j, val_pre, val_post))\n",
    "    \n",
    "    # Sort the list based on the difference value in descending order\n",
    "    diff_list.sort(key=lambda x: x[0], reverse=True)\n",
    "            \n",
    "    # Print the sorted list\n",
    "    for diff, var_i, var_j, val_pre, val_post in diff_list[0:100]:\n",
    "        # Give ind vs soc vars their corresponding color\n",
    "        if var_i in ind_vars_enc:\n",
    "            print(colors.GREEN + var_i + colors.RESET, end=' ')\n",
    "        else:\n",
    "            print(colors.PURPLE + var_i + colors.PURPLE, end=' ')\n",
    "        print(\"& \", end='')\n",
    "        if var_j in ind_vars_enc:\n",
    "            print(colors.GREEN + var_j + colors.RESET, end=' ')\n",
    "        else:\n",
    "            print(colors.PURPLE + var_j + colors.RESET, end=' ')\n",
    "        print(f\"--> Diff: {diff:.2f} (PRE: {val_pre:.2f}; POST: {val_post:.2f})\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class colors:\n",
    "    RED = '\\033[91m'\n",
    "    GREEN = '\\033[92m'\n",
    "    YELLOW = '\\033[93m'\n",
    "    BLUE = '\\033[94m'\n",
    "    PURPLE = '\\033[95m'\n",
    "    CYAN = '\\033[96m'\n",
    "    WHITE = '\\033[97m'\n",
    "    RESET = '\\033[0m'\n",
    "\n",
    "# Print colored text\n",
    "print(colors.RED + \"This is red text.\" + colors.RESET)\n",
    "print(colors.GREEN + \"This is green text.\" + colors.RESET)\n",
    "print(colors.BLUE + \"This is blue text.\" + colors.RESET)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": [
     "keep"
    ]
   },
   "outputs": [],
   "source": [
    "print(\"------SIT_TTO 1: NO FILTERING------\")\n",
    "find_diff(1, corr_mats[0][0], corr_mats[0][1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": [
     "keep"
    ]
   },
   "outputs": [],
   "source": [
    "print(\"------SIT_TTO 2: ABANDONO-----\")\n",
    "find_diff(2, corr_mats[1][0], corr_mats[1][1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": [
     "keep"
    ]
   },
   "outputs": [],
   "source": [
    "print(\"------SIT_TTO 3: ALTA-----\")\n",
    "find_diff(3, corr_mats[2][0], corr_mats[2][1])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Feature Analysis and Selection"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Building final datasets to work with"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Index: 33538 entries, 0 to 85164\n",
      "Data columns (total 45 columns):\n",
      " #   Column                                     Non-Null Count  Dtype   \n",
      "---  ------                                     --------------  -----   \n",
      " 0   Ed_Not Complete primary school             33538 non-null  bool    \n",
      " 1   Ed_Primary education                       33538 non-null  bool    \n",
      " 2   Ed_Secondary Education                     33538 non-null  bool    \n",
      " 3   Ed_Secondary more technical education      33538 non-null  bool    \n",
      " 4   Ed_Tertiary                                33538 non-null  bool    \n",
      " 5   Ed_Unknowledge                             33538 non-null  bool    \n",
      " 6   Social_protection_REDEF                    33538 non-null  int64   \n",
      " 7   JobIn_Non-stable                           33538 non-null  bool    \n",
      " 8   JobIn_Stable                               33538 non-null  bool    \n",
      " 9   JobIn_Unemployed                           33538 non-null  bool    \n",
      " 10  JobIn_unkwnodledge                         33538 non-null  bool    \n",
      " 11  Hous_Institutional                         33538 non-null  bool    \n",
      " 12  Hous_Stable                                33538 non-null  bool    \n",
      " 13  Hous_Unstable                              33538 non-null  bool    \n",
      " 14  Hous_unknowledge                           33538 non-null  bool    \n",
      " 15  Alterations_early_childhood_develop_REDEF  33538 non-null  int64   \n",
      " 16  SocInc_Live with families or friends       33538 non-null  bool    \n",
      " 17  SocInc_live alone                          33538 non-null  bool    \n",
      " 18  SocInc_live in institutions                33538 non-null  bool    \n",
      " 19  Risk_stigma_REDEF                          33538 non-null  category\n",
      " 20  Structural_conflic                         33538 non-null  float64 \n",
      " 21  Age                                        33538 non-null  float64 \n",
      " 22  Sex_REDEF                                  33538 non-null  int64   \n",
      " 23  NumHijos                                   33538 non-null  float64 \n",
      " 24  Smoking_REDEF                              33538 non-null  int64   \n",
      " 25  Biological_vulnerability_REDEF             33538 non-null  int64   \n",
      " 26  Opiaceos_DxCIE_REDEF                       33538 non-null  int64   \n",
      " 27  Cannabis_DXCIE_REDEF                       33538 non-null  int64   \n",
      " 28  BZD_DxCIE_REDEF                            33538 non-null  int64   \n",
      " 29  Cocaina_DxCIE_REDEF                        33538 non-null  int64   \n",
      " 30  Alucinogenos_DXCIE_REDEF                   33538 non-null  int64   \n",
      " 31  Tabaco_DXCIE_REDEF                         33538 non-null  int64   \n",
      " 32  Frec30_1 día/semana                        33538 non-null  bool    \n",
      " 33  Frec30_2-3 días‎/semana                    33538 non-null  bool    \n",
      " 34  Frec30_4-6 días/semana                     33538 non-null  bool    \n",
      " 35  Frec30_Desconocido                         33538 non-null  bool    \n",
      " 36  Frec30_Menos de 1 día‎/semana              33538 non-null  bool    \n",
      " 37  Frec30_No consumio                         33538 non-null  bool    \n",
      " 38  Frec30_Todos los días                      33538 non-null  bool    \n",
      " 39  Años_consumo_droga                         33538 non-null  float64 \n",
      " 40  OtrosDx_Psiquiatrico_REDEF                 33538 non-null  int64   \n",
      " 41  Tx_previos_REDEF                           33538 non-null  int64   \n",
      " 42  Adherencia_tto_recalc                      33538 non-null  float64 \n",
      " 43  Pandemia_inicio_fin_tratamiento            33538 non-null  object  \n",
      " 44  Situacion_tratamiento_REDEF                33538 non-null  int64   \n",
      "dtypes: bool(24), category(1), float64(5), int64(14), object(1)\n",
      "memory usage: 6.2+ MB\n",
      "None\n"
     ]
    }
   ],
   "source": [
    "# Work with columns of interest\n",
    "cols_of_interest = corr_cols + ['Pandemia_inicio_fin_tratamiento'] + [target_var + \"_REDEF\"]\n",
    "temp_bd = bd[cols_of_interest]\n",
    "print(temp_bd.info()) # NaN values already dealt with (replaced by mode - this okay?)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Dropping unknown columns/categories for analysis purposes\n",
    "unknown_cols = ['Ed_Unknowledge', 'JobIn_unkwnodledge', 'Hous_unknowledge', 'Frec30_Desconocido']\n",
    "temp_bd = temp_bd.drop(columns=unknown_cols)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Index: 33538 entries, 0 to 85164\n",
      "Data columns (total 41 columns):\n",
      " #   Column                                     Non-Null Count  Dtype   \n",
      "---  ------                                     --------------  -----   \n",
      " 0   Ed_Not Complete primary school             33538 non-null  bool    \n",
      " 1   Ed_Primary education                       33538 non-null  bool    \n",
      " 2   Ed_Secondary Education                     33538 non-null  bool    \n",
      " 3   Ed_Secondary more technical education      33538 non-null  bool    \n",
      " 4   Ed_Tertiary                                33538 non-null  bool    \n",
      " 5   Social_protection_REDEF                    33538 non-null  int64   \n",
      " 6   JobIn_Non-stable                           33538 non-null  bool    \n",
      " 7   JobIn_Stable                               33538 non-null  bool    \n",
      " 8   JobIn_Unemployed                           33538 non-null  bool    \n",
      " 9   Hous_Institutional                         33538 non-null  bool    \n",
      " 10  Hous_Stable                                33538 non-null  bool    \n",
      " 11  Hous_Unstable                              33538 non-null  bool    \n",
      " 12  Alterations_early_childhood_develop_REDEF  33538 non-null  int64   \n",
      " 13  SocInc_Live with families or friends       33538 non-null  bool    \n",
      " 14  SocInc_live alone                          33538 non-null  bool    \n",
      " 15  SocInc_live in institutions                33538 non-null  bool    \n",
      " 16  Risk_stigma_REDEF                          33538 non-null  category\n",
      " 17  Structural_conflic                         33538 non-null  float64 \n",
      " 18  Age                                        33538 non-null  float64 \n",
      " 19  Sex_REDEF                                  33538 non-null  int64   \n",
      " 20  NumHijos                                   33538 non-null  float64 \n",
      " 21  Smoking_REDEF                              33538 non-null  int64   \n",
      " 22  Biological_vulnerability_REDEF             33538 non-null  int64   \n",
      " 23  Opiaceos_DxCIE_REDEF                       33538 non-null  int64   \n",
      " 24  Cannabis_DXCIE_REDEF                       33538 non-null  int64   \n",
      " 25  BZD_DxCIE_REDEF                            33538 non-null  int64   \n",
      " 26  Cocaina_DxCIE_REDEF                        33538 non-null  int64   \n",
      " 27  Alucinogenos_DXCIE_REDEF                   33538 non-null  int64   \n",
      " 28  Tabaco_DXCIE_REDEF                         33538 non-null  int64   \n",
      " 29  Frec30_1 día/semana                        33538 non-null  bool    \n",
      " 30  Frec30_2-3 días‎/semana                    33538 non-null  bool    \n",
      " 31  Frec30_4-6 días/semana                     33538 non-null  bool    \n",
      " 32  Frec30_Menos de 1 día‎/semana              33538 non-null  bool    \n",
      " 33  Frec30_No consumio                         33538 non-null  bool    \n",
      " 34  Frec30_Todos los días                      33538 non-null  bool    \n",
      " 35  Años_consumo_droga                         33538 non-null  float64 \n",
      " 36  OtrosDx_Psiquiatrico_REDEF                 33538 non-null  int64   \n",
      " 37  Tx_previos_REDEF                           33538 non-null  int64   \n",
      " 38  Adherencia_tto_recalc                      33538 non-null  float64 \n",
      " 39  Pandemia_inicio_fin_tratamiento            33538 non-null  object  \n",
      " 40  Situacion_tratamiento_REDEF                33538 non-null  int64   \n",
      "dtypes: bool(20), category(1), float64(5), int64(14), object(1)\n",
      "memory usage: 6.0+ MB\n",
      "None\n"
     ]
    }
   ],
   "source": [
    "print(temp_bd.info())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "# For conj_pre dataframe\n",
    "conj_pre = temp_bd[temp_bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio y fin prepandemia']\n",
    "conj_pre = conj_pre.drop(columns=['Pandemia_inicio_fin_tratamiento'])\n",
    "\n",
    "# For conj_post dataframe\n",
    "conj_post = temp_bd[(temp_bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio prepandemia y fin en pandemia') | \n",
    "                    (temp_bd['Pandemia_inicio_fin_tratamiento'] == 'inicio y fin en pandemia')]\n",
    "conj_post = conj_post.drop(columns=['Pandemia_inicio_fin_tratamiento'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Index: 22861 entries, 0 to 85164\n",
      "Data columns (total 40 columns):\n",
      " #   Column                                     Non-Null Count  Dtype   \n",
      "---  ------                                     --------------  -----   \n",
      " 0   Ed_Not Complete primary school             22861 non-null  bool    \n",
      " 1   Ed_Primary education                       22861 non-null  bool    \n",
      " 2   Ed_Secondary Education                     22861 non-null  bool    \n",
      " 3   Ed_Secondary more technical education      22861 non-null  bool    \n",
      " 4   Ed_Tertiary                                22861 non-null  bool    \n",
      " 5   Social_protection_REDEF                    22861 non-null  int64   \n",
      " 6   JobIn_Non-stable                           22861 non-null  bool    \n",
      " 7   JobIn_Stable                               22861 non-null  bool    \n",
      " 8   JobIn_Unemployed                           22861 non-null  bool    \n",
      " 9   Hous_Institutional                         22861 non-null  bool    \n",
      " 10  Hous_Stable                                22861 non-null  bool    \n",
      " 11  Hous_Unstable                              22861 non-null  bool    \n",
      " 12  Alterations_early_childhood_develop_REDEF  22861 non-null  int64   \n",
      " 13  SocInc_Live with families or friends       22861 non-null  bool    \n",
      " 14  SocInc_live alone                          22861 non-null  bool    \n",
      " 15  SocInc_live in institutions                22861 non-null  bool    \n",
      " 16  Risk_stigma_REDEF                          22861 non-null  category\n",
      " 17  Structural_conflic                         22861 non-null  float64 \n",
      " 18  Age                                        22861 non-null  float64 \n",
      " 19  Sex_REDEF                                  22861 non-null  int64   \n",
      " 20  NumHijos                                   22861 non-null  float64 \n",
      " 21  Smoking_REDEF                              22861 non-null  int64   \n",
      " 22  Biological_vulnerability_REDEF             22861 non-null  int64   \n",
      " 23  Opiaceos_DxCIE_REDEF                       22861 non-null  int64   \n",
      " 24  Cannabis_DXCIE_REDEF                       22861 non-null  int64   \n",
      " 25  BZD_DxCIE_REDEF                            22861 non-null  int64   \n",
      " 26  Cocaina_DxCIE_REDEF                        22861 non-null  int64   \n",
      " 27  Alucinogenos_DXCIE_REDEF                   22861 non-null  int64   \n",
      " 28  Tabaco_DXCIE_REDEF                         22861 non-null  int64   \n",
      " 29  Frec30_1 día/semana                        22861 non-null  bool    \n",
      " 30  Frec30_2-3 días‎/semana                    22861 non-null  bool    \n",
      " 31  Frec30_4-6 días/semana                     22861 non-null  bool    \n",
      " 32  Frec30_Menos de 1 día‎/semana              22861 non-null  bool    \n",
      " 33  Frec30_No consumio                         22861 non-null  bool    \n",
      " 34  Frec30_Todos los días                      22861 non-null  bool    \n",
      " 35  Años_consumo_droga                         22861 non-null  float64 \n",
      " 36  OtrosDx_Psiquiatrico_REDEF                 22861 non-null  int64   \n",
      " 37  Tx_previos_REDEF                           22861 non-null  int64   \n",
      " 38  Adherencia_tto_recalc                      22861 non-null  float64 \n",
      " 39  Situacion_tratamiento_REDEF                22861 non-null  int64   \n",
      "dtypes: bool(20), category(1), float64(5), int64(14)\n",
      "memory usage: 3.9 MB\n",
      "None\n"
     ]
    }
   ],
   "source": [
    "print(conj_pre.info())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Index: 10677 entries, 11 to 85156\n",
      "Data columns (total 40 columns):\n",
      " #   Column                                     Non-Null Count  Dtype   \n",
      "---  ------                                     --------------  -----   \n",
      " 0   Ed_Not Complete primary school             10677 non-null  bool    \n",
      " 1   Ed_Primary education                       10677 non-null  bool    \n",
      " 2   Ed_Secondary Education                     10677 non-null  bool    \n",
      " 3   Ed_Secondary more technical education      10677 non-null  bool    \n",
      " 4   Ed_Tertiary                                10677 non-null  bool    \n",
      " 5   Social_protection_REDEF                    10677 non-null  int64   \n",
      " 6   JobIn_Non-stable                           10677 non-null  bool    \n",
      " 7   JobIn_Stable                               10677 non-null  bool    \n",
      " 8   JobIn_Unemployed                           10677 non-null  bool    \n",
      " 9   Hous_Institutional                         10677 non-null  bool    \n",
      " 10  Hous_Stable                                10677 non-null  bool    \n",
      " 11  Hous_Unstable                              10677 non-null  bool    \n",
      " 12  Alterations_early_childhood_develop_REDEF  10677 non-null  int64   \n",
      " 13  SocInc_Live with families or friends       10677 non-null  bool    \n",
      " 14  SocInc_live alone                          10677 non-null  bool    \n",
      " 15  SocInc_live in institutions                10677 non-null  bool    \n",
      " 16  Risk_stigma_REDEF                          10677 non-null  category\n",
      " 17  Structural_conflic                         10677 non-null  float64 \n",
      " 18  Age                                        10677 non-null  float64 \n",
      " 19  Sex_REDEF                                  10677 non-null  int64   \n",
      " 20  NumHijos                                   10677 non-null  float64 \n",
      " 21  Smoking_REDEF                              10677 non-null  int64   \n",
      " 22  Biological_vulnerability_REDEF             10677 non-null  int64   \n",
      " 23  Opiaceos_DxCIE_REDEF                       10677 non-null  int64   \n",
      " 24  Cannabis_DXCIE_REDEF                       10677 non-null  int64   \n",
      " 25  BZD_DxCIE_REDEF                            10677 non-null  int64   \n",
      " 26  Cocaina_DxCIE_REDEF                        10677 non-null  int64   \n",
      " 27  Alucinogenos_DXCIE_REDEF                   10677 non-null  int64   \n",
      " 28  Tabaco_DXCIE_REDEF                         10677 non-null  int64   \n",
      " 29  Frec30_1 día/semana                        10677 non-null  bool    \n",
      " 30  Frec30_2-3 días‎/semana                    10677 non-null  bool    \n",
      " 31  Frec30_4-6 días/semana                     10677 non-null  bool    \n",
      " 32  Frec30_Menos de 1 día‎/semana              10677 non-null  bool    \n",
      " 33  Frec30_No consumio                         10677 non-null  bool    \n",
      " 34  Frec30_Todos los días                      10677 non-null  bool    \n",
      " 35  Años_consumo_droga                         10677 non-null  float64 \n",
      " 36  OtrosDx_Psiquiatrico_REDEF                 10677 non-null  int64   \n",
      " 37  Tx_previos_REDEF                           10677 non-null  int64   \n",
      " 38  Adherencia_tto_recalc                      10677 non-null  float64 \n",
      " 39  Situacion_tratamiento_REDEF                10677 non-null  int64   \n",
      "dtypes: bool(20), category(1), float64(5), int64(14)\n",
      "memory usage: 1.8 MB\n",
      "None\n"
     ]
    }
   ],
   "source": [
    "print(conj_post.info())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Creating a numpy matrix without the target variable (X) and a list with the target variable (y) \n",
    "X_pre, y_pre = conj_pre.loc[:, conj_pre.columns != \"Situacion_tratamiento_REDEF\"].to_numpy(), conj_pre.Situacion_tratamiento_REDEF\n",
    "X_post, y_post = conj_post.loc[:, conj_post.columns != \"Situacion_tratamiento_REDEF\"].to_numpy(), conj_post.Situacion_tratamiento_REDEF\n",
    "feat = np.delete(conj_pre.columns.to_numpy(),-1) # Get labels and remove target "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['Ed_Not Complete primary school' 'Ed_Primary education'\n",
      " 'Ed_Secondary Education' 'Ed_Secondary more technical education'\n",
      " 'Ed_Tertiary' 'Social_protection_REDEF' 'JobIn_Non-stable' 'JobIn_Stable'\n",
      " 'JobIn_Unemployed' 'Hous_Institutional' 'Hous_Stable' 'Hous_Unstable'\n",
      " 'Alterations_early_childhood_develop_REDEF'\n",
      " 'SocInc_Live with families or friends' 'SocInc_live alone'\n",
      " 'SocInc_live in institutions' 'Risk_stigma_REDEF' 'Structural_conflic'\n",
      " 'Age' 'Sex_REDEF' 'NumHijos' 'Smoking_REDEF'\n",
      " 'Biological_vulnerability_REDEF' 'Opiaceos_DxCIE_REDEF'\n",
      " 'Cannabis_DXCIE_REDEF' 'BZD_DxCIE_REDEF' 'Cocaina_DxCIE_REDEF'\n",
      " 'Alucinogenos_DXCIE_REDEF' 'Tabaco_DXCIE_REDEF' 'Frec30_1 día/semana'\n",
      " 'Frec30_2-3 días\\u200e/semana' 'Frec30_4-6 días/semana'\n",
      " 'Frec30_Menos de 1 día\\u200e/semana' 'Frec30_No consumio'\n",
      " 'Frec30_Todos los días' 'Años_consumo_droga' 'OtrosDx_Psiquiatrico_REDEF'\n",
      " 'Tx_previos_REDEF' 'Adherencia_tto_recalc']\n"
     ]
    }
   ],
   "source": [
    "print(feat)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(22861, 39)\n",
      "(10677, 39)\n",
      "(22861,)\n",
      "(10677,)\n",
      "39\n"
     ]
    }
   ],
   "source": [
    "print(X_pre.shape)\n",
    "print(X_post.shape)\n",
    "print(y_pre.shape)\n",
    "print(y_post.shape)\n",
    "print(len(feat))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### FSS Filter methods"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###### Mutual Info"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create subplots\n",
    "fig, axes = plt.subplots(1, 2, figsize=(12, 6))\n",
    "\n",
    "# PRE\n",
    "importances_MI = mutual_info_classif(X_pre, y_pre)\n",
    "feat_importances_MI = pd.Series(importances_MI, feat)\n",
    "feat_importances_MI.sort_values(inplace=True)\n",
    "axes[0].barh(feat_importances_MI[feat_importances_MI != 0][-20:].index, feat_importances_MI[feat_importances_MI != 0][-20:], color='teal')\n",
    "axes[0].set_xlabel(\"Mutual Information\")\n",
    "axes[0].set_title(\"PRE\")\n",
    "\n",
    "# POST\n",
    "importances_MI = mutual_info_classif(X_post, y_post)\n",
    "feat_importances_MI = pd.Series(importances_MI, feat)\n",
    "feat_importances_MI.sort_values(inplace=True)\n",
    "axes[1].barh(feat_importances_MI[feat_importances_MI != 0][-20:].index, feat_importances_MI[feat_importances_MI != 0][-20:], color='teal')\n",
    "axes[1].set_xlabel(\"Mutual Information\")\n",
    "axes[1].set_title(\"POST\")\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('EDA_plots/features/mutual_info.svg', format='svg', dpi=1200)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###### ANOVA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create subplots\n",
    "fig, axes = plt.subplots(1, 2, figsize=(12, 6))\n",
    "\n",
    "# PRE\n",
    "selector = SelectKBest(f_classif, k=39)\n",
    "selector.fit(X_pre, y_pre)\n",
    "feat_importances_AN_pre = pd.Series(selector.pvalues_, feat)\n",
    "feat_importances_AN_pre.sort_values(inplace=True)\n",
    "axes[0].barh(feat_importances_AN_pre[feat_importances_AN_pre > 0.005][-20:].index, feat_importances_AN_pre[feat_importances_AN_pre > 0.005][-20:], color='teal')\n",
    "axes[0].set_xlabel(\"p-value ANOVA\")\n",
    "axes[0].set_title(\"PRE\")\n",
    "\n",
    "# POST\n",
    "selector = SelectKBest(f_classif, k=39)\n",
    "selector.fit(X_post, y_post)\n",
    "feat_importances_AN_post = pd.Series(selector.pvalues_, feat)\n",
    "feat_importances_AN_post.sort_values(inplace=True)\n",
    "axes[1].barh(feat_importances_AN_post[feat_importances_AN_post > 0.005][-20:].index, feat_importances_AN_post[feat_importances_AN_post > 0.005][-20:], color='teal') \n",
    "axes[1].set_xlabel(\"p-value ANOVA\")\n",
    "axes[1].set_title(\"POST\")\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('EDA_plots/features/ANOVA.svg', format='svg', dpi=1200)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create subplots\n",
    "fig, axes = plt.subplots(1, 2, figsize=(15, 6))\n",
    "\n",
    "# PRE\n",
    "variance_filter = VarianceThreshold(threshold=0)\n",
    "variance_filter.fit(X_pre)\n",
    "feat_importances_var_pre = pd.Series(variance_filter.variances_, feat)\n",
    "feat_importances_var_pre.sort_values(inplace=True)\n",
    "axes[0].barh(feat_importances_var_pre[feat_importances_var_pre > 0.05][-20:].index, feat_importances_var_pre[feat_importances_var_pre > 0.05][-20:], color='teal')\n",
    "axes[0].set_xlabel(\"Variance\")\n",
    "axes[0].set_title(\"PRE\")\n",
    "\n",
    "# POST\n",
    "variance_filter = VarianceThreshold(threshold=0)\n",
    "variance_filter.fit(X_post)\n",
    "feat_importances_var_post = pd.Series(variance_filter.variances_, feat)\n",
    "feat_importances_var_post.sort_values(inplace=True)\n",
    "axes[1].barh(feat_importances_var_post[feat_importances_var_post > 0.05][-20:].index, feat_importances_var_post[feat_importances_var_post > 0.05][-20:], color='teal')\n",
    "axes[1].set_xlabel(\"Variance\")\n",
    "axes[1].set_title(\"POST\")\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('EDA_plots/features/var_threshold.svg', format='svg', dpi=1200)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Export PRE and POST datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "conj_pre.to_csv('pre_dataset.csv', index=False)\n",
    "conj_post.to_csv('post_dataset.csv', index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}