EDA cleaned

0bbb8d6a · Joaquin Torres · d545dd10 · 0bbb8d6a · 0bbb8d6a · d545dd10
Commit 0bbb8d6a authored Jun 28, 2024 by Joaquin Torres
9 changed files
--- a/.gitignore
+++ b/.gitignore
 gen_train_data/input/
 gen_train_data/output/
-EDA/input/
\ No newline at end of file
+EDA/input/17_abril.sav
+EDA/output/datasets
\ No newline at end of file
--- a/EDA/EDA.ipynb
+++ b/EDA/EDA.ipynb
@@ -4,14 +4,15 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "### EDA"
+    "_Exploratory Data Analysis_ \\\n",
+    "_Author: Joaquín Torres Bravo_"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "#### Libraries"
+    "### Libraries"
   ]
  },
  {
@@ -25,7 +26,6 @@
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "from pypair.association import binary_binary, continuous_continuous, binary_continuous\n",
-    "\n",
    "from sklearn.feature_selection import VarianceThreshold\n",
    "from sklearn.feature_selection import SelectKBest\n",
    "from sklearn.feature_selection import f_classif\n",
@@ -36,19 +36,12 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "#### Preparing Data"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "##### Reading and filtering"
+    "### First Steps"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 139,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -61,37 +54,11 @@
    "bd = bd[(bd['Situacion_tratamiento'] == 'Abandono') | (bd['Situacion_tratamiento'] == 'Alta terapéutica')]"
   ]
  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "##### Defining sets of patients"
-   ]
-  },
  {
   "cell_type": "code",
-   "execution_count": 140,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_19584\\2495984927.py:18: SettingWithCopyWarning: \n",
-      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
-      "Try using .loc[row_indexer,col_indexer] = value instead\n",
-      "\n",
-      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
-      "  conj_post['Group'] = 'Post'\n",
-      "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_19584\\2495984927.py:19: SettingWithCopyWarning: \n",
-      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
-      "Try using .loc[row_indexer,col_indexer] = value instead\n",
-      "\n",
-      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
-      "  conj_pre['Group'] = 'Pre'\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
   "source": [
    "# Pre-pandemic\n",
    "conj_pre = bd[bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio y fin prepandemia']\n",
@@ -117,22 +84,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 100,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "PRE: 22861\n",
-      "\tALTA: 2792\n",
-      "\tABANDONO: 20069\n",
-      "POST: 10677\n",
-      "\tALTA: 1882\n",
-      "\tABANDONO: 8795\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
   "source": [
    "# Printing size of different datasets\n",
    "print(f\"PRE: {len(conj_pre)}\")\n",
@@ -144,20 +98,6 @@
    "print(f\"\\tABANDONO: {len(post_abandono)}\")"
   ]
  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### First Steps"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "##### Inspecting the dataframes"
-   ]
-  },
  {
   "cell_type": "code",
   "execution_count": null,
@@ -191,114 +131,37 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "##### Replacing unknown values with the mode"
+    "### Missing and Unknown Values"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 141,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "['Live with families or friends' 'live alone' 'live in institutions' '9.0']\n",
-      "['Live with families or friends' 'live alone' 'live in institutions']\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
   "source": [
    "# 9.0 represents unknown according to Variables.docx \n",
    "print(bd['Social_inclusion'].unique())\n",
    "mode_soc_inc = bd['Social_inclusion'].mode()[0]\n",
-    "# print(mode_soc_inc)\n",
    "bd['Social_inclusion'] = bd['Social_inclusion'].replace('9.0', mode_soc_inc)\n",
-    "print(bd['Social_inclusion'].unique())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 142,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "['No alterations (first exposure at 11 or more years)'\n",
-      " 'Alterations (first exposure before 11 years old)' '9']\n",
-      "['No alterations (first exposure at 11 or more years)'\n",
-      " 'Alterations (first exposure before 11 years old)']\n"
-     ]
-    }
-   ],
-   "source": [
+    "print(bd['Social_inclusion'].unique())\n",
+    "\n",
    "print(bd['Alterations_early_childhood_develop'].unique())\n",
    "mode_alt = bd['Alterations_early_childhood_develop'].mode()[0]\n",
    "bd['Alterations_early_childhood_develop'] = bd['Alterations_early_childhood_develop'].replace('9', mode_alt)\n",
-    "print(bd['Alterations_early_childhood_develop'].unique())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 143,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[NaN, 'Yes', 'No']\n",
-      "Categories (3, object): [99.0, 'No', 'Yes']\n",
-      "[NaN, 'Yes', 'No']\n",
-      "Categories (2, object): ['No', 'Yes']\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_19584\\1073322024.py:3: FutureWarning: The behavior of Series.replace (and DataFrame.replace) with CategoricalDtype is deprecated. In a future version, replace will only be used for cases that preserve the categories. To change the categories, use ser.cat.rename_categories instead.\n",
-      "  bd['Risk_stigma'] = bd['Risk_stigma'].replace(99.0, mode_stigma)\n"
-     ]
-    }
-   ],
-   "source": [
+    "print(bd['Alterations_early_childhood_develop'].unique())\n",
+    "\n",
    "print(bd['Risk_stigma'].unique())\n",
    "mode_stigma = bd['Risk_stigma'].mode()[0]\n",
    "bd['Risk_stigma'] = bd['Risk_stigma'].replace(99.0, mode_stigma)\n",
-    "print(bd['Risk_stigma'].unique())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 144,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[nan  0.  1.  2.  3.  4.  5.  8. 10.  6. 11. 12.  9.  7. 99. 14. 15.]\n",
-      "[nan  0.  1.  2.  3.  4.  5.  8. 10.  6. 11. 12.  9.  7. 14. 15.]\n"
-     ]
-    }
-   ],
-   "source": [
+    "print(bd['Risk_stigma'].unique())\n",
+    "\n",
    "print(bd['NumHijos'].unique())\n",
    "mode_hijos = bd['NumHijos'].mode()[0]\n",
    "bd['NumHijos'] = bd['NumHijos'].replace(99.0, mode_hijos)\n",
    "print(bd['NumHijos'].unique())"
   ]
  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "##### Quantifying Null Values"
-   ]
-  },
  {
   "cell_type": "code",
   "execution_count": null,
@@ -323,53 +186,11 @@
    "print(f\"\\t\\tMissing values NumHijos: {conj_post['NumHijos'].isnull().sum()}\")"
   ]
  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "##### Replacing missing values with mode"
-   ]
-  },
  {
   "cell_type": "code",
-   "execution_count": 145,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_19584\\3303146707.py:2: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n",
-      "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n",
-      "\n",
-      "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n",
-      "\n",
-      "\n",
-      "  bd['Age'].fillna(age_mode, inplace=True)\n",
-      "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_19584\\3303146707.py:5: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n",
-      "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n",
-      "\n",
-      "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n",
-      "\n",
-      "\n",
-      "  bd['Años_consumo_droga'].fillna(años_consumo_mode, inplace=True)\n",
-      "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_19584\\3303146707.py:8: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n",
-      "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n",
-      "\n",
-      "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n",
-      "\n",
-      "\n",
-      "  bd['Risk_stigma'].fillna(risk_stigma_mode, inplace=True)\n",
-      "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_19584\\3303146707.py:11: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n",
-      "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n",
-      "\n",
-      "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n",
-      "\n",
-      "\n",
-      "  bd['NumHijos'].fillna(num_hijos_mode, inplace=True)\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
   "source": [
    "age_mode = bd['Age'].mode()[0]\n",
    "bd['Age'].fillna(age_mode, inplace=True)\n",
@@ -388,14 +209,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "#### Distribution of variables"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "##### Classifying variables into numerical and discrete/categorical "
+    "### Distribution of Variables"
   ]
  },
  {
@@ -419,14 +233,14 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "##### Distribution of discrete attributes"
+    "#### Discrete"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "###### Count plots"
+    "##### Countplots"
   ]
  },
  {
@@ -464,7 +278,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "###### Normalized count plots"
+    "##### Normalized Countplots"
   ]
  },
  {
@@ -560,14 +374,14 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "##### Distribution of numeric attributes"
+    "#### Numerical"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "###### Summary statistics"
+    "##### Summary Stats"
   ]
  },
  {
@@ -583,7 +397,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "###### Boxplots"
+    "##### Boxplots"
   ]
  },
  {
@@ -615,7 +429,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "###### Histograms"
+    "##### Histograms"
   ]
  },
  {
@@ -655,19 +469,50 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "#### Correlation Analysis"
+    "### Correlation Analysis"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "##### Turning binary variables into 0/1 values"
+    "#### Groups of Variables"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 146,
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "social_vars = ['Education', 'Social_protection', 'Job_insecurity', 'Housing', 'Alterations_early_childhood_develop', \n",
+    "            'Social_inclusion', 'Risk_stigma', 'Structural_conflic']\n",
+    "ind_vars = ['Age', 'Sex', 'NumHijos', 'Smoking', 'Biological_vulnerability', 'Opiaceos_DxCIE', \n",
+    "            'Cannabis_DXCIE', 'BZD_DxCIE', 'Cocaina_DxCIE', 'Alucinogenos_DXCIE', 'Tabaco_DXCIE', \n",
+    "            'FrecuenciaConsumo30Dias', 'Años_consumo_droga','OtrosDx_Psiquiatrico', 'Tx_previos', 'Adherencia_tto_recalc'] \n",
+    "target_var = 'Situacion_tratamiento'\n",
+    "\n",
+    "# Columns that are already numeric and we don't need to redefine \n",
+    "no_redef_cols = ['Structural_conflic', 'Age', 'NumHijos', 'Años_consumo_droga', 'Adherencia_tto_recalc']"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### One-hot Encoding"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "##### Binary"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -739,43 +584,12 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "##### Defining groups of variables"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 127,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "social_vars = ['Education', 'Social_protection', 'Job_insecurity', 'Housing', 'Alterations_early_childhood_develop', \n",
-    "            'Social_inclusion', 'Risk_stigma', 'Structural_conflic']\n",
-    "ind_vars = ['Age', 'Sex', 'NumHijos', 'Smoking', 'Biological_vulnerability', 'Opiaceos_DxCIE', \n",
-    "            'Cannabis_DXCIE', 'BZD_DxCIE', 'Cocaina_DxCIE', 'Alucinogenos_DXCIE', 'Tabaco_DXCIE', \n",
-    "            'FrecuenciaConsumo30Dias', 'Años_consumo_droga','OtrosDx_Psiquiatrico', 'Tx_previos', 'Adherencia_tto_recalc'] \n",
-    "target_var = 'Situacion_tratamiento'"
+    "##### Categorical"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 128,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Columns that are already numeric and we don't need to redefine \n",
-    "no_redef_cols = ['Structural_conflic', 'Age', 'NumHijos', 'Años_consumo_droga', 'Adherencia_tto_recalc']"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "##### One-hot encode categorical variables"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 147,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -796,21 +610,19 @@
    "    # Create one hot encoding version of attribute and concatenate new columns to main df\n",
    "    encoded_var = pd.get_dummies(bd[one_hot_var], prefix=one_hots_vars_prefix[one_hot_var])\n",
    "    bd = pd.concat([bd, encoded_var], axis=1)\n",
-    "    one_hot_cols_dic[one_hot_var] = encoded_var.columns.tolist()\n",
-    "\n",
-    "# print(one_hot_cols_dic['FrecuenciaConsumo30Dias'])"
+    "    one_hot_cols_dic[one_hot_var] = encoded_var.columns.tolist()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "###### Defining final version of columns of interest"
+    "#### Final Columns"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 148,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -848,38 +660,40 @@
    "corr_cols = soc_vars_enc + ind_vars_enc"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Drop unknown columns\n",
+    "corr_cols = [corr_col for corr_col in corr_cols if corr_col not in ['Ed_Unknowledge', 'JobIn_unkwnodledge', 'Hous_unknowledge', 'Frec30_Desconocido']]\n",
+    "soc_vars_enc = [corr_col for corr_col in soc_vars_enc if corr_col not in ['Ed_Unknowledge', 'JobIn_unkwnodledge', 'Hous_unknowledge', 'Frec30_Desconocido']]\n",
+    "ind_vars_enc = [corr_col for corr_col in ind_vars_enc if corr_col not in ['Ed_Unknowledge', 'JobIn_unkwnodledge', 'Hous_unknowledge', 'Frec30_Desconocido']]"
+   ]
+  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "###### Excluding unknown columns and renaming"
+    "##### Renaming and Filtering"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 149,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
-    "# Drop unknown columns\n",
-    "corr_cols = [corr_col for corr_col in corr_cols if corr_col not in ['Ed_Unknowledge', 'JobIn_unkwnodledge', 'Hous_unknowledge', 'Frec30_Desconocido']]\n",
-    "soc_vars_enc = [corr_col for corr_col in soc_vars_enc if corr_col not in ['Ed_Unknowledge', 'JobIn_unkwnodledge', 'Hous_unknowledge', 'Frec30_Desconocido']]\n",
-    "ind_vars_enc = [corr_col for corr_col in soc_vars_enc if ind_vars_enc not in ['Ed_Unknowledge', 'JobIn_unkwnodledge', 'Hous_unknowledge', 'Frec30_Desconocido']]"
+    "columns_to_keep = corr_cols + ['Situacion_tratamiento','Situacion_tratamiento_REDEF', 'Pandemia_inicio_fin_tratamiento']\n",
+    "bd = bd[columns_to_keep]"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 150,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "['Ed_Not_Complete_Primary', 'Ed_Primary', 'Ed_Secondary', 'Ed_Secondary_Technical', 'Ed_Tertiary', 'Social_Protection', 'JobIn_Unstable', 'JobIn_Stable', 'JobIn_Unemployed', 'Hous_Institutional', 'Hous_Stable', 'Hous_Unstable', 'Early_Alterations', 'SocInc_Family_Friends', 'SocInc_Alone', 'SocInc_Instit', 'Risk_Stigma', 'Structural_Conflict', 'age', 'Sex', 'Num_Children', 'Smoking', 'Bio_Vulner', 'Opiods_DXCIE', 'Cannabis_DXCIE', 'BZD_DXCIE', 'Cocaine_DXCIE', 'Hallucin_DXCIE', 'Tobacco_DXCIE', 'Freq_1dpw', 'Freq_2-3dpw', 'Freq_4-6dpw', 'Freq_l1dpw', 'Freq_None', 'Freq_Everyday', 'Years_Drug_Use', 'Other_Psychiatric_DX', 'Previous_Treatments', 'Treatment_Adherence']\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
   "source": [
    "name_mapping = {\n",
    "    'Ed_Not Complete primary school': 'Ed_Not_Complete_Primary',\n",
@@ -900,7 +714,7 @@
    "    'SocInc_live in institutions': 'SocInc_Instit',\n",
    "    'Risk_stigma_REDEF': 'Risk_Stigma',\n",
    "    'Structural_conflic': 'Structural_Conflict',\n",
-    "    # 'Age': 'Age',\n",
+    "    'Age': 'Age',\n",
    "    'Sex_REDEF': 'Sex',\n",
    "    'NumHijos': 'Num_Children',\n",
    "    'Smoking_REDEF': 'Smoking',\n",
@@ -920,26 +734,28 @@
    "    'Años_consumo_droga': 'Years_Drug_Use',\n",
    "    'OtrosDx_Psiquiatrico_REDEF': 'Other_Psychiatric_DX',\n",
    "    'Tx_previos_REDEF': 'Previous_Treatments',\n",
-    "    'Adherencia_tto_recalc': 'Treatment_Adherence'\n",
+    "    'Adherencia_tto_recalc': 'Treatment_Adherence',\n",
+    "    'Situacion_tratamiento_REDEF': 'Treatment_Outcome',\n",
+    "    'Situacion_tratamiento': 'Situacion_tratamiento',\n",
+    "    'Pandemia_inicio_fin_tratamiento': 'Pandemia_inicio_fin_tratamiento'\n",
    "}\n",
    "\n",
    "# Update lists of feature names\n",
    "corr_cols = [name_mapping[corr_col] for corr_col in corr_cols]\n",
-    "print(corr_cols)\n",
    "soc_vars_enc = [name_mapping[col] for col in soc_vars_enc]\n",
-    "ind_vars_enc = [name_mapping[col] for col in ind_vars_enc]\n",
-    "\n",
-    "bd = bd.rename(columns=name_mapping)"
+    "ind_vars_enc = [name_mapping[col] for col in ind_vars_enc]"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 133,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
-    "# Create bd with just corr_cols and target\n",
-    "bd = bd[corr_cols + ['Situacion_tratamiento','Situacion_tratamiento_REDEF', 'Pandemia_inicio_fin_tratamiento']]"
+    "# Export feature names\n",
+    "np.save('./output/feature_names/feature_names.npy', corr_cols)\n",
+    "np.save('./output/feature_names/soc_vars_names.npy', soc_vars_enc)\n",
+    "np.save('./output/feature_names/ind_vars_names.npy', ind_vars_enc)"
   ]
  },
  {
@@ -948,25 +764,17 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# Export feature names\n",
-    "np.save('./output/feature_names.npy', corr_cols)\n",
-    "np.save('./output/soc_vars_names.npy', soc_vars_enc)\n",
-    "np.save('./output/ind_vars_names.npy', ind_vars_enc)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "###### Update main data frames"
+    "bd = bd.rename(columns=name_mapping)[list(name_mapping.values())]\n",
+    "#print(bd.columns)"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 134,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
+    "# Update main dfs\n",
    "# Pre-pandemic\n",
    "conj_pre = bd[bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio y fin prepandemia']\n",
    "# Pre-pandemic abandono\n",
@@ -988,110 +796,17 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "##### Building correlation matrix"
+    "#### Plotting Correlation Matrices"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 135,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Index(['Ed_Not_Complete_Primary', 'Ed_Primary', 'Ed_Secondary',\n",
-      "       'Ed_Secondary_Technical', 'Ed_Tertiary', 'Social_Protection',\n",
-      "       'JobIn_Unstable', 'JobIn_Stable', 'JobIn_Unemployed',\n",
-      "       'Hous_Institutional', 'Hous_Stable', 'Hous_Unstable',\n",
-      "       'Early_Alterations', 'SocInc_Family_Friends', 'SocInc_Alone',\n",
-      "       'SocInc_Instit', 'Risk_Stigma', 'Structural_Conflict', 'age', 'Sex',\n",
-      "       'Sex', 'Num_Children', 'Smoking', 'Smoking', 'Bio_Vulner',\n",
-      "       'Opiods_DXCIE', 'Cannabis_DXCIE', 'Cannabis_DXCIE', 'BZD_DXCIE',\n",
-      "       'Cocaine_DXCIE', 'Hallucin_DXCIE', 'Tobacco_DXCIE', 'Freq_1dpw',\n",
-      "       'Freq_2-3dpw', 'Freq_4-6dpw', 'Freq_l1dpw', 'Freq_None',\n",
-      "       'Freq_Everyday', 'Years_Drug_Use', 'Other_Psychiatric_DX',\n",
-      "       'Previous_Treatments', 'Treatment_Adherence', 'Situacion_tratamiento',\n",
-      "       'Situacion_tratamiento_REDEF', 'Pandemia_inicio_fin_tratamiento'],\n",
-      "      dtype='object')\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(bd.columns)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 137,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Ed_Not_Complete_Primary\n",
-      "2\n",
-      "Ed_Primary\n",
-      "2\n",
-      "Ed_Secondary\n",
-      "2\n",
-      "Ed_Secondary_Technical\n",
-      "2\n",
-      "Ed_Tertiary\n",
-      "2\n",
-      "Social_Protection\n",
-      "2\n",
-      "JobIn_Unstable\n",
-      "2\n",
-      "JobIn_Stable\n",
-      "2\n",
-      "JobIn_Unemployed\n",
-      "2\n",
-      "Hous_Institutional\n",
-      "2\n",
-      "Hous_Stable\n",
-      "2\n",
-      "Hous_Unstable\n",
-      "2\n",
-      "Early_Alterations\n",
-      "2\n",
-      "SocInc_Family_Friends\n",
-      "2\n",
-      "SocInc_Alone\n",
-      "2\n",
-      "SocInc_Instit\n",
-      "2\n",
-      "Risk_Stigma\n",
-      "2\n",
-      "Structural_Conflict\n",
-      "107\n",
-      "age\n",
-      "74\n",
-      "Sex\n"
-     ]
-    },
-    {
-     "ename": "AttributeError",
-     "evalue": "'DataFrame' object has no attribute 'unique'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[1;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
-      "\u001b[1;32m~\\AppData\\Local\\Temp\\ipykernel_19584\\340002156.py\u001b[0m in \u001b[0;36m?\u001b[1;34m()\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[1;31m# print(len(bd['Cocaine_DXCIE'].unique()) == 2)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      2\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      3\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mcol\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mcorr_cols\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      4\u001b[0m     \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcol\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 5\u001b[1;33m     \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mbd\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mcol\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0munique\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      6\u001b[0m \u001b[1;31m#binary_vars = [col for col in corr_cols if len(bd[col].unique()) == 2] + ['Situacion_tratamiento_REDEF', name_mapping['Risk_stigma_REDEF']]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      7\u001b[0m \u001b[1;31m#cont_vars = [name_mapping[col] for col in ['Structural_conflic', 'Age', 'NumHijos', 'Años_consumo_droga', 'Adherencia_tto_recalc']]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
-      "\u001b[1;32mc:\\Users\\Joaquín Torres\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\pandas\\core\\generic.py\u001b[0m in \u001b[0;36m?\u001b[1;34m(self, name)\u001b[0m\n\u001b[0;32m   6292\u001b[0m             \u001b[1;32mand\u001b[0m \u001b[0mname\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_accessors\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   6293\u001b[0m             \u001b[1;32mand\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_info_axis\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_can_hold_identifiers_and_holds_name\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mname\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   6294\u001b[0m         \u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   6295\u001b[0m             \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mname\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 6296\u001b[1;33m         \u001b[1;32mreturn\u001b[0m \u001b[0mobject\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__getattribute__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mname\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
-      "\u001b[1;31mAttributeError\u001b[0m: 'DataFrame' object has no attribute 'unique'"
-     ]
-    }
-   ],
-   "source": [
-    "# print(len(bd['Cocaine_DXCIE'].unique()) == 2)\n",
-    "\n",
-    "for col in corr_cols:\n",
-    "    print(col)\n",
-    "    print(len(bd[col].unique()))\n",
-    "#binary_vars = [col for col in corr_cols if len(bd[col].unique()) == 2] + ['Situacion_tratamiento_REDEF', name_mapping['Risk_stigma_REDEF']]\n",
-    "#cont_vars = [name_mapping[col] for col in ['Structural_conflic', 'Age', 'NumHijos', 'Años_consumo_droga', 'Adherencia_tto_recalc']]"
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "binary_vars = [col for col in corr_cols if len(bd[col].unique()) == 2] + [name_mapping['Situacion_tratamiento_REDEF'], name_mapping['Risk_stigma_REDEF']]\n",
+    "cont_vars = [name_mapping[col] for col in ['Structural_conflic', 'Age', 'NumHijos', 'Años_consumo_droga', 'Adherencia_tto_recalc']]"
   ]
  },
  {
@@ -1149,7 +864,7 @@
    "    # Define columns based on sit_tto arg\n",
    "    if sit_tto == 1:\n",
    "        # Include target as another variable\n",
-    "        cols = [target_var + '_REDEF'] + corr_cols\n",
+    "        cols = ['Treatment_Outcome'] + corr_cols\n",
    "    else:\n",
    "        cols = corr_cols\n",
    "        \n",
@@ -1246,10 +961,8 @@
    "\n",
    "    corr_mats.append((corr_matrix_pre, corr_matrix_post))\n",
    "        \n",
-    "# Adjust layout to prevent overlapping titles\n",
    "plt.tight_layout()\n",
    "\n",
-    "# Save the figure in SVG format in the \"./EDA_plots\" folder\n",
    "plt.savefig('./output/plots/correlations/heatmaps_one_hot.svg', dpi=550, bbox_inches='tight')"
   ]
  },
@@ -1257,7 +970,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "##### Finding significative differences between PRE and POST"
+    "#### Finding Differences PRE vs POST"
   ]
  },
  {
@@ -1326,11 +1039,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "tags": [
-     "keep"
-    ]
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"------SIT_TTO 1: NO FILTERING------\")\n",
@@ -1340,11 +1049,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "tags": [
-     "keep"
-    ]
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"------SIT_TTO 2: ABANDONO-----\")\n",
@@ -1354,11 +1059,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "tags": [
-     "keep"
-    ]
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"------SIT_TTO 3: ALTA-----\")\n",
@@ -1369,46 +1070,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "#### Feature Analysis and Selection"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "##### Building final datasets to work with"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Work with columns of interest\n",
-    "cols_of_interest = corr_cols + ['Pandemia_inicio_fin_tratamiento'] + [target_var + \"_REDEF\"]\n",
-    "temp_bd = bd[cols_of_interest]\n",
-    "print(temp_bd.info()) # NaN values already dealt with (replaced by mode - this okay?)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Dropping unknown columns/categories for analysis purposes\n",
-    "unknown_cols = ['Ed_Unknowledge', 'JobIn_unkwnodledge', 'Hous_unknowledge', 'Frec30_Desconocido']\n",
-    "temp_bd = temp_bd.drop(columns=unknown_cols)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(temp_bd.info())"
+    "### Final Datasets"
   ]
  },
  {
@@ -1417,53 +1079,20 @@
   "metadata": {},
   "outputs": [],
   "source": [
+    "bd = bd.drop(columns=['Situacion_tratamiento'])\n",
+    "# print(len(bd.columns))\n",
+    "\n",
    "# For conj_pre dataframe\n",
-    "conj_pre = temp_bd[temp_bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio y fin prepandemia']\n",
+    "conj_pre = bd[bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio y fin prepandemia']\n",
    "conj_pre = conj_pre.drop(columns=['Pandemia_inicio_fin_tratamiento'])\n",
    "\n",
    "# For conj_post dataframe\n",
-    "conj_post = temp_bd[(temp_bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio prepandemia y fin en pandemia') | \n",
-    "                    (temp_bd['Pandemia_inicio_fin_tratamiento'] == 'inicio y fin en pandemia')]\n",
-    "conj_post = conj_post.drop(columns=['Pandemia_inicio_fin_tratamiento'])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(conj_pre.info())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(conj_post.info())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Creating a numpy matrix without the target variable (X) and a list with the target variable (y) \n",
-    "X_pre, y_pre = conj_pre.loc[:, conj_pre.columns != \"Situacion_tratamiento_REDEF\"].to_numpy(), conj_pre.Situacion_tratamiento_REDEF\n",
-    "X_post, y_post = conj_post.loc[:, conj_post.columns != \"Situacion_tratamiento_REDEF\"].to_numpy(), conj_post.Situacion_tratamiento_REDEF\n",
-    "feat = np.delete(conj_pre.columns.to_numpy(),-1) # Get labels and remove target "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(feat)"
+    "conj_post = bd[(bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio prepandemia y fin en pandemia') | \n",
+    "                    (bd['Pandemia_inicio_fin_tratamiento'] == 'inicio y fin en pandemia')]\n",
+    "conj_post = conj_post.drop(columns=['Pandemia_inicio_fin_tratamiento'])\n",
+    "\n",
+    "# print(conj_post.columns)\n",
+    "# print(conj_pre.columns)"
   ]
  },
  {
@@ -1472,25 +1101,27 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "print(X_pre.shape)\n",
-    "print(X_post.shape)\n",
-    "print(y_pre.shape)\n",
-    "print(y_post.shape)\n",
-    "print(len(feat))"
+    "X_pre, y_pre = conj_pre.loc[:, conj_pre.columns != \"Treatment_Outcome\"].to_numpy(), conj_pre.Treatment_Outcome\n",
+    "X_post, y_post = conj_post.loc[:, conj_post.columns != \"Treatment_Outcome\"].to_numpy(), conj_post.Treatment_Outcome\n",
+    "feat = np.delete(conj_pre.columns.to_numpy(),-1) # Get labels and remove target \n",
+    "\n",
+    "# Export datasets\n",
+    "conj_pre.to_csv('./output/datasets/pre_dataset.csv', index=False)\n",
+    "conj_post.to_csv('./output/datasets/post_dataset.csv', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "##### FSS Filter methods"
+    "### Feature Analysis"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "###### Mutual Info"
+    "#### Mutual Info"
   ]
  },
  {
@@ -1527,7 +1158,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "###### ANOVA"
+    "#### ANOVA"
   ]
  },
  {
@@ -1562,6 +1193,13 @@
    "plt.show()"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Variance Threshold"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
@@ -1593,23 +1231,6 @@
    "plt.savefig('./output/plots/feature_importance/var_threshold.svg', format='svg', dpi=1200)\n",
    "plt.show()"
   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### Export PRE and POST datasets"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "conj_pre.to_csv('pre_dataset.csv', index=False)\n",
-    "conj_post.to_csv('post_dataset.csv', index=False)"
-   ]
  }
 ],
 "metadata": {

--- a/EDA/EDA_2.ipynb
+++ b/EDA/EDA_2.ipynb
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Libraries"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 128,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "import seaborn as sns\n",
-    "import matplotlib.pyplot as plt\n",
-    "import numpy as np\n",
-    "from pypair.association import binary_binary, continuous_continuous, binary_continuous\n",
-    "from sklearn.feature_selection import VarianceThreshold\n",
-    "from sklearn.feature_selection import SelectKBest\n",
-    "from sklearn.feature_selection import f_classif\n",
-    "from sklearn.feature_selection import mutual_info_classif"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### First Steps"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 129,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "bd_all = pd.read_spss('./input/17_abril.sav')\n",
-    "\n",
-    "# Filter the dataset to work only with alcohol patients\n",
-    "bd = bd_all[bd_all['Alcohol_DxCIE'] == 'Sí']\n",
-    "\n",
-    "# Filter the dataset to work only with 'Situacion_tratamiento' == 'Abandono' or 'Alta'\n",
-    "bd = bd[(bd['Situacion_tratamiento'] == 'Abandono') | (bd['Situacion_tratamiento'] == 'Alta terapéutica')]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 130,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_8220\\2495984927.py:18: SettingWithCopyWarning: \n",
-      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
-      "Try using .loc[row_indexer,col_indexer] = value instead\n",
-      "\n",
-      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
-      "  conj_post['Group'] = 'Post'\n",
-      "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_8220\\2495984927.py:19: SettingWithCopyWarning: \n",
-      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
-      "Try using .loc[row_indexer,col_indexer] = value instead\n",
-      "\n",
-      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
-      "  conj_pre['Group'] = 'Pre'\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Pre-pandemic\n",
-    "conj_pre = bd[bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio y fin prepandemia']\n",
-    "# Pre-pandemic abandono\n",
-    "pre_abandono = conj_pre[conj_pre['Situacion_tratamiento'] == 'Abandono']\n",
-    "# Pre-pandemic alta\n",
-    "pre_alta = conj_pre[conj_pre['Situacion_tratamiento'] == 'Alta terapéutica']\n",
-    "\n",
-    "# Post-pandemic\n",
-    "# Merging last two classes to balance sets\n",
-    "conj_post = bd[(bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio prepandemia y fin en pandemia') | \n",
-    "               (bd['Pandemia_inicio_fin_tratamiento'] == 'inicio y fin en pandemia')]\n",
-    "# Post-pandemic abandono\n",
-    "post_abandono = conj_post[conj_post['Situacion_tratamiento'] == 'Abandono']\n",
-    "# Post-pandemic alta\n",
-    "post_alta = conj_post[conj_post['Situacion_tratamiento'] == 'Alta terapéutica']\n",
-    "\n",
-    "# Concatenate the two data frames and add a new column to distinguish between them. Useful for plots\n",
-    "conj_post['Group'] = 'Post'\n",
-    "conj_pre['Group'] = 'Pre'\n",
-    "combined_pre_post = pd.concat([conj_post, conj_pre])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 131,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "PRE: 22861\n",
-      "\tALTA: 2792\n",
-      "\tABANDONO: 20069\n",
-      "POST: 10677\n",
-      "\tALTA: 1882\n",
-      "\tABANDONO: 8795\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Printing size of different datasets\n",
-    "print(f\"PRE: {len(conj_pre)}\")\n",
-    "print(f\"\\tALTA: {len(pre_alta)}\")\n",
-    "print(f\"\\tABANDONO: {len(pre_abandono)}\")\n",
-    "\n",
-    "print(f\"POST: {len(conj_post)}\")\n",
-    "print(f\"\\tALTA: {len(post_alta)}\")\n",
-    "print(f\"\\tABANDONO: {len(post_abandono)}\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 132,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "PRE\n",
-      "<class 'pandas.core.frame.DataFrame'>\n",
-      "Index: 22861 entries, 0 to 85164\n",
-      "Data columns (total 35 columns):\n",
-      " #   Column                               Non-Null Count  Dtype   \n",
-      "---  ------                               --------------  -----   \n",
-      " 0   CODPROYECTO                          22861 non-null  float64 \n",
-      " 1   Education                            22861 non-null  object  \n",
-      " 2   Social_protection                    22861 non-null  object  \n",
-      " 3   Job_insecurity                       22861 non-null  object  \n",
-      " 4   Housing                              22861 non-null  object  \n",
-      " 5   Alterations_early_childhood_develop  22861 non-null  object  \n",
-      " 6   Social_inclusion                     22861 non-null  object  \n",
-      " 7   Risk_stigma                          21606 non-null  category\n",
-      " 8   Structural_conflic                   22861 non-null  float64 \n",
-      " 9   Age                                  22852 non-null  float64 \n",
-      " 10  Sex                                  22861 non-null  object  \n",
-      " 11  NumHijos                             21647 non-null  float64 \n",
-      " 12  Smoking                              22861 non-null  object  \n",
-      " 13  Biological_vulnerability             22861 non-null  object  \n",
-      " 14  Alcohol_DxCIE                        22861 non-null  object  \n",
-      " 15  Opiaceos_DxCIE                       22861 non-null  object  \n",
-      " 16  Cannabis_DXCIE                       22861 non-null  object  \n",
-      " 17  BZD_DxCIE                            22861 non-null  object  \n",
-      " 18  Cocaina_DxCIE                        22861 non-null  object  \n",
-      " 19  Alucinogenos_DXCIE                   22861 non-null  object  \n",
-      " 20  Tabaco_DXCIE                         22861 non-null  object  \n",
-      " 21  FrecuenciaConsumo30Dias              22861 non-null  object  \n",
-      " 22  Años_consumo_droga                   22342 non-null  float64 \n",
-      " 23  OtrosDx_Psiquiatrico                 22861 non-null  object  \n",
-      " 24  Tx_previos                           22861 non-null  object  \n",
-      " 25  Adherencia_tto_recalc                22861 non-null  float64 \n",
-      " 26  Tiempo_tx                            22861 non-null  float64 \n",
-      " 27  Readmisiones_estudios                22861 non-null  object  \n",
-      " 28  Situacion_tratamiento                22861 non-null  object  \n",
-      " 29  Periodos_COVID                       22861 non-null  object  \n",
-      " 30  Pandemia_inicio_fin_tratamiento      22861 non-null  object  \n",
-      " 31  Nreadmision                          22861 non-null  float64 \n",
-      " 32  Readmisiones_PRECOVID                22861 non-null  float64 \n",
-      " 33  Readmisiones_COVID                   22861 non-null  float64 \n",
-      " 34  Group                                22861 non-null  object  \n",
-      "dtypes: category(1), float64(10), object(24)\n",
-      "memory usage: 6.1+ MB\n",
-      "None\n",
-      "-------------------------------\n",
-      "PRE-ABANDONO\n",
-      "<class 'pandas.core.frame.DataFrame'>\n",
-      "Index: 20069 entries, 0 to 85164\n",
-      "Data columns (total 34 columns):\n",
-      " #   Column                               Non-Null Count  Dtype   \n",
-      "---  ------                               --------------  -----   \n",
-      " 0   CODPROYECTO                          20069 non-null  float64 \n",
-      " 1   Education                            20069 non-null  object  \n",
-      " 2   Social_protection                    20069 non-null  object  \n",
-      " 3   Job_insecurity                       20069 non-null  object  \n",
-      " 4   Housing                              20069 non-null  object  \n",
-      " 5   Alterations_early_childhood_develop  20069 non-null  object  \n",
-      " 6   Social_inclusion                     20069 non-null  object  \n",
-      " 7   Risk_stigma                          18919 non-null  category\n",
-      " 8   Structural_conflic                   20069 non-null  float64 \n",
-      " 9   Age                                  20061 non-null  float64 \n",
-      " 10  Sex                                  20069 non-null  object  \n",
-      " 11  NumHijos                             18958 non-null  float64 \n",
-      " 12  Smoking                              20069 non-null  object  \n",
-      " 13  Biological_vulnerability             20069 non-null  object  \n",
-      " 14  Alcohol_DxCIE                        20069 non-null  object  \n",
-      " 15  Opiaceos_DxCIE                       20069 non-null  object  \n",
-      " 16  Cannabis_DXCIE                       20069 non-null  object  \n",
-      " 17  BZD_DxCIE                            20069 non-null  object  \n",
-      " 18  Cocaina_DxCIE                        20069 non-null  object  \n",
-      " 19  Alucinogenos_DXCIE                   20069 non-null  object  \n",
-      " 20  Tabaco_DXCIE                         20069 non-null  object  \n",
-      " 21  FrecuenciaConsumo30Dias              20069 non-null  object  \n",
-      " 22  Años_consumo_droga                   19609 non-null  float64 \n",
-      " 23  OtrosDx_Psiquiatrico                 20069 non-null  object  \n",
-      " 24  Tx_previos                           20069 non-null  object  \n",
-      " 25  Adherencia_tto_recalc                20069 non-null  float64 \n",
-      " 26  Tiempo_tx                            20069 non-null  float64 \n",
-      " 27  Readmisiones_estudios                20069 non-null  object  \n",
-      " 28  Situacion_tratamiento                20069 non-null  object  \n",
-      " 29  Periodos_COVID                       20069 non-null  object  \n",
-      " 30  Pandemia_inicio_fin_tratamiento      20069 non-null  object  \n",
-      " 31  Nreadmision                          20069 non-null  float64 \n",
-      " 32  Readmisiones_PRECOVID                20069 non-null  float64 \n",
-      " 33  Readmisiones_COVID                   20069 non-null  float64 \n",
-      "dtypes: category(1), float64(10), object(23)\n",
-      "memory usage: 5.2+ MB\n",
-      "None\n",
-      "-------------------------------\n",
-      "PRE-ALTA\n",
-      "<class 'pandas.core.frame.DataFrame'>\n",
-      "Index: 2792 entries, 23 to 85159\n",
-      "Data columns (total 34 columns):\n",
-      " #   Column                               Non-Null Count  Dtype   \n",
-      "---  ------                               --------------  -----   \n",
-      " 0   CODPROYECTO                          2792 non-null   float64 \n",
-      " 1   Education                            2792 non-null   object  \n",
-      " 2   Social_protection                    2792 non-null   object  \n",
-      " 3   Job_insecurity                       2792 non-null   object  \n",
-      " 4   Housing                              2792 non-null   object  \n",
-      " 5   Alterations_early_childhood_develop  2792 non-null   object  \n",
-      " 6   Social_inclusion                     2792 non-null   object  \n",
-      " 7   Risk_stigma                          2687 non-null   category\n",
-      " 8   Structural_conflic                   2792 non-null   float64 \n",
-      " 9   Age                                  2791 non-null   float64 \n",
-      " 10  Sex                                  2792 non-null   object  \n",
-      " 11  NumHijos                             2689 non-null   float64 \n",
-      " 12  Smoking                              2792 non-null   object  \n",
-      " 13  Biological_vulnerability             2792 non-null   object  \n",
-      " 14  Alcohol_DxCIE                        2792 non-null   object  \n",
-      " 15  Opiaceos_DxCIE                       2792 non-null   object  \n",
-      " 16  Cannabis_DXCIE                       2792 non-null   object  \n",
-      " 17  BZD_DxCIE                            2792 non-null   object  \n",
-      " 18  Cocaina_DxCIE                        2792 non-null   object  \n",
-      " 19  Alucinogenos_DXCIE                   2792 non-null   object  \n",
-      " 20  Tabaco_DXCIE                         2792 non-null   object  \n",
-      " 21  FrecuenciaConsumo30Dias              2792 non-null   object  \n",
-      " 22  Años_consumo_droga                   2733 non-null   float64 \n",
-      " 23  OtrosDx_Psiquiatrico                 2792 non-null   object  \n",
-      " 24  Tx_previos                           2792 non-null   object  \n",
-      " 25  Adherencia_tto_recalc                2792 non-null   float64 \n",
-      " 26  Tiempo_tx                            2792 non-null   float64 \n",
-      " 27  Readmisiones_estudios                2792 non-null   object  \n",
-      " 28  Situacion_tratamiento                2792 non-null   object  \n",
-      " 29  Periodos_COVID                       2792 non-null   object  \n",
-      " 30  Pandemia_inicio_fin_tratamiento      2792 non-null   object  \n",
-      " 31  Nreadmision                          2792 non-null   float64 \n",
-      " 32  Readmisiones_PRECOVID                2792 non-null   float64 \n",
-      " 33  Readmisiones_COVID                   2792 non-null   float64 \n",
-      "dtypes: category(1), float64(10), object(23)\n",
-      "memory usage: 744.5+ KB\n",
-      "None\n",
-      "-------------------------------\n",
-      "\n",
-      "\n",
-      "\n",
-      "\n",
-      "POST\n",
-      "<class 'pandas.core.frame.DataFrame'>\n",
-      "Index: 10677 entries, 11 to 85156\n",
-      "Data columns (total 35 columns):\n",
-      " #   Column                               Non-Null Count  Dtype   \n",
-      "---  ------                               --------------  -----   \n",
-      " 0   CODPROYECTO                          10677 non-null  float64 \n",
-      " 1   Education                            10677 non-null  object  \n",
-      " 2   Social_protection                    10677 non-null  object  \n",
-      " 3   Job_insecurity                       10677 non-null  object  \n",
-      " 4   Housing                              10677 non-null  object  \n",
-      " 5   Alterations_early_childhood_develop  10677 non-null  object  \n",
-      " 6   Social_inclusion                     10677 non-null  object  \n",
-      " 7   Risk_stigma                          10085 non-null  category\n",
-      " 8   Structural_conflic                   10677 non-null  float64 \n",
-      " 9   Age                                  10676 non-null  float64 \n",
-      " 10  Sex                                  10677 non-null  object  \n",
-      " 11  NumHijos                             10103 non-null  float64 \n",
-      " 12  Smoking                              10677 non-null  object  \n",
-      " 13  Biological_vulnerability             10677 non-null  object  \n",
-      " 14  Alcohol_DxCIE                        10677 non-null  object  \n",
-      " 15  Opiaceos_DxCIE                       10677 non-null  object  \n",
-      " 16  Cannabis_DXCIE                       10677 non-null  object  \n",
-      " 17  BZD_DxCIE                            10677 non-null  object  \n",
-      " 18  Cocaina_DxCIE                        10677 non-null  object  \n",
-      " 19  Alucinogenos_DXCIE                   10677 non-null  object  \n",
-      " 20  Tabaco_DXCIE                         10677 non-null  object  \n",
-      " 21  FrecuenciaConsumo30Dias              10677 non-null  object  \n",
-      " 22  Años_consumo_droga                   10478 non-null  float64 \n",
-      " 23  OtrosDx_Psiquiatrico                 10677 non-null  object  \n",
-      " 24  Tx_previos                           10677 non-null  object  \n",
-      " 25  Adherencia_tto_recalc                10677 non-null  float64 \n",
-      " 26  Tiempo_tx                            10677 non-null  float64 \n",
-      " 27  Readmisiones_estudios                10677 non-null  object  \n",
-      " 28  Situacion_tratamiento                10677 non-null  object  \n",
-      " 29  Periodos_COVID                       10677 non-null  object  \n",
-      " 30  Pandemia_inicio_fin_tratamiento      10677 non-null  object  \n",
-      " 31  Nreadmision                          10677 non-null  float64 \n",
-      " 32  Readmisiones_PRECOVID                10677 non-null  float64 \n",
-      " 33  Readmisiones_COVID                   10677 non-null  float64 \n",
-      " 34  Group                                10677 non-null  object  \n",
-      "dtypes: category(1), float64(10), object(24)\n",
-      "memory usage: 2.9+ MB\n",
-      "None\n",
-      "-------------------------------\n",
-      "POST-ABANDONO\n",
-      "<class 'pandas.core.frame.DataFrame'>\n",
-      "Index: 8795 entries, 11 to 85156\n",
-      "Data columns (total 34 columns):\n",
-      " #   Column                               Non-Null Count  Dtype   \n",
-      "---  ------                               --------------  -----   \n",
-      " 0   CODPROYECTO                          8795 non-null   float64 \n",
-      " 1   Education                            8795 non-null   object  \n",
-      " 2   Social_protection                    8795 non-null   object  \n",
-      " 3   Job_insecurity                       8795 non-null   object  \n",
-      " 4   Housing                              8795 non-null   object  \n",
-      " 5   Alterations_early_childhood_develop  8795 non-null   object  \n",
-      " 6   Social_inclusion                     8795 non-null   object  \n",
-      " 7   Risk_stigma                          8308 non-null   category\n",
-      " 8   Structural_conflic                   8795 non-null   float64 \n",
-      " 9   Age                                  8794 non-null   float64 \n",
-      " 10  Sex                                  8795 non-null   object  \n",
-      " 11  NumHijos                             8325 non-null   float64 \n",
-      " 12  Smoking                              8795 non-null   object  \n",
-      " 13  Biological_vulnerability             8795 non-null   object  \n",
-      " 14  Alcohol_DxCIE                        8795 non-null   object  \n",
-      " 15  Opiaceos_DxCIE                       8795 non-null   object  \n",
-      " 16  Cannabis_DXCIE                       8795 non-null   object  \n",
-      " 17  BZD_DxCIE                            8795 non-null   object  \n",
-      " 18  Cocaina_DxCIE                        8795 non-null   object  \n",
-      " 19  Alucinogenos_DXCIE                   8795 non-null   object  \n",
-      " 20  Tabaco_DXCIE                         8795 non-null   object  \n",
-      " 21  FrecuenciaConsumo30Dias              8795 non-null   object  \n",
-      " 22  Años_consumo_droga                   8627 non-null   float64 \n",
-      " 23  OtrosDx_Psiquiatrico                 8795 non-null   object  \n",
-      " 24  Tx_previos                           8795 non-null   object  \n",
-      " 25  Adherencia_tto_recalc                8795 non-null   float64 \n",
-      " 26  Tiempo_tx                            8795 non-null   float64 \n",
-      " 27  Readmisiones_estudios                8795 non-null   object  \n",
-      " 28  Situacion_tratamiento                8795 non-null   object  \n",
-      " 29  Periodos_COVID                       8795 non-null   object  \n",
-      " 30  Pandemia_inicio_fin_tratamiento      8795 non-null   object  \n",
-      " 31  Nreadmision                          8795 non-null   float64 \n",
-      " 32  Readmisiones_PRECOVID                8795 non-null   float64 \n",
-      " 33  Readmisiones_COVID                   8795 non-null   float64 \n",
-      "dtypes: category(1), float64(10), object(23)\n",
-      "memory usage: 2.3+ MB\n",
-      "None\n",
-      "-------------------------------\n",
-      "POST-ALTA\n",
-      "<class 'pandas.core.frame.DataFrame'>\n",
-      "Index: 1882 entries, 258 to 85149\n",
-      "Data columns (total 34 columns):\n",
-      " #   Column                               Non-Null Count  Dtype   \n",
-      "---  ------                               --------------  -----   \n",
-      " 0   CODPROYECTO                          1882 non-null   float64 \n",
-      " 1   Education                            1882 non-null   object  \n",
-      " 2   Social_protection                    1882 non-null   object  \n",
-      " 3   Job_insecurity                       1882 non-null   object  \n",
-      " 4   Housing                              1882 non-null   object  \n",
-      " 5   Alterations_early_childhood_develop  1882 non-null   object  \n",
-      " 6   Social_inclusion                     1882 non-null   object  \n",
-      " 7   Risk_stigma                          1777 non-null   category\n",
-      " 8   Structural_conflic                   1882 non-null   float64 \n",
-      " 9   Age                                  1882 non-null   float64 \n",
-      " 10  Sex                                  1882 non-null   object  \n",
-      " 11  NumHijos                             1778 non-null   float64 \n",
-      " 12  Smoking                              1882 non-null   object  \n",
-      " 13  Biological_vulnerability             1882 non-null   object  \n",
-      " 14  Alcohol_DxCIE                        1882 non-null   object  \n",
-      " 15  Opiaceos_DxCIE                       1882 non-null   object  \n",
-      " 16  Cannabis_DXCIE                       1882 non-null   object  \n",
-      " 17  BZD_DxCIE                            1882 non-null   object  \n",
-      " 18  Cocaina_DxCIE                        1882 non-null   object  \n",
-      " 19  Alucinogenos_DXCIE                   1882 non-null   object  \n",
-      " 20  Tabaco_DXCIE                         1882 non-null   object  \n",
-      " 21  FrecuenciaConsumo30Dias              1882 non-null   object  \n",
-      " 22  Años_consumo_droga                   1851 non-null   float64 \n",
-      " 23  OtrosDx_Psiquiatrico                 1882 non-null   object  \n",
-      " 24  Tx_previos                           1882 non-null   object  \n",
-      " 25  Adherencia_tto_recalc                1882 non-null   float64 \n",
-      " 26  Tiempo_tx                            1882 non-null   float64 \n",
-      " 27  Readmisiones_estudios                1882 non-null   object  \n",
-      " 28  Situacion_tratamiento                1882 non-null   object  \n",
-      " 29  Periodos_COVID                       1882 non-null   object  \n",
-      " 30  Pandemia_inicio_fin_tratamiento      1882 non-null   object  \n",
-      " 31  Nreadmision                          1882 non-null   float64 \n",
-      " 32  Readmisiones_PRECOVID                1882 non-null   float64 \n",
-      " 33  Readmisiones_COVID                   1882 non-null   float64 \n",
-      "dtypes: category(1), float64(10), object(23)\n",
-      "memory usage: 501.9+ KB\n",
-      "None\n",
-      "-------------------------------\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(\"PRE\")\n",
-    "print(conj_pre.info())\n",
-    "print (\"-------------------------------\")\n",
-    "print(\"PRE-ABANDONO\")\n",
-    "print(pre_abandono.info())\n",
-    "print (\"-------------------------------\")\n",
-    "print(\"PRE-ALTA\")\n",
-    "print(pre_alta.info())\n",
-    "print (\"-------------------------------\")\n",
-    "\n",
-    "print(\"\\n\\n\\n\")\n",
-    "\n",
-    "print (\"POST\")\n",
-    "print(conj_post.info())\n",
-    "print (\"-------------------------------\")\n",
-    "print(\"POST-ABANDONO\")\n",
-    "print(post_abandono.info())\n",
-    "print (\"-------------------------------\")\n",
-    "print(\"POST-ALTA\")\n",
-    "print(post_alta.info())\n",
-    "print (\"-------------------------------\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Missing and Unknown Values"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 133,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "['Live with families or friends' 'live alone' 'live in institutions' '9.0']\n",
-      "['Live with families or friends' 'live alone' 'live in institutions']\n",
-      "['No alterations (first exposure at 11 or more years)'\n",
-      " 'Alterations (first exposure before 11 years old)' '9']\n",
-      "['No alterations (first exposure at 11 or more years)'\n",
-      " 'Alterations (first exposure before 11 years old)']\n",
-      "[NaN, 'Yes', 'No']\n",
-      "Categories (3, object): [99.0, 'No', 'Yes']\n",
-      "[NaN, 'Yes', 'No']\n",
-      "Categories (2, object): ['No', 'Yes']\n",
-      "[nan  0.  1.  2.  3.  4.  5.  8. 10.  6. 11. 12.  9.  7. 99. 14. 15.]\n",
-      "[nan  0.  1.  2.  3.  4.  5.  8. 10.  6. 11. 12.  9.  7. 14. 15.]\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_8220\\1003504044.py:14: FutureWarning: The behavior of Series.replace (and DataFrame.replace) with CategoricalDtype is deprecated. In a future version, replace will only be used for cases that preserve the categories. To change the categories, use ser.cat.rename_categories instead.\n",
-      "  bd['Risk_stigma'] = bd['Risk_stigma'].replace(99.0, mode_stigma)\n"
-     ]
-    }
-   ],
-   "source": [
-    "# 9.0 represents unknown according to Variables.docx \n",
-    "print(bd['Social_inclusion'].unique())\n",
-    "mode_soc_inc = bd['Social_inclusion'].mode()[0]\n",
-    "bd['Social_inclusion'] = bd['Social_inclusion'].replace('9.0', mode_soc_inc)\n",
-    "print(bd['Social_inclusion'].unique())\n",
-    "\n",
-    "print(bd['Alterations_early_childhood_develop'].unique())\n",
-    "mode_alt = bd['Alterations_early_childhood_develop'].mode()[0]\n",
-    "bd['Alterations_early_childhood_develop'] = bd['Alterations_early_childhood_develop'].replace('9', mode_alt)\n",
-    "print(bd['Alterations_early_childhood_develop'].unique())\n",
-    "\n",
-    "print(bd['Risk_stigma'].unique())\n",
-    "mode_stigma = bd['Risk_stigma'].mode()[0]\n",
-    "bd['Risk_stigma'] = bd['Risk_stigma'].replace(99.0, mode_stigma)\n",
-    "print(bd['Risk_stigma'].unique())\n",
-    "\n",
-    "print(bd['NumHijos'].unique())\n",
-    "mode_hijos = bd['NumHijos'].mode()[0]\n",
-    "bd['NumHijos'] = bd['NumHijos'].replace(99.0, mode_hijos)\n",
-    "print(bd['NumHijos'].unique())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 119,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Total missing values Age: 10\n",
-      "Total missing values Años_consumo_droga: 718\n",
-      "Total missing values Risk_stigma: 1847\n",
-      "Total missing values NumHijos: 1788\n",
-      "\tCONJUNTO PREPANDEMIA\n",
-      "\t\tMissing values Age: 9\n",
-      "\t\tMissing values Años_consumo_droga: 519\n",
-      "\t\tMissing values Risk_stigma: 1255\n",
-      "\t\tMissing values NumHijos: 1214\n",
-      "\tCONJUNTO POSTPANDEMIA\n",
-      "\t\tMissing values Age: 1\n",
-      "\t\tMissing values Años_consumo_droga: 199\n",
-      "\t\tMissing values Risk_stigma: 592\n",
-      "\t\tMissing values NumHijos: 574\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(f\"Total missing values Age: {bd['Age'].isnull().sum()}\")\n",
-    "print(f\"Total missing values Años_consumo_droga: {bd['Años_consumo_droga'].isnull().sum()}\")\n",
-    "print(f\"Total missing values Risk_stigma: {bd['Risk_stigma'].isnull().sum()}\")\n",
-    "print(f\"Total missing values NumHijos: {bd['NumHijos'].isnull().sum()}\")\n",
-    "\n",
-    "print(\"\\tCONJUNTO PREPANDEMIA\")\n",
-    "print(f\"\\t\\tMissing values Age: {conj_pre['Age'].isnull().sum()}\")\n",
-    "print(f\"\\t\\tMissing values Años_consumo_droga: {conj_pre['Años_consumo_droga'].isnull().sum()}\")\n",
-    "print(f\"\\t\\tMissing values Risk_stigma: {conj_pre['Risk_stigma'].isnull().sum()}\")\n",
-    "print(f\"\\t\\tMissing values NumHijos: {conj_pre['NumHijos'].isnull().sum()}\")\n",
-    "\n",
-    "print(\"\\tCONJUNTO POSTPANDEMIA\")\n",
-    "print(f\"\\t\\tMissing values Age: {conj_post['Age'].isnull().sum()}\")\n",
-    "print(f\"\\t\\tMissing values Años_consumo_droga: {conj_post['Años_consumo_droga'].isnull().sum()}\")\n",
-    "print(f\"\\t\\tMissing values Risk_stigma: {conj_post['Risk_stigma'].isnull().sum()}\")\n",
-    "print(f\"\\t\\tMissing values NumHijos: {conj_post['NumHijos'].isnull().sum()}\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 134,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_8220\\3303146707.py:2: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n",
-      "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n",
-      "\n",
-      "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n",
-      "\n",
-      "\n",
-      "  bd['Age'].fillna(age_mode, inplace=True)\n",
-      "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_8220\\3303146707.py:5: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n",
-      "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n",
-      "\n",
-      "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n",
-      "\n",
-      "\n",
-      "  bd['Años_consumo_droga'].fillna(años_consumo_mode, inplace=True)\n",
-      "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_8220\\3303146707.py:8: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n",
-      "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n",
-      "\n",
-      "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n",
-      "\n",
-      "\n",
-      "  bd['Risk_stigma'].fillna(risk_stigma_mode, inplace=True)\n",
-      "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_8220\\3303146707.py:11: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n",
-      "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n",
-      "\n",
-      "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n",
-      "\n",
-      "\n",
-      "  bd['NumHijos'].fillna(num_hijos_mode, inplace=True)\n"
-     ]
-    }
-   ],
-   "source": [
-    "age_mode = bd['Age'].mode()[0]\n",
-    "bd['Age'].fillna(age_mode, inplace=True)\n",
-    "\n",
-    "años_consumo_mode = bd['Años_consumo_droga'].mode()[0]\n",
-    "bd['Años_consumo_droga'].fillna(años_consumo_mode, inplace=True)\n",
-    "\n",
-    "risk_stigma_mode = bd['Risk_stigma'].mode()[0]\n",
-    "bd['Risk_stigma'].fillna(risk_stigma_mode, inplace=True)\n",
-    "\n",
-    "num_hijos_mode = bd['NumHijos'].mode()[0]\n",
-    "bd['NumHijos'].fillna(num_hijos_mode, inplace=True)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Distribution of Variables"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "disc_atts = ['Education', 'Social_protection', 'Job_insecurity', 'Housing',\n",
-    "        'Alterations_early_childhood_develop', 'Social_inclusion',\n",
-    "        'Risk_stigma', 'Sex', 'NumHijos', 'Smoking', 'Biological_vulnerability',\n",
-    "        'Opiaceos_DxCIE', 'Cannabis_DXCIE', 'BZD_DxCIE', 'Cocaina_DxCIE',\n",
-    "        'Alucinogenos_DXCIE', 'Tabaco_DXCIE', 'FrecuenciaConsumo30Dias',\n",
-    "        'OtrosDx_Psiquiatrico', 'Tx_previos', 'Readmisiones_estudios', 'Nreadmision'\n",
-    "        ]\n",
-    "\n",
-    "num_atts = ['Structural_conflic', 'Adherencia_tto_recalc', 'Age', 'Años_consumo_droga', 'Tiempo_tx']"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### Discrete"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "##### Countplots"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "fig, axs = plt.subplots(len(disc_atts), 1, figsize=(15, 5*len(disc_atts)))\n",
-    "plt.subplots_adjust(hspace=0.75, wspace=1.25)\n",
-    "\n",
-    "for i, disc_att in enumerate(disc_atts):\n",
-    "    ax = sns.countplot(x=disc_att, data=combined_pre_post, hue=combined_pre_post[['Situacion_tratamiento', 'Group']].apply(tuple, axis=1),\n",
-    "                       hue_order=[('Abandono', 'Pre'),('Alta terapéutica', 'Pre'), ('Abandono', 'Post'), ('Alta terapéutica', 'Post')],\n",
-    "                       ax=axs[i])\n",
-    "    ax.set_title(disc_att, fontsize=16, fontweight='bold')\n",
-    "    ax.get_legend().set_title(\"Groups\")\n",
-    "    \n",
-    "    # Adding count annotations\n",
-    "    for p in ax.patches:\n",
-    "        if p.get_label() == '_nolegend_':\n",
-    "            ax.annotate(format(p.get_height(), '.0f'), \n",
-    "                        (p.get_x() + p.get_width() / 2., p.get_height()), \n",
-    "                        ha = 'center', va = 'center', \n",
-    "                        xytext = (0, 9), \n",
-    "                        textcoords = 'offset points')\n",
-    "\n",
-    "# Adjust layout to prevent overlapping titles\n",
-    "plt.tight_layout()\n",
-    "\n",
-    "plt.savefig('./output/plots/distributions/countplots.svg', dpi=600, bbox_inches='tight')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "##### Normalized Countplots"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Function to plot countplot \n",
-    "def plot_count_perc_norm(i: int, group:int, disc_att:str) -> None:\n",
-    "    \"\"\"\n",
-    "        group: 1 (all), 2 (pre), 3 (post) \n",
-    "    \"\"\"\n",
-    "\n",
-    "    # Define data to work with based on group\n",
-    "    if group == 1:\n",
-    "        df = bd \n",
-    "    elif group == 2:\n",
-    "        df = conj_pre\n",
-    "    elif group == 3:\n",
-    "        df = conj_post\n",
-    "\n",
-    "    # GOAL: find percentage of each possible category within the total of its situacion_tto subset\n",
-    "    # Group data by 'Situacion_tratamiento' and 'Education' and count occurrences\n",
-    "    grouped_counts = df.groupby(['Situacion_tratamiento', disc_att]).size().reset_index(name='count')\n",
-    "    # Calculate total count for each 'Situacion_tratamiento' group\n",
-    "    total_counts = df.groupby('Situacion_tratamiento')[disc_att].count()\n",
-    "    # Divide each count by its corresponding total count and calculate percentage\n",
-    "    grouped_counts['percentage'] = grouped_counts.apply(lambda row: row['count'] / total_counts[row['Situacion_tratamiento']] * 100, axis=1)\n",
-    "    \n",
-    "    # Follow the same order in plot as in computations\n",
-    "    col_order = grouped_counts[grouped_counts['Situacion_tratamiento'] == 'Abandono'][disc_att].tolist()\n",
-    "\n",
-    "    # Create countplot and split each bar into two based on the value of sit_tto\n",
-    "    ax = sns.countplot(x=disc_att, hue='Situacion_tratamiento', data=df, order=col_order, ax=axs[i, group-2])\n",
-    "\n",
-    "    # Adjust y-axis to represent percentages out of the total count\n",
-    "    ax.set_ylim(0, 100)\n",
-    "\n",
-    "    percentages = grouped_counts['percentage']\n",
-    "    for i, p in enumerate(ax.patches):\n",
-    "        # Skip going over the legend values\n",
-    "        if p.get_label() == \"_nolegend_\":\n",
-    "            # Set height to corresponding percentage and annotate result\n",
-    "            height = percentages[i]\n",
-    "            p.set_height(height)\n",
-    "            ax.annotate(f'{height:.2f}%', (p.get_x() + p.get_width() / 2., height),\n",
-    "                        ha='center', va='bottom', fontsize=6, color='black', xytext=(0, 5),\n",
-    "                        textcoords='offset points')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "fig, axs = plt.subplots(len(disc_atts), 2, figsize=(15, 7*len(disc_atts)))\n",
-    "plt.subplots_adjust(hspace=0.75, wspace=1.5)\n",
-    "\n",
-    "for i, disc_att in enumerate(disc_atts):\n",
-    "\n",
-    "    # # 1: ALL    \n",
-    "    # plot_count_perc_norm(i, 1, disc_att)\n",
-    "    # axs[i, 0].set_title(\"\\nALL\")\n",
-    "    # axs[i, 0].set_xlabel(disc_att, fontweight='bold')\n",
-    "    # axs[i, 0].set_ylabel(\"% of total within its Sit_tto group\")\n",
-    "    # axs[i, 0].tick_params(axis='x', rotation=90)\n",
-    "    \n",
-    "    # 2: PRE\n",
-    "    plot_count_perc_norm(i, 2, disc_att)\n",
-    "    axs[i, 0].set_title(\"\\nPRE\")\n",
-    "    axs[i, 0].set_xlabel(disc_att, fontweight='bold')\n",
-    "    axs[i, 0].set_ylabel(\"% of total within its Sit_tto group\")\n",
-    "    axs[i, 0].tick_params(axis='x', rotation=90)\n",
-    "\n",
-    "    # 3: POST\n",
-    "    plot_count_perc_norm(i, 3, disc_att)\n",
-    "    axs[i, 1].set_title(\"\\nPOST\")\n",
-    "    axs[i, 1].set_xlabel(disc_att, fontweight='bold')\n",
-    "    axs[i, 1].set_ylabel(\"% of total within its Sit_tto group\")\n",
-    "    axs[i, 1].tick_params(axis='x', rotation=90)\n",
-    "\n",
-    "    \n",
-    "# Adjust layout to prevent overlapping titles\n",
-    "plt.tight_layout()\n",
-    "\n",
-    "# Save the figure in SVG format with DPI=600 in the \"._plots\" folder\n",
-    "plt.savefig('./output/plots/distributions/norm_countplots.svg', dpi=600, bbox_inches='tight')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### Numerical"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "##### Summary Stats"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(bd[num_atts].describe())"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "##### Boxplots"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "fig, axs = plt.subplots(len(num_atts), 1, figsize=(12, 5*len(num_atts)))\n",
-    "plt.subplots_adjust(hspace=0.75, wspace=1.5)\n",
-    "\n",
-    "for i, num_att in enumerate(num_atts):\n",
-    "    plt.subplot(len(num_atts), 1, i+1)\n",
-    "    sns.boxplot(\n",
-    "        data=combined_pre_post,\n",
-    "        x = num_att,\n",
-    "        y = 'Group',\n",
-    "        hue='Situacion_tratamiento',\n",
-    "    )\n",
-    "\n",
-    "# Adjust layout to prevent overlapping titles\n",
-    "plt.tight_layout()\n",
-    "\n",
-    "# Save the figure in SVG format with DPI=600 in the \"./EDA_plots\" folder\n",
-    "plt.savefig('./output/plots/distributions/boxplots.svg', dpi=600, bbox_inches='tight')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "##### Histograms"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "fig, axs = plt.subplots(len(num_atts), 3, figsize=(15, 6*len(num_atts)))\n",
-    "plt.subplots_adjust(hspace=0.75, wspace=1.5)\n",
-    "\n",
-    "for i, num_att in enumerate(num_atts):\n",
-    "\n",
-    "    # 1: All alcohol patients\n",
-    "    sns.histplot(data=bd,x=num_att,bins=15, hue='Situacion_tratamiento', stat='probability', common_norm=False, kde=True,\n",
-    "                 line_kws={'lw': 5}, alpha = 0.4, ax=axs[i, 0])\n",
-    "    axs[i, 0].set_title(f\"\\nDistr. of {num_att}  - ALL\")\n",
-    "\n",
-    "    # 2: PRE\n",
-    "    sns.histplot(data=conj_pre,x=num_att,bins=15, hue='Situacion_tratamiento', stat='probability', common_norm=False, kde=True, \n",
-    "                 line_kws={'lw': 5}, alpha = 0.4, ax=axs[i, 1])\n",
-    "    axs[i, 1].set_title(f\"\\nDistr. of {num_att}  - PRE\")\n",
-    "\n",
-    "    # Subplot 3: POST\n",
-    "    sns.histplot(data=conj_post,x=num_att,bins=15, hue='Situacion_tratamiento', stat='probability', common_norm=False, kde=True, \n",
-    "                 line_kws={'lw': 5}, alpha = 0.4, ax=axs[i, 2])\n",
-    "    axs[i, 2].set_title(f\"\\nDistr. of {num_att}  - POST\")\n",
-    "\n",
-    "# Adjust layout to prevent overlapping titles\n",
-    "plt.tight_layout()\n",
-    "\n",
-    "# Save the figure in SVG format with DPI=600 in the \"./EDA_plots\" folder\n",
-    "plt.savefig('./output/plots/distributions/histograms.svg', dpi=600, bbox_inches='tight')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Correlation Analysis"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### Groups of Variables"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 135,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "social_vars = ['Education', 'Social_protection', 'Job_insecurity', 'Housing', 'Alterations_early_childhood_develop', \n",
-    "            'Social_inclusion', 'Risk_stigma', 'Structural_conflic']\n",
-    "ind_vars = ['Age', 'Sex', 'NumHijos', 'Smoking', 'Biological_vulnerability', 'Opiaceos_DxCIE', \n",
-    "            'Cannabis_DXCIE', 'BZD_DxCIE', 'Cocaina_DxCIE', 'Alucinogenos_DXCIE', 'Tabaco_DXCIE', \n",
-    "            'FrecuenciaConsumo30Dias', 'Años_consumo_droga','OtrosDx_Psiquiatrico', 'Tx_previos', 'Adherencia_tto_recalc'] \n",
-    "target_var = 'Situacion_tratamiento'\n",
-    "\n",
-    "# Columns that are already numeric and we don't need to redefine \n",
-    "no_redef_cols = ['Structural_conflic', 'Age', 'NumHijos', 'Años_consumo_droga', 'Adherencia_tto_recalc']"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### One-hot Encoding"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "##### Binary"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 136,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# --------------------------------------------------------------------------\n",
-    "\n",
-    "# 'Alterations_early_childhood_develop'\n",
-    "alterations_mapping = {\n",
-    "    'No alterations (first exposure at 11 or more years)' : 0,\n",
-    "    'Alterations (first exposure before 11 years old)': 1,\n",
-    "}\n",
-    "\n",
-    "bd['Alterations_early_childhood_develop_REDEF'] = bd['Alterations_early_childhood_develop'].map(alterations_mapping)\n",
-    "\n",
-    "# --------------------------------------------------------------------------\n",
-    "\n",
-    "# Social protection\n",
-    "bd['Social_protection_REDEF'] = bd['Social_protection'].map({'No':0, 'Sí':1})\n",
-    "\n",
-    "# --------------------------------------------------------------------------\n",
-    "\n",
-    "# 'Risk_stigma'\n",
-    "bd['Risk_stigma_REDEF'] = bd['Risk_stigma'].map({'No':0, 'Yes':1})\n",
-    "\n",
-    "# --------------------------------------------------------------------------\n",
-    "\n",
-    "# 'Sex'\n",
-    "bd['Sex_REDEF'] = bd['Sex'].map({'Hombre':0, 'Mujer':1})\n",
-    "\n",
-    "# --------------------------------------------------------------------------\n",
-    "\n",
-    "# 'Smoking'\n",
-    "bd['Smoking_REDEF'] = bd['Smoking'].map({'No':0, 'Sí':1})\n",
-    "\n",
-    "# --------------------------------------------------------------------------\n",
-    "\n",
-    "# 'Biological_vulnerability'\n",
-    "bd['Biological_vulnerability_REDEF'] = bd['Biological_vulnerability'].map({'No':0, 'Sí':1})\n",
-    "\n",
-    "# --------------------------------------------------------------------------\n",
-    "\n",
-    "# 'Droga_DxCIE'\n",
-    "bd['Opiaceos_DxCIE_REDEF'] = bd['Opiaceos_DxCIE'].map({'No': 0, 'Sí': 1})\n",
-    "bd['Cannabis_DXCIE_REDEF'] = bd['Cannabis_DXCIE'].map({'No': 0, 'Sí': 1})\n",
-    "bd['BZD_DxCIE_REDEF'] = bd['BZD_DxCIE'].map({'No': 0, 'Sí': 1})\n",
-    "bd['Cocaina_DxCIE_REDEF'] = bd['Cocaina_DxCIE'].map({'No': 0, 'Sí': 1})\n",
-    "bd['Alucinogenos_DXCIE_REDEF'] = bd['Alucinogenos_DXCIE'].map({'No': 0, 'Sí': 1})\n",
-    "bd['Tabaco_DXCIE_REDEF'] = bd['Tabaco_DXCIE'].map({'No': 0, 'Sí': 1})\n",
-    "\n",
-    "# --------------------------------------------------------------------------\n",
-    "\n",
-    "# 'OtrosDx_Psiquiatrico'\n",
-    "bd['OtrosDx_Psiquiatrico_REDEF'] = bd['OtrosDx_Psiquiatrico'].map({'No':0, 'Sí':1})\n",
-    "\n",
-    "# --------------------------------------------------------------------------\n",
-    "\n",
-    "# 'Tx_previos'\n",
-    "bd['Tx_previos_REDEF'] = bd['Tx_previos'].map({'No':0, 'Sí':1})\n",
-    "\n",
-    "# --------------------------------------------------------------------------\n",
-    "\n",
-    "# 'Situacion_tratamiento (!!!!!)\n",
-    "# Important to define properly\n",
-    "bd['Situacion_tratamiento_REDEF'] = bd['Situacion_tratamiento'].map({'Abandono':1, 'Alta terapéutica':0})\n",
-    "\n",
-    "# --------------------------------------------------------------------------"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "##### Categorical"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 137,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Specify columns to one hot encode; empty list otherwise\n",
-    "one_hot_vars = ['Education', 'Job_insecurity', 'Housing', 'Social_inclusion', 'FrecuenciaConsumo30Dias']\n",
-    "\n",
-    "one_hots_vars_prefix = {\n",
-    "    'Education': 'Ed',\n",
-    "    'Job_insecurity': 'JobIn',\n",
-    "    'Housing': 'Hous', \n",
-    "    'Social_inclusion': 'SocInc',\n",
-    "    'FrecuenciaConsumo30Dias': 'Frec30',\n",
-    "}\n",
-    "\n",
-    "one_hot_cols_dic = {}\n",
-    "\n",
-    "for one_hot_var in one_hot_vars:\n",
-    "    # Create one hot encoding version of attribute and concatenate new columns to main df\n",
-    "    encoded_var = pd.get_dummies(bd[one_hot_var], prefix=one_hots_vars_prefix[one_hot_var])\n",
-    "    bd = pd.concat([bd, encoded_var], axis=1)\n",
-    "    one_hot_cols_dic[one_hot_var] = encoded_var.columns.tolist()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### Final Columns"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 142,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "soc_vars_enc = []\n",
-    "for soc_var in social_vars:\n",
-    "    # If no need to redefine, append directly\n",
-    "    if soc_var in no_redef_cols:\n",
-    "        soc_vars_enc.append(soc_var)\n",
-    "    # If need to redefine\n",
-    "    else:\n",
-    "        # Check if it was one-hot encoded\n",
-    "        if soc_var in one_hot_vars:\n",
-    "            # Append all one hot columns\n",
-    "            soc_vars_enc = soc_vars_enc + one_hot_cols_dic[soc_var]\n",
-    "        # If not, use redefined version through mapping\n",
-    "        else:\n",
-    "            soc_vars_enc.append(soc_var + '_REDEF')\n",
-    "\n",
-    "ind_vars_enc = []\n",
-    "for ind_var in ind_vars:\n",
-    "    # If no need to redefine, append directly\n",
-    "    if ind_var in no_redef_cols:\n",
-    "        ind_vars_enc.append(ind_var)\n",
-    "    # If need to redefine\n",
-    "    else:\n",
-    "        # Check if it was one-hot encoded\n",
-    "        if ind_var in one_hot_vars:\n",
-    "            # Append all one hot columns\n",
-    "            ind_vars_enc = ind_vars_enc + one_hot_cols_dic[ind_var]\n",
-    "        # If not, use redefined version through mapping\n",
-    "        else:\n",
-    "            ind_vars_enc.append(ind_var + '_REDEF')\n",
-    "\n",
-    "# Final version of columns we need to use for correlation analysis\n",
-    "corr_cols = soc_vars_enc + ind_vars_enc"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 143,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Drop unknown columns\n",
-    "corr_cols = [corr_col for corr_col in corr_cols if corr_col not in ['Ed_Unknowledge', 'JobIn_unkwnodledge', 'Hous_unknowledge', 'Frec30_Desconocido']]\n",
-    "soc_vars_enc = [corr_col for corr_col in soc_vars_enc if corr_col not in ['Ed_Unknowledge', 'JobIn_unkwnodledge', 'Hous_unknowledge', 'Frec30_Desconocido']]\n",
-    "ind_vars_enc = [corr_col for corr_col in ind_vars_enc if corr_col not in ['Ed_Unknowledge', 'JobIn_unkwnodledge', 'Hous_unknowledge', 'Frec30_Desconocido']]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "##### Renaming and Filtering"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 144,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "columns_to_keep = corr_cols + ['Situacion_tratamiento','Situacion_tratamiento_REDEF', 'Pandemia_inicio_fin_tratamiento']\n",
-    "bd = bd[columns_to_keep]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 145,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "name_mapping = {\n",
-    "    'Ed_Not Complete primary school': 'Ed_Not_Complete_Primary',\n",
-    "    'Ed_Primary education': 'Ed_Primary',\n",
-    "    'Ed_Secondary Education': 'Ed_Secondary',\n",
-    "    'Ed_Secondary more technical education': 'Ed_Secondary_Technical',\n",
-    "    'Ed_Tertiary': 'Ed_Tertiary',\n",
-    "    'Social_protection_REDEF': 'Social_Protection',\n",
-    "    'JobIn_Non-stable': 'JobIn_Unstable',\n",
-    "    'JobIn_Stable': 'JobIn_Stable',\n",
-    "    'JobIn_Unemployed': 'JobIn_Unemployed',\n",
-    "    'Hous_Institutional': 'Hous_Institutional',\n",
-    "    'Hous_Stable': 'Hous_Stable',\n",
-    "    'Hous_Unstable': 'Hous_Unstable',\n",
-    "    'Alterations_early_childhood_develop_REDEF': 'Early_Alterations',\n",
-    "    'SocInc_Live with families or friends': 'SocInc_Family_Friends',\n",
-    "    'SocInc_live alone': 'SocInc_Alone',\n",
-    "    'SocInc_live in institutions': 'SocInc_Instit',\n",
-    "    'Risk_stigma_REDEF': 'Risk_Stigma',\n",
-    "    'Structural_conflic': 'Structural_Conflict',\n",
-    "    'Age': 'Age',\n",
-    "    'Sex_REDEF': 'Sex',\n",
-    "    'NumHijos': 'Num_Children',\n",
-    "    'Smoking_REDEF': 'Smoking',\n",
-    "    'Biological_vulnerability_REDEF': 'Bio_Vulner',\n",
-    "    'Opiaceos_DxCIE_REDEF': 'Opiods_DXCIE',\n",
-    "    'Cannabis_DXCIE_REDEF': 'Cannabis_DXCIE',\n",
-    "    'BZD_DxCIE_REDEF': 'BZD_DXCIE',\n",
-    "    'Cocaina_DxCIE_REDEF': 'Cocaine_DXCIE',\n",
-    "    'Alucinogenos_DXCIE_REDEF': 'Hallucin_DXCIE',\n",
-    "    'Tabaco_DXCIE_REDEF': 'Tobacco_DXCIE',\n",
-    "    'Frec30_1 día/semana': 'Freq_1dpw',\n",
-    "    'Frec30_2-3 días\\u200e/semana': 'Freq_2-3dpw',\n",
-    "    'Frec30_4-6 días/semana': 'Freq_4-6dpw',\n",
-    "    'Frec30_Menos de 1 día\\u200e/semana': 'Freq_l1dpw',\n",
-    "    'Frec30_No consumio': 'Freq_None',\n",
-    "    'Frec30_Todos los días': 'Freq_Everyday',\n",
-    "    'Años_consumo_droga': 'Years_Drug_Use',\n",
-    "    'OtrosDx_Psiquiatrico_REDEF': 'Other_Psychiatric_DX',\n",
-    "    'Tx_previos_REDEF': 'Previous_Treatments',\n",
-    "    'Adherencia_tto_recalc': 'Treatment_Adherence',\n",
-    "    'Situacion_tratamiento_REDEF': 'Treatment_Outcome',\n",
-    "    'Situacion_tratamiento': 'Situacion_tratamiento',\n",
-    "    'Pandemia_inicio_fin_tratamiento': 'Pandemia_inicio_fin_tratamiento'\n",
-    "}\n",
-    "\n",
-    "# Update lists of feature names\n",
-    "corr_cols = [name_mapping[corr_col] for corr_col in corr_cols]\n",
-    "soc_vars_enc = [name_mapping[col] for col in soc_vars_enc]\n",
-    "ind_vars_enc = [name_mapping[col] for col in ind_vars_enc]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 146,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Export feature names\n",
-    "np.save('./output/feature_names.npy', corr_cols)\n",
-    "np.save('./output/soc_vars_names.npy', soc_vars_enc)\n",
-    "np.save('./output/ind_vars_names.npy', ind_vars_enc)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 147,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "bd = bd.rename(columns=name_mapping)[list(name_mapping.values())]\n",
-    "#print(bd.columns)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 148,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Update main dfs\n",
-    "# Pre-pandemic\n",
-    "conj_pre = bd[bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio y fin prepandemia']\n",
-    "# Pre-pandemic abandono\n",
-    "pre_abandono = conj_pre[conj_pre['Situacion_tratamiento'] == 'Abandono']\n",
-    "# Pre-pandemic alta\n",
-    "pre_alta = conj_pre[conj_pre['Situacion_tratamiento'] == 'Alta terapéutica']\n",
-    "\n",
-    "# Post-pandemic\n",
-    "# Merging last two classes to balance sets\n",
-    "conj_post = bd[(bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio prepandemia y fin en pandemia') | \n",
-    "               (bd['Pandemia_inicio_fin_tratamiento'] == 'inicio y fin en pandemia')]\n",
-    "# Post-pandemic abandono\n",
-    "post_abandono = conj_post[conj_post['Situacion_tratamiento'] == 'Abandono']\n",
-    "# Post-pandemic alta\n",
-    "post_alta = conj_post[conj_post['Situacion_tratamiento'] == 'Alta terapéutica']"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### Correlation Matrices"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 149,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "binary_vars = [col for col in corr_cols if len(bd[col].unique()) == 2] + [name_mapping['Situacion_tratamiento_REDEF'], name_mapping['Risk_stigma_REDEF']]\n",
-    "cont_vars = [name_mapping[col] for col in ['Structural_conflic', 'Age', 'NumHijos', 'Años_consumo_droga', 'Adherencia_tto_recalc']]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 150,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def get_corr_matrix(df, cols):\n",
-    "    \n",
-    "    # Initialize nxn matrix to zeroes\n",
-    "    n = len(cols)\n",
-    "    corr_matrix = np.zeros((n,n))\n",
-    "\n",
-    "    for i, var_i in enumerate(cols):\n",
-    "        for j, var_j in enumerate(cols):\n",
-    "            # Fill lower triangle of matrix\n",
-    "            if i > j:\n",
-    "                # Binary with binary correlation: tetrachoric\n",
-    "                if var_i in binary_vars and var_j in binary_vars:\n",
-    "                    corr = binary_binary(df[var_i], df[var_j], measure='tetrachoric')\n",
-    "                # Continuous with continuous correlation: \n",
-    "                elif var_i in cont_vars and var_j in cont_vars:\n",
-    "                    # Returning nan sometimes:\n",
-    "                    # corr_tuple = continuous_continuous(df[var_i], df[var_j], measure = 'spearman')\n",
-    "                    # corr = corr_tuple[0]\n",
-    "                    corr = df[var_i].corr(df[var_j], method='spearman')\n",
-    "                # Binary vs Continuous correlation:\n",
-    "                else:\n",
-    "                    if var_i in binary_vars:\n",
-    "                        bin_var = var_i\n",
-    "                        cont_var = var_j\n",
-    "                    else:\n",
-    "                        bin_var = var_j\n",
-    "                        cont_var = var_i\n",
-    "                    corr = binary_continuous(df[bin_var], df[cont_var], measure='point_biserial')\n",
-    "                # Assign value to matrix\n",
-    "                corr_matrix[i][j] = corr \n",
-    "                      \n",
-    "    return corr_matrix"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 151,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def plot_heatmap(sit_tto: int, group:int) -> None:\n",
-    "    \"\"\"\n",
-    "        sit_tto: 1 (include it as another var), 2 (only abandono), 3 (only alta)\n",
-    "        group: 1 (all alcohol patients), 2 (pre), 3 (post)\n",
-    "    \"\"\"\n",
-    "\n",
-    "    # Define columns based on sit_tto arg\n",
-    "    if sit_tto == 1:\n",
-    "        # Include target as another variable\n",
-    "        cols = ['Treatment_Outcome'] + corr_cols\n",
-    "    else:\n",
-    "        cols = corr_cols\n",
-    "        \n",
-    "    # Title plot and select datat based on group and sit_tto\n",
-    "    if group == 1:\n",
-    "        plot_title = \"Correl Matrix - ALL\"\n",
-    "        if sit_tto == 1:\n",
-    "            bd_ca = bd[cols]\n",
-    "        elif sit_tto == 2:\n",
-    "            bd_ca = bd[bd['Situacion_tratamiento'] == 'Abandono'][cols]\n",
-    "        elif sit_tto == 3:\n",
-    "            bd_ca = bd[bd['Situacion_tratamiento'] == 'Alta terapéutica'][cols]\n",
-    "    elif group == 2:\n",
-    "        plot_title = \"Correl Matrix - PRE\"\n",
-    "        if sit_tto == 1:    \n",
-    "            bd_ca = conj_pre[cols]\n",
-    "        elif sit_tto == 2:\n",
-    "            bd_ca = pre_abandono[cols]\n",
-    "        elif sit_tto == 3:\n",
-    "            bd_ca = pre_alta[cols]\n",
-    "    elif group == 3:\n",
-    "        plot_title = \"Correl Matrix - POST\"\n",
-    "        if sit_tto == 1:    \n",
-    "            bd_ca = conj_post[cols]\n",
-    "        elif sit_tto == 2:\n",
-    "            bd_ca = post_abandono[cols]\n",
-    "        elif sit_tto == 3:\n",
-    "            bd_ca = post_alta[cols]\n",
-    "            \n",
-    "    # Complete title\n",
-    "    if sit_tto == 2:\n",
-    "        plot_title += \" - ABANDONO\"\n",
-    "    elif sit_tto == 3:\n",
-    "        plot_title += \" - ALTA\"\n",
-    "\n",
-    "    corr_matrix = get_corr_matrix(bd_ca, cols)\n",
-    "\n",
-    "    # Create a mask for the upper triangle\n",
-    "    mask = np.triu(np.ones_like(corr_matrix, dtype=bool))\n",
-    "\n",
-    "    # Create heatmap correlation matrix\n",
-    "    dataplot = sns.heatmap(corr_matrix, mask=mask, xticklabels=cols, yticklabels=cols, cmap=\"coolwarm\", vmin=-1, vmax=1, annot=True, fmt=\".2f\", annot_kws={\"size\": 4})\n",
-    "\n",
-    "    # Group ind vs social vars by color and modify tick label names\n",
-    "    for tick_label in dataplot.axes.xaxis.get_ticklabels():\n",
-    "        if tick_label.get_text() in ind_vars_enc:\n",
-    "            tick_label.set_color('green')\n",
-    "        elif tick_label.get_text() in soc_vars_enc:\n",
-    "            tick_label.set_color('purple')  \n",
-    "    for tick_label in dataplot.axes.yaxis.get_ticklabels():\n",
-    "        if tick_label.get_text() in ind_vars_enc:\n",
-    "            tick_label.set_color('green')\n",
-    "        elif tick_label.get_text() in soc_vars_enc:\n",
-    "            tick_label.set_color('purple') \n",
-    "\n",
-    "    # Increase the size of xtick labels\n",
-    "    # dataplot.tick_params(axis='x', labelsize=12)\n",
-    "\n",
-    "    # Increase the size of ytick labels\n",
-    "    # dataplot.tick_params(axis='y', labelsize=12)\n",
-    "\n",
-    "    # Add legend and place it in lower left \n",
-    "    plt.legend(handles=[\n",
-    "        plt.Line2D([0], [0], marker='o', color='w', label='Social Factors', markerfacecolor='purple', markersize=10),\n",
-    "        plt.Line2D([0], [0], marker='o', color='w', label='Individual Factors', markerfacecolor='green', markersize=10)\n",
-    "    ], bbox_to_anchor=(-0.1, -0.1), fontsize = 20)\n",
-    "\n",
-    "    plt.title(\"\\n\\n\" + plot_title, fontdict={'fontsize': 30, 'fontweight': 'bold'})\n",
-    "\n",
-    "    return corr_matrix"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 152,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "fig, axs = plt.subplots(3, 3, figsize=(50, 50))\n",
-    "plt.subplots_adjust(hspace=0.75, wspace=2)\n",
-    "corr_mats = [] # List of tuples (m1, m2) to store the 3 pairs of matrices to compare (pre vs post)\n",
-    "\n",
-    "# Go through possible values for 'Situacion_tratamiento' and 'Group'\n",
-    "for sit_tto in range(1,4):\n",
-    "    # ALL\n",
-    "    plt.subplot(3, 3, 3*(sit_tto-1) + 1)  # Calculate the subplot position dynamically\n",
-    "    _ = plot_heatmap(sit_tto, 1)\n",
-    "    # PRE\n",
-    "    plt.subplot(3, 3, 3*(sit_tto-1) + 2) \n",
-    "    corr_matrix_pre = plot_heatmap(sit_tto, 2)\n",
-    "    # POST\n",
-    "    plt.subplot(3, 3, 3*(sit_tto-1) + 3)\n",
-    "    corr_matrix_post = plot_heatmap(sit_tto, 3)\n",
-    "\n",
-    "    corr_mats.append((corr_matrix_pre, corr_matrix_post))\n",
-    "        \n",
-    "plt.tight_layout()\n",
-    "\n",
-    "plt.savefig('./output/plots/correlations/heatmaps_one_hot.svg', dpi=550, bbox_inches='tight')"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.12.2"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
--- a/EDA/output/feature_names.npy
+++ b/EDA/output/feature_names.npy
--- a/EDA/output/ind_vars_names.npy
+++ b/EDA/output/ind_vars_names.npy
--- a/EDA/output/soc_vars_names.npy
+++ b/EDA/output/soc_vars_names.npy
--- a/EDA/output/plots/feature_importance/ANOVA.svg
+++ b/EDA/output/plots/feature_importance/ANOVA.svg
--- a/EDA/output/plots/feature_importance/mutual_info.svg
+++ b/EDA/output/plots/feature_importance/mutual_info.svg
--- a/EDA/output/plots/feature_importance/var_threshold.svg
+++ b/EDA/output/plots/feature_importance/var_threshold.svg