EDA.ipynb 45.2 KB
Newer Older
Joaquin Torres's avatar
Joaquin Torres committed
1 2 3 4 5 6
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
Joaquin Torres's avatar
Joaquin Torres committed
7
    "**Exploratory Data Analysis** \\\n",
Joaquin Torres's avatar
Joaquin Torres committed
8
    "_Author: Joaquín Torres Bravo_"
Joaquin Torres's avatar
Joaquin Torres committed
9 10 11 12 13 14
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
Joaquin Torres's avatar
Joaquin Torres committed
15
    "### Libraries"
Joaquin Torres's avatar
Joaquin Torres committed
16 17 18 19
   ]
  },
  {
   "cell_type": "code",
Joaquin Torres's avatar
Joaquin Torres committed
20
   "execution_count": null,
Joaquin Torres's avatar
Joaquin Torres committed
21 22 23 24 25 26
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
27
    "import numpy as np\n",
Joaquin Torres's avatar
Joaquin Torres committed
28 29 30 31 32
    "from pypair.association import binary_binary, continuous_continuous, binary_continuous\n",
    "from sklearn.feature_selection import VarianceThreshold\n",
    "from sklearn.feature_selection import SelectKBest\n",
    "from sklearn.feature_selection import f_classif\n",
    "from sklearn.feature_selection import mutual_info_classif"
Joaquin Torres's avatar
Joaquin Torres committed
33 34 35 36 37 38
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
Joaquin Torres's avatar
Joaquin Torres committed
39
    "### First Steps"
Joaquin Torres's avatar
Joaquin Torres committed
40 41 42 43
   ]
  },
  {
   "cell_type": "code",
Joaquin Torres's avatar
Joaquin Torres committed
44
   "execution_count": null,
Joaquin Torres's avatar
Joaquin Torres committed
45 46 47
   "metadata": {},
   "outputs": [],
   "source": [
Joaquin Torres's avatar
Joaquin Torres committed
48
    "bd_all = pd.read_spss('./input/17_abril.sav')\n",
Joaquin Torres's avatar
Joaquin Torres committed
49 50 51 52 53 54 55 56 57 58
    "\n",
    "# Filter the dataset to work only with alcohol patients\n",
    "bd = bd_all[bd_all['Alcohol_DxCIE'] == 'Sí']\n",
    "\n",
    "# Filter the dataset to work only with 'Situacion_tratamiento' == 'Abandono' or 'Alta'\n",
    "bd = bd[(bd['Situacion_tratamiento'] == 'Abandono') | (bd['Situacion_tratamiento'] == 'Alta terapéutica')]"
   ]
  },
  {
   "cell_type": "code",
Joaquin Torres's avatar
Joaquin Torres committed
59 60 61
   "execution_count": null,
   "metadata": {},
   "outputs": [],
Joaquin Torres's avatar
Joaquin Torres committed
62
   "source": [
Joaquin Torres's avatar
Joaquin Torres committed
63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84
    "# Pre-pandemic\n",
    "conj_pre = bd[bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio y fin prepandemia']\n",
    "# Pre-pandemic abandono\n",
    "pre_abandono = conj_pre[conj_pre['Situacion_tratamiento'] == 'Abandono']\n",
    "# Pre-pandemic alta\n",
    "pre_alta = conj_pre[conj_pre['Situacion_tratamiento'] == 'Alta terapéutica']\n",
    "\n",
    "# Post-pandemic\n",
    "# Merging last two classes to balance sets\n",
    "conj_post = bd[(bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio prepandemia y fin en pandemia') | \n",
    "               (bd['Pandemia_inicio_fin_tratamiento'] == 'inicio y fin en pandemia')]\n",
    "# Post-pandemic abandono\n",
    "post_abandono = conj_post[conj_post['Situacion_tratamiento'] == 'Abandono']\n",
    "# Post-pandemic alta\n",
    "post_alta = conj_post[conj_post['Situacion_tratamiento'] == 'Alta terapéutica']\n",
    "\n",
    "# Concatenate the two data frames and add a new column to distinguish between them. Useful for plots\n",
    "conj_post['Group'] = 'Post'\n",
    "conj_pre['Group'] = 'Pre'\n",
    "combined_pre_post = pd.concat([conj_post, conj_pre])"
   ]
  },
85 86
  {
   "cell_type": "code",
Joaquin Torres's avatar
Joaquin Torres committed
87 88 89
   "execution_count": null,
   "metadata": {},
   "outputs": [],
90 91 92 93 94 95 96 97 98 99 100
   "source": [
    "# Printing size of different datasets\n",
    "print(f\"PRE: {len(conj_pre)}\")\n",
    "print(f\"\\tALTA: {len(pre_alta)}\")\n",
    "print(f\"\\tABANDONO: {len(pre_abandono)}\")\n",
    "\n",
    "print(f\"POST: {len(conj_post)}\")\n",
    "print(f\"\\tALTA: {len(post_alta)}\")\n",
    "print(f\"\\tABANDONO: {len(post_abandono)}\")"
   ]
  },
Joaquin Torres's avatar
Joaquin Torres committed
101
  {
102
   "cell_type": "code",
Joaquin Torres's avatar
Joaquin Torres committed
103
   "execution_count": null,
Joaquin Torres's avatar
Joaquin Torres committed
104
   "metadata": {},
Joaquin Torres's avatar
Joaquin Torres committed
105
   "outputs": [],
Joaquin Torres's avatar
Joaquin Torres committed
106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131
   "source": [
    "print(\"PRE\")\n",
    "print(conj_pre.info())\n",
    "print (\"-------------------------------\")\n",
    "print(\"PRE-ABANDONO\")\n",
    "print(pre_abandono.info())\n",
    "print (\"-------------------------------\")\n",
    "print(\"PRE-ALTA\")\n",
    "print(pre_alta.info())\n",
    "print (\"-------------------------------\")\n",
    "\n",
    "print(\"\\n\\n\\n\")\n",
    "\n",
    "print (\"POST\")\n",
    "print(conj_post.info())\n",
    "print (\"-------------------------------\")\n",
    "print(\"POST-ABANDONO\")\n",
    "print(post_abandono.info())\n",
    "print (\"-------------------------------\")\n",
    "print(\"POST-ALTA\")\n",
    "print(post_alta.info())\n",
    "print (\"-------------------------------\")"
   ]
  },
  {
   "cell_type": "markdown",
Joaquin Torres's avatar
Joaquin Torres committed
132 133
   "metadata": {},
   "source": [
Joaquin Torres's avatar
Joaquin Torres committed
134
    "### Missing and Unknown Values"
135 136 137 138
   ]
  },
  {
   "cell_type": "code",
Joaquin Torres's avatar
Joaquin Torres committed
139 140 141
   "execution_count": null,
   "metadata": {},
   "outputs": [],
142
   "source": [
Joaquin Torres's avatar
Joaquin Torres committed
143 144 145 146
    "# 9.0 represents unknown according to Variables.docx \n",
    "print(bd['Social_inclusion'].unique())\n",
    "mode_soc_inc = bd['Social_inclusion'].mode()[0]\n",
    "bd['Social_inclusion'] = bd['Social_inclusion'].replace('9.0', mode_soc_inc)\n",
Joaquin Torres's avatar
Joaquin Torres committed
147 148
    "print(bd['Social_inclusion'].unique())\n",
    "\n",
Joaquin Torres's avatar
Joaquin Torres committed
149 150 151
    "print(bd['Alterations_early_childhood_develop'].unique())\n",
    "mode_alt = bd['Alterations_early_childhood_develop'].mode()[0]\n",
    "bd['Alterations_early_childhood_develop'] = bd['Alterations_early_childhood_develop'].replace('9', mode_alt)\n",
Joaquin Torres's avatar
Joaquin Torres committed
152 153
    "print(bd['Alterations_early_childhood_develop'].unique())\n",
    "\n",
154
    "print(bd['Risk_stigma'].unique())\n",
Joaquin Torres's avatar
Joaquin Torres committed
155 156
    "mode_stigma = bd['Risk_stigma'].mode()[0]\n",
    "bd['Risk_stigma'] = bd['Risk_stigma'].replace(99.0, mode_stigma)\n",
Joaquin Torres's avatar
Joaquin Torres committed
157 158
    "print(bd['Risk_stigma'].unique())\n",
    "\n",
159
    "print(bd['NumHijos'].unique())\n",
Joaquin Torres's avatar
Joaquin Torres committed
160 161
    "mode_hijos = bd['NumHijos'].mode()[0]\n",
    "bd['NumHijos'] = bd['NumHijos'].replace(99.0, mode_hijos)\n",
162 163 164
    "print(bd['NumHijos'].unique())"
   ]
  },
Joaquin Torres's avatar
Joaquin Torres committed
165 166
  {
   "cell_type": "code",
167
   "execution_count": null,
Joaquin Torres's avatar
Joaquin Torres committed
168
   "metadata": {},
169
   "outputs": [],
Joaquin Torres's avatar
Joaquin Torres committed
170 171 172
   "source": [
    "print(f\"Total missing values Age: {bd['Age'].isnull().sum()}\")\n",
    "print(f\"Total missing values Años_consumo_droga: {bd['Años_consumo_droga'].isnull().sum()}\")\n",
173 174
    "print(f\"Total missing values Risk_stigma: {bd['Risk_stigma'].isnull().sum()}\")\n",
    "print(f\"Total missing values NumHijos: {bd['NumHijos'].isnull().sum()}\")\n",
Joaquin Torres's avatar
Joaquin Torres committed
175 176 177 178
    "\n",
    "print(\"\\tCONJUNTO PREPANDEMIA\")\n",
    "print(f\"\\t\\tMissing values Age: {conj_pre['Age'].isnull().sum()}\")\n",
    "print(f\"\\t\\tMissing values Años_consumo_droga: {conj_pre['Años_consumo_droga'].isnull().sum()}\")\n",
179 180
    "print(f\"\\t\\tMissing values Risk_stigma: {conj_pre['Risk_stigma'].isnull().sum()}\")\n",
    "print(f\"\\t\\tMissing values NumHijos: {conj_pre['NumHijos'].isnull().sum()}\")\n",
Joaquin Torres's avatar
Joaquin Torres committed
181 182 183
    "\n",
    "print(\"\\tCONJUNTO POSTPANDEMIA\")\n",
    "print(f\"\\t\\tMissing values Age: {conj_post['Age'].isnull().sum()}\")\n",
184 185 186
    "print(f\"\\t\\tMissing values Años_consumo_droga: {conj_post['Años_consumo_droga'].isnull().sum()}\")\n",
    "print(f\"\\t\\tMissing values Risk_stigma: {conj_post['Risk_stigma'].isnull().sum()}\")\n",
    "print(f\"\\t\\tMissing values NumHijos: {conj_post['NumHijos'].isnull().sum()}\")"
Joaquin Torres's avatar
Joaquin Torres committed
187 188
   ]
  },
Joaquin Torres's avatar
Joaquin Torres committed
189 190
  {
   "cell_type": "code",
Joaquin Torres's avatar
Joaquin Torres committed
191 192 193
   "execution_count": null,
   "metadata": {},
   "outputs": [],
Joaquin Torres's avatar
Joaquin Torres committed
194 195 196 197 198 199 200 201 202 203 204 205 206 207
   "source": [
    "age_mode = bd['Age'].mode()[0]\n",
    "bd['Age'].fillna(age_mode, inplace=True)\n",
    "\n",
    "años_consumo_mode = bd['Años_consumo_droga'].mode()[0]\n",
    "bd['Años_consumo_droga'].fillna(años_consumo_mode, inplace=True)\n",
    "\n",
    "risk_stigma_mode = bd['Risk_stigma'].mode()[0]\n",
    "bd['Risk_stigma'].fillna(risk_stigma_mode, inplace=True)\n",
    "\n",
    "num_hijos_mode = bd['NumHijos'].mode()[0]\n",
    "bd['NumHijos'].fillna(num_hijos_mode, inplace=True)"
   ]
  },
Joaquin Torres's avatar
Joaquin Torres committed
208 209 210 211
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
Joaquin Torres's avatar
Joaquin Torres committed
212
    "### Distribution of Variables"
Joaquin Torres's avatar
Joaquin Torres committed
213 214 215 216
   ]
  },
  {
   "cell_type": "code",
217
   "execution_count": null,
Joaquin Torres's avatar
Joaquin Torres committed
218 219 220 221 222
   "metadata": {},
   "outputs": [],
   "source": [
    "disc_atts = ['Education', 'Social_protection', 'Job_insecurity', 'Housing',\n",
    "        'Alterations_early_childhood_develop', 'Social_inclusion',\n",
223 224 225 226
    "        'Risk_stigma', 'Sex', 'NumHijos', 'Smoking', 'Biological_vulnerability',\n",
    "        'Opiaceos_DxCIE', 'Cannabis_DXCIE', 'BZD_DxCIE', 'Cocaina_DxCIE',\n",
    "        'Alucinogenos_DXCIE', 'Tabaco_DXCIE', 'FrecuenciaConsumo30Dias',\n",
    "        'OtrosDx_Psiquiatrico', 'Tx_previos', 'Readmisiones_estudios', 'Nreadmision'\n",
Joaquin Torres's avatar
Joaquin Torres committed
227 228 229 230 231 232 233 234 235
    "        ]\n",
    "\n",
    "num_atts = ['Structural_conflic', 'Adherencia_tto_recalc', 'Age', 'Años_consumo_droga', 'Tiempo_tx']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
Joaquin Torres's avatar
Joaquin Torres committed
236
    "#### Discrete"
Joaquin Torres's avatar
Joaquin Torres committed
237 238 239 240 241 242
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
Joaquin Torres's avatar
Joaquin Torres committed
243
    "##### Countplots"
Joaquin Torres's avatar
Joaquin Torres committed
244 245 246 247
   ]
  },
  {
   "cell_type": "code",
248
   "execution_count": null,
Joaquin Torres's avatar
Joaquin Torres committed
249
   "metadata": {},
250
   "outputs": [],
Joaquin Torres's avatar
Joaquin Torres committed
251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273
   "source": [
    "fig, axs = plt.subplots(len(disc_atts), 1, figsize=(15, 5*len(disc_atts)))\n",
    "plt.subplots_adjust(hspace=0.75, wspace=1.25)\n",
    "\n",
    "for i, disc_att in enumerate(disc_atts):\n",
    "    ax = sns.countplot(x=disc_att, data=combined_pre_post, hue=combined_pre_post[['Situacion_tratamiento', 'Group']].apply(tuple, axis=1),\n",
    "                       hue_order=[('Abandono', 'Pre'),('Alta terapéutica', 'Pre'), ('Abandono', 'Post'), ('Alta terapéutica', 'Post')],\n",
    "                       ax=axs[i])\n",
    "    ax.set_title(disc_att, fontsize=16, fontweight='bold')\n",
    "    ax.get_legend().set_title(\"Groups\")\n",
    "    \n",
    "    # Adding count annotations\n",
    "    for p in ax.patches:\n",
    "        if p.get_label() == '_nolegend_':\n",
    "            ax.annotate(format(p.get_height(), '.0f'), \n",
    "                        (p.get_x() + p.get_width() / 2., p.get_height()), \n",
    "                        ha = 'center', va = 'center', \n",
    "                        xytext = (0, 9), \n",
    "                        textcoords = 'offset points')\n",
    "\n",
    "# Adjust layout to prevent overlapping titles\n",
    "plt.tight_layout()\n",
    "\n",
Joaquin Torres's avatar
Joaquin Torres committed
274
    "plt.savefig('./output/plots/distributions/countplots.svg', dpi=600, bbox_inches='tight')"
Joaquin Torres's avatar
Joaquin Torres committed
275 276 277 278 279 280
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
Joaquin Torres's avatar
Joaquin Torres committed
281
    "##### Normalized Countplots"
Joaquin Torres's avatar
Joaquin Torres committed
282 283 284 285
   ]
  },
  {
   "cell_type": "code",
286
   "execution_count": null,
Joaquin Torres's avatar
Joaquin Torres committed
287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334
   "metadata": {},
   "outputs": [],
   "source": [
    "# Function to plot countplot \n",
    "def plot_count_perc_norm(i: int, group:int, disc_att:str) -> None:\n",
    "    \"\"\"\n",
    "        group: 1 (all), 2 (pre), 3 (post) \n",
    "    \"\"\"\n",
    "\n",
    "    # Define data to work with based on group\n",
    "    if group == 1:\n",
    "        df = bd \n",
    "    elif group == 2:\n",
    "        df = conj_pre\n",
    "    elif group == 3:\n",
    "        df = conj_post\n",
    "\n",
    "    # GOAL: find percentage of each possible category within the total of its situacion_tto subset\n",
    "    # Group data by 'Situacion_tratamiento' and 'Education' and count occurrences\n",
    "    grouped_counts = df.groupby(['Situacion_tratamiento', disc_att]).size().reset_index(name='count')\n",
    "    # Calculate total count for each 'Situacion_tratamiento' group\n",
    "    total_counts = df.groupby('Situacion_tratamiento')[disc_att].count()\n",
    "    # Divide each count by its corresponding total count and calculate percentage\n",
    "    grouped_counts['percentage'] = grouped_counts.apply(lambda row: row['count'] / total_counts[row['Situacion_tratamiento']] * 100, axis=1)\n",
    "    \n",
    "    # Follow the same order in plot as in computations\n",
    "    col_order = grouped_counts[grouped_counts['Situacion_tratamiento'] == 'Abandono'][disc_att].tolist()\n",
    "\n",
    "    # Create countplot and split each bar into two based on the value of sit_tto\n",
    "    ax = sns.countplot(x=disc_att, hue='Situacion_tratamiento', data=df, order=col_order, ax=axs[i, group-2])\n",
    "\n",
    "    # Adjust y-axis to represent percentages out of the total count\n",
    "    ax.set_ylim(0, 100)\n",
    "\n",
    "    percentages = grouped_counts['percentage']\n",
    "    for i, p in enumerate(ax.patches):\n",
    "        # Skip going over the legend values\n",
    "        if p.get_label() == \"_nolegend_\":\n",
    "            # Set height to corresponding percentage and annotate result\n",
    "            height = percentages[i]\n",
    "            p.set_height(height)\n",
    "            ax.annotate(f'{height:.2f}%', (p.get_x() + p.get_width() / 2., height),\n",
    "                        ha='center', va='bottom', fontsize=6, color='black', xytext=(0, 5),\n",
    "                        textcoords='offset points')"
   ]
  },
  {
   "cell_type": "code",
335
   "execution_count": null,
Joaquin Torres's avatar
Joaquin Torres committed
336
   "metadata": {},
337
   "outputs": [],
Joaquin Torres's avatar
Joaquin Torres committed
338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368
   "source": [
    "fig, axs = plt.subplots(len(disc_atts), 2, figsize=(15, 7*len(disc_atts)))\n",
    "plt.subplots_adjust(hspace=0.75, wspace=1.5)\n",
    "\n",
    "for i, disc_att in enumerate(disc_atts):\n",
    "\n",
    "    # # 1: ALL    \n",
    "    # plot_count_perc_norm(i, 1, disc_att)\n",
    "    # axs[i, 0].set_title(\"\\nALL\")\n",
    "    # axs[i, 0].set_xlabel(disc_att, fontweight='bold')\n",
    "    # axs[i, 0].set_ylabel(\"% of total within its Sit_tto group\")\n",
    "    # axs[i, 0].tick_params(axis='x', rotation=90)\n",
    "    \n",
    "    # 2: PRE\n",
    "    plot_count_perc_norm(i, 2, disc_att)\n",
    "    axs[i, 0].set_title(\"\\nPRE\")\n",
    "    axs[i, 0].set_xlabel(disc_att, fontweight='bold')\n",
    "    axs[i, 0].set_ylabel(\"% of total within its Sit_tto group\")\n",
    "    axs[i, 0].tick_params(axis='x', rotation=90)\n",
    "\n",
    "    # 3: POST\n",
    "    plot_count_perc_norm(i, 3, disc_att)\n",
    "    axs[i, 1].set_title(\"\\nPOST\")\n",
    "    axs[i, 1].set_xlabel(disc_att, fontweight='bold')\n",
    "    axs[i, 1].set_ylabel(\"% of total within its Sit_tto group\")\n",
    "    axs[i, 1].tick_params(axis='x', rotation=90)\n",
    "\n",
    "    \n",
    "# Adjust layout to prevent overlapping titles\n",
    "plt.tight_layout()\n",
    "\n",
Joaquin Torres's avatar
Joaquin Torres committed
369 370
    "# Save the figure in SVG format with DPI=600 in the \"._plots\" folder\n",
    "plt.savefig('./output/plots/distributions/norm_countplots.svg', dpi=600, bbox_inches='tight')"
Joaquin Torres's avatar
Joaquin Torres committed
371 372 373 374 375 376
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
Joaquin Torres's avatar
Joaquin Torres committed
377
    "#### Numerical"
Joaquin Torres's avatar
Joaquin Torres committed
378 379 380 381 382 383
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
Joaquin Torres's avatar
Joaquin Torres committed
384
    "##### Summary Stats"
Joaquin Torres's avatar
Joaquin Torres committed
385 386 387 388
   ]
  },
  {
   "cell_type": "code",
389
   "execution_count": null,
Joaquin Torres's avatar
Joaquin Torres committed
390
   "metadata": {},
391
   "outputs": [],
Joaquin Torres's avatar
Joaquin Torres committed
392 393 394 395 396 397 398 399
   "source": [
    "print(bd[num_atts].describe())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
Joaquin Torres's avatar
Joaquin Torres committed
400
    "##### Boxplots"
Joaquin Torres's avatar
Joaquin Torres committed
401 402 403 404
   ]
  },
  {
   "cell_type": "code",
405
   "execution_count": null,
Joaquin Torres's avatar
Joaquin Torres committed
406
   "metadata": {},
407
   "outputs": [],
Joaquin Torres's avatar
Joaquin Torres committed
408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423
   "source": [
    "fig, axs = plt.subplots(len(num_atts), 1, figsize=(12, 5*len(num_atts)))\n",
    "plt.subplots_adjust(hspace=0.75, wspace=1.5)\n",
    "\n",
    "for i, num_att in enumerate(num_atts):\n",
    "    plt.subplot(len(num_atts), 1, i+1)\n",
    "    sns.boxplot(\n",
    "        data=combined_pre_post,\n",
    "        x = num_att,\n",
    "        y = 'Group',\n",
    "        hue='Situacion_tratamiento',\n",
    "    )\n",
    "\n",
    "# Adjust layout to prevent overlapping titles\n",
    "plt.tight_layout()\n",
    "\n",
424
    "# Save the figure in SVG format with DPI=600 in the \"./EDA_plots\" folder\n",
Joaquin Torres's avatar
Joaquin Torres committed
425
    "plt.savefig('./output/plots/distributions/boxplots.svg', dpi=600, bbox_inches='tight')"
Joaquin Torres's avatar
Joaquin Torres committed
426 427 428 429 430 431
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
Joaquin Torres's avatar
Joaquin Torres committed
432
    "##### Histograms"
Joaquin Torres's avatar
Joaquin Torres committed
433 434 435 436
   ]
  },
  {
   "cell_type": "code",
437
   "execution_count": null,
Joaquin Torres's avatar
Joaquin Torres committed
438
   "metadata": {},
439
   "outputs": [],
Joaquin Torres's avatar
Joaquin Torres committed
440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463
   "source": [
    "fig, axs = plt.subplots(len(num_atts), 3, figsize=(15, 6*len(num_atts)))\n",
    "plt.subplots_adjust(hspace=0.75, wspace=1.5)\n",
    "\n",
    "for i, num_att in enumerate(num_atts):\n",
    "\n",
    "    # 1: All alcohol patients\n",
    "    sns.histplot(data=bd,x=num_att,bins=15, hue='Situacion_tratamiento', stat='probability', common_norm=False, kde=True,\n",
    "                 line_kws={'lw': 5}, alpha = 0.4, ax=axs[i, 0])\n",
    "    axs[i, 0].set_title(f\"\\nDistr. of {num_att}  - ALL\")\n",
    "\n",
    "    # 2: PRE\n",
    "    sns.histplot(data=conj_pre,x=num_att,bins=15, hue='Situacion_tratamiento', stat='probability', common_norm=False, kde=True, \n",
    "                 line_kws={'lw': 5}, alpha = 0.4, ax=axs[i, 1])\n",
    "    axs[i, 1].set_title(f\"\\nDistr. of {num_att}  - PRE\")\n",
    "\n",
    "    # Subplot 3: POST\n",
    "    sns.histplot(data=conj_post,x=num_att,bins=15, hue='Situacion_tratamiento', stat='probability', common_norm=False, kde=True, \n",
    "                 line_kws={'lw': 5}, alpha = 0.4, ax=axs[i, 2])\n",
    "    axs[i, 2].set_title(f\"\\nDistr. of {num_att}  - POST\")\n",
    "\n",
    "# Adjust layout to prevent overlapping titles\n",
    "plt.tight_layout()\n",
    "\n",
464
    "# Save the figure in SVG format with DPI=600 in the \"./EDA_plots\" folder\n",
Joaquin Torres's avatar
Joaquin Torres committed
465
    "plt.savefig('./output/plots/distributions/histograms.svg', dpi=600, bbox_inches='tight')"
Joaquin Torres's avatar
Joaquin Torres committed
466 467 468 469 470 471
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
Joaquin Torres's avatar
Joaquin Torres committed
472
    "### Correlation Analysis"
Joaquin Torres's avatar
Joaquin Torres committed
473 474 475 476 477 478
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
Joaquin Torres's avatar
Joaquin Torres committed
479
    "#### Groups of Variables"
Joaquin Torres's avatar
Joaquin Torres committed
480 481 482 483
   ]
  },
  {
   "cell_type": "code",
Joaquin Torres's avatar
Joaquin Torres committed
484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "social_vars = ['Education', 'Social_protection', 'Job_insecurity', 'Housing', 'Alterations_early_childhood_develop', \n",
    "            'Social_inclusion', 'Risk_stigma', 'Structural_conflic']\n",
    "ind_vars = ['Age', 'Sex', 'NumHijos', 'Smoking', 'Biological_vulnerability', 'Opiaceos_DxCIE', \n",
    "            'Cannabis_DXCIE', 'BZD_DxCIE', 'Cocaina_DxCIE', 'Alucinogenos_DXCIE', 'Tabaco_DXCIE', \n",
    "            'FrecuenciaConsumo30Dias', 'Años_consumo_droga','OtrosDx_Psiquiatrico', 'Tx_previos', 'Adherencia_tto_recalc'] \n",
    "target_var = 'Situacion_tratamiento'\n",
    "\n",
    "# Columns that are already numeric and we don't need to redefine \n",
    "no_redef_cols = ['Structural_conflic', 'Age', 'NumHijos', 'Años_consumo_droga', 'Adherencia_tto_recalc']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### One-hot Encoding"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Binary"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
Joaquin Torres's avatar
Joaquin Torres committed
516 517 518
   "metadata": {},
   "outputs": [],
   "source": [
Joaquin Torres's avatar
Joaquin Torres committed
519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538
    "# --------------------------------------------------------------------------\n",
    "\n",
    "# 'Alterations_early_childhood_develop'\n",
    "alterations_mapping = {\n",
    "    'No alterations (first exposure at 11 or more years)' : 0,\n",
    "    'Alterations (first exposure before 11 years old)': 1,\n",
    "}\n",
    "\n",
    "bd['Alterations_early_childhood_develop_REDEF'] = bd['Alterations_early_childhood_develop'].map(alterations_mapping)\n",
    "\n",
    "# --------------------------------------------------------------------------\n",
    "\n",
    "# Social protection\n",
    "bd['Social_protection_REDEF'] = bd['Social_protection'].map({'No':0, 'Sí':1})\n",
    "\n",
    "# --------------------------------------------------------------------------\n",
    "\n",
    "# 'Risk_stigma'\n",
    "bd['Risk_stigma_REDEF'] = bd['Risk_stigma'].map({'No':0, 'Yes':1})\n",
    "\n",
Joaquin Torres's avatar
Joaquin Torres committed
539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575
    "# --------------------------------------------------------------------------\n",
    "\n",
    "# 'Sex'\n",
    "bd['Sex_REDEF'] = bd['Sex'].map({'Hombre':0, 'Mujer':1})\n",
    "\n",
    "# --------------------------------------------------------------------------\n",
    "\n",
    "# 'Smoking'\n",
    "bd['Smoking_REDEF'] = bd['Smoking'].map({'No':0, 'Sí':1})\n",
    "\n",
    "# --------------------------------------------------------------------------\n",
    "\n",
    "# 'Biological_vulnerability'\n",
    "bd['Biological_vulnerability_REDEF'] = bd['Biological_vulnerability'].map({'No':0, 'Sí':1})\n",
    "\n",
    "# --------------------------------------------------------------------------\n",
    "\n",
    "# 'Droga_DxCIE'\n",
    "bd['Opiaceos_DxCIE_REDEF'] = bd['Opiaceos_DxCIE'].map({'No': 0, 'Sí': 1})\n",
    "bd['Cannabis_DXCIE_REDEF'] = bd['Cannabis_DXCIE'].map({'No': 0, 'Sí': 1})\n",
    "bd['BZD_DxCIE_REDEF'] = bd['BZD_DxCIE'].map({'No': 0, 'Sí': 1})\n",
    "bd['Cocaina_DxCIE_REDEF'] = bd['Cocaina_DxCIE'].map({'No': 0, 'Sí': 1})\n",
    "bd['Alucinogenos_DXCIE_REDEF'] = bd['Alucinogenos_DXCIE'].map({'No': 0, 'Sí': 1})\n",
    "bd['Tabaco_DXCIE_REDEF'] = bd['Tabaco_DXCIE'].map({'No': 0, 'Sí': 1})\n",
    "\n",
    "# --------------------------------------------------------------------------\n",
    "\n",
    "# 'OtrosDx_Psiquiatrico'\n",
    "bd['OtrosDx_Psiquiatrico_REDEF'] = bd['OtrosDx_Psiquiatrico'].map({'No':0, 'Sí':1})\n",
    "\n",
    "# --------------------------------------------------------------------------\n",
    "\n",
    "# 'Tx_previos'\n",
    "bd['Tx_previos_REDEF'] = bd['Tx_previos'].map({'No':0, 'Sí':1})\n",
    "\n",
    "# --------------------------------------------------------------------------\n",
    "\n",
576 577 578
    "# 'Situacion_tratamiento (!!!!!)\n",
    "# Important to define properly\n",
    "bd['Situacion_tratamiento_REDEF'] = bd['Situacion_tratamiento'].map({'Abandono':1, 'Alta terapéutica':0})\n",
Joaquin Torres's avatar
Joaquin Torres committed
579 580 581 582 583 584 585 586
    "\n",
    "# --------------------------------------------------------------------------"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
Joaquin Torres's avatar
Joaquin Torres committed
587
    "##### Categorical"
Joaquin Torres's avatar
Joaquin Torres committed
588 589 590 591
   ]
  },
  {
   "cell_type": "code",
Joaquin Torres's avatar
Joaquin Torres committed
592
   "execution_count": null,
593 594
   "metadata": {},
   "outputs": [],
Joaquin Torres's avatar
Joaquin Torres committed
595 596
   "source": [
    "# Specify columns to one hot encode; empty list otherwise\n",
597
    "one_hot_vars = ['Education', 'Job_insecurity', 'Housing', 'Social_inclusion', 'FrecuenciaConsumo30Dias']\n",
Joaquin Torres's avatar
Joaquin Torres committed
598 599 600 601 602 603 604 605 606 607 608 609 610 611 612
    "\n",
    "one_hots_vars_prefix = {\n",
    "    'Education': 'Ed',\n",
    "    'Job_insecurity': 'JobIn',\n",
    "    'Housing': 'Hous', \n",
    "    'Social_inclusion': 'SocInc',\n",
    "    'FrecuenciaConsumo30Dias': 'Frec30',\n",
    "}\n",
    "\n",
    "one_hot_cols_dic = {}\n",
    "\n",
    "for one_hot_var in one_hot_vars:\n",
    "    # Create one hot encoding version of attribute and concatenate new columns to main df\n",
    "    encoded_var = pd.get_dummies(bd[one_hot_var], prefix=one_hots_vars_prefix[one_hot_var])\n",
    "    bd = pd.concat([bd, encoded_var], axis=1)\n",
Joaquin Torres's avatar
Joaquin Torres committed
613
    "    one_hot_cols_dic[one_hot_var] = encoded_var.columns.tolist()"
Joaquin Torres's avatar
Joaquin Torres committed
614 615 616 617 618 619
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
Joaquin Torres's avatar
Joaquin Torres committed
620
    "#### Final Columns"
Joaquin Torres's avatar
Joaquin Torres committed
621 622 623 624
   ]
  },
  {
   "cell_type": "code",
Joaquin Torres's avatar
Joaquin Torres committed
625
   "execution_count": null,
Joaquin Torres's avatar
Joaquin Torres committed
626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662
   "metadata": {},
   "outputs": [],
   "source": [
    "soc_vars_enc = []\n",
    "for soc_var in social_vars:\n",
    "    # If no need to redefine, append directly\n",
    "    if soc_var in no_redef_cols:\n",
    "        soc_vars_enc.append(soc_var)\n",
    "    # If need to redefine\n",
    "    else:\n",
    "        # Check if it was one-hot encoded\n",
    "        if soc_var in one_hot_vars:\n",
    "            # Append all one hot columns\n",
    "            soc_vars_enc = soc_vars_enc + one_hot_cols_dic[soc_var]\n",
    "        # If not, use redefined version through mapping\n",
    "        else:\n",
    "            soc_vars_enc.append(soc_var + '_REDEF')\n",
    "\n",
    "ind_vars_enc = []\n",
    "for ind_var in ind_vars:\n",
    "    # If no need to redefine, append directly\n",
    "    if ind_var in no_redef_cols:\n",
    "        ind_vars_enc.append(ind_var)\n",
    "    # If need to redefine\n",
    "    else:\n",
    "        # Check if it was one-hot encoded\n",
    "        if ind_var in one_hot_vars:\n",
    "            # Append all one hot columns\n",
    "            ind_vars_enc = ind_vars_enc + one_hot_cols_dic[ind_var]\n",
    "        # If not, use redefined version through mapping\n",
    "        else:\n",
    "            ind_vars_enc.append(ind_var + '_REDEF')\n",
    "\n",
    "# Final version of columns we need to use for correlation analysis\n",
    "corr_cols = soc_vars_enc + ind_vars_enc"
   ]
  },
Joaquin Torres's avatar
Joaquin Torres committed
663 664 665 666 667 668 669 670 671 672 673 674
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Drop unknown columns\n",
    "corr_cols = [corr_col for corr_col in corr_cols if corr_col not in ['Ed_Unknowledge', 'JobIn_unkwnodledge', 'Hous_unknowledge', 'Frec30_Desconocido']]\n",
    "soc_vars_enc = [corr_col for corr_col in soc_vars_enc if corr_col not in ['Ed_Unknowledge', 'JobIn_unkwnodledge', 'Hous_unknowledge', 'Frec30_Desconocido']]\n",
    "ind_vars_enc = [corr_col for corr_col in ind_vars_enc if corr_col not in ['Ed_Unknowledge', 'JobIn_unkwnodledge', 'Hous_unknowledge', 'Frec30_Desconocido']]"
   ]
  },
Joaquin Torres's avatar
Joaquin Torres committed
675 676 677 678
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
Joaquin Torres's avatar
Joaquin Torres committed
679
    "##### Renaming and Filtering"
Joaquin Torres's avatar
Joaquin Torres committed
680 681 682 683
   ]
  },
  {
   "cell_type": "code",
Joaquin Torres's avatar
Joaquin Torres committed
684
   "execution_count": null,
Joaquin Torres's avatar
Joaquin Torres committed
685 686 687
   "metadata": {},
   "outputs": [],
   "source": [
Joaquin Torres's avatar
Joaquin Torres committed
688 689
    "columns_to_keep = corr_cols + ['Situacion_tratamiento','Situacion_tratamiento_REDEF', 'Pandemia_inicio_fin_tratamiento']\n",
    "bd = bd[columns_to_keep]"
Joaquin Torres's avatar
Joaquin Torres committed
690 691
   ]
  },
692 693
  {
   "cell_type": "code",
Joaquin Torres's avatar
Joaquin Torres committed
694 695 696
   "execution_count": null,
   "metadata": {},
   "outputs": [],
697
   "source": [
Joaquin Torres's avatar
Joaquin Torres committed
698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716
    "name_mapping = {\n",
    "    'Ed_Not Complete primary school': 'Ed_Not_Complete_Primary',\n",
    "    'Ed_Primary education': 'Ed_Primary',\n",
    "    'Ed_Secondary Education': 'Ed_Secondary',\n",
    "    'Ed_Secondary more technical education': 'Ed_Secondary_Technical',\n",
    "    'Ed_Tertiary': 'Ed_Tertiary',\n",
    "    'Social_protection_REDEF': 'Social_Protection',\n",
    "    'JobIn_Non-stable': 'JobIn_Unstable',\n",
    "    'JobIn_Stable': 'JobIn_Stable',\n",
    "    'JobIn_Unemployed': 'JobIn_Unemployed',\n",
    "    'Hous_Institutional': 'Hous_Institutional',\n",
    "    'Hous_Stable': 'Hous_Stable',\n",
    "    'Hous_Unstable': 'Hous_Unstable',\n",
    "    'Alterations_early_childhood_develop_REDEF': 'Early_Alterations',\n",
    "    'SocInc_Live with families or friends': 'SocInc_Family_Friends',\n",
    "    'SocInc_live alone': 'SocInc_Alone',\n",
    "    'SocInc_live in institutions': 'SocInc_Instit',\n",
    "    'Risk_stigma_REDEF': 'Risk_Stigma',\n",
    "    'Structural_conflic': 'Structural_Conflict',\n",
Joaquin Torres's avatar
Joaquin Torres committed
717
    "    'Age': 'Age',\n",
Joaquin Torres's avatar
Joaquin Torres committed
718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736
    "    'Sex_REDEF': 'Sex',\n",
    "    'NumHijos': 'Num_Children',\n",
    "    'Smoking_REDEF': 'Smoking',\n",
    "    'Biological_vulnerability_REDEF': 'Bio_Vulner',\n",
    "    'Opiaceos_DxCIE_REDEF': 'Opiods_DXCIE',\n",
    "    'Cannabis_DXCIE_REDEF': 'Cannabis_DXCIE',\n",
    "    'BZD_DxCIE_REDEF': 'BZD_DXCIE',\n",
    "    'Cocaina_DxCIE_REDEF': 'Cocaine_DXCIE',\n",
    "    'Alucinogenos_DXCIE_REDEF': 'Hallucin_DXCIE',\n",
    "    'Tabaco_DXCIE_REDEF': 'Tobacco_DXCIE',\n",
    "    'Frec30_1 día/semana': 'Freq_1dpw',\n",
    "    'Frec30_2-3 días\\u200e/semana': 'Freq_2-3dpw',\n",
    "    'Frec30_4-6 días/semana': 'Freq_4-6dpw',\n",
    "    'Frec30_Menos de 1 día\\u200e/semana': 'Freq_l1dpw',\n",
    "    'Frec30_No consumio': 'Freq_None',\n",
    "    'Frec30_Todos los días': 'Freq_Everyday',\n",
    "    'Años_consumo_droga': 'Years_Drug_Use',\n",
    "    'OtrosDx_Psiquiatrico_REDEF': 'Other_Psychiatric_DX',\n",
    "    'Tx_previos_REDEF': 'Previous_Treatments',\n",
Joaquin Torres's avatar
Joaquin Torres committed
737 738 739 740
    "    'Adherencia_tto_recalc': 'Treatment_Adherence',\n",
    "    'Situacion_tratamiento_REDEF': 'Treatment_Outcome',\n",
    "    'Situacion_tratamiento': 'Situacion_tratamiento',\n",
    "    'Pandemia_inicio_fin_tratamiento': 'Pandemia_inicio_fin_tratamiento'\n",
Joaquin Torres's avatar
Joaquin Torres committed
741 742 743 744 745
    "}\n",
    "\n",
    "# Update lists of feature names\n",
    "corr_cols = [name_mapping[corr_col] for corr_col in corr_cols]\n",
    "soc_vars_enc = [name_mapping[col] for col in soc_vars_enc]\n",
Joaquin Torres's avatar
Joaquin Torres committed
746
    "ind_vars_enc = [name_mapping[col] for col in ind_vars_enc]"
Joaquin Torres's avatar
Joaquin Torres committed
747 748 749 750
   ]
  },
  {
   "cell_type": "code",
Joaquin Torres's avatar
Joaquin Torres committed
751
   "execution_count": null,
Joaquin Torres's avatar
Joaquin Torres committed
752 753 754
   "metadata": {},
   "outputs": [],
   "source": [
Joaquin Torres's avatar
Joaquin Torres committed
755 756 757 758
    "# Export feature names\n",
    "np.save('./output/feature_names/feature_names.npy', corr_cols)\n",
    "np.save('./output/feature_names/soc_vars_names.npy', soc_vars_enc)\n",
    "np.save('./output/feature_names/ind_vars_names.npy', ind_vars_enc)"
Joaquin Torres's avatar
Joaquin Torres committed
759 760 761 762 763
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
Joaquin Torres's avatar
Joaquin Torres committed
764 765 766
   "metadata": {},
   "outputs": [],
   "source": [
Joaquin Torres's avatar
Joaquin Torres committed
767 768
    "bd = bd.rename(columns=name_mapping)[list(name_mapping.values())]\n",
    "#print(bd.columns)"
Joaquin Torres's avatar
Joaquin Torres committed
769 770 771 772
   ]
  },
  {
   "cell_type": "code",
Joaquin Torres's avatar
Joaquin Torres committed
773
   "execution_count": null,
Joaquin Torres's avatar
Joaquin Torres committed
774 775 776
   "metadata": {},
   "outputs": [],
   "source": [
Joaquin Torres's avatar
Joaquin Torres committed
777
    "# Update main dfs\n",
Joaquin Torres's avatar
Joaquin Torres committed
778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798
    "# Pre-pandemic\n",
    "conj_pre = bd[bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio y fin prepandemia']\n",
    "# Pre-pandemic abandono\n",
    "pre_abandono = conj_pre[conj_pre['Situacion_tratamiento'] == 'Abandono']\n",
    "# Pre-pandemic alta\n",
    "pre_alta = conj_pre[conj_pre['Situacion_tratamiento'] == 'Alta terapéutica']\n",
    "\n",
    "# Post-pandemic\n",
    "# Merging last two classes to balance sets\n",
    "conj_post = bd[(bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio prepandemia y fin en pandemia') | \n",
    "               (bd['Pandemia_inicio_fin_tratamiento'] == 'inicio y fin en pandemia')]\n",
    "# Post-pandemic abandono\n",
    "post_abandono = conj_post[conj_post['Situacion_tratamiento'] == 'Abandono']\n",
    "# Post-pandemic alta\n",
    "post_alta = conj_post[conj_post['Situacion_tratamiento'] == 'Alta terapéutica']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
Joaquin Torres's avatar
Joaquin Torres committed
799
    "#### Plotting Correlation Matrices"
800 801 802 803
   ]
  },
  {
   "cell_type": "code",
Joaquin Torres's avatar
Joaquin Torres committed
804 805 806 807 808 809
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "binary_vars = [col for col in corr_cols if len(bd[col].unique()) == 2] + [name_mapping['Situacion_tratamiento_REDEF'], name_mapping['Risk_stigma_REDEF']]\n",
    "cont_vars = [name_mapping[col] for col in ['Structural_conflic', 'Age', 'NumHijos', 'Años_consumo_droga', 'Adherencia_tto_recalc']]"
810 811 812 813
   ]
  },
  {
   "cell_type": "code",
Joaquin Torres's avatar
Joaquin Torres committed
814
   "execution_count": null,
815 816 817 818 819 820 821 822 823 824 825
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_corr_matrix(df, cols):\n",
    "    \n",
    "    # Initialize nxn matrix to zeroes\n",
    "    n = len(cols)\n",
    "    corr_matrix = np.zeros((n,n))\n",
    "\n",
    "    for i, var_i in enumerate(cols):\n",
    "        for j, var_j in enumerate(cols):\n",
826
    "            # Fill lower triangle of matrix\n",
827 828 829 830 831
    "            if i > j:\n",
    "                # Binary with binary correlation: tetrachoric\n",
    "                if var_i in binary_vars and var_j in binary_vars:\n",
    "                    corr = binary_binary(df[var_i], df[var_j], measure='tetrachoric')\n",
    "                # Continuous with continuous correlation: \n",
832 833 834 835 836 837 838 839 840 841 842 843 844
    "                elif var_i in cont_vars and var_j in cont_vars:\n",
    "                    # Returning nan sometimes:\n",
    "                    # corr_tuple = continuous_continuous(df[var_i], df[var_j], measure = 'spearman')\n",
    "                    # corr = corr_tuple[0]\n",
    "                    corr = df[var_i].corr(df[var_j], method='spearman')\n",
    "                # Binary vs Continuous correlation:\n",
    "                else:\n",
    "                    if var_i in binary_vars:\n",
    "                        bin_var = var_i\n",
    "                        cont_var = var_j\n",
    "                    else:\n",
    "                        bin_var = var_j\n",
    "                        cont_var = var_i\n",
845
    "                    corr = binary_continuous(df[bin_var], df[cont_var], measure='point_biserial')\n",
846 847 848 849
    "                # Assign value to matrix\n",
    "                corr_matrix[i][j] = corr \n",
    "                      \n",
    "    return corr_matrix"
Joaquin Torres's avatar
Joaquin Torres committed
850 851 852 853
   ]
  },
  {
   "cell_type": "code",
Joaquin Torres's avatar
Joaquin Torres committed
854
   "execution_count": null,
Joaquin Torres's avatar
Joaquin Torres committed
855 856 857 858 859 860 861 862 863 864 865 866
   "metadata": {},
   "outputs": [],
   "source": [
    "def plot_heatmap(sit_tto: int, group:int) -> None:\n",
    "    \"\"\"\n",
    "        sit_tto: 1 (include it as another var), 2 (only abandono), 3 (only alta)\n",
    "        group: 1 (all alcohol patients), 2 (pre), 3 (post)\n",
    "    \"\"\"\n",
    "\n",
    "    # Define columns based on sit_tto arg\n",
    "    if sit_tto == 1:\n",
    "        # Include target as another variable\n",
Joaquin Torres's avatar
Joaquin Torres committed
867
    "        cols = ['Treatment_Outcome'] + corr_cols\n",
Joaquin Torres's avatar
Joaquin Torres committed
868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902
    "    else:\n",
    "        cols = corr_cols\n",
    "        \n",
    "    # Title plot and select datat based on group and sit_tto\n",
    "    if group == 1:\n",
    "        plot_title = \"Correl Matrix - ALL\"\n",
    "        if sit_tto == 1:\n",
    "            bd_ca = bd[cols]\n",
    "        elif sit_tto == 2:\n",
    "            bd_ca = bd[bd['Situacion_tratamiento'] == 'Abandono'][cols]\n",
    "        elif sit_tto == 3:\n",
    "            bd_ca = bd[bd['Situacion_tratamiento'] == 'Alta terapéutica'][cols]\n",
    "    elif group == 2:\n",
    "        plot_title = \"Correl Matrix - PRE\"\n",
    "        if sit_tto == 1:    \n",
    "            bd_ca = conj_pre[cols]\n",
    "        elif sit_tto == 2:\n",
    "            bd_ca = pre_abandono[cols]\n",
    "        elif sit_tto == 3:\n",
    "            bd_ca = pre_alta[cols]\n",
    "    elif group == 3:\n",
    "        plot_title = \"Correl Matrix - POST\"\n",
    "        if sit_tto == 1:    \n",
    "            bd_ca = conj_post[cols]\n",
    "        elif sit_tto == 2:\n",
    "            bd_ca = post_abandono[cols]\n",
    "        elif sit_tto == 3:\n",
    "            bd_ca = post_alta[cols]\n",
    "            \n",
    "    # Complete title\n",
    "    if sit_tto == 2:\n",
    "        plot_title += \" - ABANDONO\"\n",
    "    elif sit_tto == 3:\n",
    "        plot_title += \" - ALTA\"\n",
    "\n",
903
    "    corr_matrix = get_corr_matrix(bd_ca, cols)\n",
Joaquin Torres's avatar
Joaquin Torres committed
904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934
    "\n",
    "    # Create a mask for the upper triangle\n",
    "    mask = np.triu(np.ones_like(corr_matrix, dtype=bool))\n",
    "\n",
    "    # Create heatmap correlation matrix\n",
    "    dataplot = sns.heatmap(corr_matrix, mask=mask, xticklabels=cols, yticklabels=cols, cmap=\"coolwarm\", vmin=-1, vmax=1, annot=True, fmt=\".2f\", annot_kws={\"size\": 4})\n",
    "\n",
    "    # Group ind vs social vars by color and modify tick label names\n",
    "    for tick_label in dataplot.axes.xaxis.get_ticklabels():\n",
    "        if tick_label.get_text() in ind_vars_enc:\n",
    "            tick_label.set_color('green')\n",
    "        elif tick_label.get_text() in soc_vars_enc:\n",
    "            tick_label.set_color('purple')  \n",
    "    for tick_label in dataplot.axes.yaxis.get_ticklabels():\n",
    "        if tick_label.get_text() in ind_vars_enc:\n",
    "            tick_label.set_color('green')\n",
    "        elif tick_label.get_text() in soc_vars_enc:\n",
    "            tick_label.set_color('purple') \n",
    "\n",
    "    # Increase the size of xtick labels\n",
    "    # dataplot.tick_params(axis='x', labelsize=12)\n",
    "\n",
    "    # Increase the size of ytick labels\n",
    "    # dataplot.tick_params(axis='y', labelsize=12)\n",
    "\n",
    "    # Add legend and place it in lower left \n",
    "    plt.legend(handles=[\n",
    "        plt.Line2D([0], [0], marker='o', color='w', label='Social Factors', markerfacecolor='purple', markersize=10),\n",
    "        plt.Line2D([0], [0], marker='o', color='w', label='Individual Factors', markerfacecolor='green', markersize=10)\n",
    "    ], bbox_to_anchor=(-0.1, -0.1), fontsize = 20)\n",
    "\n",
935 936 937
    "    plt.title(\"\\n\\n\" + plot_title, fontdict={'fontsize': 30, 'fontweight': 'bold'})\n",
    "\n",
    "    return corr_matrix"
Joaquin Torres's avatar
Joaquin Torres committed
938 939 940 941
   ]
  },
  {
   "cell_type": "code",
Joaquin Torres's avatar
Joaquin Torres committed
942
   "execution_count": null,
Joaquin Torres's avatar
Joaquin Torres committed
943
   "metadata": {},
Joaquin Torres's avatar
Joaquin Torres committed
944
   "outputs": [],
Joaquin Torres's avatar
Joaquin Torres committed
945 946 947
   "source": [
    "fig, axs = plt.subplots(3, 3, figsize=(50, 50))\n",
    "plt.subplots_adjust(hspace=0.75, wspace=2)\n",
948
    "corr_mats = [] # List of tuples (m1, m2) to store the 3 pairs of matrices to compare (pre vs post)\n",
Joaquin Torres's avatar
Joaquin Torres committed
949 950
    "\n",
    "# Go through possible values for 'Situacion_tratamiento' and 'Group'\n",
Joaquin Torres's avatar
Joaquin Torres committed
951
    "for sit_tto in range(1,4):\n",
952 953 954 955 956 957 958 959 960 961 962
    "    # ALL\n",
    "    plt.subplot(3, 3, 3*(sit_tto-1) + 1)  # Calculate the subplot position dynamically\n",
    "    _ = plot_heatmap(sit_tto, 1)\n",
    "    # PRE\n",
    "    plt.subplot(3, 3, 3*(sit_tto-1) + 2) \n",
    "    corr_matrix_pre = plot_heatmap(sit_tto, 2)\n",
    "    # POST\n",
    "    plt.subplot(3, 3, 3*(sit_tto-1) + 3)\n",
    "    corr_matrix_post = plot_heatmap(sit_tto, 3)\n",
    "\n",
    "    corr_mats.append((corr_matrix_pre, corr_matrix_post))\n",
Joaquin Torres's avatar
Joaquin Torres committed
963 964 965
    "        \n",
    "plt.tight_layout()\n",
    "\n",
Joaquin Torres's avatar
Joaquin Torres committed
966
    "plt.savefig('./output/plots/correlations/heatmaps_one_hot.svg', dpi=550, bbox_inches='tight')"
Joaquin Torres's avatar
Joaquin Torres committed
967
   ]
968 969 970 971 972
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
Joaquin Torres's avatar
Joaquin Torres committed
973
    "#### Finding Differences PRE vs POST"
974 975 976 977
   ]
  },
  {
   "cell_type": "code",
Joaquin Torres's avatar
Joaquin Torres committed
978
   "execution_count": null,
979 980 981 982
   "metadata": {},
   "outputs": [],
   "source": [
    "def find_diff (sit_tto:int, m_pre, m_post):\n",
Joaquin Torres's avatar
Joaquin Torres committed
983 984 985
    "\n",
    "    diff_list = []  # List to store tuples of (difference, variable_i, variable_j)\n",
    "\n",
986 987 988 989 990 991 992 993
    "    if sit_tto == 1:\n",
    "        cols = [target_var + '_REDEF'] + corr_cols\n",
    "    else:\n",
    "        cols = corr_cols\n",
    "    # Go through matrices\n",
    "    for i, var_i in enumerate(cols):\n",
    "        for j, var_j in enumerate(cols):\n",
    "            # If difference greater than certain threshold, print variables \n",
Joaquin Torres's avatar
Joaquin Torres committed
994 995 996 997 998 999 1000 1001 1002
    "            val_pre = m_pre[i][j]\n",
    "            val_post = m_post[i][j]\n",
    "            diff = abs(val_pre - val_post)\n",
    "            diff_list.append((diff, var_i, var_j, val_pre, val_post))\n",
    "    \n",
    "    # Sort the list based on the difference value in descending order\n",
    "    diff_list.sort(key=lambda x: x[0], reverse=True)\n",
    "            \n",
    "    # Print the sorted list\n",
1003
    "    for diff, var_i, var_j, val_pre, val_post in diff_list[0:100]:\n",
Joaquin Torres's avatar
Joaquin Torres committed
1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014
    "        # Give ind vs soc vars their corresponding color\n",
    "        if var_i in ind_vars_enc:\n",
    "            print(colors.GREEN + var_i + colors.RESET, end=' ')\n",
    "        else:\n",
    "            print(colors.PURPLE + var_i + colors.PURPLE, end=' ')\n",
    "        print(\"& \", end='')\n",
    "        if var_j in ind_vars_enc:\n",
    "            print(colors.GREEN + var_j + colors.RESET, end=' ')\n",
    "        else:\n",
    "            print(colors.PURPLE + var_j + colors.RESET, end=' ')\n",
    "        print(f\"--> Diff: {diff:.2f} (PRE: {val_pre:.2f}; POST: {val_post:.2f})\")"
1015 1016 1017 1018
   ]
  },
  {
   "cell_type": "code",
Joaquin Torres's avatar
Joaquin Torres committed
1019
   "execution_count": null,
1020
   "metadata": {},
Joaquin Torres's avatar
Joaquin Torres committed
1021
   "outputs": [],
1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036
   "source": [
    "class colors:\n",
    "    RED = '\\033[91m'\n",
    "    GREEN = '\\033[92m'\n",
    "    YELLOW = '\\033[93m'\n",
    "    BLUE = '\\033[94m'\n",
    "    PURPLE = '\\033[95m'\n",
    "    CYAN = '\\033[96m'\n",
    "    WHITE = '\\033[97m'\n",
    "    RESET = '\\033[0m'\n",
    "\n",
    "# Print colored text\n",
    "print(colors.RED + \"This is red text.\" + colors.RESET)\n",
    "print(colors.GREEN + \"This is green text.\" + colors.RESET)\n",
    "print(colors.BLUE + \"This is blue text.\" + colors.RESET)"
1037 1038 1039 1040
   ]
  },
  {
   "cell_type": "code",
Joaquin Torres's avatar
Joaquin Torres committed
1041
   "execution_count": null,
Joaquin Torres's avatar
Joaquin Torres committed
1042
   "metadata": {},
Joaquin Torres's avatar
Joaquin Torres committed
1043 1044 1045 1046 1047 1048 1049 1050 1051
   "outputs": [],
   "source": [
    "print(\"------SIT_TTO 1: NO FILTERING------\")\n",
    "find_diff(1, corr_mats[0][0], corr_mats[0][1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
Joaquin Torres's avatar
Joaquin Torres committed
1052
   "metadata": {},
Joaquin Torres's avatar
Joaquin Torres committed
1053 1054 1055 1056 1057 1058 1059 1060 1061
   "outputs": [],
   "source": [
    "print(\"------SIT_TTO 2: ABANDONO-----\")\n",
    "find_diff(2, corr_mats[1][0], corr_mats[1][1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
Joaquin Torres's avatar
Joaquin Torres committed
1062
   "metadata": {},
Joaquin Torres's avatar
Joaquin Torres committed
1063 1064 1065 1066 1067 1068 1069 1070 1071 1072
   "outputs": [],
   "source": [
    "print(\"------SIT_TTO 3: ALTA-----\")\n",
    "find_diff(3, corr_mats[2][0], corr_mats[2][1])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
Joaquin Torres's avatar
Joaquin Torres committed
1073
    "### Final Datasets"
Joaquin Torres's avatar
Joaquin Torres committed
1074 1075 1076 1077
   ]
  },
  {
   "cell_type": "code",
Joaquin Torres's avatar
Joaquin Torres committed
1078
   "execution_count": null,
Joaquin Torres's avatar
Joaquin Torres committed
1079 1080 1081
   "metadata": {},
   "outputs": [],
   "source": [
Joaquin Torres's avatar
Joaquin Torres committed
1082 1083 1084
    "bd = bd.drop(columns=['Situacion_tratamiento'])\n",
    "# print(len(bd.columns))\n",
    "\n",
Joaquin Torres's avatar
Joaquin Torres committed
1085
    "# For conj_pre dataframe\n",
Joaquin Torres's avatar
Joaquin Torres committed
1086
    "conj_pre = bd[bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio y fin prepandemia']\n",
Joaquin Torres's avatar
Joaquin Torres committed
1087 1088 1089
    "conj_pre = conj_pre.drop(columns=['Pandemia_inicio_fin_tratamiento'])\n",
    "\n",
    "# For conj_post dataframe\n",
Joaquin Torres's avatar
Joaquin Torres committed
1090 1091 1092 1093 1094 1095
    "conj_post = bd[(bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio prepandemia y fin en pandemia') | \n",
    "                    (bd['Pandemia_inicio_fin_tratamiento'] == 'inicio y fin en pandemia')]\n",
    "conj_post = conj_post.drop(columns=['Pandemia_inicio_fin_tratamiento'])\n",
    "\n",
    "# print(conj_post.columns)\n",
    "# print(conj_pre.columns)"
Joaquin Torres's avatar
Joaquin Torres committed
1096 1097 1098 1099
   ]
  },
  {
   "cell_type": "code",
Joaquin Torres's avatar
Joaquin Torres committed
1100 1101 1102
   "execution_count": null,
   "metadata": {},
   "outputs": [],
1103
   "source": [
Joaquin Torres's avatar
Joaquin Torres committed
1104 1105 1106 1107 1108 1109 1110
    "X_pre, y_pre = conj_pre.loc[:, conj_pre.columns != \"Treatment_Outcome\"].to_numpy(), conj_pre.Treatment_Outcome\n",
    "X_post, y_post = conj_post.loc[:, conj_post.columns != \"Treatment_Outcome\"].to_numpy(), conj_post.Treatment_Outcome\n",
    "feat = np.delete(conj_pre.columns.to_numpy(),-1) # Get labels and remove target \n",
    "\n",
    "# Export datasets\n",
    "conj_pre.to_csv('./output/datasets/pre_dataset.csv', index=False)\n",
    "conj_post.to_csv('./output/datasets/post_dataset.csv', index=False)"
Joaquin Torres's avatar
Joaquin Torres committed
1111 1112 1113 1114 1115 1116
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
Joaquin Torres's avatar
Joaquin Torres committed
1117
    "### Feature Analysis"
Joaquin Torres's avatar
Joaquin Torres committed
1118 1119
   ]
  },
Joaquin Torres's avatar
Joaquin Torres committed
1120 1121 1122 1123
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
Joaquin Torres's avatar
Joaquin Torres committed
1124
    "#### Mutual Info"
Joaquin Torres's avatar
Joaquin Torres committed
1125 1126
   ]
  },
Joaquin Torres's avatar
Joaquin Torres committed
1127 1128
  {
   "cell_type": "code",
1129
   "execution_count": null,
Joaquin Torres's avatar
Joaquin Torres committed
1130
   "metadata": {},
1131
   "outputs": [],
Joaquin Torres's avatar
Joaquin Torres committed
1132
   "source": [
Joaquin Torres's avatar
Joaquin Torres committed
1133 1134 1135 1136
    "# Create subplots\n",
    "fig, axes = plt.subplots(1, 2, figsize=(12, 6))\n",
    "\n",
    "# PRE\n",
1137
    "importances_MI = mutual_info_classif(X_pre, y_pre)\n",
Joaquin Torres's avatar
Joaquin Torres committed
1138 1139 1140 1141 1142
    "feat_importances_MI = pd.Series(importances_MI, feat)\n",
    "feat_importances_MI.sort_values(inplace=True)\n",
    "axes[0].barh(feat_importances_MI[feat_importances_MI != 0][-20:].index, feat_importances_MI[feat_importances_MI != 0][-20:], color='teal')\n",
    "axes[0].set_xlabel(\"Mutual Information\")\n",
    "axes[0].set_title(\"PRE\")\n",
Joaquin Torres's avatar
Joaquin Torres committed
1143
    "\n",
Joaquin Torres's avatar
Joaquin Torres committed
1144
    "# POST\n",
1145
    "importances_MI = mutual_info_classif(X_post, y_post)\n",
Joaquin Torres's avatar
Joaquin Torres committed
1146 1147
    "feat_importances_MI = pd.Series(importances_MI, feat)\n",
    "feat_importances_MI.sort_values(inplace=True)\n",
Joaquin Torres's avatar
Joaquin Torres committed
1148 1149 1150
    "axes[1].barh(feat_importances_MI[feat_importances_MI != 0][-20:].index, feat_importances_MI[feat_importances_MI != 0][-20:], color='teal')\n",
    "axes[1].set_xlabel(\"Mutual Information\")\n",
    "axes[1].set_title(\"POST\")\n",
Joaquin Torres's avatar
Joaquin Torres committed
1151 1152
    "\n",
    "plt.tight_layout()\n",
Joaquin Torres's avatar
Joaquin Torres committed
1153
    "plt.savefig('./output/plots/feature_importance/mutual_info.svg', format='svg', dpi=1200)\n",
1154
    "plt.show()"
Joaquin Torres's avatar
Joaquin Torres committed
1155 1156 1157 1158 1159 1160
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
Joaquin Torres's avatar
Joaquin Torres committed
1161
    "#### ANOVA"
Joaquin Torres's avatar
Joaquin Torres committed
1162 1163 1164 1165
   ]
  },
  {
   "cell_type": "code",
1166
   "execution_count": null,
Joaquin Torres's avatar
Joaquin Torres committed
1167
   "metadata": {},
1168
   "outputs": [],
Joaquin Torres's avatar
Joaquin Torres committed
1169
   "source": [
Joaquin Torres's avatar
Joaquin Torres committed
1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184
    "# Create subplots\n",
    "fig, axes = plt.subplots(1, 2, figsize=(12, 6))\n",
    "\n",
    "# PRE\n",
    "selector = SelectKBest(f_classif, k=39)\n",
    "selector.fit(X_pre, y_pre)\n",
    "feat_importances_AN_pre = pd.Series(selector.pvalues_, feat)\n",
    "feat_importances_AN_pre.sort_values(inplace=True)\n",
    "axes[0].barh(feat_importances_AN_pre[feat_importances_AN_pre > 0.005][-20:].index, feat_importances_AN_pre[feat_importances_AN_pre > 0.005][-20:], color='teal')\n",
    "axes[0].set_xlabel(\"p-value ANOVA\")\n",
    "axes[0].set_title(\"PRE\")\n",
    "\n",
    "# POST\n",
    "selector = SelectKBest(f_classif, k=39)\n",
    "selector.fit(X_post, y_post)\n",
1185
    "feat_importances_AN_post = pd.Series(selector.pvalues_, feat)\n",
Joaquin Torres's avatar
Joaquin Torres committed
1186 1187 1188 1189
    "feat_importances_AN_post.sort_values(inplace=True)\n",
    "axes[1].barh(feat_importances_AN_post[feat_importances_AN_post > 0.005][-20:].index, feat_importances_AN_post[feat_importances_AN_post > 0.005][-20:], color='teal') \n",
    "axes[1].set_xlabel(\"p-value ANOVA\")\n",
    "axes[1].set_title(\"POST\")\n",
Joaquin Torres's avatar
Joaquin Torres committed
1190 1191
    "\n",
    "plt.tight_layout()\n",
Joaquin Torres's avatar
Joaquin Torres committed
1192
    "plt.savefig('./output/plots/feature_importance/ANOVA.svg', format='svg', dpi=1200)\n",
Joaquin Torres's avatar
Joaquin Torres committed
1193
    "plt.show()"
1194
   ]
Joaquin Torres's avatar
Joaquin Torres committed
1195
  },
Joaquin Torres's avatar
Joaquin Torres committed
1196 1197 1198 1199 1200 1201 1202
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Variance Threshold"
   ]
  },
Joaquin Torres's avatar
Joaquin Torres committed
1203 1204 1205 1206 1207
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230
   "source": [
    "# Create subplots\n",
    "fig, axes = plt.subplots(1, 2, figsize=(15, 6))\n",
    "\n",
    "# PRE\n",
    "variance_filter = VarianceThreshold(threshold=0)\n",
    "variance_filter.fit(X_pre)\n",
    "feat_importances_var_pre = pd.Series(variance_filter.variances_, feat)\n",
    "feat_importances_var_pre.sort_values(inplace=True)\n",
    "axes[0].barh(feat_importances_var_pre[feat_importances_var_pre > 0.05][-20:].index, feat_importances_var_pre[feat_importances_var_pre > 0.05][-20:], color='teal')\n",
    "axes[0].set_xlabel(\"Variance\")\n",
    "axes[0].set_title(\"PRE\")\n",
    "\n",
    "# POST\n",
    "variance_filter = VarianceThreshold(threshold=0)\n",
    "variance_filter.fit(X_post)\n",
    "feat_importances_var_post = pd.Series(variance_filter.variances_, feat)\n",
    "feat_importances_var_post.sort_values(inplace=True)\n",
    "axes[1].barh(feat_importances_var_post[feat_importances_var_post > 0.05][-20:].index, feat_importances_var_post[feat_importances_var_post > 0.05][-20:], color='teal')\n",
    "axes[1].set_xlabel(\"Variance\")\n",
    "axes[1].set_title(\"POST\")\n",
    "\n",
    "plt.tight_layout()\n",
Joaquin Torres's avatar
Joaquin Torres committed
1231
    "plt.savefig('./output/plots/feature_importance/var_threshold.svg', format='svg', dpi=1200)\n",
1232 1233
    "plt.show()"
   ]
Joaquin Torres's avatar
Joaquin Torres committed
1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}