{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### EDA" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Libraries" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "from pypair.association import binary_binary, continuous_continuous, binary_continuous\n", "\n", "from sklearn.feature_selection import VarianceThreshold\n", "from sklearn.feature_selection import SelectKBest\n", "from sklearn.feature_selection import f_classif\n", "from sklearn.feature_selection import mutual_info_classif" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Preparing Data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Reading and filtering" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "bd_all = pd.read_spss('17_abril.sav')\n", "\n", "# Filter the dataset to work only with alcohol patients\n", "bd = bd_all[bd_all['Alcohol_DxCIE'] == 'Sí']\n", "\n", "# Filter the dataset to work only with 'Situacion_tratamiento' == 'Abandono' or 'Alta'\n", "bd = bd[(bd['Situacion_tratamiento'] == 'Abandono') | (bd['Situacion_tratamiento'] == 'Alta terapéutica')]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Defining sets of patients" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_11540\\2495984927.py:18: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " conj_post['Group'] = 'Post'\n", "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_11540\\2495984927.py:19: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " conj_pre['Group'] = 'Pre'\n" ] } ], "source": [ "# Pre-pandemic\n", "conj_pre = bd[bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio y fin prepandemia']\n", "# Pre-pandemic abandono\n", "pre_abandono = conj_pre[conj_pre['Situacion_tratamiento'] == 'Abandono']\n", "# Pre-pandemic alta\n", "pre_alta = conj_pre[conj_pre['Situacion_tratamiento'] == 'Alta terapéutica']\n", "\n", "# Post-pandemic\n", "# Merging last two classes to balance sets\n", "conj_post = bd[(bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio prepandemia y fin en pandemia') | \n", " (bd['Pandemia_inicio_fin_tratamiento'] == 'inicio y fin en pandemia')]\n", "# Post-pandemic abandono\n", "post_abandono = conj_post[conj_post['Situacion_tratamiento'] == 'Abandono']\n", "# Post-pandemic alta\n", "post_alta = conj_post[conj_post['Situacion_tratamiento'] == 'Alta terapéutica']\n", "\n", "# Concatenate the two data frames and add a new column to distinguish between them. Useful for plots\n", "conj_post['Group'] = 'Post'\n", "conj_pre['Group'] = 'Pre'\n", "combined_pre_post = pd.concat([conj_post, conj_pre])" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "PRE: 22861\n", "\tALTA: 2792\n", "\tABANDONO: 20069\n", "POST: 10677\n", "\tALTA: 1882\n", "\tABANDONO: 8795\n" ] } ], "source": [ "# Printing size of different datasets\n", "print(f\"PRE: {len(conj_pre)}\")\n", "print(f\"\\tALTA: {len(pre_alta)}\")\n", "print(f\"\\tABANDONO: {len(pre_abandono)}\")\n", "\n", "print(f\"POST: {len(conj_post)}\")\n", "print(f\"\\tALTA: {len(post_alta)}\")\n", "print(f\"\\tABANDONO: {len(post_abandono)}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### First Steps" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Inspecting the dataframes" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "PRE\n", "\n", "Index: 22861 entries, 0 to 85164\n", "Data columns (total 35 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 CODPROYECTO 22861 non-null float64 \n", " 1 Education 22861 non-null object \n", " 2 Social_protection 22861 non-null object \n", " 3 Job_insecurity 22861 non-null object \n", " 4 Housing 22861 non-null object \n", " 5 Alterations_early_childhood_develop 22861 non-null object \n", " 6 Social_inclusion 22861 non-null object \n", " 7 Risk_stigma 21606 non-null category\n", " 8 Structural_conflic 22861 non-null float64 \n", " 9 Age 22852 non-null float64 \n", " 10 Sex 22861 non-null object \n", " 11 NumHijos 21647 non-null float64 \n", " 12 Smoking 22861 non-null object \n", " 13 Biological_vulnerability 22861 non-null object \n", " 14 Alcohol_DxCIE 22861 non-null object \n", " 15 Opiaceos_DxCIE 22861 non-null object \n", " 16 Cannabis_DXCIE 22861 non-null object \n", " 17 BZD_DxCIE 22861 non-null object \n", " 18 Cocaina_DxCIE 22861 non-null object \n", " 19 Alucinogenos_DXCIE 22861 non-null object \n", " 20 Tabaco_DXCIE 22861 non-null object \n", " 21 FrecuenciaConsumo30Dias 22861 non-null object \n", " 22 Años_consumo_droga 22342 non-null float64 \n", " 23 OtrosDx_Psiquiatrico 22861 non-null object \n", " 24 Tx_previos 22861 non-null object \n", " 25 Adherencia_tto_recalc 22861 non-null float64 \n", " 26 Tiempo_tx 22861 non-null float64 \n", " 27 Readmisiones_estudios 22861 non-null object \n", " 28 Situacion_tratamiento 22861 non-null object \n", " 29 Periodos_COVID 22861 non-null object \n", " 30 Pandemia_inicio_fin_tratamiento 22861 non-null object \n", " 31 Nreadmision 22861 non-null float64 \n", " 32 Readmisiones_PRECOVID 22861 non-null float64 \n", " 33 Readmisiones_COVID 22861 non-null float64 \n", " 34 Group 22861 non-null object \n", "dtypes: category(1), float64(10), object(24)\n", "memory usage: 6.1+ MB\n", "None\n", "-------------------------------\n", "PRE-ABANDONO\n", "\n", "Index: 20069 entries, 0 to 85164\n", "Data columns (total 34 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 CODPROYECTO 20069 non-null float64 \n", " 1 Education 20069 non-null object \n", " 2 Social_protection 20069 non-null object \n", " 3 Job_insecurity 20069 non-null object \n", " 4 Housing 20069 non-null object \n", " 5 Alterations_early_childhood_develop 20069 non-null object \n", " 6 Social_inclusion 20069 non-null object \n", " 7 Risk_stigma 18919 non-null category\n", " 8 Structural_conflic 20069 non-null float64 \n", " 9 Age 20061 non-null float64 \n", " 10 Sex 20069 non-null object \n", " 11 NumHijos 18958 non-null float64 \n", " 12 Smoking 20069 non-null object \n", " 13 Biological_vulnerability 20069 non-null object \n", " 14 Alcohol_DxCIE 20069 non-null object \n", " 15 Opiaceos_DxCIE 20069 non-null object \n", " 16 Cannabis_DXCIE 20069 non-null object \n", " 17 BZD_DxCIE 20069 non-null object \n", " 18 Cocaina_DxCIE 20069 non-null object \n", " 19 Alucinogenos_DXCIE 20069 non-null object \n", " 20 Tabaco_DXCIE 20069 non-null object \n", " 21 FrecuenciaConsumo30Dias 20069 non-null object \n", " 22 Años_consumo_droga 19609 non-null float64 \n", " 23 OtrosDx_Psiquiatrico 20069 non-null object \n", " 24 Tx_previos 20069 non-null object \n", " 25 Adherencia_tto_recalc 20069 non-null float64 \n", " 26 Tiempo_tx 20069 non-null float64 \n", " 27 Readmisiones_estudios 20069 non-null object \n", " 28 Situacion_tratamiento 20069 non-null object \n", " 29 Periodos_COVID 20069 non-null object \n", " 30 Pandemia_inicio_fin_tratamiento 20069 non-null object \n", " 31 Nreadmision 20069 non-null float64 \n", " 32 Readmisiones_PRECOVID 20069 non-null float64 \n", " 33 Readmisiones_COVID 20069 non-null float64 \n", "dtypes: category(1), float64(10), object(23)\n", "memory usage: 5.2+ MB\n", "None\n", "-------------------------------\n", "PRE-ALTA\n", "\n", "Index: 2792 entries, 23 to 85159\n", "Data columns (total 34 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 CODPROYECTO 2792 non-null float64 \n", " 1 Education 2792 non-null object \n", " 2 Social_protection 2792 non-null object \n", " 3 Job_insecurity 2792 non-null object \n", " 4 Housing 2792 non-null object \n", " 5 Alterations_early_childhood_develop 2792 non-null object \n", " 6 Social_inclusion 2792 non-null object \n", " 7 Risk_stigma 2687 non-null category\n", " 8 Structural_conflic 2792 non-null float64 \n", " 9 Age 2791 non-null float64 \n", " 10 Sex 2792 non-null object \n", " 11 NumHijos 2689 non-null float64 \n", " 12 Smoking 2792 non-null object \n", " 13 Biological_vulnerability 2792 non-null object \n", " 14 Alcohol_DxCIE 2792 non-null object \n", " 15 Opiaceos_DxCIE 2792 non-null object \n", " 16 Cannabis_DXCIE 2792 non-null object \n", " 17 BZD_DxCIE 2792 non-null object \n", " 18 Cocaina_DxCIE 2792 non-null object \n", " 19 Alucinogenos_DXCIE 2792 non-null object \n", " 20 Tabaco_DXCIE 2792 non-null object \n", " 21 FrecuenciaConsumo30Dias 2792 non-null object \n", " 22 Años_consumo_droga 2733 non-null float64 \n", " 23 OtrosDx_Psiquiatrico 2792 non-null object \n", " 24 Tx_previos 2792 non-null object \n", " 25 Adherencia_tto_recalc 2792 non-null float64 \n", " 26 Tiempo_tx 2792 non-null float64 \n", " 27 Readmisiones_estudios 2792 non-null object \n", " 28 Situacion_tratamiento 2792 non-null object \n", " 29 Periodos_COVID 2792 non-null object \n", " 30 Pandemia_inicio_fin_tratamiento 2792 non-null object \n", " 31 Nreadmision 2792 non-null float64 \n", " 32 Readmisiones_PRECOVID 2792 non-null float64 \n", " 33 Readmisiones_COVID 2792 non-null float64 \n", "dtypes: category(1), float64(10), object(23)\n", "memory usage: 744.5+ KB\n", "None\n", "-------------------------------\n", "\n", "\n", "\n", "\n", "POST\n", "\n", "Index: 10677 entries, 11 to 85156\n", "Data columns (total 35 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 CODPROYECTO 10677 non-null float64 \n", " 1 Education 10677 non-null object \n", " 2 Social_protection 10677 non-null object \n", " 3 Job_insecurity 10677 non-null object \n", " 4 Housing 10677 non-null object \n", " 5 Alterations_early_childhood_develop 10677 non-null object \n", " 6 Social_inclusion 10677 non-null object \n", " 7 Risk_stigma 10085 non-null category\n", " 8 Structural_conflic 10677 non-null float64 \n", " 9 Age 10676 non-null float64 \n", " 10 Sex 10677 non-null object \n", " 11 NumHijos 10103 non-null float64 \n", " 12 Smoking 10677 non-null object \n", " 13 Biological_vulnerability 10677 non-null object \n", " 14 Alcohol_DxCIE 10677 non-null object \n", " 15 Opiaceos_DxCIE 10677 non-null object \n", " 16 Cannabis_DXCIE 10677 non-null object \n", " 17 BZD_DxCIE 10677 non-null object \n", " 18 Cocaina_DxCIE 10677 non-null object \n", " 19 Alucinogenos_DXCIE 10677 non-null object \n", " 20 Tabaco_DXCIE 10677 non-null object \n", " 21 FrecuenciaConsumo30Dias 10677 non-null object \n", " 22 Años_consumo_droga 10478 non-null float64 \n", " 23 OtrosDx_Psiquiatrico 10677 non-null object \n", " 24 Tx_previos 10677 non-null object \n", " 25 Adherencia_tto_recalc 10677 non-null float64 \n", " 26 Tiempo_tx 10677 non-null float64 \n", " 27 Readmisiones_estudios 10677 non-null object \n", " 28 Situacion_tratamiento 10677 non-null object \n", " 29 Periodos_COVID 10677 non-null object \n", " 30 Pandemia_inicio_fin_tratamiento 10677 non-null object \n", " 31 Nreadmision 10677 non-null float64 \n", " 32 Readmisiones_PRECOVID 10677 non-null float64 \n", " 33 Readmisiones_COVID 10677 non-null float64 \n", " 34 Group 10677 non-null object \n", "dtypes: category(1), float64(10), object(24)\n", "memory usage: 2.9+ MB\n", "None\n", "-------------------------------\n", "POST-ABANDONO\n", "\n", "Index: 8795 entries, 11 to 85156\n", "Data columns (total 34 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 CODPROYECTO 8795 non-null float64 \n", " 1 Education 8795 non-null object \n", " 2 Social_protection 8795 non-null object \n", " 3 Job_insecurity 8795 non-null object \n", " 4 Housing 8795 non-null object \n", " 5 Alterations_early_childhood_develop 8795 non-null object \n", " 6 Social_inclusion 8795 non-null object \n", " 7 Risk_stigma 8308 non-null category\n", " 8 Structural_conflic 8795 non-null float64 \n", " 9 Age 8794 non-null float64 \n", " 10 Sex 8795 non-null object \n", " 11 NumHijos 8325 non-null float64 \n", " 12 Smoking 8795 non-null object \n", " 13 Biological_vulnerability 8795 non-null object \n", " 14 Alcohol_DxCIE 8795 non-null object \n", " 15 Opiaceos_DxCIE 8795 non-null object \n", " 16 Cannabis_DXCIE 8795 non-null object \n", " 17 BZD_DxCIE 8795 non-null object \n", " 18 Cocaina_DxCIE 8795 non-null object \n", " 19 Alucinogenos_DXCIE 8795 non-null object \n", " 20 Tabaco_DXCIE 8795 non-null object \n", " 21 FrecuenciaConsumo30Dias 8795 non-null object \n", " 22 Años_consumo_droga 8627 non-null float64 \n", " 23 OtrosDx_Psiquiatrico 8795 non-null object \n", " 24 Tx_previos 8795 non-null object \n", " 25 Adherencia_tto_recalc 8795 non-null float64 \n", " 26 Tiempo_tx 8795 non-null float64 \n", " 27 Readmisiones_estudios 8795 non-null object \n", " 28 Situacion_tratamiento 8795 non-null object \n", " 29 Periodos_COVID 8795 non-null object \n", " 30 Pandemia_inicio_fin_tratamiento 8795 non-null object \n", " 31 Nreadmision 8795 non-null float64 \n", " 32 Readmisiones_PRECOVID 8795 non-null float64 \n", " 33 Readmisiones_COVID 8795 non-null float64 \n", "dtypes: category(1), float64(10), object(23)\n", "memory usage: 2.3+ MB\n", "None\n", "-------------------------------\n", "POST-ALTA\n", "\n", "Index: 1882 entries, 258 to 85149\n", "Data columns (total 34 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 CODPROYECTO 1882 non-null float64 \n", " 1 Education 1882 non-null object \n", " 2 Social_protection 1882 non-null object \n", " 3 Job_insecurity 1882 non-null object \n", " 4 Housing 1882 non-null object \n", " 5 Alterations_early_childhood_develop 1882 non-null object \n", " 6 Social_inclusion 1882 non-null object \n", " 7 Risk_stigma 1777 non-null category\n", " 8 Structural_conflic 1882 non-null float64 \n", " 9 Age 1882 non-null float64 \n", " 10 Sex 1882 non-null object \n", " 11 NumHijos 1778 non-null float64 \n", " 12 Smoking 1882 non-null object \n", " 13 Biological_vulnerability 1882 non-null object \n", " 14 Alcohol_DxCIE 1882 non-null object \n", " 15 Opiaceos_DxCIE 1882 non-null object \n", " 16 Cannabis_DXCIE 1882 non-null object \n", " 17 BZD_DxCIE 1882 non-null object \n", " 18 Cocaina_DxCIE 1882 non-null object \n", " 19 Alucinogenos_DXCIE 1882 non-null object \n", " 20 Tabaco_DXCIE 1882 non-null object \n", " 21 FrecuenciaConsumo30Dias 1882 non-null object \n", " 22 Años_consumo_droga 1851 non-null float64 \n", " 23 OtrosDx_Psiquiatrico 1882 non-null object \n", " 24 Tx_previos 1882 non-null object \n", " 25 Adherencia_tto_recalc 1882 non-null float64 \n", " 26 Tiempo_tx 1882 non-null float64 \n", " 27 Readmisiones_estudios 1882 non-null object \n", " 28 Situacion_tratamiento 1882 non-null object \n", " 29 Periodos_COVID 1882 non-null object \n", " 30 Pandemia_inicio_fin_tratamiento 1882 non-null object \n", " 31 Nreadmision 1882 non-null float64 \n", " 32 Readmisiones_PRECOVID 1882 non-null float64 \n", " 33 Readmisiones_COVID 1882 non-null float64 \n", "dtypes: category(1), float64(10), object(23)\n", "memory usage: 501.9+ KB\n", "None\n", "-------------------------------\n" ] } ], "source": [ "print(\"PRE\")\n", "print(conj_pre.info())\n", "print (\"-------------------------------\")\n", "print(\"PRE-ABANDONO\")\n", "print(pre_abandono.info())\n", "print (\"-------------------------------\")\n", "print(\"PRE-ALTA\")\n", "print(pre_alta.info())\n", "print (\"-------------------------------\")\n", "\n", "print(\"\\n\\n\\n\")\n", "\n", "print (\"POST\")\n", "print(conj_post.info())\n", "print (\"-------------------------------\")\n", "print(\"POST-ABANDONO\")\n", "print(post_abandono.info())\n", "print (\"-------------------------------\")\n", "print(\"POST-ALTA\")\n", "print(post_alta.info())\n", "print (\"-------------------------------\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Replacing unknown values with the mode" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Live with families or friends' 'live alone' 'live in institutions' '9.0']\n", "['Live with families or friends' 'live alone' 'live in institutions']\n" ] } ], "source": [ "# 9.0 represents unknown according to Variables.docx \n", "print(bd['Social_inclusion'].unique())\n", "mode_soc_inc = bd['Social_inclusion'].mode()[0]\n", "# print(mode_soc_inc)\n", "bd['Social_inclusion'] = bd['Social_inclusion'].replace('9.0', mode_soc_inc)\n", "print(bd['Social_inclusion'].unique())" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['No alterations (first exposure at 11 or more years)'\n", " 'Alterations (first exposure before 11 years old)' '9']\n", "['No alterations (first exposure at 11 or more years)'\n", " 'Alterations (first exposure before 11 years old)']\n" ] } ], "source": [ "print(bd['Alterations_early_childhood_develop'].unique())\n", "mode_alt = bd['Alterations_early_childhood_develop'].mode()[0]\n", "bd['Alterations_early_childhood_develop'] = bd['Alterations_early_childhood_develop'].replace('9', mode_alt)\n", "print(bd['Alterations_early_childhood_develop'].unique())" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[NaN, 'Yes', 'No']\n", "Categories (3, object): [99.0, 'No', 'Yes']\n", "[NaN, 'Yes', 'No']\n", "Categories (2, object): ['No', 'Yes']\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_11540\\1073322024.py:3: FutureWarning: The behavior of Series.replace (and DataFrame.replace) with CategoricalDtype is deprecated. In a future version, replace will only be used for cases that preserve the categories. To change the categories, use ser.cat.rename_categories instead.\n", " bd['Risk_stigma'] = bd['Risk_stigma'].replace(99.0, mode_stigma)\n" ] } ], "source": [ "print(bd['Risk_stigma'].unique())\n", "mode_stigma = bd['Risk_stigma'].mode()[0]\n", "bd['Risk_stigma'] = bd['Risk_stigma'].replace(99.0, mode_stigma)\n", "print(bd['Risk_stigma'].unique())" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[nan 0. 1. 2. 3. 4. 5. 8. 10. 6. 11. 12. 9. 7. 99. 14. 15.]\n", "[nan 0. 1. 2. 3. 4. 5. 8. 10. 6. 11. 12. 9. 7. 14. 15.]\n" ] } ], "source": [ "print(bd['NumHijos'].unique())\n", "mode_hijos = bd['NumHijos'].mode()[0]\n", "bd['NumHijos'] = bd['NumHijos'].replace(99.0, mode_hijos)\n", "print(bd['NumHijos'].unique())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Quantifying Null Values" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(f\"Total missing values Age: {bd['Age'].isnull().sum()}\")\n", "print(f\"Total missing values Años_consumo_droga: {bd['Años_consumo_droga'].isnull().sum()}\")\n", "print(f\"Total missing values Risk_stigma: {bd['Risk_stigma'].isnull().sum()}\")\n", "print(f\"Total missing values NumHijos: {bd['NumHijos'].isnull().sum()}\")\n", "\n", "print(\"\\tCONJUNTO PREPANDEMIA\")\n", "print(f\"\\t\\tMissing values Age: {conj_pre['Age'].isnull().sum()}\")\n", "print(f\"\\t\\tMissing values Años_consumo_droga: {conj_pre['Años_consumo_droga'].isnull().sum()}\")\n", "print(f\"\\t\\tMissing values Risk_stigma: {conj_pre['Risk_stigma'].isnull().sum()}\")\n", "print(f\"\\t\\tMissing values NumHijos: {conj_pre['NumHijos'].isnull().sum()}\")\n", "\n", "print(\"\\tCONJUNTO POSTPANDEMIA\")\n", "print(f\"\\t\\tMissing values Age: {conj_post['Age'].isnull().sum()}\")\n", "print(f\"\\t\\tMissing values Años_consumo_droga: {conj_post['Años_consumo_droga'].isnull().sum()}\")\n", "print(f\"\\t\\tMissing values Risk_stigma: {conj_post['Risk_stigma'].isnull().sum()}\")\n", "print(f\"\\t\\tMissing values NumHijos: {conj_post['NumHijos'].isnull().sum()}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Replacing missing values with mode" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_11540\\3303146707.py:2: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", "\n", "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", "\n", "\n", " bd['Age'].fillna(age_mode, inplace=True)\n", "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_11540\\3303146707.py:5: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", "\n", "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", "\n", "\n", " bd['Años_consumo_droga'].fillna(años_consumo_mode, inplace=True)\n", "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_11540\\3303146707.py:8: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", "\n", "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", "\n", "\n", " bd['Risk_stigma'].fillna(risk_stigma_mode, inplace=True)\n", "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_11540\\3303146707.py:11: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", "\n", "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", "\n", "\n", " bd['NumHijos'].fillna(num_hijos_mode, inplace=True)\n" ] } ], "source": [ "age_mode = bd['Age'].mode()[0]\n", "bd['Age'].fillna(age_mode, inplace=True)\n", "\n", "años_consumo_mode = bd['Años_consumo_droga'].mode()[0]\n", "bd['Años_consumo_droga'].fillna(años_consumo_mode, inplace=True)\n", "\n", "risk_stigma_mode = bd['Risk_stigma'].mode()[0]\n", "bd['Risk_stigma'].fillna(risk_stigma_mode, inplace=True)\n", "\n", "num_hijos_mode = bd['NumHijos'].mode()[0]\n", "bd['NumHijos'].fillna(num_hijos_mode, inplace=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Distribution of variables" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Classifying variables into numerical and discrete/categorical " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "disc_atts = ['Education', 'Social_protection', 'Job_insecurity', 'Housing',\n", " 'Alterations_early_childhood_develop', 'Social_inclusion',\n", " 'Risk_stigma', 'Sex', 'NumHijos', 'Smoking', 'Biological_vulnerability',\n", " 'Opiaceos_DxCIE', 'Cannabis_DXCIE', 'BZD_DxCIE', 'Cocaina_DxCIE',\n", " 'Alucinogenos_DXCIE', 'Tabaco_DXCIE', 'FrecuenciaConsumo30Dias',\n", " 'OtrosDx_Psiquiatrico', 'Tx_previos', 'Readmisiones_estudios', 'Nreadmision'\n", " ]\n", "\n", "num_atts = ['Structural_conflic', 'Adherencia_tto_recalc', 'Age', 'Años_consumo_droga', 'Tiempo_tx']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Distribution of discrete attributes" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "###### Count plots" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fig, axs = plt.subplots(len(disc_atts), 1, figsize=(15, 5*len(disc_atts)))\n", "plt.subplots_adjust(hspace=0.75, wspace=1.25)\n", "\n", "for i, disc_att in enumerate(disc_atts):\n", " ax = sns.countplot(x=disc_att, data=combined_pre_post, hue=combined_pre_post[['Situacion_tratamiento', 'Group']].apply(tuple, axis=1),\n", " hue_order=[('Abandono', 'Pre'),('Alta terapéutica', 'Pre'), ('Abandono', 'Post'), ('Alta terapéutica', 'Post')],\n", " ax=axs[i])\n", " ax.set_title(disc_att, fontsize=16, fontweight='bold')\n", " ax.get_legend().set_title(\"Groups\")\n", " \n", " # Adding count annotations\n", " for p in ax.patches:\n", " if p.get_label() == '_nolegend_':\n", " ax.annotate(format(p.get_height(), '.0f'), \n", " (p.get_x() + p.get_width() / 2., p.get_height()), \n", " ha = 'center', va = 'center', \n", " xytext = (0, 9), \n", " textcoords = 'offset points')\n", "\n", "# Adjust layout to prevent overlapping titles\n", "plt.tight_layout()\n", "\n", "# Save the figure in SVG format with DPI=600 in the \"./EDA_plots\" folder\n", "plt.savefig('./EDA_plots/countplots.svg', dpi=600, bbox_inches='tight')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "###### Normalized count plots" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Function to plot countplot \n", "def plot_count_perc_norm(i: int, group:int, disc_att:str) -> None:\n", " \"\"\"\n", " group: 1 (all), 2 (pre), 3 (post) \n", " \"\"\"\n", "\n", " # Define data to work with based on group\n", " if group == 1:\n", " df = bd \n", " elif group == 2:\n", " df = conj_pre\n", " elif group == 3:\n", " df = conj_post\n", "\n", " # GOAL: find percentage of each possible category within the total of its situacion_tto subset\n", " # Group data by 'Situacion_tratamiento' and 'Education' and count occurrences\n", " grouped_counts = df.groupby(['Situacion_tratamiento', disc_att]).size().reset_index(name='count')\n", " # Calculate total count for each 'Situacion_tratamiento' group\n", " total_counts = df.groupby('Situacion_tratamiento')[disc_att].count()\n", " # Divide each count by its corresponding total count and calculate percentage\n", " grouped_counts['percentage'] = grouped_counts.apply(lambda row: row['count'] / total_counts[row['Situacion_tratamiento']] * 100, axis=1)\n", " \n", " # Follow the same order in plot as in computations\n", " col_order = grouped_counts[grouped_counts['Situacion_tratamiento'] == 'Abandono'][disc_att].tolist()\n", "\n", " # Create countplot and split each bar into two based on the value of sit_tto\n", " ax = sns.countplot(x=disc_att, hue='Situacion_tratamiento', data=df, order=col_order, ax=axs[i, group-2])\n", "\n", " # Adjust y-axis to represent percentages out of the total count\n", " ax.set_ylim(0, 100)\n", "\n", " percentages = grouped_counts['percentage']\n", " for i, p in enumerate(ax.patches):\n", " # Skip going over the legend values\n", " if p.get_label() == \"_nolegend_\":\n", " # Set height to corresponding percentage and annotate result\n", " height = percentages[i]\n", " p.set_height(height)\n", " ax.annotate(f'{height:.2f}%', (p.get_x() + p.get_width() / 2., height),\n", " ha='center', va='bottom', fontsize=6, color='black', xytext=(0, 5),\n", " textcoords='offset points')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fig, axs = plt.subplots(len(disc_atts), 2, figsize=(15, 7*len(disc_atts)))\n", "plt.subplots_adjust(hspace=0.75, wspace=1.5)\n", "\n", "for i, disc_att in enumerate(disc_atts):\n", "\n", " # # 1: ALL \n", " # plot_count_perc_norm(i, 1, disc_att)\n", " # axs[i, 0].set_title(\"\\nALL\")\n", " # axs[i, 0].set_xlabel(disc_att, fontweight='bold')\n", " # axs[i, 0].set_ylabel(\"% of total within its Sit_tto group\")\n", " # axs[i, 0].tick_params(axis='x', rotation=90)\n", " \n", " # 2: PRE\n", " plot_count_perc_norm(i, 2, disc_att)\n", " axs[i, 0].set_title(\"\\nPRE\")\n", " axs[i, 0].set_xlabel(disc_att, fontweight='bold')\n", " axs[i, 0].set_ylabel(\"% of total within its Sit_tto group\")\n", " axs[i, 0].tick_params(axis='x', rotation=90)\n", "\n", " # 3: POST\n", " plot_count_perc_norm(i, 3, disc_att)\n", " axs[i, 1].set_title(\"\\nPOST\")\n", " axs[i, 1].set_xlabel(disc_att, fontweight='bold')\n", " axs[i, 1].set_ylabel(\"% of total within its Sit_tto group\")\n", " axs[i, 1].tick_params(axis='x', rotation=90)\n", "\n", " \n", "# Adjust layout to prevent overlapping titles\n", "plt.tight_layout()\n", "\n", "# Save the figure in SVG format with DPI=600 in the \"./EDA_plots\" folder\n", "plt.savefig('./EDA_plots/norm_countplots.svg', dpi=600, bbox_inches='tight')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Distribution of numeric attributes" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "###### Summary statistics" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(bd[num_atts].describe())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "###### Boxplots" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fig, axs = plt.subplots(len(num_atts), 1, figsize=(12, 5*len(num_atts)))\n", "plt.subplots_adjust(hspace=0.75, wspace=1.5)\n", "\n", "for i, num_att in enumerate(num_atts):\n", " plt.subplot(len(num_atts), 1, i+1)\n", " sns.boxplot(\n", " data=combined_pre_post,\n", " x = num_att,\n", " y = 'Group',\n", " hue='Situacion_tratamiento',\n", " )\n", "\n", "# Adjust layout to prevent overlapping titles\n", "plt.tight_layout()\n", "\n", "# Save the figure in SVG format with DPI=600 in the \"./EDA_plots\" folder\n", "plt.savefig('./EDA_plots/boxplots.svg', dpi=600, bbox_inches='tight')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "###### Histograms" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fig, axs = plt.subplots(len(num_atts), 3, figsize=(15, 6*len(num_atts)))\n", "plt.subplots_adjust(hspace=0.75, wspace=1.5)\n", "\n", "for i, num_att in enumerate(num_atts):\n", "\n", " # 1: All alcohol patients\n", " sns.histplot(data=bd,x=num_att,bins=15, hue='Situacion_tratamiento', stat='probability', common_norm=False, kde=True,\n", " line_kws={'lw': 5}, alpha = 0.4, ax=axs[i, 0])\n", " axs[i, 0].set_title(f\"\\nDistr. of {num_att} - ALL\")\n", "\n", " # 2: PRE\n", " sns.histplot(data=conj_pre,x=num_att,bins=15, hue='Situacion_tratamiento', stat='probability', common_norm=False, kde=True, \n", " line_kws={'lw': 5}, alpha = 0.4, ax=axs[i, 1])\n", " axs[i, 1].set_title(f\"\\nDistr. of {num_att} - PRE\")\n", "\n", " # Subplot 3: POST\n", " sns.histplot(data=conj_post,x=num_att,bins=15, hue='Situacion_tratamiento', stat='probability', common_norm=False, kde=True, \n", " line_kws={'lw': 5}, alpha = 0.4, ax=axs[i, 2])\n", " axs[i, 2].set_title(f\"\\nDistr. of {num_att} - POST\")\n", "\n", "# Adjust layout to prevent overlapping titles\n", "plt.tight_layout()\n", "\n", "# Save the figure in SVG format with DPI=600 in the \"./EDA_plots\" folder\n", "plt.savefig('./EDA_plots/histograms.svg', dpi=600, bbox_inches='tight')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Correlation Analysis" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Turning binary variables into 0/1 values" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# --------------------------------------------------------------------------\n", "\n", "# 'Alterations_early_childhood_develop'\n", "alterations_mapping = {\n", " 'No alterations (first exposure at 11 or more years)' : 0,\n", " 'Alterations (first exposure before 11 years old)': 1,\n", "}\n", "\n", "bd['Alterations_early_childhood_develop_REDEF'] = bd['Alterations_early_childhood_develop'].map(alterations_mapping)\n", "\n", "# --------------------------------------------------------------------------\n", "\n", "# Social protection\n", "bd['Social_protection_REDEF'] = bd['Social_protection'].map({'No':0, 'Sí':1})\n", "\n", "# --------------------------------------------------------------------------\n", "\n", "# 'Risk_stigma'\n", "bd['Risk_stigma_REDEF'] = bd['Risk_stigma'].map({'No':0, 'Yes':1})\n", "\n", "# --------------------------------------------------------------------------\n", "\n", "# 'Sex'\n", "bd['Sex_REDEF'] = bd['Sex'].map({'Hombre':0, 'Mujer':1})\n", "\n", "# --------------------------------------------------------------------------\n", "\n", "# 'Smoking'\n", "bd['Smoking_REDEF'] = bd['Smoking'].map({'No':0, 'Sí':1})\n", "\n", "# --------------------------------------------------------------------------\n", "\n", "# 'Biological_vulnerability'\n", "bd['Biological_vulnerability_REDEF'] = bd['Biological_vulnerability'].map({'No':0, 'Sí':1})\n", "\n", "# --------------------------------------------------------------------------\n", "\n", "# 'Droga_DxCIE'\n", "bd['Opiaceos_DxCIE_REDEF'] = bd['Opiaceos_DxCIE'].map({'No': 0, 'Sí': 1})\n", "bd['Cannabis_DXCIE_REDEF'] = bd['Cannabis_DXCIE'].map({'No': 0, 'Sí': 1})\n", "bd['BZD_DxCIE_REDEF'] = bd['BZD_DxCIE'].map({'No': 0, 'Sí': 1})\n", "bd['Cocaina_DxCIE_REDEF'] = bd['Cocaina_DxCIE'].map({'No': 0, 'Sí': 1})\n", "bd['Alucinogenos_DXCIE_REDEF'] = bd['Alucinogenos_DXCIE'].map({'No': 0, 'Sí': 1})\n", "bd['Tabaco_DXCIE_REDEF'] = bd['Tabaco_DXCIE'].map({'No': 0, 'Sí': 1})\n", "\n", "# --------------------------------------------------------------------------\n", "\n", "# 'OtrosDx_Psiquiatrico'\n", "bd['OtrosDx_Psiquiatrico_REDEF'] = bd['OtrosDx_Psiquiatrico'].map({'No':0, 'Sí':1})\n", "\n", "# --------------------------------------------------------------------------\n", "\n", "# 'Tx_previos'\n", "bd['Tx_previos_REDEF'] = bd['Tx_previos'].map({'No':0, 'Sí':1})\n", "\n", "# --------------------------------------------------------------------------\n", "\n", "# 'Situacion_tratamiento (!!!!!)\n", "# Important to define properly\n", "bd['Situacion_tratamiento_REDEF'] = bd['Situacion_tratamiento'].map({'Abandono':1, 'Alta terapéutica':0})\n", "\n", "# --------------------------------------------------------------------------" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Defining groups of variables" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "social_vars = ['Education', 'Social_protection', 'Job_insecurity', 'Housing', 'Alterations_early_childhood_develop', \n", " 'Social_inclusion', 'Risk_stigma', 'Structural_conflic']\n", "ind_vars = ['Age', 'Sex', 'NumHijos', 'Smoking', 'Biological_vulnerability', 'Opiaceos_DxCIE', \n", " 'Cannabis_DXCIE', 'BZD_DxCIE', 'Cocaina_DxCIE', 'Alucinogenos_DXCIE', 'Tabaco_DXCIE', \n", " 'FrecuenciaConsumo30Dias', 'Años_consumo_droga','OtrosDx_Psiquiatrico', 'Tx_previos', 'Adherencia_tto_recalc'] \n", "target_var = 'Situacion_tratamiento'" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "# Columns that are already numeric and we don't need to redefine \n", "no_redef_cols = ['Structural_conflic', 'Age', 'NumHijos', 'Años_consumo_droga', 'Adherencia_tto_recalc']" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# res_vars = ['Tiempo_tx', 'Readmisiones_estudios', 'Periodos_COVID', 'Pandemia_inicio_fin_tratamiento', \n", "# 'Nreadmision', 'Readmisiones_PRECOVID', 'Readmisiones_COVID']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### One-hot encode categorical variables" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "# Specify columns to one hot encode; empty list otherwise\n", "one_hot_vars = ['Education', 'Job_insecurity', 'Housing', 'Social_inclusion', 'FrecuenciaConsumo30Dias']\n", "\n", "one_hots_vars_prefix = {\n", " 'Education': 'Ed',\n", " 'Job_insecurity': 'JobIn',\n", " 'Housing': 'Hous', \n", " 'Social_inclusion': 'SocInc',\n", " 'FrecuenciaConsumo30Dias': 'Frec30',\n", "}\n", "\n", "one_hot_cols_dic = {}\n", "\n", "for one_hot_var in one_hot_vars:\n", " # Create one hot encoding version of attribute and concatenate new columns to main df\n", " encoded_var = pd.get_dummies(bd[one_hot_var], prefix=one_hots_vars_prefix[one_hot_var])\n", " bd = pd.concat([bd, encoded_var], axis=1)\n", " one_hot_cols_dic[one_hot_var] = encoded_var.columns.tolist()\n", "\n", "# print(one_hot_cols_dic['FrecuenciaConsumo30Dias'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "###### Defining final version of columns of interest" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "soc_vars_enc = []\n", "for soc_var in social_vars:\n", " # If no need to redefine, append directly\n", " if soc_var in no_redef_cols:\n", " soc_vars_enc.append(soc_var)\n", " # If need to redefine\n", " else:\n", " # Check if it was one-hot encoded\n", " if soc_var in one_hot_vars:\n", " # Append all one hot columns\n", " soc_vars_enc = soc_vars_enc + one_hot_cols_dic[soc_var]\n", " # If not, use redefined version through mapping\n", " else:\n", " soc_vars_enc.append(soc_var + '_REDEF')\n", "\n", "ind_vars_enc = []\n", "for ind_var in ind_vars:\n", " # If no need to redefine, append directly\n", " if ind_var in no_redef_cols:\n", " ind_vars_enc.append(ind_var)\n", " # If need to redefine\n", " else:\n", " # Check if it was one-hot encoded\n", " if ind_var in one_hot_vars:\n", " # Append all one hot columns\n", " ind_vars_enc = ind_vars_enc + one_hot_cols_dic[ind_var]\n", " # If not, use redefined version through mapping\n", " else:\n", " ind_vars_enc.append(ind_var + '_REDEF')\n", "\n", "# Final version of columns we need to use for correlation analysis\n", "corr_cols = soc_vars_enc + ind_vars_enc" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Ed_Not Complete primary school', 'Ed_Primary education', 'Ed_Secondary Education', 'Ed_Secondary more technical education', 'Ed_Tertiary', 'Ed_Unknowledge', 'Social_protection_REDEF', 'JobIn_Non-stable', 'JobIn_Stable', 'JobIn_Unemployed', 'JobIn_unkwnodledge', 'Hous_Institutional', 'Hous_Stable', 'Hous_Unstable', 'Hous_unknowledge', 'Alterations_early_childhood_develop_REDEF', 'SocInc_Live with families or friends', 'SocInc_live alone', 'SocInc_live in institutions', 'Risk_stigma_REDEF', 'Structural_conflic']\n" ] } ], "source": [ "# Export column names for future programs\n", "np.save('./soc_vars_names.npy', soc_vars_enc)\n", "np.save('./ind_vars_names.npy', soc_vars_enc)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "###### Update main data frames" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "# Pre-pandemic\n", "conj_pre = bd[bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio y fin prepandemia']\n", "# Pre-pandemic abandono\n", "pre_abandono = conj_pre[conj_pre['Situacion_tratamiento'] == 'Abandono']\n", "# Pre-pandemic alta\n", "pre_alta = conj_pre[conj_pre['Situacion_tratamiento'] == 'Alta terapéutica']\n", "\n", "# Post-pandemic\n", "# Merging last two classes to balance sets\n", "conj_post = bd[(bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio prepandemia y fin en pandemia') | \n", " (bd['Pandemia_inicio_fin_tratamiento'] == 'inicio y fin en pandemia')]\n", "# Post-pandemic abandono\n", "post_abandono = conj_post[conj_post['Situacion_tratamiento'] == 'Abandono']\n", "# Post-pandemic alta\n", "post_alta = conj_post[conj_post['Situacion_tratamiento'] == 'Alta terapéutica']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Building correlation matrix" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "binary_vars = [col for col in corr_cols if len(bd[col].unique()) == 2] + ['Situacion_tratamiento_REDEF', 'Risk_stigma_REDEF']\n", "cont_vars = ['Structural_conflic', 'Age', 'NumHijos', 'Años_consumo_droga', 'Adherencia_tto_recalc']" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "def get_corr_matrix(df, cols):\n", " \n", " # Initialize nxn matrix to zeroes\n", " n = len(cols)\n", " corr_matrix = np.zeros((n,n))\n", "\n", " for i, var_i in enumerate(cols):\n", " for j, var_j in enumerate(cols):\n", " # Fill lower triangle of matrix\n", " if i > j:\n", " # Binary with binary correlation: tetrachoric\n", " if var_i in binary_vars and var_j in binary_vars:\n", " corr = binary_binary(df[var_i], df[var_j], measure='tetrachoric')\n", " # Continuous with continuous correlation: \n", " elif var_i in cont_vars and var_j in cont_vars:\n", " # Returning nan sometimes:\n", " # corr_tuple = continuous_continuous(df[var_i], df[var_j], measure = 'spearman')\n", " # corr = corr_tuple[0]\n", " corr = df[var_i].corr(df[var_j], method='spearman')\n", " # Binary vs Continuous correlation:\n", " else:\n", " if var_i in binary_vars:\n", " bin_var = var_i\n", " cont_var = var_j\n", " else:\n", " bin_var = var_j\n", " cont_var = var_i\n", " corr = binary_continuous(df[bin_var], df[cont_var], measure='point_biserial')\n", " # Assign value to matrix\n", " corr_matrix[i][j] = corr \n", " \n", " return corr_matrix" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "def plot_heatmap(sit_tto: int, group:int) -> None:\n", " \"\"\"\n", " sit_tto: 1 (include it as another var), 2 (only abandono), 3 (only alta)\n", " group: 1 (all alcohol patients), 2 (pre), 3 (post)\n", " \"\"\"\n", "\n", " # Define columns based on sit_tto arg\n", " if sit_tto == 1:\n", " # Include target as another variable\n", " cols = [target_var + '_REDEF'] + corr_cols\n", " else:\n", " cols = corr_cols\n", " \n", " # Title plot and select datat based on group and sit_tto\n", " if group == 1:\n", " plot_title = \"Correl Matrix - ALL\"\n", " if sit_tto == 1:\n", " bd_ca = bd[cols]\n", " elif sit_tto == 2:\n", " bd_ca = bd[bd['Situacion_tratamiento'] == 'Abandono'][cols]\n", " elif sit_tto == 3:\n", " bd_ca = bd[bd['Situacion_tratamiento'] == 'Alta terapéutica'][cols]\n", " elif group == 2:\n", " plot_title = \"Correl Matrix - PRE\"\n", " if sit_tto == 1: \n", " bd_ca = conj_pre[cols]\n", " elif sit_tto == 2:\n", " bd_ca = pre_abandono[cols]\n", " elif sit_tto == 3:\n", " bd_ca = pre_alta[cols]\n", " elif group == 3:\n", " plot_title = \"Correl Matrix - POST\"\n", " if sit_tto == 1: \n", " bd_ca = conj_post[cols]\n", " elif sit_tto == 2:\n", " bd_ca = post_abandono[cols]\n", " elif sit_tto == 3:\n", " bd_ca = post_alta[cols]\n", " \n", " # Complete title\n", " if sit_tto == 2:\n", " plot_title += \" - ABANDONO\"\n", " elif sit_tto == 3:\n", " plot_title += \" - ALTA\"\n", "\n", " corr_matrix = get_corr_matrix(bd_ca, cols)\n", "\n", " # Create a mask for the upper triangle\n", " mask = np.triu(np.ones_like(corr_matrix, dtype=bool))\n", "\n", " # Create heatmap correlation matrix\n", " dataplot = sns.heatmap(corr_matrix, mask=mask, xticklabels=cols, yticklabels=cols, cmap=\"coolwarm\", vmin=-1, vmax=1, annot=True, fmt=\".2f\", annot_kws={\"size\": 4})\n", "\n", " # Group ind vs social vars by color and modify tick label names\n", " for tick_label in dataplot.axes.xaxis.get_ticklabels():\n", " if tick_label.get_text() in ind_vars_enc:\n", " tick_label.set_color('green')\n", " elif tick_label.get_text() in soc_vars_enc:\n", " tick_label.set_color('purple') \n", " for tick_label in dataplot.axes.yaxis.get_ticklabels():\n", " if tick_label.get_text() in ind_vars_enc:\n", " tick_label.set_color('green')\n", " elif tick_label.get_text() in soc_vars_enc:\n", " tick_label.set_color('purple') \n", "\n", " # Increase the size of xtick labels\n", " # dataplot.tick_params(axis='x', labelsize=12)\n", "\n", " # Increase the size of ytick labels\n", " # dataplot.tick_params(axis='y', labelsize=12)\n", "\n", " # Add legend and place it in lower left \n", " plt.legend(handles=[\n", " plt.Line2D([0], [0], marker='o', color='w', label='Social Factors', markerfacecolor='purple', markersize=10),\n", " plt.Line2D([0], [0], marker='o', color='w', label='Individual Factors', markerfacecolor='green', markersize=10)\n", " ], bbox_to_anchor=(-0.1, -0.1), fontsize = 20)\n", "\n", " plt.title(\"\\n\\n\" + plot_title, fontdict={'fontsize': 30, 'fontweight': 'bold'})\n", "\n", " return corr_matrix" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fig, axs = plt.subplots(3, 3, figsize=(50, 50))\n", "plt.subplots_adjust(hspace=0.75, wspace=2)\n", "corr_mats = [] # List of tuples (m1, m2) to store the 3 pairs of matrices to compare (pre vs post)\n", "\n", "# Go through possible values for 'Situacion_tratamiento' and 'Group'\n", "for sit_tto in range(1,4):\n", " # ALL\n", " plt.subplot(3, 3, 3*(sit_tto-1) + 1) # Calculate the subplot position dynamically\n", " _ = plot_heatmap(sit_tto, 1)\n", " # PRE\n", " plt.subplot(3, 3, 3*(sit_tto-1) + 2) \n", " corr_matrix_pre = plot_heatmap(sit_tto, 2)\n", " # POST\n", " plt.subplot(3, 3, 3*(sit_tto-1) + 3)\n", " corr_matrix_post = plot_heatmap(sit_tto, 3)\n", "\n", " corr_mats.append((corr_matrix_pre, corr_matrix_post))\n", " \n", "# Adjust layout to prevent overlapping titles\n", "plt.tight_layout()\n", "\n", "# Save the figure in SVG format in the \"./EDA_plots\" folder\n", "plt.savefig('./EDA_plots/heatmaps_one_hot.svg', dpi=550, bbox_inches='tight')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Finding significative differences between PRE and POST" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def find_diff (sit_tto:int, m_pre, m_post):\n", "\n", " diff_list = [] # List to store tuples of (difference, variable_i, variable_j)\n", "\n", " if sit_tto == 1:\n", " cols = [target_var + '_REDEF'] + corr_cols\n", " else:\n", " cols = corr_cols\n", " # Go through matrices\n", " for i, var_i in enumerate(cols):\n", " for j, var_j in enumerate(cols):\n", " # If difference greater than certain threshold, print variables \n", " val_pre = m_pre[i][j]\n", " val_post = m_post[i][j]\n", " diff = abs(val_pre - val_post)\n", " diff_list.append((diff, var_i, var_j, val_pre, val_post))\n", " \n", " # Sort the list based on the difference value in descending order\n", " diff_list.sort(key=lambda x: x[0], reverse=True)\n", " \n", " # Print the sorted list\n", " for diff, var_i, var_j, val_pre, val_post in diff_list[0:100]:\n", " # Give ind vs soc vars their corresponding color\n", " if var_i in ind_vars_enc:\n", " print(colors.GREEN + var_i + colors.RESET, end=' ')\n", " else:\n", " print(colors.PURPLE + var_i + colors.PURPLE, end=' ')\n", " print(\"& \", end='')\n", " if var_j in ind_vars_enc:\n", " print(colors.GREEN + var_j + colors.RESET, end=' ')\n", " else:\n", " print(colors.PURPLE + var_j + colors.RESET, end=' ')\n", " print(f\"--> Diff: {diff:.2f} (PRE: {val_pre:.2f}; POST: {val_post:.2f})\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "class colors:\n", " RED = '\\033[91m'\n", " GREEN = '\\033[92m'\n", " YELLOW = '\\033[93m'\n", " BLUE = '\\033[94m'\n", " PURPLE = '\\033[95m'\n", " CYAN = '\\033[96m'\n", " WHITE = '\\033[97m'\n", " RESET = '\\033[0m'\n", "\n", "# Print colored text\n", "print(colors.RED + \"This is red text.\" + colors.RESET)\n", "print(colors.GREEN + \"This is green text.\" + colors.RESET)\n", "print(colors.BLUE + \"This is blue text.\" + colors.RESET)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [ "keep" ] }, "outputs": [], "source": [ "print(\"------SIT_TTO 1: NO FILTERING------\")\n", "find_diff(1, corr_mats[0][0], corr_mats[0][1])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [ "keep" ] }, "outputs": [], "source": [ "print(\"------SIT_TTO 2: ABANDONO-----\")\n", "find_diff(2, corr_mats[1][0], corr_mats[1][1])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [ "keep" ] }, "outputs": [], "source": [ "print(\"------SIT_TTO 3: ALTA-----\")\n", "find_diff(3, corr_mats[2][0], corr_mats[2][1])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Feature Analysis and Selection" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Building final datasets to work with" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Index: 33538 entries, 0 to 85164\n", "Data columns (total 45 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Ed_Not Complete primary school 33538 non-null bool \n", " 1 Ed_Primary education 33538 non-null bool \n", " 2 Ed_Secondary Education 33538 non-null bool \n", " 3 Ed_Secondary more technical education 33538 non-null bool \n", " 4 Ed_Tertiary 33538 non-null bool \n", " 5 Ed_Unknowledge 33538 non-null bool \n", " 6 Social_protection_REDEF 33538 non-null int64 \n", " 7 JobIn_Non-stable 33538 non-null bool \n", " 8 JobIn_Stable 33538 non-null bool \n", " 9 JobIn_Unemployed 33538 non-null bool \n", " 10 JobIn_unkwnodledge 33538 non-null bool \n", " 11 Hous_Institutional 33538 non-null bool \n", " 12 Hous_Stable 33538 non-null bool \n", " 13 Hous_Unstable 33538 non-null bool \n", " 14 Hous_unknowledge 33538 non-null bool \n", " 15 Alterations_early_childhood_develop_REDEF 33538 non-null int64 \n", " 16 SocInc_Live with families or friends 33538 non-null bool \n", " 17 SocInc_live alone 33538 non-null bool \n", " 18 SocInc_live in institutions 33538 non-null bool \n", " 19 Risk_stigma_REDEF 33538 non-null category\n", " 20 Structural_conflic 33538 non-null float64 \n", " 21 Age 33538 non-null float64 \n", " 22 Sex_REDEF 33538 non-null int64 \n", " 23 NumHijos 33538 non-null float64 \n", " 24 Smoking_REDEF 33538 non-null int64 \n", " 25 Biological_vulnerability_REDEF 33538 non-null int64 \n", " 26 Opiaceos_DxCIE_REDEF 33538 non-null int64 \n", " 27 Cannabis_DXCIE_REDEF 33538 non-null int64 \n", " 28 BZD_DxCIE_REDEF 33538 non-null int64 \n", " 29 Cocaina_DxCIE_REDEF 33538 non-null int64 \n", " 30 Alucinogenos_DXCIE_REDEF 33538 non-null int64 \n", " 31 Tabaco_DXCIE_REDEF 33538 non-null int64 \n", " 32 Frec30_1 día/semana 33538 non-null bool \n", " 33 Frec30_2-3 días‎/semana 33538 non-null bool \n", " 34 Frec30_4-6 días/semana 33538 non-null bool \n", " 35 Frec30_Desconocido 33538 non-null bool \n", " 36 Frec30_Menos de 1 día‎/semana 33538 non-null bool \n", " 37 Frec30_No consumio 33538 non-null bool \n", " 38 Frec30_Todos los días 33538 non-null bool \n", " 39 Años_consumo_droga 33538 non-null float64 \n", " 40 OtrosDx_Psiquiatrico_REDEF 33538 non-null int64 \n", " 41 Tx_previos_REDEF 33538 non-null int64 \n", " 42 Adherencia_tto_recalc 33538 non-null float64 \n", " 43 Pandemia_inicio_fin_tratamiento 33538 non-null object \n", " 44 Situacion_tratamiento_REDEF 33538 non-null int64 \n", "dtypes: bool(24), category(1), float64(5), int64(14), object(1)\n", "memory usage: 6.2+ MB\n", "None\n" ] } ], "source": [ "# Work with columns of interest\n", "cols_of_interest = corr_cols + ['Pandemia_inicio_fin_tratamiento'] + [target_var + \"_REDEF\"]\n", "temp_bd = bd[cols_of_interest]\n", "print(temp_bd.info()) # NaN values already dealt with (replaced by mode - this okay?)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "# Dropping unknown columns/categories for analysis purposes\n", "unknown_cols = ['Ed_Unknowledge', 'JobIn_unkwnodledge', 'Hous_unknowledge', 'Frec30_Desconocido']\n", "temp_bd = temp_bd.drop(columns=unknown_cols)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Index: 33538 entries, 0 to 85164\n", "Data columns (total 41 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Ed_Not Complete primary school 33538 non-null bool \n", " 1 Ed_Primary education 33538 non-null bool \n", " 2 Ed_Secondary Education 33538 non-null bool \n", " 3 Ed_Secondary more technical education 33538 non-null bool \n", " 4 Ed_Tertiary 33538 non-null bool \n", " 5 Social_protection_REDEF 33538 non-null int64 \n", " 6 JobIn_Non-stable 33538 non-null bool \n", " 7 JobIn_Stable 33538 non-null bool \n", " 8 JobIn_Unemployed 33538 non-null bool \n", " 9 Hous_Institutional 33538 non-null bool \n", " 10 Hous_Stable 33538 non-null bool \n", " 11 Hous_Unstable 33538 non-null bool \n", " 12 Alterations_early_childhood_develop_REDEF 33538 non-null int64 \n", " 13 SocInc_Live with families or friends 33538 non-null bool \n", " 14 SocInc_live alone 33538 non-null bool \n", " 15 SocInc_live in institutions 33538 non-null bool \n", " 16 Risk_stigma_REDEF 33538 non-null category\n", " 17 Structural_conflic 33538 non-null float64 \n", " 18 Age 33538 non-null float64 \n", " 19 Sex_REDEF 33538 non-null int64 \n", " 20 NumHijos 33538 non-null float64 \n", " 21 Smoking_REDEF 33538 non-null int64 \n", " 22 Biological_vulnerability_REDEF 33538 non-null int64 \n", " 23 Opiaceos_DxCIE_REDEF 33538 non-null int64 \n", " 24 Cannabis_DXCIE_REDEF 33538 non-null int64 \n", " 25 BZD_DxCIE_REDEF 33538 non-null int64 \n", " 26 Cocaina_DxCIE_REDEF 33538 non-null int64 \n", " 27 Alucinogenos_DXCIE_REDEF 33538 non-null int64 \n", " 28 Tabaco_DXCIE_REDEF 33538 non-null int64 \n", " 29 Frec30_1 día/semana 33538 non-null bool \n", " 30 Frec30_2-3 días‎/semana 33538 non-null bool \n", " 31 Frec30_4-6 días/semana 33538 non-null bool \n", " 32 Frec30_Menos de 1 día‎/semana 33538 non-null bool \n", " 33 Frec30_No consumio 33538 non-null bool \n", " 34 Frec30_Todos los días 33538 non-null bool \n", " 35 Años_consumo_droga 33538 non-null float64 \n", " 36 OtrosDx_Psiquiatrico_REDEF 33538 non-null int64 \n", " 37 Tx_previos_REDEF 33538 non-null int64 \n", " 38 Adherencia_tto_recalc 33538 non-null float64 \n", " 39 Pandemia_inicio_fin_tratamiento 33538 non-null object \n", " 40 Situacion_tratamiento_REDEF 33538 non-null int64 \n", "dtypes: bool(20), category(1), float64(5), int64(14), object(1)\n", "memory usage: 6.0+ MB\n", "None\n" ] } ], "source": [ "print(temp_bd.info())" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "# For conj_pre dataframe\n", "conj_pre = temp_bd[temp_bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio y fin prepandemia']\n", "conj_pre = conj_pre.drop(columns=['Pandemia_inicio_fin_tratamiento'])\n", "\n", "# For conj_post dataframe\n", "conj_post = temp_bd[(temp_bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio prepandemia y fin en pandemia') | \n", " (temp_bd['Pandemia_inicio_fin_tratamiento'] == 'inicio y fin en pandemia')]\n", "conj_post = conj_post.drop(columns=['Pandemia_inicio_fin_tratamiento'])" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Index: 22861 entries, 0 to 85164\n", "Data columns (total 40 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Ed_Not Complete primary school 22861 non-null bool \n", " 1 Ed_Primary education 22861 non-null bool \n", " 2 Ed_Secondary Education 22861 non-null bool \n", " 3 Ed_Secondary more technical education 22861 non-null bool \n", " 4 Ed_Tertiary 22861 non-null bool \n", " 5 Social_protection_REDEF 22861 non-null int64 \n", " 6 JobIn_Non-stable 22861 non-null bool \n", " 7 JobIn_Stable 22861 non-null bool \n", " 8 JobIn_Unemployed 22861 non-null bool \n", " 9 Hous_Institutional 22861 non-null bool \n", " 10 Hous_Stable 22861 non-null bool \n", " 11 Hous_Unstable 22861 non-null bool \n", " 12 Alterations_early_childhood_develop_REDEF 22861 non-null int64 \n", " 13 SocInc_Live with families or friends 22861 non-null bool \n", " 14 SocInc_live alone 22861 non-null bool \n", " 15 SocInc_live in institutions 22861 non-null bool \n", " 16 Risk_stigma_REDEF 22861 non-null category\n", " 17 Structural_conflic 22861 non-null float64 \n", " 18 Age 22861 non-null float64 \n", " 19 Sex_REDEF 22861 non-null int64 \n", " 20 NumHijos 22861 non-null float64 \n", " 21 Smoking_REDEF 22861 non-null int64 \n", " 22 Biological_vulnerability_REDEF 22861 non-null int64 \n", " 23 Opiaceos_DxCIE_REDEF 22861 non-null int64 \n", " 24 Cannabis_DXCIE_REDEF 22861 non-null int64 \n", " 25 BZD_DxCIE_REDEF 22861 non-null int64 \n", " 26 Cocaina_DxCIE_REDEF 22861 non-null int64 \n", " 27 Alucinogenos_DXCIE_REDEF 22861 non-null int64 \n", " 28 Tabaco_DXCIE_REDEF 22861 non-null int64 \n", " 29 Frec30_1 día/semana 22861 non-null bool \n", " 30 Frec30_2-3 días‎/semana 22861 non-null bool \n", " 31 Frec30_4-6 días/semana 22861 non-null bool \n", " 32 Frec30_Menos de 1 día‎/semana 22861 non-null bool \n", " 33 Frec30_No consumio 22861 non-null bool \n", " 34 Frec30_Todos los días 22861 non-null bool \n", " 35 Años_consumo_droga 22861 non-null float64 \n", " 36 OtrosDx_Psiquiatrico_REDEF 22861 non-null int64 \n", " 37 Tx_previos_REDEF 22861 non-null int64 \n", " 38 Adherencia_tto_recalc 22861 non-null float64 \n", " 39 Situacion_tratamiento_REDEF 22861 non-null int64 \n", "dtypes: bool(20), category(1), float64(5), int64(14)\n", "memory usage: 3.9 MB\n", "None\n" ] } ], "source": [ "print(conj_pre.info())" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Index: 10677 entries, 11 to 85156\n", "Data columns (total 40 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Ed_Not Complete primary school 10677 non-null bool \n", " 1 Ed_Primary education 10677 non-null bool \n", " 2 Ed_Secondary Education 10677 non-null bool \n", " 3 Ed_Secondary more technical education 10677 non-null bool \n", " 4 Ed_Tertiary 10677 non-null bool \n", " 5 Social_protection_REDEF 10677 non-null int64 \n", " 6 JobIn_Non-stable 10677 non-null bool \n", " 7 JobIn_Stable 10677 non-null bool \n", " 8 JobIn_Unemployed 10677 non-null bool \n", " 9 Hous_Institutional 10677 non-null bool \n", " 10 Hous_Stable 10677 non-null bool \n", " 11 Hous_Unstable 10677 non-null bool \n", " 12 Alterations_early_childhood_develop_REDEF 10677 non-null int64 \n", " 13 SocInc_Live with families or friends 10677 non-null bool \n", " 14 SocInc_live alone 10677 non-null bool \n", " 15 SocInc_live in institutions 10677 non-null bool \n", " 16 Risk_stigma_REDEF 10677 non-null category\n", " 17 Structural_conflic 10677 non-null float64 \n", " 18 Age 10677 non-null float64 \n", " 19 Sex_REDEF 10677 non-null int64 \n", " 20 NumHijos 10677 non-null float64 \n", " 21 Smoking_REDEF 10677 non-null int64 \n", " 22 Biological_vulnerability_REDEF 10677 non-null int64 \n", " 23 Opiaceos_DxCIE_REDEF 10677 non-null int64 \n", " 24 Cannabis_DXCIE_REDEF 10677 non-null int64 \n", " 25 BZD_DxCIE_REDEF 10677 non-null int64 \n", " 26 Cocaina_DxCIE_REDEF 10677 non-null int64 \n", " 27 Alucinogenos_DXCIE_REDEF 10677 non-null int64 \n", " 28 Tabaco_DXCIE_REDEF 10677 non-null int64 \n", " 29 Frec30_1 día/semana 10677 non-null bool \n", " 30 Frec30_2-3 días‎/semana 10677 non-null bool \n", " 31 Frec30_4-6 días/semana 10677 non-null bool \n", " 32 Frec30_Menos de 1 día‎/semana 10677 non-null bool \n", " 33 Frec30_No consumio 10677 non-null bool \n", " 34 Frec30_Todos los días 10677 non-null bool \n", " 35 Años_consumo_droga 10677 non-null float64 \n", " 36 OtrosDx_Psiquiatrico_REDEF 10677 non-null int64 \n", " 37 Tx_previos_REDEF 10677 non-null int64 \n", " 38 Adherencia_tto_recalc 10677 non-null float64 \n", " 39 Situacion_tratamiento_REDEF 10677 non-null int64 \n", "dtypes: bool(20), category(1), float64(5), int64(14)\n", "memory usage: 1.8 MB\n", "None\n" ] } ], "source": [ "print(conj_post.info())" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "# Creating a numpy matrix without the target variable (X) and a list with the target variable (y) \n", "X_pre, y_pre = conj_pre.loc[:, conj_pre.columns != \"Situacion_tratamiento_REDEF\"].to_numpy(), conj_pre.Situacion_tratamiento_REDEF\n", "X_post, y_post = conj_post.loc[:, conj_post.columns != \"Situacion_tratamiento_REDEF\"].to_numpy(), conj_post.Situacion_tratamiento_REDEF\n", "feat = np.delete(conj_pre.columns.to_numpy(),-1) # Get labels and remove target " ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Ed_Not Complete primary school' 'Ed_Primary education'\n", " 'Ed_Secondary Education' 'Ed_Secondary more technical education'\n", " 'Ed_Tertiary' 'Social_protection_REDEF' 'JobIn_Non-stable' 'JobIn_Stable'\n", " 'JobIn_Unemployed' 'Hous_Institutional' 'Hous_Stable' 'Hous_Unstable'\n", " 'Alterations_early_childhood_develop_REDEF'\n", " 'SocInc_Live with families or friends' 'SocInc_live alone'\n", " 'SocInc_live in institutions' 'Risk_stigma_REDEF' 'Structural_conflic'\n", " 'Age' 'Sex_REDEF' 'NumHijos' 'Smoking_REDEF'\n", " 'Biological_vulnerability_REDEF' 'Opiaceos_DxCIE_REDEF'\n", " 'Cannabis_DXCIE_REDEF' 'BZD_DxCIE_REDEF' 'Cocaina_DxCIE_REDEF'\n", " 'Alucinogenos_DXCIE_REDEF' 'Tabaco_DXCIE_REDEF' 'Frec30_1 día/semana'\n", " 'Frec30_2-3 días\\u200e/semana' 'Frec30_4-6 días/semana'\n", " 'Frec30_Menos de 1 día\\u200e/semana' 'Frec30_No consumio'\n", " 'Frec30_Todos los días' 'Años_consumo_droga' 'OtrosDx_Psiquiatrico_REDEF'\n", " 'Tx_previos_REDEF' 'Adherencia_tto_recalc']\n" ] } ], "source": [ "print(feat)" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(22861, 39)\n", "(10677, 39)\n", "(22861,)\n", "(10677,)\n", "39\n" ] } ], "source": [ "print(X_pre.shape)\n", "print(X_post.shape)\n", "print(y_pre.shape)\n", "print(y_post.shape)\n", "print(len(feat))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### FSS Filter methods" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "###### Mutual Info" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create subplots\n", "fig, axes = plt.subplots(1, 2, figsize=(12, 6))\n", "\n", "# PRE\n", "importances_MI = mutual_info_classif(X_pre, y_pre)\n", "feat_importances_MI = pd.Series(importances_MI, feat)\n", "feat_importances_MI.sort_values(inplace=True)\n", "axes[0].barh(feat_importances_MI[feat_importances_MI != 0][-20:].index, feat_importances_MI[feat_importances_MI != 0][-20:], color='teal')\n", "axes[0].set_xlabel(\"Mutual Information\")\n", "axes[0].set_title(\"PRE\")\n", "\n", "# POST\n", "importances_MI = mutual_info_classif(X_post, y_post)\n", "feat_importances_MI = pd.Series(importances_MI, feat)\n", "feat_importances_MI.sort_values(inplace=True)\n", "axes[1].barh(feat_importances_MI[feat_importances_MI != 0][-20:].index, feat_importances_MI[feat_importances_MI != 0][-20:], color='teal')\n", "axes[1].set_xlabel(\"Mutual Information\")\n", "axes[1].set_title(\"POST\")\n", "\n", "plt.tight_layout()\n", "plt.savefig('EDA_plots/features/mutual_info.svg', format='svg', dpi=1200)\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "###### ANOVA" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create subplots\n", "fig, axes = plt.subplots(1, 2, figsize=(12, 6))\n", "\n", "# PRE\n", "selector = SelectKBest(f_classif, k=39)\n", "selector.fit(X_pre, y_pre)\n", "feat_importances_AN_pre = pd.Series(selector.pvalues_, feat)\n", "feat_importances_AN_pre.sort_values(inplace=True)\n", "axes[0].barh(feat_importances_AN_pre[feat_importances_AN_pre > 0.005][-20:].index, feat_importances_AN_pre[feat_importances_AN_pre > 0.005][-20:], color='teal')\n", "axes[0].set_xlabel(\"p-value ANOVA\")\n", "axes[0].set_title(\"PRE\")\n", "\n", "# POST\n", "selector = SelectKBest(f_classif, k=39)\n", "selector.fit(X_post, y_post)\n", "feat_importances_AN_post = pd.Series(selector.pvalues_, feat)\n", "feat_importances_AN_post.sort_values(inplace=True)\n", "axes[1].barh(feat_importances_AN_post[feat_importances_AN_post > 0.005][-20:].index, feat_importances_AN_post[feat_importances_AN_post > 0.005][-20:], color='teal') \n", "axes[1].set_xlabel(\"p-value ANOVA\")\n", "axes[1].set_title(\"POST\")\n", "\n", "plt.tight_layout()\n", "plt.savefig('EDA_plots/features/ANOVA.svg', format='svg', dpi=1200)\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create subplots\n", "fig, axes = plt.subplots(1, 2, figsize=(15, 6))\n", "\n", "# PRE\n", "variance_filter = VarianceThreshold(threshold=0)\n", "variance_filter.fit(X_pre)\n", "feat_importances_var_pre = pd.Series(variance_filter.variances_, feat)\n", "feat_importances_var_pre.sort_values(inplace=True)\n", "axes[0].barh(feat_importances_var_pre[feat_importances_var_pre > 0.05][-20:].index, feat_importances_var_pre[feat_importances_var_pre > 0.05][-20:], color='teal')\n", "axes[0].set_xlabel(\"Variance\")\n", "axes[0].set_title(\"PRE\")\n", "\n", "# POST\n", "variance_filter = VarianceThreshold(threshold=0)\n", "variance_filter.fit(X_post)\n", "feat_importances_var_post = pd.Series(variance_filter.variances_, feat)\n", "feat_importances_var_post.sort_values(inplace=True)\n", "axes[1].barh(feat_importances_var_post[feat_importances_var_post > 0.05][-20:].index, feat_importances_var_post[feat_importances_var_post > 0.05][-20:], color='teal')\n", "axes[1].set_xlabel(\"Variance\")\n", "axes[1].set_title(\"POST\")\n", "\n", "plt.tight_layout()\n", "plt.savefig('EDA_plots/features/var_threshold.svg', format='svg', dpi=1200)\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Export PRE and POST datasets" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "conj_pre.to_csv('pre_dataset.csv', index=False)\n", "conj_post.to_csv('post_dataset.csv', index=False)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.2" } }, "nbformat": 4, "nbformat_minor": 2 }