{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### EDA" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Libraries" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "from pypair.association import binary_binary, continuous_continuous, binary_continuous\n", "\n", "from sklearn.feature_selection import VarianceThreshold\n", "from sklearn.feature_selection import SelectKBest\n", "from sklearn.feature_selection import f_classif\n", "from sklearn.feature_selection import mutual_info_classif" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Preparing Data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Reading and filtering" ] }, { "cell_type": "code", "execution_count": 139, "metadata": {}, "outputs": [], "source": [ "bd_all = pd.read_spss('./input/17_abril.sav')\n", "\n", "# Filter the dataset to work only with alcohol patients\n", "bd = bd_all[bd_all['Alcohol_DxCIE'] == 'Sí']\n", "\n", "# Filter the dataset to work only with 'Situacion_tratamiento' == 'Abandono' or 'Alta'\n", "bd = bd[(bd['Situacion_tratamiento'] == 'Abandono') | (bd['Situacion_tratamiento'] == 'Alta terapéutica')]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Defining sets of patients" ] }, { "cell_type": "code", "execution_count": 140, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_19584\\2495984927.py:18: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " conj_post['Group'] = 'Post'\n", "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_19584\\2495984927.py:19: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " conj_pre['Group'] = 'Pre'\n" ] } ], "source": [ "# Pre-pandemic\n", "conj_pre = bd[bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio y fin prepandemia']\n", "# Pre-pandemic abandono\n", "pre_abandono = conj_pre[conj_pre['Situacion_tratamiento'] == 'Abandono']\n", "# Pre-pandemic alta\n", "pre_alta = conj_pre[conj_pre['Situacion_tratamiento'] == 'Alta terapéutica']\n", "\n", "# Post-pandemic\n", "# Merging last two classes to balance sets\n", "conj_post = bd[(bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio prepandemia y fin en pandemia') | \n", " (bd['Pandemia_inicio_fin_tratamiento'] == 'inicio y fin en pandemia')]\n", "# Post-pandemic abandono\n", "post_abandono = conj_post[conj_post['Situacion_tratamiento'] == 'Abandono']\n", "# Post-pandemic alta\n", "post_alta = conj_post[conj_post['Situacion_tratamiento'] == 'Alta terapéutica']\n", "\n", "# Concatenate the two data frames and add a new column to distinguish between them. Useful for plots\n", "conj_post['Group'] = 'Post'\n", "conj_pre['Group'] = 'Pre'\n", "combined_pre_post = pd.concat([conj_post, conj_pre])" ] }, { "cell_type": "code", "execution_count": 100, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "PRE: 22861\n", "\tALTA: 2792\n", "\tABANDONO: 20069\n", "POST: 10677\n", "\tALTA: 1882\n", "\tABANDONO: 8795\n" ] } ], "source": [ "# Printing size of different datasets\n", "print(f\"PRE: {len(conj_pre)}\")\n", "print(f\"\\tALTA: {len(pre_alta)}\")\n", "print(f\"\\tABANDONO: {len(pre_abandono)}\")\n", "\n", "print(f\"POST: {len(conj_post)}\")\n", "print(f\"\\tALTA: {len(post_alta)}\")\n", "print(f\"\\tABANDONO: {len(post_abandono)}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### First Steps" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Inspecting the dataframes" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(\"PRE\")\n", "print(conj_pre.info())\n", "print (\"-------------------------------\")\n", "print(\"PRE-ABANDONO\")\n", "print(pre_abandono.info())\n", "print (\"-------------------------------\")\n", "print(\"PRE-ALTA\")\n", "print(pre_alta.info())\n", "print (\"-------------------------------\")\n", "\n", "print(\"\\n\\n\\n\")\n", "\n", "print (\"POST\")\n", "print(conj_post.info())\n", "print (\"-------------------------------\")\n", "print(\"POST-ABANDONO\")\n", "print(post_abandono.info())\n", "print (\"-------------------------------\")\n", "print(\"POST-ALTA\")\n", "print(post_alta.info())\n", "print (\"-------------------------------\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Replacing unknown values with the mode" ] }, { "cell_type": "code", "execution_count": 141, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Live with families or friends' 'live alone' 'live in institutions' '9.0']\n", "['Live with families or friends' 'live alone' 'live in institutions']\n" ] } ], "source": [ "# 9.0 represents unknown according to Variables.docx \n", "print(bd['Social_inclusion'].unique())\n", "mode_soc_inc = bd['Social_inclusion'].mode()[0]\n", "# print(mode_soc_inc)\n", "bd['Social_inclusion'] = bd['Social_inclusion'].replace('9.0', mode_soc_inc)\n", "print(bd['Social_inclusion'].unique())" ] }, { "cell_type": "code", "execution_count": 142, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['No alterations (first exposure at 11 or more years)'\n", " 'Alterations (first exposure before 11 years old)' '9']\n", "['No alterations (first exposure at 11 or more years)'\n", " 'Alterations (first exposure before 11 years old)']\n" ] } ], "source": [ "print(bd['Alterations_early_childhood_develop'].unique())\n", "mode_alt = bd['Alterations_early_childhood_develop'].mode()[0]\n", "bd['Alterations_early_childhood_develop'] = bd['Alterations_early_childhood_develop'].replace('9', mode_alt)\n", "print(bd['Alterations_early_childhood_develop'].unique())" ] }, { "cell_type": "code", "execution_count": 143, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[NaN, 'Yes', 'No']\n", "Categories (3, object): [99.0, 'No', 'Yes']\n", "[NaN, 'Yes', 'No']\n", "Categories (2, object): ['No', 'Yes']\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_19584\\1073322024.py:3: FutureWarning: The behavior of Series.replace (and DataFrame.replace) with CategoricalDtype is deprecated. In a future version, replace will only be used for cases that preserve the categories. To change the categories, use ser.cat.rename_categories instead.\n", " bd['Risk_stigma'] = bd['Risk_stigma'].replace(99.0, mode_stigma)\n" ] } ], "source": [ "print(bd['Risk_stigma'].unique())\n", "mode_stigma = bd['Risk_stigma'].mode()[0]\n", "bd['Risk_stigma'] = bd['Risk_stigma'].replace(99.0, mode_stigma)\n", "print(bd['Risk_stigma'].unique())" ] }, { "cell_type": "code", "execution_count": 144, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[nan 0. 1. 2. 3. 4. 5. 8. 10. 6. 11. 12. 9. 7. 99. 14. 15.]\n", "[nan 0. 1. 2. 3. 4. 5. 8. 10. 6. 11. 12. 9. 7. 14. 15.]\n" ] } ], "source": [ "print(bd['NumHijos'].unique())\n", "mode_hijos = bd['NumHijos'].mode()[0]\n", "bd['NumHijos'] = bd['NumHijos'].replace(99.0, mode_hijos)\n", "print(bd['NumHijos'].unique())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Quantifying Null Values" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(f\"Total missing values Age: {bd['Age'].isnull().sum()}\")\n", "print(f\"Total missing values Años_consumo_droga: {bd['Años_consumo_droga'].isnull().sum()}\")\n", "print(f\"Total missing values Risk_stigma: {bd['Risk_stigma'].isnull().sum()}\")\n", "print(f\"Total missing values NumHijos: {bd['NumHijos'].isnull().sum()}\")\n", "\n", "print(\"\\tCONJUNTO PREPANDEMIA\")\n", "print(f\"\\t\\tMissing values Age: {conj_pre['Age'].isnull().sum()}\")\n", "print(f\"\\t\\tMissing values Años_consumo_droga: {conj_pre['Años_consumo_droga'].isnull().sum()}\")\n", "print(f\"\\t\\tMissing values Risk_stigma: {conj_pre['Risk_stigma'].isnull().sum()}\")\n", "print(f\"\\t\\tMissing values NumHijos: {conj_pre['NumHijos'].isnull().sum()}\")\n", "\n", "print(\"\\tCONJUNTO POSTPANDEMIA\")\n", "print(f\"\\t\\tMissing values Age: {conj_post['Age'].isnull().sum()}\")\n", "print(f\"\\t\\tMissing values Años_consumo_droga: {conj_post['Años_consumo_droga'].isnull().sum()}\")\n", "print(f\"\\t\\tMissing values Risk_stigma: {conj_post['Risk_stigma'].isnull().sum()}\")\n", "print(f\"\\t\\tMissing values NumHijos: {conj_post['NumHijos'].isnull().sum()}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Replacing missing values with mode" ] }, { "cell_type": "code", "execution_count": 145, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_19584\\3303146707.py:2: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", "\n", "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", "\n", "\n", " bd['Age'].fillna(age_mode, inplace=True)\n", "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_19584\\3303146707.py:5: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", "\n", "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", "\n", "\n", " bd['Años_consumo_droga'].fillna(años_consumo_mode, inplace=True)\n", "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_19584\\3303146707.py:8: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", "\n", "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", "\n", "\n", " bd['Risk_stigma'].fillna(risk_stigma_mode, inplace=True)\n", "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_19584\\3303146707.py:11: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", "\n", "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", "\n", "\n", " bd['NumHijos'].fillna(num_hijos_mode, inplace=True)\n" ] } ], "source": [ "age_mode = bd['Age'].mode()[0]\n", "bd['Age'].fillna(age_mode, inplace=True)\n", "\n", "años_consumo_mode = bd['Años_consumo_droga'].mode()[0]\n", "bd['Años_consumo_droga'].fillna(años_consumo_mode, inplace=True)\n", "\n", "risk_stigma_mode = bd['Risk_stigma'].mode()[0]\n", "bd['Risk_stigma'].fillna(risk_stigma_mode, inplace=True)\n", "\n", "num_hijos_mode = bd['NumHijos'].mode()[0]\n", "bd['NumHijos'].fillna(num_hijos_mode, inplace=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Distribution of variables" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Classifying variables into numerical and discrete/categorical " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "disc_atts = ['Education', 'Social_protection', 'Job_insecurity', 'Housing',\n", " 'Alterations_early_childhood_develop', 'Social_inclusion',\n", " 'Risk_stigma', 'Sex', 'NumHijos', 'Smoking', 'Biological_vulnerability',\n", " 'Opiaceos_DxCIE', 'Cannabis_DXCIE', 'BZD_DxCIE', 'Cocaina_DxCIE',\n", " 'Alucinogenos_DXCIE', 'Tabaco_DXCIE', 'FrecuenciaConsumo30Dias',\n", " 'OtrosDx_Psiquiatrico', 'Tx_previos', 'Readmisiones_estudios', 'Nreadmision'\n", " ]\n", "\n", "num_atts = ['Structural_conflic', 'Adherencia_tto_recalc', 'Age', 'Años_consumo_droga', 'Tiempo_tx']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Distribution of discrete attributes" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "###### Count plots" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fig, axs = plt.subplots(len(disc_atts), 1, figsize=(15, 5*len(disc_atts)))\n", "plt.subplots_adjust(hspace=0.75, wspace=1.25)\n", "\n", "for i, disc_att in enumerate(disc_atts):\n", " ax = sns.countplot(x=disc_att, data=combined_pre_post, hue=combined_pre_post[['Situacion_tratamiento', 'Group']].apply(tuple, axis=1),\n", " hue_order=[('Abandono', 'Pre'),('Alta terapéutica', 'Pre'), ('Abandono', 'Post'), ('Alta terapéutica', 'Post')],\n", " ax=axs[i])\n", " ax.set_title(disc_att, fontsize=16, fontweight='bold')\n", " ax.get_legend().set_title(\"Groups\")\n", " \n", " # Adding count annotations\n", " for p in ax.patches:\n", " if p.get_label() == '_nolegend_':\n", " ax.annotate(format(p.get_height(), '.0f'), \n", " (p.get_x() + p.get_width() / 2., p.get_height()), \n", " ha = 'center', va = 'center', \n", " xytext = (0, 9), \n", " textcoords = 'offset points')\n", "\n", "# Adjust layout to prevent overlapping titles\n", "plt.tight_layout()\n", "\n", "plt.savefig('./output/plots/distributions/countplots.svg', dpi=600, bbox_inches='tight')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "###### Normalized count plots" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Function to plot countplot \n", "def plot_count_perc_norm(i: int, group:int, disc_att:str) -> None:\n", " \"\"\"\n", " group: 1 (all), 2 (pre), 3 (post) \n", " \"\"\"\n", "\n", " # Define data to work with based on group\n", " if group == 1:\n", " df = bd \n", " elif group == 2:\n", " df = conj_pre\n", " elif group == 3:\n", " df = conj_post\n", "\n", " # GOAL: find percentage of each possible category within the total of its situacion_tto subset\n", " # Group data by 'Situacion_tratamiento' and 'Education' and count occurrences\n", " grouped_counts = df.groupby(['Situacion_tratamiento', disc_att]).size().reset_index(name='count')\n", " # Calculate total count for each 'Situacion_tratamiento' group\n", " total_counts = df.groupby('Situacion_tratamiento')[disc_att].count()\n", " # Divide each count by its corresponding total count and calculate percentage\n", " grouped_counts['percentage'] = grouped_counts.apply(lambda row: row['count'] / total_counts[row['Situacion_tratamiento']] * 100, axis=1)\n", " \n", " # Follow the same order in plot as in computations\n", " col_order = grouped_counts[grouped_counts['Situacion_tratamiento'] == 'Abandono'][disc_att].tolist()\n", "\n", " # Create countplot and split each bar into two based on the value of sit_tto\n", " ax = sns.countplot(x=disc_att, hue='Situacion_tratamiento', data=df, order=col_order, ax=axs[i, group-2])\n", "\n", " # Adjust y-axis to represent percentages out of the total count\n", " ax.set_ylim(0, 100)\n", "\n", " percentages = grouped_counts['percentage']\n", " for i, p in enumerate(ax.patches):\n", " # Skip going over the legend values\n", " if p.get_label() == \"_nolegend_\":\n", " # Set height to corresponding percentage and annotate result\n", " height = percentages[i]\n", " p.set_height(height)\n", " ax.annotate(f'{height:.2f}%', (p.get_x() + p.get_width() / 2., height),\n", " ha='center', va='bottom', fontsize=6, color='black', xytext=(0, 5),\n", " textcoords='offset points')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fig, axs = plt.subplots(len(disc_atts), 2, figsize=(15, 7*len(disc_atts)))\n", "plt.subplots_adjust(hspace=0.75, wspace=1.5)\n", "\n", "for i, disc_att in enumerate(disc_atts):\n", "\n", " # # 1: ALL \n", " # plot_count_perc_norm(i, 1, disc_att)\n", " # axs[i, 0].set_title(\"\\nALL\")\n", " # axs[i, 0].set_xlabel(disc_att, fontweight='bold')\n", " # axs[i, 0].set_ylabel(\"% of total within its Sit_tto group\")\n", " # axs[i, 0].tick_params(axis='x', rotation=90)\n", " \n", " # 2: PRE\n", " plot_count_perc_norm(i, 2, disc_att)\n", " axs[i, 0].set_title(\"\\nPRE\")\n", " axs[i, 0].set_xlabel(disc_att, fontweight='bold')\n", " axs[i, 0].set_ylabel(\"% of total within its Sit_tto group\")\n", " axs[i, 0].tick_params(axis='x', rotation=90)\n", "\n", " # 3: POST\n", " plot_count_perc_norm(i, 3, disc_att)\n", " axs[i, 1].set_title(\"\\nPOST\")\n", " axs[i, 1].set_xlabel(disc_att, fontweight='bold')\n", " axs[i, 1].set_ylabel(\"% of total within its Sit_tto group\")\n", " axs[i, 1].tick_params(axis='x', rotation=90)\n", "\n", " \n", "# Adjust layout to prevent overlapping titles\n", "plt.tight_layout()\n", "\n", "# Save the figure in SVG format with DPI=600 in the \"._plots\" folder\n", "plt.savefig('./output/plots/distributions/norm_countplots.svg', dpi=600, bbox_inches='tight')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Distribution of numeric attributes" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "###### Summary statistics" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(bd[num_atts].describe())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "###### Boxplots" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fig, axs = plt.subplots(len(num_atts), 1, figsize=(12, 5*len(num_atts)))\n", "plt.subplots_adjust(hspace=0.75, wspace=1.5)\n", "\n", "for i, num_att in enumerate(num_atts):\n", " plt.subplot(len(num_atts), 1, i+1)\n", " sns.boxplot(\n", " data=combined_pre_post,\n", " x = num_att,\n", " y = 'Group',\n", " hue='Situacion_tratamiento',\n", " )\n", "\n", "# Adjust layout to prevent overlapping titles\n", "plt.tight_layout()\n", "\n", "# Save the figure in SVG format with DPI=600 in the \"./EDA_plots\" folder\n", "plt.savefig('./output/plots/distributions/boxplots.svg', dpi=600, bbox_inches='tight')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "###### Histograms" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fig, axs = plt.subplots(len(num_atts), 3, figsize=(15, 6*len(num_atts)))\n", "plt.subplots_adjust(hspace=0.75, wspace=1.5)\n", "\n", "for i, num_att in enumerate(num_atts):\n", "\n", " # 1: All alcohol patients\n", " sns.histplot(data=bd,x=num_att,bins=15, hue='Situacion_tratamiento', stat='probability', common_norm=False, kde=True,\n", " line_kws={'lw': 5}, alpha = 0.4, ax=axs[i, 0])\n", " axs[i, 0].set_title(f\"\\nDistr. of {num_att} - ALL\")\n", "\n", " # 2: PRE\n", " sns.histplot(data=conj_pre,x=num_att,bins=15, hue='Situacion_tratamiento', stat='probability', common_norm=False, kde=True, \n", " line_kws={'lw': 5}, alpha = 0.4, ax=axs[i, 1])\n", " axs[i, 1].set_title(f\"\\nDistr. of {num_att} - PRE\")\n", "\n", " # Subplot 3: POST\n", " sns.histplot(data=conj_post,x=num_att,bins=15, hue='Situacion_tratamiento', stat='probability', common_norm=False, kde=True, \n", " line_kws={'lw': 5}, alpha = 0.4, ax=axs[i, 2])\n", " axs[i, 2].set_title(f\"\\nDistr. of {num_att} - POST\")\n", "\n", "# Adjust layout to prevent overlapping titles\n", "plt.tight_layout()\n", "\n", "# Save the figure in SVG format with DPI=600 in the \"./EDA_plots\" folder\n", "plt.savefig('./output/plots/distributions/histograms.svg', dpi=600, bbox_inches='tight')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Correlation Analysis" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Turning binary variables into 0/1 values" ] }, { "cell_type": "code", "execution_count": 146, "metadata": {}, "outputs": [], "source": [ "# --------------------------------------------------------------------------\n", "\n", "# 'Alterations_early_childhood_develop'\n", "alterations_mapping = {\n", " 'No alterations (first exposure at 11 or more years)' : 0,\n", " 'Alterations (first exposure before 11 years old)': 1,\n", "}\n", "\n", "bd['Alterations_early_childhood_develop_REDEF'] = bd['Alterations_early_childhood_develop'].map(alterations_mapping)\n", "\n", "# --------------------------------------------------------------------------\n", "\n", "# Social protection\n", "bd['Social_protection_REDEF'] = bd['Social_protection'].map({'No':0, 'Sí':1})\n", "\n", "# --------------------------------------------------------------------------\n", "\n", "# 'Risk_stigma'\n", "bd['Risk_stigma_REDEF'] = bd['Risk_stigma'].map({'No':0, 'Yes':1})\n", "\n", "# --------------------------------------------------------------------------\n", "\n", "# 'Sex'\n", "bd['Sex_REDEF'] = bd['Sex'].map({'Hombre':0, 'Mujer':1})\n", "\n", "# --------------------------------------------------------------------------\n", "\n", "# 'Smoking'\n", "bd['Smoking_REDEF'] = bd['Smoking'].map({'No':0, 'Sí':1})\n", "\n", "# --------------------------------------------------------------------------\n", "\n", "# 'Biological_vulnerability'\n", "bd['Biological_vulnerability_REDEF'] = bd['Biological_vulnerability'].map({'No':0, 'Sí':1})\n", "\n", "# --------------------------------------------------------------------------\n", "\n", "# 'Droga_DxCIE'\n", "bd['Opiaceos_DxCIE_REDEF'] = bd['Opiaceos_DxCIE'].map({'No': 0, 'Sí': 1})\n", "bd['Cannabis_DXCIE_REDEF'] = bd['Cannabis_DXCIE'].map({'No': 0, 'Sí': 1})\n", "bd['BZD_DxCIE_REDEF'] = bd['BZD_DxCIE'].map({'No': 0, 'Sí': 1})\n", "bd['Cocaina_DxCIE_REDEF'] = bd['Cocaina_DxCIE'].map({'No': 0, 'Sí': 1})\n", "bd['Alucinogenos_DXCIE_REDEF'] = bd['Alucinogenos_DXCIE'].map({'No': 0, 'Sí': 1})\n", "bd['Tabaco_DXCIE_REDEF'] = bd['Tabaco_DXCIE'].map({'No': 0, 'Sí': 1})\n", "\n", "# --------------------------------------------------------------------------\n", "\n", "# 'OtrosDx_Psiquiatrico'\n", "bd['OtrosDx_Psiquiatrico_REDEF'] = bd['OtrosDx_Psiquiatrico'].map({'No':0, 'Sí':1})\n", "\n", "# --------------------------------------------------------------------------\n", "\n", "# 'Tx_previos'\n", "bd['Tx_previos_REDEF'] = bd['Tx_previos'].map({'No':0, 'Sí':1})\n", "\n", "# --------------------------------------------------------------------------\n", "\n", "# 'Situacion_tratamiento (!!!!!)\n", "# Important to define properly\n", "bd['Situacion_tratamiento_REDEF'] = bd['Situacion_tratamiento'].map({'Abandono':1, 'Alta terapéutica':0})\n", "\n", "# --------------------------------------------------------------------------" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Defining groups of variables" ] }, { "cell_type": "code", "execution_count": 127, "metadata": {}, "outputs": [], "source": [ "social_vars = ['Education', 'Social_protection', 'Job_insecurity', 'Housing', 'Alterations_early_childhood_develop', \n", " 'Social_inclusion', 'Risk_stigma', 'Structural_conflic']\n", "ind_vars = ['Age', 'Sex', 'NumHijos', 'Smoking', 'Biological_vulnerability', 'Opiaceos_DxCIE', \n", " 'Cannabis_DXCIE', 'BZD_DxCIE', 'Cocaina_DxCIE', 'Alucinogenos_DXCIE', 'Tabaco_DXCIE', \n", " 'FrecuenciaConsumo30Dias', 'Años_consumo_droga','OtrosDx_Psiquiatrico', 'Tx_previos', 'Adherencia_tto_recalc'] \n", "target_var = 'Situacion_tratamiento'" ] }, { "cell_type": "code", "execution_count": 128, "metadata": {}, "outputs": [], "source": [ "# Columns that are already numeric and we don't need to redefine \n", "no_redef_cols = ['Structural_conflic', 'Age', 'NumHijos', 'Años_consumo_droga', 'Adherencia_tto_recalc']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### One-hot encode categorical variables" ] }, { "cell_type": "code", "execution_count": 147, "metadata": {}, "outputs": [], "source": [ "# Specify columns to one hot encode; empty list otherwise\n", "one_hot_vars = ['Education', 'Job_insecurity', 'Housing', 'Social_inclusion', 'FrecuenciaConsumo30Dias']\n", "\n", "one_hots_vars_prefix = {\n", " 'Education': 'Ed',\n", " 'Job_insecurity': 'JobIn',\n", " 'Housing': 'Hous', \n", " 'Social_inclusion': 'SocInc',\n", " 'FrecuenciaConsumo30Dias': 'Frec30',\n", "}\n", "\n", "one_hot_cols_dic = {}\n", "\n", "for one_hot_var in one_hot_vars:\n", " # Create one hot encoding version of attribute and concatenate new columns to main df\n", " encoded_var = pd.get_dummies(bd[one_hot_var], prefix=one_hots_vars_prefix[one_hot_var])\n", " bd = pd.concat([bd, encoded_var], axis=1)\n", " one_hot_cols_dic[one_hot_var] = encoded_var.columns.tolist()\n", "\n", "# print(one_hot_cols_dic['FrecuenciaConsumo30Dias'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "###### Defining final version of columns of interest" ] }, { "cell_type": "code", "execution_count": 148, "metadata": {}, "outputs": [], "source": [ "soc_vars_enc = []\n", "for soc_var in social_vars:\n", " # If no need to redefine, append directly\n", " if soc_var in no_redef_cols:\n", " soc_vars_enc.append(soc_var)\n", " # If need to redefine\n", " else:\n", " # Check if it was one-hot encoded\n", " if soc_var in one_hot_vars:\n", " # Append all one hot columns\n", " soc_vars_enc = soc_vars_enc + one_hot_cols_dic[soc_var]\n", " # If not, use redefined version through mapping\n", " else:\n", " soc_vars_enc.append(soc_var + '_REDEF')\n", "\n", "ind_vars_enc = []\n", "for ind_var in ind_vars:\n", " # If no need to redefine, append directly\n", " if ind_var in no_redef_cols:\n", " ind_vars_enc.append(ind_var)\n", " # If need to redefine\n", " else:\n", " # Check if it was one-hot encoded\n", " if ind_var in one_hot_vars:\n", " # Append all one hot columns\n", " ind_vars_enc = ind_vars_enc + one_hot_cols_dic[ind_var]\n", " # If not, use redefined version through mapping\n", " else:\n", " ind_vars_enc.append(ind_var + '_REDEF')\n", "\n", "# Final version of columns we need to use for correlation analysis\n", "corr_cols = soc_vars_enc + ind_vars_enc" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "###### Excluding unknown columns and renaming" ] }, { "cell_type": "code", "execution_count": 149, "metadata": {}, "outputs": [], "source": [ "# Drop unknown columns\n", "corr_cols = [corr_col for corr_col in corr_cols if corr_col not in ['Ed_Unknowledge', 'JobIn_unkwnodledge', 'Hous_unknowledge', 'Frec30_Desconocido']]\n", "soc_vars_enc = [corr_col for corr_col in soc_vars_enc if corr_col not in ['Ed_Unknowledge', 'JobIn_unkwnodledge', 'Hous_unknowledge', 'Frec30_Desconocido']]\n", "ind_vars_enc = [corr_col for corr_col in soc_vars_enc if ind_vars_enc not in ['Ed_Unknowledge', 'JobIn_unkwnodledge', 'Hous_unknowledge', 'Frec30_Desconocido']]" ] }, { "cell_type": "code", "execution_count": 150, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Ed_Not_Complete_Primary', 'Ed_Primary', 'Ed_Secondary', 'Ed_Secondary_Technical', 'Ed_Tertiary', 'Social_Protection', 'JobIn_Unstable', 'JobIn_Stable', 'JobIn_Unemployed', 'Hous_Institutional', 'Hous_Stable', 'Hous_Unstable', 'Early_Alterations', 'SocInc_Family_Friends', 'SocInc_Alone', 'SocInc_Instit', 'Risk_Stigma', 'Structural_Conflict', 'age', 'Sex', 'Num_Children', 'Smoking', 'Bio_Vulner', 'Opiods_DXCIE', 'Cannabis_DXCIE', 'BZD_DXCIE', 'Cocaine_DXCIE', 'Hallucin_DXCIE', 'Tobacco_DXCIE', 'Freq_1dpw', 'Freq_2-3dpw', 'Freq_4-6dpw', 'Freq_l1dpw', 'Freq_None', 'Freq_Everyday', 'Years_Drug_Use', 'Other_Psychiatric_DX', 'Previous_Treatments', 'Treatment_Adherence']\n" ] } ], "source": [ "name_mapping = {\n", " 'Ed_Not Complete primary school': 'Ed_Not_Complete_Primary',\n", " 'Ed_Primary education': 'Ed_Primary',\n", " 'Ed_Secondary Education': 'Ed_Secondary',\n", " 'Ed_Secondary more technical education': 'Ed_Secondary_Technical',\n", " 'Ed_Tertiary': 'Ed_Tertiary',\n", " 'Social_protection_REDEF': 'Social_Protection',\n", " 'JobIn_Non-stable': 'JobIn_Unstable',\n", " 'JobIn_Stable': 'JobIn_Stable',\n", " 'JobIn_Unemployed': 'JobIn_Unemployed',\n", " 'Hous_Institutional': 'Hous_Institutional',\n", " 'Hous_Stable': 'Hous_Stable',\n", " 'Hous_Unstable': 'Hous_Unstable',\n", " 'Alterations_early_childhood_develop_REDEF': 'Early_Alterations',\n", " 'SocInc_Live with families or friends': 'SocInc_Family_Friends',\n", " 'SocInc_live alone': 'SocInc_Alone',\n", " 'SocInc_live in institutions': 'SocInc_Instit',\n", " 'Risk_stigma_REDEF': 'Risk_Stigma',\n", " 'Structural_conflic': 'Structural_Conflict',\n", " # 'Age': 'Age',\n", " 'Sex_REDEF': 'Sex',\n", " 'NumHijos': 'Num_Children',\n", " 'Smoking_REDEF': 'Smoking',\n", " 'Biological_vulnerability_REDEF': 'Bio_Vulner',\n", " 'Opiaceos_DxCIE_REDEF': 'Opiods_DXCIE',\n", " 'Cannabis_DXCIE_REDEF': 'Cannabis_DXCIE',\n", " 'BZD_DxCIE_REDEF': 'BZD_DXCIE',\n", " 'Cocaina_DxCIE_REDEF': 'Cocaine_DXCIE',\n", " 'Alucinogenos_DXCIE_REDEF': 'Hallucin_DXCIE',\n", " 'Tabaco_DXCIE_REDEF': 'Tobacco_DXCIE',\n", " 'Frec30_1 día/semana': 'Freq_1dpw',\n", " 'Frec30_2-3 días\\u200e/semana': 'Freq_2-3dpw',\n", " 'Frec30_4-6 días/semana': 'Freq_4-6dpw',\n", " 'Frec30_Menos de 1 día\\u200e/semana': 'Freq_l1dpw',\n", " 'Frec30_No consumio': 'Freq_None',\n", " 'Frec30_Todos los días': 'Freq_Everyday',\n", " 'Años_consumo_droga': 'Years_Drug_Use',\n", " 'OtrosDx_Psiquiatrico_REDEF': 'Other_Psychiatric_DX',\n", " 'Tx_previos_REDEF': 'Previous_Treatments',\n", " 'Adherencia_tto_recalc': 'Treatment_Adherence'\n", "}\n", "\n", "# Update lists of feature names\n", "corr_cols = [name_mapping[corr_col] for corr_col in corr_cols]\n", "print(corr_cols)\n", "soc_vars_enc = [name_mapping[col] for col in soc_vars_enc]\n", "ind_vars_enc = [name_mapping[col] for col in ind_vars_enc]\n", "\n", "bd = bd.rename(columns=name_mapping)" ] }, { "cell_type": "code", "execution_count": 133, "metadata": {}, "outputs": [], "source": [ "# Create bd with just corr_cols and target\n", "bd = bd[corr_cols + ['Situacion_tratamiento','Situacion_tratamiento_REDEF', 'Pandemia_inicio_fin_tratamiento']]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Export feature names\n", "np.save('./output/feature_names.npy', corr_cols)\n", "np.save('./output/soc_vars_names.npy', soc_vars_enc)\n", "np.save('./output/ind_vars_names.npy', ind_vars_enc)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "###### Update main data frames" ] }, { "cell_type": "code", "execution_count": 134, "metadata": {}, "outputs": [], "source": [ "# Pre-pandemic\n", "conj_pre = bd[bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio y fin prepandemia']\n", "# Pre-pandemic abandono\n", "pre_abandono = conj_pre[conj_pre['Situacion_tratamiento'] == 'Abandono']\n", "# Pre-pandemic alta\n", "pre_alta = conj_pre[conj_pre['Situacion_tratamiento'] == 'Alta terapéutica']\n", "\n", "# Post-pandemic\n", "# Merging last two classes to balance sets\n", "conj_post = bd[(bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio prepandemia y fin en pandemia') | \n", " (bd['Pandemia_inicio_fin_tratamiento'] == 'inicio y fin en pandemia')]\n", "# Post-pandemic abandono\n", "post_abandono = conj_post[conj_post['Situacion_tratamiento'] == 'Abandono']\n", "# Post-pandemic alta\n", "post_alta = conj_post[conj_post['Situacion_tratamiento'] == 'Alta terapéutica']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Building correlation matrix" ] }, { "cell_type": "code", "execution_count": 135, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Index(['Ed_Not_Complete_Primary', 'Ed_Primary', 'Ed_Secondary',\n", " 'Ed_Secondary_Technical', 'Ed_Tertiary', 'Social_Protection',\n", " 'JobIn_Unstable', 'JobIn_Stable', 'JobIn_Unemployed',\n", " 'Hous_Institutional', 'Hous_Stable', 'Hous_Unstable',\n", " 'Early_Alterations', 'SocInc_Family_Friends', 'SocInc_Alone',\n", " 'SocInc_Instit', 'Risk_Stigma', 'Structural_Conflict', 'age', 'Sex',\n", " 'Sex', 'Num_Children', 'Smoking', 'Smoking', 'Bio_Vulner',\n", " 'Opiods_DXCIE', 'Cannabis_DXCIE', 'Cannabis_DXCIE', 'BZD_DXCIE',\n", " 'Cocaine_DXCIE', 'Hallucin_DXCIE', 'Tobacco_DXCIE', 'Freq_1dpw',\n", " 'Freq_2-3dpw', 'Freq_4-6dpw', 'Freq_l1dpw', 'Freq_None',\n", " 'Freq_Everyday', 'Years_Drug_Use', 'Other_Psychiatric_DX',\n", " 'Previous_Treatments', 'Treatment_Adherence', 'Situacion_tratamiento',\n", " 'Situacion_tratamiento_REDEF', 'Pandemia_inicio_fin_tratamiento'],\n", " dtype='object')\n" ] } ], "source": [ "print(bd.columns)" ] }, { "cell_type": "code", "execution_count": 137, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Ed_Not_Complete_Primary\n", "2\n", "Ed_Primary\n", "2\n", "Ed_Secondary\n", "2\n", "Ed_Secondary_Technical\n", "2\n", "Ed_Tertiary\n", "2\n", "Social_Protection\n", "2\n", "JobIn_Unstable\n", "2\n", "JobIn_Stable\n", "2\n", "JobIn_Unemployed\n", "2\n", "Hous_Institutional\n", "2\n", "Hous_Stable\n", "2\n", "Hous_Unstable\n", "2\n", "Early_Alterations\n", "2\n", "SocInc_Family_Friends\n", "2\n", "SocInc_Alone\n", "2\n", "SocInc_Instit\n", "2\n", "Risk_Stigma\n", "2\n", "Structural_Conflict\n", "107\n", "age\n", "74\n", "Sex\n" ] }, { "ename": "AttributeError", "evalue": "'DataFrame' object has no attribute 'unique'", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", "\u001b[1;32m~\\AppData\\Local\\Temp\\ipykernel_19584\\340002156.py\u001b[0m in \u001b[0;36m?\u001b[1;34m()\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;31m# print(len(bd['Cocaine_DXCIE'].unique()) == 2)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mcol\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mcorr_cols\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcol\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 5\u001b[1;33m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mbd\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mcol\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0munique\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 6\u001b[0m \u001b[1;31m#binary_vars = [col for col in corr_cols if len(bd[col].unique()) == 2] + ['Situacion_tratamiento_REDEF', name_mapping['Risk_stigma_REDEF']]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 7\u001b[0m \u001b[1;31m#cont_vars = [name_mapping[col] for col in ['Structural_conflic', 'Age', 'NumHijos', 'Años_consumo_droga', 'Adherencia_tto_recalc']]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32mc:\\Users\\Joaquín Torres\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\pandas\\core\\generic.py\u001b[0m in \u001b[0;36m?\u001b[1;34m(self, name)\u001b[0m\n\u001b[0;32m 6292\u001b[0m \u001b[1;32mand\u001b[0m \u001b[0mname\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_accessors\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 6293\u001b[0m \u001b[1;32mand\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_info_axis\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_can_hold_identifiers_and_holds_name\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mname\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 6294\u001b[0m \u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 6295\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mname\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 6296\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mobject\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__getattribute__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mname\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[1;31mAttributeError\u001b[0m: 'DataFrame' object has no attribute 'unique'" ] } ], "source": [ "# print(len(bd['Cocaine_DXCIE'].unique()) == 2)\n", "\n", "for col in corr_cols:\n", " print(col)\n", " print(len(bd[col].unique()))\n", "#binary_vars = [col for col in corr_cols if len(bd[col].unique()) == 2] + ['Situacion_tratamiento_REDEF', name_mapping['Risk_stigma_REDEF']]\n", "#cont_vars = [name_mapping[col] for col in ['Structural_conflic', 'Age', 'NumHijos', 'Años_consumo_droga', 'Adherencia_tto_recalc']]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_corr_matrix(df, cols):\n", " \n", " # Initialize nxn matrix to zeroes\n", " n = len(cols)\n", " corr_matrix = np.zeros((n,n))\n", "\n", " for i, var_i in enumerate(cols):\n", " for j, var_j in enumerate(cols):\n", " # Fill lower triangle of matrix\n", " if i > j:\n", " # Binary with binary correlation: tetrachoric\n", " if var_i in binary_vars and var_j in binary_vars:\n", " corr = binary_binary(df[var_i], df[var_j], measure='tetrachoric')\n", " # Continuous with continuous correlation: \n", " elif var_i in cont_vars and var_j in cont_vars:\n", " # Returning nan sometimes:\n", " # corr_tuple = continuous_continuous(df[var_i], df[var_j], measure = 'spearman')\n", " # corr = corr_tuple[0]\n", " corr = df[var_i].corr(df[var_j], method='spearman')\n", " # Binary vs Continuous correlation:\n", " else:\n", " if var_i in binary_vars:\n", " bin_var = var_i\n", " cont_var = var_j\n", " else:\n", " bin_var = var_j\n", " cont_var = var_i\n", " corr = binary_continuous(df[bin_var], df[cont_var], measure='point_biserial')\n", " # Assign value to matrix\n", " corr_matrix[i][j] = corr \n", " \n", " return corr_matrix" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def plot_heatmap(sit_tto: int, group:int) -> None:\n", " \"\"\"\n", " sit_tto: 1 (include it as another var), 2 (only abandono), 3 (only alta)\n", " group: 1 (all alcohol patients), 2 (pre), 3 (post)\n", " \"\"\"\n", "\n", " # Define columns based on sit_tto arg\n", " if sit_tto == 1:\n", " # Include target as another variable\n", " cols = [target_var + '_REDEF'] + corr_cols\n", " else:\n", " cols = corr_cols\n", " \n", " # Title plot and select datat based on group and sit_tto\n", " if group == 1:\n", " plot_title = \"Correl Matrix - ALL\"\n", " if sit_tto == 1:\n", " bd_ca = bd[cols]\n", " elif sit_tto == 2:\n", " bd_ca = bd[bd['Situacion_tratamiento'] == 'Abandono'][cols]\n", " elif sit_tto == 3:\n", " bd_ca = bd[bd['Situacion_tratamiento'] == 'Alta terapéutica'][cols]\n", " elif group == 2:\n", " plot_title = \"Correl Matrix - PRE\"\n", " if sit_tto == 1: \n", " bd_ca = conj_pre[cols]\n", " elif sit_tto == 2:\n", " bd_ca = pre_abandono[cols]\n", " elif sit_tto == 3:\n", " bd_ca = pre_alta[cols]\n", " elif group == 3:\n", " plot_title = \"Correl Matrix - POST\"\n", " if sit_tto == 1: \n", " bd_ca = conj_post[cols]\n", " elif sit_tto == 2:\n", " bd_ca = post_abandono[cols]\n", " elif sit_tto == 3:\n", " bd_ca = post_alta[cols]\n", " \n", " # Complete title\n", " if sit_tto == 2:\n", " plot_title += \" - ABANDONO\"\n", " elif sit_tto == 3:\n", " plot_title += \" - ALTA\"\n", "\n", " corr_matrix = get_corr_matrix(bd_ca, cols)\n", "\n", " # Create a mask for the upper triangle\n", " mask = np.triu(np.ones_like(corr_matrix, dtype=bool))\n", "\n", " # Create heatmap correlation matrix\n", " dataplot = sns.heatmap(corr_matrix, mask=mask, xticklabels=cols, yticklabels=cols, cmap=\"coolwarm\", vmin=-1, vmax=1, annot=True, fmt=\".2f\", annot_kws={\"size\": 4})\n", "\n", " # Group ind vs social vars by color and modify tick label names\n", " for tick_label in dataplot.axes.xaxis.get_ticklabels():\n", " if tick_label.get_text() in ind_vars_enc:\n", " tick_label.set_color('green')\n", " elif tick_label.get_text() in soc_vars_enc:\n", " tick_label.set_color('purple') \n", " for tick_label in dataplot.axes.yaxis.get_ticklabels():\n", " if tick_label.get_text() in ind_vars_enc:\n", " tick_label.set_color('green')\n", " elif tick_label.get_text() in soc_vars_enc:\n", " tick_label.set_color('purple') \n", "\n", " # Increase the size of xtick labels\n", " # dataplot.tick_params(axis='x', labelsize=12)\n", "\n", " # Increase the size of ytick labels\n", " # dataplot.tick_params(axis='y', labelsize=12)\n", "\n", " # Add legend and place it in lower left \n", " plt.legend(handles=[\n", " plt.Line2D([0], [0], marker='o', color='w', label='Social Factors', markerfacecolor='purple', markersize=10),\n", " plt.Line2D([0], [0], marker='o', color='w', label='Individual Factors', markerfacecolor='green', markersize=10)\n", " ], bbox_to_anchor=(-0.1, -0.1), fontsize = 20)\n", "\n", " plt.title(\"\\n\\n\" + plot_title, fontdict={'fontsize': 30, 'fontweight': 'bold'})\n", "\n", " return corr_matrix" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fig, axs = plt.subplots(3, 3, figsize=(50, 50))\n", "plt.subplots_adjust(hspace=0.75, wspace=2)\n", "corr_mats = [] # List of tuples (m1, m2) to store the 3 pairs of matrices to compare (pre vs post)\n", "\n", "# Go through possible values for 'Situacion_tratamiento' and 'Group'\n", "for sit_tto in range(1,4):\n", " # ALL\n", " plt.subplot(3, 3, 3*(sit_tto-1) + 1) # Calculate the subplot position dynamically\n", " _ = plot_heatmap(sit_tto, 1)\n", " # PRE\n", " plt.subplot(3, 3, 3*(sit_tto-1) + 2) \n", " corr_matrix_pre = plot_heatmap(sit_tto, 2)\n", " # POST\n", " plt.subplot(3, 3, 3*(sit_tto-1) + 3)\n", " corr_matrix_post = plot_heatmap(sit_tto, 3)\n", "\n", " corr_mats.append((corr_matrix_pre, corr_matrix_post))\n", " \n", "# Adjust layout to prevent overlapping titles\n", "plt.tight_layout()\n", "\n", "# Save the figure in SVG format in the \"./EDA_plots\" folder\n", "plt.savefig('./output/plots/correlations/heatmaps_one_hot.svg', dpi=550, bbox_inches='tight')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Finding significative differences between PRE and POST" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def find_diff (sit_tto:int, m_pre, m_post):\n", "\n", " diff_list = [] # List to store tuples of (difference, variable_i, variable_j)\n", "\n", " if sit_tto == 1:\n", " cols = [target_var + '_REDEF'] + corr_cols\n", " else:\n", " cols = corr_cols\n", " # Go through matrices\n", " for i, var_i in enumerate(cols):\n", " for j, var_j in enumerate(cols):\n", " # If difference greater than certain threshold, print variables \n", " val_pre = m_pre[i][j]\n", " val_post = m_post[i][j]\n", " diff = abs(val_pre - val_post)\n", " diff_list.append((diff, var_i, var_j, val_pre, val_post))\n", " \n", " # Sort the list based on the difference value in descending order\n", " diff_list.sort(key=lambda x: x[0], reverse=True)\n", " \n", " # Print the sorted list\n", " for diff, var_i, var_j, val_pre, val_post in diff_list[0:100]:\n", " # Give ind vs soc vars their corresponding color\n", " if var_i in ind_vars_enc:\n", " print(colors.GREEN + var_i + colors.RESET, end=' ')\n", " else:\n", " print(colors.PURPLE + var_i + colors.PURPLE, end=' ')\n", " print(\"& \", end='')\n", " if var_j in ind_vars_enc:\n", " print(colors.GREEN + var_j + colors.RESET, end=' ')\n", " else:\n", " print(colors.PURPLE + var_j + colors.RESET, end=' ')\n", " print(f\"--> Diff: {diff:.2f} (PRE: {val_pre:.2f}; POST: {val_post:.2f})\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "class colors:\n", " RED = '\\033[91m'\n", " GREEN = '\\033[92m'\n", " YELLOW = '\\033[93m'\n", " BLUE = '\\033[94m'\n", " PURPLE = '\\033[95m'\n", " CYAN = '\\033[96m'\n", " WHITE = '\\033[97m'\n", " RESET = '\\033[0m'\n", "\n", "# Print colored text\n", "print(colors.RED + \"This is red text.\" + colors.RESET)\n", "print(colors.GREEN + \"This is green text.\" + colors.RESET)\n", "print(colors.BLUE + \"This is blue text.\" + colors.RESET)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [ "keep" ] }, "outputs": [], "source": [ "print(\"------SIT_TTO 1: NO FILTERING------\")\n", "find_diff(1, corr_mats[0][0], corr_mats[0][1])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [ "keep" ] }, "outputs": [], "source": [ "print(\"------SIT_TTO 2: ABANDONO-----\")\n", "find_diff(2, corr_mats[1][0], corr_mats[1][1])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [ "keep" ] }, "outputs": [], "source": [ "print(\"------SIT_TTO 3: ALTA-----\")\n", "find_diff(3, corr_mats[2][0], corr_mats[2][1])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Feature Analysis and Selection" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Building final datasets to work with" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Work with columns of interest\n", "cols_of_interest = corr_cols + ['Pandemia_inicio_fin_tratamiento'] + [target_var + \"_REDEF\"]\n", "temp_bd = bd[cols_of_interest]\n", "print(temp_bd.info()) # NaN values already dealt with (replaced by mode - this okay?)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Dropping unknown columns/categories for analysis purposes\n", "unknown_cols = ['Ed_Unknowledge', 'JobIn_unkwnodledge', 'Hous_unknowledge', 'Frec30_Desconocido']\n", "temp_bd = temp_bd.drop(columns=unknown_cols)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(temp_bd.info())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# For conj_pre dataframe\n", "conj_pre = temp_bd[temp_bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio y fin prepandemia']\n", "conj_pre = conj_pre.drop(columns=['Pandemia_inicio_fin_tratamiento'])\n", "\n", "# For conj_post dataframe\n", "conj_post = temp_bd[(temp_bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio prepandemia y fin en pandemia') | \n", " (temp_bd['Pandemia_inicio_fin_tratamiento'] == 'inicio y fin en pandemia')]\n", "conj_post = conj_post.drop(columns=['Pandemia_inicio_fin_tratamiento'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(conj_pre.info())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(conj_post.info())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Creating a numpy matrix without the target variable (X) and a list with the target variable (y) \n", "X_pre, y_pre = conj_pre.loc[:, conj_pre.columns != \"Situacion_tratamiento_REDEF\"].to_numpy(), conj_pre.Situacion_tratamiento_REDEF\n", "X_post, y_post = conj_post.loc[:, conj_post.columns != \"Situacion_tratamiento_REDEF\"].to_numpy(), conj_post.Situacion_tratamiento_REDEF\n", "feat = np.delete(conj_pre.columns.to_numpy(),-1) # Get labels and remove target " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(feat)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(X_pre.shape)\n", "print(X_post.shape)\n", "print(y_pre.shape)\n", "print(y_post.shape)\n", "print(len(feat))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### FSS Filter methods" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "###### Mutual Info" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create subplots\n", "fig, axes = plt.subplots(1, 2, figsize=(12, 6))\n", "\n", "# PRE\n", "importances_MI = mutual_info_classif(X_pre, y_pre)\n", "feat_importances_MI = pd.Series(importances_MI, feat)\n", "feat_importances_MI.sort_values(inplace=True)\n", "axes[0].barh(feat_importances_MI[feat_importances_MI != 0][-20:].index, feat_importances_MI[feat_importances_MI != 0][-20:], color='teal')\n", "axes[0].set_xlabel(\"Mutual Information\")\n", "axes[0].set_title(\"PRE\")\n", "\n", "# POST\n", "importances_MI = mutual_info_classif(X_post, y_post)\n", "feat_importances_MI = pd.Series(importances_MI, feat)\n", "feat_importances_MI.sort_values(inplace=True)\n", "axes[1].barh(feat_importances_MI[feat_importances_MI != 0][-20:].index, feat_importances_MI[feat_importances_MI != 0][-20:], color='teal')\n", "axes[1].set_xlabel(\"Mutual Information\")\n", "axes[1].set_title(\"POST\")\n", "\n", "plt.tight_layout()\n", "plt.savefig('./output/plots/feature_importance/mutual_info.svg', format='svg', dpi=1200)\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "###### ANOVA" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create subplots\n", "fig, axes = plt.subplots(1, 2, figsize=(12, 6))\n", "\n", "# PRE\n", "selector = SelectKBest(f_classif, k=39)\n", "selector.fit(X_pre, y_pre)\n", "feat_importances_AN_pre = pd.Series(selector.pvalues_, feat)\n", "feat_importances_AN_pre.sort_values(inplace=True)\n", "axes[0].barh(feat_importances_AN_pre[feat_importances_AN_pre > 0.005][-20:].index, feat_importances_AN_pre[feat_importances_AN_pre > 0.005][-20:], color='teal')\n", "axes[0].set_xlabel(\"p-value ANOVA\")\n", "axes[0].set_title(\"PRE\")\n", "\n", "# POST\n", "selector = SelectKBest(f_classif, k=39)\n", "selector.fit(X_post, y_post)\n", "feat_importances_AN_post = pd.Series(selector.pvalues_, feat)\n", "feat_importances_AN_post.sort_values(inplace=True)\n", "axes[1].barh(feat_importances_AN_post[feat_importances_AN_post > 0.005][-20:].index, feat_importances_AN_post[feat_importances_AN_post > 0.005][-20:], color='teal') \n", "axes[1].set_xlabel(\"p-value ANOVA\")\n", "axes[1].set_title(\"POST\")\n", "\n", "plt.tight_layout()\n", "plt.savefig('./output/plots/feature_importance/ANOVA.svg', format='svg', dpi=1200)\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create subplots\n", "fig, axes = plt.subplots(1, 2, figsize=(15, 6))\n", "\n", "# PRE\n", "variance_filter = VarianceThreshold(threshold=0)\n", "variance_filter.fit(X_pre)\n", "feat_importances_var_pre = pd.Series(variance_filter.variances_, feat)\n", "feat_importances_var_pre.sort_values(inplace=True)\n", "axes[0].barh(feat_importances_var_pre[feat_importances_var_pre > 0.05][-20:].index, feat_importances_var_pre[feat_importances_var_pre > 0.05][-20:], color='teal')\n", "axes[0].set_xlabel(\"Variance\")\n", "axes[0].set_title(\"PRE\")\n", "\n", "# POST\n", "variance_filter = VarianceThreshold(threshold=0)\n", "variance_filter.fit(X_post)\n", "feat_importances_var_post = pd.Series(variance_filter.variances_, feat)\n", "feat_importances_var_post.sort_values(inplace=True)\n", "axes[1].barh(feat_importances_var_post[feat_importances_var_post > 0.05][-20:].index, feat_importances_var_post[feat_importances_var_post > 0.05][-20:], color='teal')\n", "axes[1].set_xlabel(\"Variance\")\n", "axes[1].set_title(\"POST\")\n", "\n", "plt.tight_layout()\n", "plt.savefig('./output/plots/feature_importance/var_threshold.svg', format='svg', dpi=1200)\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Export PRE and POST datasets" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "conj_pre.to_csv('pre_dataset.csv', index=False)\n", "conj_post.to_csv('post_dataset.csv', index=False)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.2" } }, "nbformat": 4, "nbformat_minor": 2 }