diff --git a/EDA/EDA.ipynb b/EDA/EDA.ipynb index 514304f4530e247401aa6c54fc2a1ac17c657cd4..89734793969ce3b73cd54c53d47415d57ac71287 100644 --- a/EDA/EDA.ipynb +++ b/EDA/EDA.ipynb @@ -25,7 +25,8 @@ "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", - "from pypair.association import binary_binary, continuous_continuous, binary_continuous\n", + "from pypair.association import binary_binary, continuous_continuous, binary_continuous # Correlations\n", + "# Feature Importance\n", "from sklearn.feature_selection import VarianceThreshold\n", "from sklearn.feature_selection import SelectKBest\n", "from sklearn.feature_selection import f_classif\n", @@ -50,7 +51,7 @@ "# Filter the dataset to work only with alcohol patients\n", "bd = bd_all[bd_all['Alcohol_DxCIE'] == 'Sí']\n", "\n", - "# Filter the dataset to work only with 'Situacion_tratamiento' == 'Abandono' or 'Alta'\n", + "# Filter the dataset to work only with 'Abandono' or 'Alta' patients\n", "bd = bd[(bd['Situacion_tratamiento'] == 'Abandono') | (bd['Situacion_tratamiento'] == 'Alta terapéutica')]" ] }, @@ -60,15 +61,15 @@ "metadata": {}, "outputs": [], "source": [ - "# Pre-pandemic\n", + "# Pre-pandemic group\n", "conj_pre = bd[bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio y fin prepandemia']\n", "# Pre-pandemic abandono\n", "pre_abandono = conj_pre[conj_pre['Situacion_tratamiento'] == 'Abandono']\n", "# Pre-pandemic alta\n", "pre_alta = conj_pre[conj_pre['Situacion_tratamiento'] == 'Alta terapéutica']\n", "\n", - "# Post-pandemic\n", - "# Merging last two classes to balance sets\n", + "# Post-pandemic group\n", + "# Merging last two classes\n", "conj_post = bd[(bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio prepandemia y fin en pandemia') | \n", " (bd['Pandemia_inicio_fin_tratamiento'] == 'inicio y fin en pandemia')]\n", "# Post-pandemic abandono\n", @@ -142,7 +143,9 @@ "source": [ "# 9.0 represents unknown according to Variables.docx \n", "print(bd['Social_inclusion'].unique())\n", + "# Obtain mode for this feature\n", "mode_soc_inc = bd['Social_inclusion'].mode()[0]\n", + "# Replace unknown value by the mode\n", "bd['Social_inclusion'] = bd['Social_inclusion'].replace('9.0', mode_soc_inc)\n", "print(bd['Social_inclusion'].unique())\n", "\n", @@ -192,6 +195,7 @@ "metadata": {}, "outputs": [], "source": [ + "# Replace NaN values by the mode\n", "age_mode = bd['Age'].mode()[0]\n", "bd['Age'].fillna(age_mode, inplace=True)\n", "\n", @@ -218,6 +222,7 @@ "metadata": {}, "outputs": [], "source": [ + "# Discrete attributes\n", "disc_atts = ['Education', 'Social_protection', 'Job_insecurity', 'Housing',\n", " 'Alterations_early_childhood_develop', 'Social_inclusion',\n", " 'Risk_stigma', 'Sex', 'NumHijos', 'Smoking', 'Biological_vulnerability',\n", @@ -226,6 +231,7 @@ " 'OtrosDx_Psiquiatrico', 'Tx_previos', 'Readmisiones_estudios', 'Nreadmision'\n", " ]\n", "\n", + "# Numerical attributes\n", "num_atts = ['Structural_conflic', 'Adherencia_tto_recalc', 'Age', 'Años_consumo_droga', 'Tiempo_tx']" ] }, @@ -252,7 +258,9 @@ "fig, axs = plt.subplots(len(disc_atts), 1, figsize=(15, 5*len(disc_atts)))\n", "plt.subplots_adjust(hspace=0.75, wspace=1.25)\n", "\n", + "# Generate countplot for each attribute\n", "for i, disc_att in enumerate(disc_atts):\n", + " # For each possible value of the attribute, consider the PRE-POST and ALTA-ABANDONO combinations in the same subplot\n", " ax = sns.countplot(x=disc_att, data=combined_pre_post, hue=combined_pre_post[['Situacion_tratamiento', 'Group']].apply(tuple, axis=1),\n", " hue_order=[('Abandono', 'Pre'),('Alta terapéutica', 'Pre'), ('Abandono', 'Post'), ('Alta terapéutica', 'Post')],\n", " ax=axs[i])\n", @@ -268,9 +276,7 @@ " xytext = (0, 9), \n", " textcoords = 'offset points')\n", "\n", - "# Adjust layout to prevent overlapping titles\n", "plt.tight_layout()\n", - "\n", "plt.savefig('./output/plots/distributions/countplots.svg', dpi=600, bbox_inches='tight')" ] }, @@ -287,12 +293,12 @@ "metadata": {}, "outputs": [], "source": [ - "# Function to plot countplot \n", + "# Function to plot normalized countplot \n", "def plot_count_perc_norm(i: int, group:int, disc_att:str) -> None:\n", " \"\"\"\n", " group: 1 (all), 2 (pre), 3 (post) \n", " \"\"\"\n", - "\n", + " \n", " # Define data to work with based on group\n", " if group == 1:\n", " df = bd \n", @@ -340,7 +346,6 @@ "plt.subplots_adjust(hspace=0.75, wspace=1.5)\n", "\n", "for i, disc_att in enumerate(disc_atts):\n", - "\n", " # # 1: ALL \n", " # plot_count_perc_norm(i, 1, disc_att)\n", " # axs[i, 0].set_title(\"\\nALL\")\n", @@ -362,11 +367,7 @@ " axs[i, 1].set_ylabel(\"% of total within its Sit_tto group\")\n", " axs[i, 1].tick_params(axis='x', rotation=90)\n", "\n", - " \n", - "# Adjust layout to prevent overlapping titles\n", "plt.tight_layout()\n", - "\n", - "# Save the figure in SVG format with DPI=600 in the \"._plots\" folder\n", "plt.savefig('./output/plots/distributions/norm_countplots.svg', dpi=600, bbox_inches='tight')" ] }, @@ -413,15 +414,12 @@ " plt.subplot(len(num_atts), 1, i+1)\n", " sns.boxplot(\n", " data=combined_pre_post,\n", - " x = num_att,\n", - " y = 'Group',\n", - " hue='Situacion_tratamiento',\n", + " x = num_att, # attribute value in the x axis\n", + " y = 'Group', # pre and post in y axis\n", + " hue='Situacion_tratamiento', # side by side abandono vs alta\n", " )\n", "\n", - "# Adjust layout to prevent overlapping titles\n", "plt.tight_layout()\n", - "\n", - "# Save the figure in SVG format with DPI=600 in the \"./EDA_plots\" folder\n", "plt.savefig('./output/plots/distributions/boxplots.svg', dpi=600, bbox_inches='tight')" ] }, @@ -458,10 +456,7 @@ " line_kws={'lw': 5}, alpha = 0.4, ax=axs[i, 2])\n", " axs[i, 2].set_title(f\"\\nDistr. of {num_att} - POST\")\n", "\n", - "# Adjust layout to prevent overlapping titles\n", "plt.tight_layout()\n", - "\n", - "# Save the figure in SVG format with DPI=600 in the \"./EDA_plots\" folder\n", "plt.savefig('./output/plots/distributions/histograms.svg', dpi=600, bbox_inches='tight')" ] }, @@ -492,7 +487,7 @@ " 'FrecuenciaConsumo30Dias', 'Años_consumo_droga','OtrosDx_Psiquiatrico', 'Tx_previos', 'Adherencia_tto_recalc'] \n", "target_var = 'Situacion_tratamiento'\n", "\n", - "# Columns that are already numeric and we don't need to redefine \n", + "# Columns that are already numeric and we do not need to redefine \n", "no_redef_cols = ['Structural_conflic', 'Age', 'NumHijos', 'Años_consumo_droga', 'Adherencia_tto_recalc']" ] }, @@ -607,7 +602,7 @@ "one_hot_cols_dic = {}\n", "\n", "for one_hot_var in one_hot_vars:\n", - " # Create one hot encoding version of attribute and concatenate new columns to main df\n", + " # Create one hot encoding version of attribute and concatenate new columns to main df using specified prefix\n", " encoded_var = pd.get_dummies(bd[one_hot_var], prefix=one_hots_vars_prefix[one_hot_var])\n", " bd = pd.concat([bd, encoded_var], axis=1)\n", " one_hot_cols_dic[one_hot_var] = encoded_var.columns.tolist()" @@ -685,6 +680,7 @@ "metadata": {}, "outputs": [], "source": [ + "# Keep target column and Pandemia_inicio_fin_tratamiento to update df and split again into PRE and POST\n", "columns_to_keep = corr_cols + ['Situacion_tratamiento','Situacion_tratamiento_REDEF', 'Pandemia_inicio_fin_tratamiento']\n", "bd = bd[columns_to_keep]" ] @@ -695,6 +691,7 @@ "metadata": {}, "outputs": [], "source": [ + "# Cleaning attribute names\n", "name_mapping = {\n", " 'Ed_Not Complete primary school': 'Ed_Not_Complete_Primary',\n", " 'Ed_Primary education': 'Ed_Primary',\n", @@ -764,8 +761,8 @@ "metadata": {}, "outputs": [], "source": [ - "bd = bd.rename(columns=name_mapping)[list(name_mapping.values())]\n", - "#print(bd.columns)" + "# Renaming columns\n", + "bd = bd.rename(columns=name_mapping)[list(name_mapping.values())]" ] }, { @@ -805,7 +802,7 @@ "metadata": {}, "outputs": [], "source": [ - "binary_vars = [col for col in corr_cols if len(bd[col].unique()) == 2] + [name_mapping['Situacion_tratamiento_REDEF'], name_mapping['Risk_stigma_REDEF']]\n", + "binary_vars = [col for col in corr_cols if len(bd[col].unique()) == 2] + [name_mapping['Situacion_tratamiento_REDEF']] #, name_mapping['Risk_stigma_REDEF']]\n", "cont_vars = [name_mapping[col] for col in ['Structural_conflic', 'Age', 'NumHijos', 'Años_consumo_droga', 'Adherencia_tto_recalc']]" ] }, @@ -825,16 +822,16 @@ " for j, var_j in enumerate(cols):\n", " # Fill lower triangle of matrix\n", " if i > j:\n", - " # Binary with binary correlation: tetrachoric\n", + " # Binary with binary correlation -> tetrachoric\n", " if var_i in binary_vars and var_j in binary_vars:\n", " corr = binary_binary(df[var_i], df[var_j], measure='tetrachoric')\n", - " # Continuous with continuous correlation: \n", + " # Continuous with continuous correlation -> Spearman\n", " elif var_i in cont_vars and var_j in cont_vars:\n", " # Returning nan sometimes:\n", " # corr_tuple = continuous_continuous(df[var_i], df[var_j], measure = 'spearman')\n", " # corr = corr_tuple[0]\n", " corr = df[var_i].corr(df[var_j], method='spearman')\n", - " # Binary vs Continuous correlation:\n", + " # Binary vs Continuous correlation -> Point Biserial\n", " else:\n", " if var_i in binary_vars:\n", " bin_var = var_i\n", @@ -868,7 +865,7 @@ " else:\n", " cols = corr_cols\n", " \n", - " # Title plot and select datat based on group and sit_tto\n", + " # Title plot and select data based on group and sit_tto\n", " if group == 1:\n", " plot_title = \"Correl Matrix - ALL\"\n", " if sit_tto == 1:\n", @@ -947,7 +944,7 @@ "plt.subplots_adjust(hspace=0.75, wspace=2)\n", "corr_mats = [] # List of tuples (m1, m2) to store the 3 pairs of matrices to compare (pre vs post)\n", "\n", - "# Go through possible values for 'Situacion_tratamiento' and 'Group'\n", + "# Go through possible values for 'Situacion_tratamiento': 1 (include it as another var), 2 (only abandono), 3 (only alta)\n", "for sit_tto in range(1,4):\n", " # ALL\n", " plt.subplot(3, 3, 3*(sit_tto-1) + 1) # Calculate the subplot position dynamically\n", @@ -990,7 +987,6 @@ " # Go through matrices\n", " for i, var_i in enumerate(cols):\n", " for j, var_j in enumerate(cols):\n", - " # If difference greater than certain threshold, print variables \n", " val_pre = m_pre[i][j]\n", " val_post = m_post[i][j]\n", " diff = abs(val_pre - val_post)\n", @@ -1079,6 +1075,7 @@ "metadata": {}, "outputs": [], "source": [ + "# Drop target\n", "bd = bd.drop(columns=['Situacion_tratamiento'])\n", "# print(len(bd.columns))\n", "\n", @@ -1101,10 +1098,18 @@ "metadata": {}, "outputs": [], "source": [ + "# Numpy matrices for features and target\n", "X_pre, y_pre = conj_pre.loc[:, conj_pre.columns != \"Treatment_Outcome\"].to_numpy(), conj_pre.Treatment_Outcome\n", "X_post, y_post = conj_post.loc[:, conj_post.columns != \"Treatment_Outcome\"].to_numpy(), conj_post.Treatment_Outcome\n", - "feat = np.delete(conj_pre.columns.to_numpy(),-1) # Get labels and remove target \n", - "\n", + "feat = np.delete(conj_pre.columns.to_numpy(),-1) # Get labels and remove target " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ "# Export datasets\n", "conj_pre.to_csv('./output/datasets/pre_dataset.csv', index=False)\n", "conj_post.to_csv('./output/datasets/post_dataset.csv', index=False)"