diff --git a/.gitignore b/.gitignore index 0c6f380c18eedcea231d0fa63c7d84bf5938fc7e..643c7461f7527cc0375c16a6f4d6ffa333c5dace 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ gen_train_data/input/ gen_train_data/output/ -EDA/input/ \ No newline at end of file +EDA/input/17_abril.sav +EDA/output/datasets \ No newline at end of file diff --git a/EDA/EDA.ipynb b/EDA/EDA.ipynb index 9426e5017064d955f4fd3ddf0958ee15d4690ad7..b33f9fbf89f6ee6a7e530f15873dbcf3b019fd5a 100644 --- a/EDA/EDA.ipynb +++ b/EDA/EDA.ipynb @@ -4,14 +4,15 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### EDA" + "_Exploratory Data Analysis_ \\\n", + "_Author: Joaquín Torres Bravo_" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "#### Libraries" + "### Libraries" ] }, { @@ -25,7 +26,6 @@ "import matplotlib.pyplot as plt\n", "import numpy as np\n", "from pypair.association import binary_binary, continuous_continuous, binary_continuous\n", - "\n", "from sklearn.feature_selection import VarianceThreshold\n", "from sklearn.feature_selection import SelectKBest\n", "from sklearn.feature_selection import f_classif\n", @@ -36,19 +36,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "#### Preparing Data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Reading and filtering" + "### First Steps" ] }, { "cell_type": "code", - "execution_count": 139, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -61,37 +54,11 @@ "bd = bd[(bd['Situacion_tratamiento'] == 'Abandono') | (bd['Situacion_tratamiento'] == 'Alta terapéutica')]" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Defining sets of patients" - ] - }, { "cell_type": "code", - "execution_count": 140, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_19584\\2495984927.py:18: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " conj_post['Group'] = 'Post'\n", - "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_19584\\2495984927.py:19: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " conj_pre['Group'] = 'Pre'\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# Pre-pandemic\n", "conj_pre = bd[bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio y fin prepandemia']\n", @@ -117,22 +84,9 @@ }, { "cell_type": "code", - "execution_count": 100, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "PRE: 22861\n", - "\tALTA: 2792\n", - "\tABANDONO: 20069\n", - "POST: 10677\n", - "\tALTA: 1882\n", - "\tABANDONO: 8795\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# Printing size of different datasets\n", "print(f\"PRE: {len(conj_pre)}\")\n", @@ -144,20 +98,6 @@ "print(f\"\\tABANDONO: {len(post_abandono)}\")" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### First Steps" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Inspecting the dataframes" - ] - }, { "cell_type": "code", "execution_count": null, @@ -191,114 +131,37 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "##### Replacing unknown values with the mode" + "### Missing and Unknown Values" ] }, { "cell_type": "code", - "execution_count": 141, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['Live with families or friends' 'live alone' 'live in institutions' '9.0']\n", - "['Live with families or friends' 'live alone' 'live in institutions']\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# 9.0 represents unknown according to Variables.docx \n", "print(bd['Social_inclusion'].unique())\n", "mode_soc_inc = bd['Social_inclusion'].mode()[0]\n", - "# print(mode_soc_inc)\n", "bd['Social_inclusion'] = bd['Social_inclusion'].replace('9.0', mode_soc_inc)\n", - "print(bd['Social_inclusion'].unique())" - ] - }, - { - "cell_type": "code", - "execution_count": 142, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['No alterations (first exposure at 11 or more years)'\n", - " 'Alterations (first exposure before 11 years old)' '9']\n", - "['No alterations (first exposure at 11 or more years)'\n", - " 'Alterations (first exposure before 11 years old)']\n" - ] - } - ], - "source": [ + "print(bd['Social_inclusion'].unique())\n", + "\n", "print(bd['Alterations_early_childhood_develop'].unique())\n", "mode_alt = bd['Alterations_early_childhood_develop'].mode()[0]\n", "bd['Alterations_early_childhood_develop'] = bd['Alterations_early_childhood_develop'].replace('9', mode_alt)\n", - "print(bd['Alterations_early_childhood_develop'].unique())" - ] - }, - { - "cell_type": "code", - "execution_count": 143, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[NaN, 'Yes', 'No']\n", - "Categories (3, object): [99.0, 'No', 'Yes']\n", - "[NaN, 'Yes', 'No']\n", - "Categories (2, object): ['No', 'Yes']\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_19584\\1073322024.py:3: FutureWarning: The behavior of Series.replace (and DataFrame.replace) with CategoricalDtype is deprecated. In a future version, replace will only be used for cases that preserve the categories. To change the categories, use ser.cat.rename_categories instead.\n", - " bd['Risk_stigma'] = bd['Risk_stigma'].replace(99.0, mode_stigma)\n" - ] - } - ], - "source": [ + "print(bd['Alterations_early_childhood_develop'].unique())\n", + "\n", "print(bd['Risk_stigma'].unique())\n", "mode_stigma = bd['Risk_stigma'].mode()[0]\n", "bd['Risk_stigma'] = bd['Risk_stigma'].replace(99.0, mode_stigma)\n", - "print(bd['Risk_stigma'].unique())" - ] - }, - { - "cell_type": "code", - "execution_count": 144, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[nan 0. 1. 2. 3. 4. 5. 8. 10. 6. 11. 12. 9. 7. 99. 14. 15.]\n", - "[nan 0. 1. 2. 3. 4. 5. 8. 10. 6. 11. 12. 9. 7. 14. 15.]\n" - ] - } - ], - "source": [ + "print(bd['Risk_stigma'].unique())\n", + "\n", "print(bd['NumHijos'].unique())\n", "mode_hijos = bd['NumHijos'].mode()[0]\n", "bd['NumHijos'] = bd['NumHijos'].replace(99.0, mode_hijos)\n", "print(bd['NumHijos'].unique())" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Quantifying Null Values" - ] - }, { "cell_type": "code", "execution_count": null, @@ -323,53 +186,11 @@ "print(f\"\\t\\tMissing values NumHijos: {conj_post['NumHijos'].isnull().sum()}\")" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Replacing missing values with mode" - ] - }, { "cell_type": "code", - "execution_count": 145, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_19584\\3303146707.py:2: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", - "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", - "\n", - "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", - "\n", - "\n", - " bd['Age'].fillna(age_mode, inplace=True)\n", - "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_19584\\3303146707.py:5: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", - "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", - "\n", - "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", - "\n", - "\n", - " bd['Años_consumo_droga'].fillna(años_consumo_mode, inplace=True)\n", - "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_19584\\3303146707.py:8: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", - "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", - "\n", - "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", - "\n", - "\n", - " bd['Risk_stigma'].fillna(risk_stigma_mode, inplace=True)\n", - "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_19584\\3303146707.py:11: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", - "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", - "\n", - "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", - "\n", - "\n", - " bd['NumHijos'].fillna(num_hijos_mode, inplace=True)\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "age_mode = bd['Age'].mode()[0]\n", "bd['Age'].fillna(age_mode, inplace=True)\n", @@ -388,14 +209,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "#### Distribution of variables" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Classifying variables into numerical and discrete/categorical " + "### Distribution of Variables" ] }, { @@ -419,14 +233,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "##### Distribution of discrete attributes" + "#### Discrete" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "###### Count plots" + "##### Countplots" ] }, { @@ -464,7 +278,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "###### Normalized count plots" + "##### Normalized Countplots" ] }, { @@ -560,14 +374,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "##### Distribution of numeric attributes" + "#### Numerical" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "###### Summary statistics" + "##### Summary Stats" ] }, { @@ -583,7 +397,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "###### Boxplots" + "##### Boxplots" ] }, { @@ -615,7 +429,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "###### Histograms" + "##### Histograms" ] }, { @@ -655,19 +469,50 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "#### Correlation Analysis" + "### Correlation Analysis" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "##### Turning binary variables into 0/1 values" + "#### Groups of Variables" ] }, { "cell_type": "code", - "execution_count": 146, + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "social_vars = ['Education', 'Social_protection', 'Job_insecurity', 'Housing', 'Alterations_early_childhood_develop', \n", + " 'Social_inclusion', 'Risk_stigma', 'Structural_conflic']\n", + "ind_vars = ['Age', 'Sex', 'NumHijos', 'Smoking', 'Biological_vulnerability', 'Opiaceos_DxCIE', \n", + " 'Cannabis_DXCIE', 'BZD_DxCIE', 'Cocaina_DxCIE', 'Alucinogenos_DXCIE', 'Tabaco_DXCIE', \n", + " 'FrecuenciaConsumo30Dias', 'Años_consumo_droga','OtrosDx_Psiquiatrico', 'Tx_previos', 'Adherencia_tto_recalc'] \n", + "target_var = 'Situacion_tratamiento'\n", + "\n", + "# Columns that are already numeric and we don't need to redefine \n", + "no_redef_cols = ['Structural_conflic', 'Age', 'NumHijos', 'Años_consumo_droga', 'Adherencia_tto_recalc']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### One-hot Encoding" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Binary" + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -739,43 +584,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "##### Defining groups of variables" - ] - }, - { - "cell_type": "code", - "execution_count": 127, - "metadata": {}, - "outputs": [], - "source": [ - "social_vars = ['Education', 'Social_protection', 'Job_insecurity', 'Housing', 'Alterations_early_childhood_develop', \n", - " 'Social_inclusion', 'Risk_stigma', 'Structural_conflic']\n", - "ind_vars = ['Age', 'Sex', 'NumHijos', 'Smoking', 'Biological_vulnerability', 'Opiaceos_DxCIE', \n", - " 'Cannabis_DXCIE', 'BZD_DxCIE', 'Cocaina_DxCIE', 'Alucinogenos_DXCIE', 'Tabaco_DXCIE', \n", - " 'FrecuenciaConsumo30Dias', 'Años_consumo_droga','OtrosDx_Psiquiatrico', 'Tx_previos', 'Adherencia_tto_recalc'] \n", - "target_var = 'Situacion_tratamiento'" + "##### Categorical" ] }, { "cell_type": "code", - "execution_count": 128, - "metadata": {}, - "outputs": [], - "source": [ - "# Columns that are already numeric and we don't need to redefine \n", - "no_redef_cols = ['Structural_conflic', 'Age', 'NumHijos', 'Años_consumo_droga', 'Adherencia_tto_recalc']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### One-hot encode categorical variables" - ] - }, - { - "cell_type": "code", - "execution_count": 147, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -796,21 +610,19 @@ " # Create one hot encoding version of attribute and concatenate new columns to main df\n", " encoded_var = pd.get_dummies(bd[one_hot_var], prefix=one_hots_vars_prefix[one_hot_var])\n", " bd = pd.concat([bd, encoded_var], axis=1)\n", - " one_hot_cols_dic[one_hot_var] = encoded_var.columns.tolist()\n", - "\n", - "# print(one_hot_cols_dic['FrecuenciaConsumo30Dias'])" + " one_hot_cols_dic[one_hot_var] = encoded_var.columns.tolist()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "###### Defining final version of columns of interest" + "#### Final Columns" ] }, { "cell_type": "code", - "execution_count": 148, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -848,38 +660,40 @@ "corr_cols = soc_vars_enc + ind_vars_enc" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Drop unknown columns\n", + "corr_cols = [corr_col for corr_col in corr_cols if corr_col not in ['Ed_Unknowledge', 'JobIn_unkwnodledge', 'Hous_unknowledge', 'Frec30_Desconocido']]\n", + "soc_vars_enc = [corr_col for corr_col in soc_vars_enc if corr_col not in ['Ed_Unknowledge', 'JobIn_unkwnodledge', 'Hous_unknowledge', 'Frec30_Desconocido']]\n", + "ind_vars_enc = [corr_col for corr_col in ind_vars_enc if corr_col not in ['Ed_Unknowledge', 'JobIn_unkwnodledge', 'Hous_unknowledge', 'Frec30_Desconocido']]" + ] + }, { "cell_type": "markdown", "metadata": {}, "source": [ - "###### Excluding unknown columns and renaming" + "##### Renaming and Filtering" ] }, { "cell_type": "code", - "execution_count": 149, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# Drop unknown columns\n", - "corr_cols = [corr_col for corr_col in corr_cols if corr_col not in ['Ed_Unknowledge', 'JobIn_unkwnodledge', 'Hous_unknowledge', 'Frec30_Desconocido']]\n", - "soc_vars_enc = [corr_col for corr_col in soc_vars_enc if corr_col not in ['Ed_Unknowledge', 'JobIn_unkwnodledge', 'Hous_unknowledge', 'Frec30_Desconocido']]\n", - "ind_vars_enc = [corr_col for corr_col in soc_vars_enc if ind_vars_enc not in ['Ed_Unknowledge', 'JobIn_unkwnodledge', 'Hous_unknowledge', 'Frec30_Desconocido']]" + "columns_to_keep = corr_cols + ['Situacion_tratamiento','Situacion_tratamiento_REDEF', 'Pandemia_inicio_fin_tratamiento']\n", + "bd = bd[columns_to_keep]" ] }, { "cell_type": "code", - "execution_count": 150, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['Ed_Not_Complete_Primary', 'Ed_Primary', 'Ed_Secondary', 'Ed_Secondary_Technical', 'Ed_Tertiary', 'Social_Protection', 'JobIn_Unstable', 'JobIn_Stable', 'JobIn_Unemployed', 'Hous_Institutional', 'Hous_Stable', 'Hous_Unstable', 'Early_Alterations', 'SocInc_Family_Friends', 'SocInc_Alone', 'SocInc_Instit', 'Risk_Stigma', 'Structural_Conflict', 'age', 'Sex', 'Num_Children', 'Smoking', 'Bio_Vulner', 'Opiods_DXCIE', 'Cannabis_DXCIE', 'BZD_DXCIE', 'Cocaine_DXCIE', 'Hallucin_DXCIE', 'Tobacco_DXCIE', 'Freq_1dpw', 'Freq_2-3dpw', 'Freq_4-6dpw', 'Freq_l1dpw', 'Freq_None', 'Freq_Everyday', 'Years_Drug_Use', 'Other_Psychiatric_DX', 'Previous_Treatments', 'Treatment_Adherence']\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "name_mapping = {\n", " 'Ed_Not Complete primary school': 'Ed_Not_Complete_Primary',\n", @@ -900,7 +714,7 @@ " 'SocInc_live in institutions': 'SocInc_Instit',\n", " 'Risk_stigma_REDEF': 'Risk_Stigma',\n", " 'Structural_conflic': 'Structural_Conflict',\n", - " # 'Age': 'Age',\n", + " 'Age': 'Age',\n", " 'Sex_REDEF': 'Sex',\n", " 'NumHijos': 'Num_Children',\n", " 'Smoking_REDEF': 'Smoking',\n", @@ -920,26 +734,28 @@ " 'Años_consumo_droga': 'Years_Drug_Use',\n", " 'OtrosDx_Psiquiatrico_REDEF': 'Other_Psychiatric_DX',\n", " 'Tx_previos_REDEF': 'Previous_Treatments',\n", - " 'Adherencia_tto_recalc': 'Treatment_Adherence'\n", + " 'Adherencia_tto_recalc': 'Treatment_Adherence',\n", + " 'Situacion_tratamiento_REDEF': 'Treatment_Outcome',\n", + " 'Situacion_tratamiento': 'Situacion_tratamiento',\n", + " 'Pandemia_inicio_fin_tratamiento': 'Pandemia_inicio_fin_tratamiento'\n", "}\n", "\n", "# Update lists of feature names\n", "corr_cols = [name_mapping[corr_col] for corr_col in corr_cols]\n", - "print(corr_cols)\n", "soc_vars_enc = [name_mapping[col] for col in soc_vars_enc]\n", - "ind_vars_enc = [name_mapping[col] for col in ind_vars_enc]\n", - "\n", - "bd = bd.rename(columns=name_mapping)" + "ind_vars_enc = [name_mapping[col] for col in ind_vars_enc]" ] }, { "cell_type": "code", - "execution_count": 133, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# Create bd with just corr_cols and target\n", - "bd = bd[corr_cols + ['Situacion_tratamiento','Situacion_tratamiento_REDEF', 'Pandemia_inicio_fin_tratamiento']]" + "# Export feature names\n", + "np.save('./output/feature_names/feature_names.npy', corr_cols)\n", + "np.save('./output/feature_names/soc_vars_names.npy', soc_vars_enc)\n", + "np.save('./output/feature_names/ind_vars_names.npy', ind_vars_enc)" ] }, { @@ -948,25 +764,17 @@ "metadata": {}, "outputs": [], "source": [ - "# Export feature names\n", - "np.save('./output/feature_names.npy', corr_cols)\n", - "np.save('./output/soc_vars_names.npy', soc_vars_enc)\n", - "np.save('./output/ind_vars_names.npy', ind_vars_enc)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "###### Update main data frames" + "bd = bd.rename(columns=name_mapping)[list(name_mapping.values())]\n", + "#print(bd.columns)" ] }, { "cell_type": "code", - "execution_count": 134, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ + "# Update main dfs\n", "# Pre-pandemic\n", "conj_pre = bd[bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio y fin prepandemia']\n", "# Pre-pandemic abandono\n", @@ -988,110 +796,17 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "##### Building correlation matrix" + "#### Plotting Correlation Matrices" ] }, { "cell_type": "code", - "execution_count": 135, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Index(['Ed_Not_Complete_Primary', 'Ed_Primary', 'Ed_Secondary',\n", - " 'Ed_Secondary_Technical', 'Ed_Tertiary', 'Social_Protection',\n", - " 'JobIn_Unstable', 'JobIn_Stable', 'JobIn_Unemployed',\n", - " 'Hous_Institutional', 'Hous_Stable', 'Hous_Unstable',\n", - " 'Early_Alterations', 'SocInc_Family_Friends', 'SocInc_Alone',\n", - " 'SocInc_Instit', 'Risk_Stigma', 'Structural_Conflict', 'age', 'Sex',\n", - " 'Sex', 'Num_Children', 'Smoking', 'Smoking', 'Bio_Vulner',\n", - " 'Opiods_DXCIE', 'Cannabis_DXCIE', 'Cannabis_DXCIE', 'BZD_DXCIE',\n", - " 'Cocaine_DXCIE', 'Hallucin_DXCIE', 'Tobacco_DXCIE', 'Freq_1dpw',\n", - " 'Freq_2-3dpw', 'Freq_4-6dpw', 'Freq_l1dpw', 'Freq_None',\n", - " 'Freq_Everyday', 'Years_Drug_Use', 'Other_Psychiatric_DX',\n", - " 'Previous_Treatments', 'Treatment_Adherence', 'Situacion_tratamiento',\n", - " 'Situacion_tratamiento_REDEF', 'Pandemia_inicio_fin_tratamiento'],\n", - " dtype='object')\n" - ] - } - ], - "source": [ - "print(bd.columns)" - ] - }, - { - "cell_type": "code", - "execution_count": 137, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Ed_Not_Complete_Primary\n", - "2\n", - "Ed_Primary\n", - "2\n", - "Ed_Secondary\n", - "2\n", - "Ed_Secondary_Technical\n", - "2\n", - "Ed_Tertiary\n", - "2\n", - "Social_Protection\n", - "2\n", - "JobIn_Unstable\n", - "2\n", - "JobIn_Stable\n", - "2\n", - "JobIn_Unemployed\n", - "2\n", - "Hous_Institutional\n", - "2\n", - "Hous_Stable\n", - "2\n", - "Hous_Unstable\n", - "2\n", - "Early_Alterations\n", - "2\n", - "SocInc_Family_Friends\n", - "2\n", - "SocInc_Alone\n", - "2\n", - "SocInc_Instit\n", - "2\n", - "Risk_Stigma\n", - "2\n", - "Structural_Conflict\n", - "107\n", - "age\n", - "74\n", - "Sex\n" - ] - }, - { - "ename": "AttributeError", - "evalue": "'DataFrame' object has no attribute 'unique'", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m~\\AppData\\Local\\Temp\\ipykernel_19584\\340002156.py\u001b[0m in \u001b[0;36m?\u001b[1;34m()\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;31m# print(len(bd['Cocaine_DXCIE'].unique()) == 2)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mcol\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mcorr_cols\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcol\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 5\u001b[1;33m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mbd\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mcol\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0munique\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 6\u001b[0m \u001b[1;31m#binary_vars = [col for col in corr_cols if len(bd[col].unique()) == 2] + ['Situacion_tratamiento_REDEF', name_mapping['Risk_stigma_REDEF']]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 7\u001b[0m \u001b[1;31m#cont_vars = [name_mapping[col] for col in ['Structural_conflic', 'Age', 'NumHijos', 'Años_consumo_droga', 'Adherencia_tto_recalc']]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32mc:\\Users\\Joaquín Torres\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\pandas\\core\\generic.py\u001b[0m in \u001b[0;36m?\u001b[1;34m(self, name)\u001b[0m\n\u001b[0;32m 6292\u001b[0m \u001b[1;32mand\u001b[0m \u001b[0mname\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_accessors\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 6293\u001b[0m \u001b[1;32mand\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_info_axis\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_can_hold_identifiers_and_holds_name\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mname\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 6294\u001b[0m \u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 6295\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mname\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 6296\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mobject\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__getattribute__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mname\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[1;31mAttributeError\u001b[0m: 'DataFrame' object has no attribute 'unique'" - ] - } - ], - "source": [ - "# print(len(bd['Cocaine_DXCIE'].unique()) == 2)\n", - "\n", - "for col in corr_cols:\n", - " print(col)\n", - " print(len(bd[col].unique()))\n", - "#binary_vars = [col for col in corr_cols if len(bd[col].unique()) == 2] + ['Situacion_tratamiento_REDEF', name_mapping['Risk_stigma_REDEF']]\n", - "#cont_vars = [name_mapping[col] for col in ['Structural_conflic', 'Age', 'NumHijos', 'Años_consumo_droga', 'Adherencia_tto_recalc']]" + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "binary_vars = [col for col in corr_cols if len(bd[col].unique()) == 2] + [name_mapping['Situacion_tratamiento_REDEF'], name_mapping['Risk_stigma_REDEF']]\n", + "cont_vars = [name_mapping[col] for col in ['Structural_conflic', 'Age', 'NumHijos', 'Años_consumo_droga', 'Adherencia_tto_recalc']]" ] }, { @@ -1149,7 +864,7 @@ " # Define columns based on sit_tto arg\n", " if sit_tto == 1:\n", " # Include target as another variable\n", - " cols = [target_var + '_REDEF'] + corr_cols\n", + " cols = ['Treatment_Outcome'] + corr_cols\n", " else:\n", " cols = corr_cols\n", " \n", @@ -1246,10 +961,8 @@ "\n", " corr_mats.append((corr_matrix_pre, corr_matrix_post))\n", " \n", - "# Adjust layout to prevent overlapping titles\n", "plt.tight_layout()\n", "\n", - "# Save the figure in SVG format in the \"./EDA_plots\" folder\n", "plt.savefig('./output/plots/correlations/heatmaps_one_hot.svg', dpi=550, bbox_inches='tight')" ] }, @@ -1257,7 +970,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "##### Finding significative differences between PRE and POST" + "#### Finding Differences PRE vs POST" ] }, { @@ -1326,11 +1039,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "tags": [ - "keep" - ] - }, + "metadata": {}, "outputs": [], "source": [ "print(\"------SIT_TTO 1: NO FILTERING------\")\n", @@ -1340,11 +1049,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "tags": [ - "keep" - ] - }, + "metadata": {}, "outputs": [], "source": [ "print(\"------SIT_TTO 2: ABANDONO-----\")\n", @@ -1354,11 +1059,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "tags": [ - "keep" - ] - }, + "metadata": {}, "outputs": [], "source": [ "print(\"------SIT_TTO 3: ALTA-----\")\n", @@ -1369,46 +1070,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "#### Feature Analysis and Selection" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Building final datasets to work with" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Work with columns of interest\n", - "cols_of_interest = corr_cols + ['Pandemia_inicio_fin_tratamiento'] + [target_var + \"_REDEF\"]\n", - "temp_bd = bd[cols_of_interest]\n", - "print(temp_bd.info()) # NaN values already dealt with (replaced by mode - this okay?)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Dropping unknown columns/categories for analysis purposes\n", - "unknown_cols = ['Ed_Unknowledge', 'JobIn_unkwnodledge', 'Hous_unknowledge', 'Frec30_Desconocido']\n", - "temp_bd = temp_bd.drop(columns=unknown_cols)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(temp_bd.info())" + "### Final Datasets" ] }, { @@ -1417,53 +1079,20 @@ "metadata": {}, "outputs": [], "source": [ + "bd = bd.drop(columns=['Situacion_tratamiento'])\n", + "# print(len(bd.columns))\n", + "\n", "# For conj_pre dataframe\n", - "conj_pre = temp_bd[temp_bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio y fin prepandemia']\n", + "conj_pre = bd[bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio y fin prepandemia']\n", "conj_pre = conj_pre.drop(columns=['Pandemia_inicio_fin_tratamiento'])\n", "\n", "# For conj_post dataframe\n", - "conj_post = temp_bd[(temp_bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio prepandemia y fin en pandemia') | \n", - " (temp_bd['Pandemia_inicio_fin_tratamiento'] == 'inicio y fin en pandemia')]\n", - "conj_post = conj_post.drop(columns=['Pandemia_inicio_fin_tratamiento'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(conj_pre.info())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(conj_post.info())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Creating a numpy matrix without the target variable (X) and a list with the target variable (y) \n", - "X_pre, y_pre = conj_pre.loc[:, conj_pre.columns != \"Situacion_tratamiento_REDEF\"].to_numpy(), conj_pre.Situacion_tratamiento_REDEF\n", - "X_post, y_post = conj_post.loc[:, conj_post.columns != \"Situacion_tratamiento_REDEF\"].to_numpy(), conj_post.Situacion_tratamiento_REDEF\n", - "feat = np.delete(conj_pre.columns.to_numpy(),-1) # Get labels and remove target " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(feat)" + "conj_post = bd[(bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio prepandemia y fin en pandemia') | \n", + " (bd['Pandemia_inicio_fin_tratamiento'] == 'inicio y fin en pandemia')]\n", + "conj_post = conj_post.drop(columns=['Pandemia_inicio_fin_tratamiento'])\n", + "\n", + "# print(conj_post.columns)\n", + "# print(conj_pre.columns)" ] }, { @@ -1472,25 +1101,27 @@ "metadata": {}, "outputs": [], "source": [ - "print(X_pre.shape)\n", - "print(X_post.shape)\n", - "print(y_pre.shape)\n", - "print(y_post.shape)\n", - "print(len(feat))" + "X_pre, y_pre = conj_pre.loc[:, conj_pre.columns != \"Treatment_Outcome\"].to_numpy(), conj_pre.Treatment_Outcome\n", + "X_post, y_post = conj_post.loc[:, conj_post.columns != \"Treatment_Outcome\"].to_numpy(), conj_post.Treatment_Outcome\n", + "feat = np.delete(conj_pre.columns.to_numpy(),-1) # Get labels and remove target \n", + "\n", + "# Export datasets\n", + "conj_pre.to_csv('./output/datasets/pre_dataset.csv', index=False)\n", + "conj_post.to_csv('./output/datasets/post_dataset.csv', index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "##### FSS Filter methods" + "### Feature Analysis" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "###### Mutual Info" + "#### Mutual Info" ] }, { @@ -1527,7 +1158,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "###### ANOVA" + "#### ANOVA" ] }, { @@ -1562,6 +1193,13 @@ "plt.show()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Variance Threshold" + ] + }, { "cell_type": "code", "execution_count": null, @@ -1593,23 +1231,6 @@ "plt.savefig('./output/plots/feature_importance/var_threshold.svg', format='svg', dpi=1200)\n", "plt.show()" ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Export PRE and POST datasets" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "conj_pre.to_csv('pre_dataset.csv', index=False)\n", - "conj_post.to_csv('post_dataset.csv', index=False)" - ] } ], "metadata": { diff --git a/EDA/EDA_2.ipynb b/EDA/EDA_2.ipynb deleted file mode 100644 index 594502ba79b32fba41fed892529a757af7df7f95..0000000000000000000000000000000000000000 --- a/EDA/EDA_2.ipynb +++ /dev/null @@ -1,1380 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Libraries" - ] - }, - { - "cell_type": "code", - "execution_count": 128, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import seaborn as sns\n", - "import matplotlib.pyplot as plt\n", - "import numpy as np\n", - "from pypair.association import binary_binary, continuous_continuous, binary_continuous\n", - "from sklearn.feature_selection import VarianceThreshold\n", - "from sklearn.feature_selection import SelectKBest\n", - "from sklearn.feature_selection import f_classif\n", - "from sklearn.feature_selection import mutual_info_classif" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### First Steps" - ] - }, - { - "cell_type": "code", - "execution_count": 129, - "metadata": {}, - "outputs": [], - "source": [ - "bd_all = pd.read_spss('./input/17_abril.sav')\n", - "\n", - "# Filter the dataset to work only with alcohol patients\n", - "bd = bd_all[bd_all['Alcohol_DxCIE'] == 'Sí']\n", - "\n", - "# Filter the dataset to work only with 'Situacion_tratamiento' == 'Abandono' or 'Alta'\n", - "bd = bd[(bd['Situacion_tratamiento'] == 'Abandono') | (bd['Situacion_tratamiento'] == 'Alta terapéutica')]" - ] - }, - { - "cell_type": "code", - "execution_count": 130, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_8220\\2495984927.py:18: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " conj_post['Group'] = 'Post'\n", - "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_8220\\2495984927.py:19: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " conj_pre['Group'] = 'Pre'\n" - ] - } - ], - "source": [ - "# Pre-pandemic\n", - "conj_pre = bd[bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio y fin prepandemia']\n", - "# Pre-pandemic abandono\n", - "pre_abandono = conj_pre[conj_pre['Situacion_tratamiento'] == 'Abandono']\n", - "# Pre-pandemic alta\n", - "pre_alta = conj_pre[conj_pre['Situacion_tratamiento'] == 'Alta terapéutica']\n", - "\n", - "# Post-pandemic\n", - "# Merging last two classes to balance sets\n", - "conj_post = bd[(bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio prepandemia y fin en pandemia') | \n", - " (bd['Pandemia_inicio_fin_tratamiento'] == 'inicio y fin en pandemia')]\n", - "# Post-pandemic abandono\n", - "post_abandono = conj_post[conj_post['Situacion_tratamiento'] == 'Abandono']\n", - "# Post-pandemic alta\n", - "post_alta = conj_post[conj_post['Situacion_tratamiento'] == 'Alta terapéutica']\n", - "\n", - "# Concatenate the two data frames and add a new column to distinguish between them. Useful for plots\n", - "conj_post['Group'] = 'Post'\n", - "conj_pre['Group'] = 'Pre'\n", - "combined_pre_post = pd.concat([conj_post, conj_pre])" - ] - }, - { - "cell_type": "code", - "execution_count": 131, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "PRE: 22861\n", - "\tALTA: 2792\n", - "\tABANDONO: 20069\n", - "POST: 10677\n", - "\tALTA: 1882\n", - "\tABANDONO: 8795\n" - ] - } - ], - "source": [ - "# Printing size of different datasets\n", - "print(f\"PRE: {len(conj_pre)}\")\n", - "print(f\"\\tALTA: {len(pre_alta)}\")\n", - "print(f\"\\tABANDONO: {len(pre_abandono)}\")\n", - "\n", - "print(f\"POST: {len(conj_post)}\")\n", - "print(f\"\\tALTA: {len(post_alta)}\")\n", - "print(f\"\\tABANDONO: {len(post_abandono)}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 132, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "PRE\n", - "\n", - "Index: 22861 entries, 0 to 85164\n", - "Data columns (total 35 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 CODPROYECTO 22861 non-null float64 \n", - " 1 Education 22861 non-null object \n", - " 2 Social_protection 22861 non-null object \n", - " 3 Job_insecurity 22861 non-null object \n", - " 4 Housing 22861 non-null object \n", - " 5 Alterations_early_childhood_develop 22861 non-null object \n", - " 6 Social_inclusion 22861 non-null object \n", - " 7 Risk_stigma 21606 non-null category\n", - " 8 Structural_conflic 22861 non-null float64 \n", - " 9 Age 22852 non-null float64 \n", - " 10 Sex 22861 non-null object \n", - " 11 NumHijos 21647 non-null float64 \n", - " 12 Smoking 22861 non-null object \n", - " 13 Biological_vulnerability 22861 non-null object \n", - " 14 Alcohol_DxCIE 22861 non-null object \n", - " 15 Opiaceos_DxCIE 22861 non-null object \n", - " 16 Cannabis_DXCIE 22861 non-null object \n", - " 17 BZD_DxCIE 22861 non-null object \n", - " 18 Cocaina_DxCIE 22861 non-null object \n", - " 19 Alucinogenos_DXCIE 22861 non-null object \n", - " 20 Tabaco_DXCIE 22861 non-null object \n", - " 21 FrecuenciaConsumo30Dias 22861 non-null object \n", - " 22 Años_consumo_droga 22342 non-null float64 \n", - " 23 OtrosDx_Psiquiatrico 22861 non-null object \n", - " 24 Tx_previos 22861 non-null object \n", - " 25 Adherencia_tto_recalc 22861 non-null float64 \n", - " 26 Tiempo_tx 22861 non-null float64 \n", - " 27 Readmisiones_estudios 22861 non-null object \n", - " 28 Situacion_tratamiento 22861 non-null object \n", - " 29 Periodos_COVID 22861 non-null object \n", - " 30 Pandemia_inicio_fin_tratamiento 22861 non-null object \n", - " 31 Nreadmision 22861 non-null float64 \n", - " 32 Readmisiones_PRECOVID 22861 non-null float64 \n", - " 33 Readmisiones_COVID 22861 non-null float64 \n", - " 34 Group 22861 non-null object \n", - "dtypes: category(1), float64(10), object(24)\n", - "memory usage: 6.1+ MB\n", - "None\n", - "-------------------------------\n", - "PRE-ABANDONO\n", - "\n", - "Index: 20069 entries, 0 to 85164\n", - "Data columns (total 34 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 CODPROYECTO 20069 non-null float64 \n", - " 1 Education 20069 non-null object \n", - " 2 Social_protection 20069 non-null object \n", - " 3 Job_insecurity 20069 non-null object \n", - " 4 Housing 20069 non-null object \n", - " 5 Alterations_early_childhood_develop 20069 non-null object \n", - " 6 Social_inclusion 20069 non-null object \n", - " 7 Risk_stigma 18919 non-null category\n", - " 8 Structural_conflic 20069 non-null float64 \n", - " 9 Age 20061 non-null float64 \n", - " 10 Sex 20069 non-null object \n", - " 11 NumHijos 18958 non-null float64 \n", - " 12 Smoking 20069 non-null object \n", - " 13 Biological_vulnerability 20069 non-null object \n", - " 14 Alcohol_DxCIE 20069 non-null object \n", - " 15 Opiaceos_DxCIE 20069 non-null object \n", - " 16 Cannabis_DXCIE 20069 non-null object \n", - " 17 BZD_DxCIE 20069 non-null object \n", - " 18 Cocaina_DxCIE 20069 non-null object \n", - " 19 Alucinogenos_DXCIE 20069 non-null object \n", - " 20 Tabaco_DXCIE 20069 non-null object \n", - " 21 FrecuenciaConsumo30Dias 20069 non-null object \n", - " 22 Años_consumo_droga 19609 non-null float64 \n", - " 23 OtrosDx_Psiquiatrico 20069 non-null object \n", - " 24 Tx_previos 20069 non-null object \n", - " 25 Adherencia_tto_recalc 20069 non-null float64 \n", - " 26 Tiempo_tx 20069 non-null float64 \n", - " 27 Readmisiones_estudios 20069 non-null object \n", - " 28 Situacion_tratamiento 20069 non-null object \n", - " 29 Periodos_COVID 20069 non-null object \n", - " 30 Pandemia_inicio_fin_tratamiento 20069 non-null object \n", - " 31 Nreadmision 20069 non-null float64 \n", - " 32 Readmisiones_PRECOVID 20069 non-null float64 \n", - " 33 Readmisiones_COVID 20069 non-null float64 \n", - "dtypes: category(1), float64(10), object(23)\n", - "memory usage: 5.2+ MB\n", - "None\n", - "-------------------------------\n", - "PRE-ALTA\n", - "\n", - "Index: 2792 entries, 23 to 85159\n", - "Data columns (total 34 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 CODPROYECTO 2792 non-null float64 \n", - " 1 Education 2792 non-null object \n", - " 2 Social_protection 2792 non-null object \n", - " 3 Job_insecurity 2792 non-null object \n", - " 4 Housing 2792 non-null object \n", - " 5 Alterations_early_childhood_develop 2792 non-null object \n", - " 6 Social_inclusion 2792 non-null object \n", - " 7 Risk_stigma 2687 non-null category\n", - " 8 Structural_conflic 2792 non-null float64 \n", - " 9 Age 2791 non-null float64 \n", - " 10 Sex 2792 non-null object \n", - " 11 NumHijos 2689 non-null float64 \n", - " 12 Smoking 2792 non-null object \n", - " 13 Biological_vulnerability 2792 non-null object \n", - " 14 Alcohol_DxCIE 2792 non-null object \n", - " 15 Opiaceos_DxCIE 2792 non-null object \n", - " 16 Cannabis_DXCIE 2792 non-null object \n", - " 17 BZD_DxCIE 2792 non-null object \n", - " 18 Cocaina_DxCIE 2792 non-null object \n", - " 19 Alucinogenos_DXCIE 2792 non-null object \n", - " 20 Tabaco_DXCIE 2792 non-null object \n", - " 21 FrecuenciaConsumo30Dias 2792 non-null object \n", - " 22 Años_consumo_droga 2733 non-null float64 \n", - " 23 OtrosDx_Psiquiatrico 2792 non-null object \n", - " 24 Tx_previos 2792 non-null object \n", - " 25 Adherencia_tto_recalc 2792 non-null float64 \n", - " 26 Tiempo_tx 2792 non-null float64 \n", - " 27 Readmisiones_estudios 2792 non-null object \n", - " 28 Situacion_tratamiento 2792 non-null object \n", - " 29 Periodos_COVID 2792 non-null object \n", - " 30 Pandemia_inicio_fin_tratamiento 2792 non-null object \n", - " 31 Nreadmision 2792 non-null float64 \n", - " 32 Readmisiones_PRECOVID 2792 non-null float64 \n", - " 33 Readmisiones_COVID 2792 non-null float64 \n", - "dtypes: category(1), float64(10), object(23)\n", - "memory usage: 744.5+ KB\n", - "None\n", - "-------------------------------\n", - "\n", - "\n", - "\n", - "\n", - "POST\n", - "\n", - "Index: 10677 entries, 11 to 85156\n", - "Data columns (total 35 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 CODPROYECTO 10677 non-null float64 \n", - " 1 Education 10677 non-null object \n", - " 2 Social_protection 10677 non-null object \n", - " 3 Job_insecurity 10677 non-null object \n", - " 4 Housing 10677 non-null object \n", - " 5 Alterations_early_childhood_develop 10677 non-null object \n", - " 6 Social_inclusion 10677 non-null object \n", - " 7 Risk_stigma 10085 non-null category\n", - " 8 Structural_conflic 10677 non-null float64 \n", - " 9 Age 10676 non-null float64 \n", - " 10 Sex 10677 non-null object \n", - " 11 NumHijos 10103 non-null float64 \n", - " 12 Smoking 10677 non-null object \n", - " 13 Biological_vulnerability 10677 non-null object \n", - " 14 Alcohol_DxCIE 10677 non-null object \n", - " 15 Opiaceos_DxCIE 10677 non-null object \n", - " 16 Cannabis_DXCIE 10677 non-null object \n", - " 17 BZD_DxCIE 10677 non-null object \n", - " 18 Cocaina_DxCIE 10677 non-null object \n", - " 19 Alucinogenos_DXCIE 10677 non-null object \n", - " 20 Tabaco_DXCIE 10677 non-null object \n", - " 21 FrecuenciaConsumo30Dias 10677 non-null object \n", - " 22 Años_consumo_droga 10478 non-null float64 \n", - " 23 OtrosDx_Psiquiatrico 10677 non-null object \n", - " 24 Tx_previos 10677 non-null object \n", - " 25 Adherencia_tto_recalc 10677 non-null float64 \n", - " 26 Tiempo_tx 10677 non-null float64 \n", - " 27 Readmisiones_estudios 10677 non-null object \n", - " 28 Situacion_tratamiento 10677 non-null object \n", - " 29 Periodos_COVID 10677 non-null object \n", - " 30 Pandemia_inicio_fin_tratamiento 10677 non-null object \n", - " 31 Nreadmision 10677 non-null float64 \n", - " 32 Readmisiones_PRECOVID 10677 non-null float64 \n", - " 33 Readmisiones_COVID 10677 non-null float64 \n", - " 34 Group 10677 non-null object \n", - "dtypes: category(1), float64(10), object(24)\n", - "memory usage: 2.9+ MB\n", - "None\n", - "-------------------------------\n", - "POST-ABANDONO\n", - "\n", - "Index: 8795 entries, 11 to 85156\n", - "Data columns (total 34 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 CODPROYECTO 8795 non-null float64 \n", - " 1 Education 8795 non-null object \n", - " 2 Social_protection 8795 non-null object \n", - " 3 Job_insecurity 8795 non-null object \n", - " 4 Housing 8795 non-null object \n", - " 5 Alterations_early_childhood_develop 8795 non-null object \n", - " 6 Social_inclusion 8795 non-null object \n", - " 7 Risk_stigma 8308 non-null category\n", - " 8 Structural_conflic 8795 non-null float64 \n", - " 9 Age 8794 non-null float64 \n", - " 10 Sex 8795 non-null object \n", - " 11 NumHijos 8325 non-null float64 \n", - " 12 Smoking 8795 non-null object \n", - " 13 Biological_vulnerability 8795 non-null object \n", - " 14 Alcohol_DxCIE 8795 non-null object \n", - " 15 Opiaceos_DxCIE 8795 non-null object \n", - " 16 Cannabis_DXCIE 8795 non-null object \n", - " 17 BZD_DxCIE 8795 non-null object \n", - " 18 Cocaina_DxCIE 8795 non-null object \n", - " 19 Alucinogenos_DXCIE 8795 non-null object \n", - " 20 Tabaco_DXCIE 8795 non-null object \n", - " 21 FrecuenciaConsumo30Dias 8795 non-null object \n", - " 22 Años_consumo_droga 8627 non-null float64 \n", - " 23 OtrosDx_Psiquiatrico 8795 non-null object \n", - " 24 Tx_previos 8795 non-null object \n", - " 25 Adherencia_tto_recalc 8795 non-null float64 \n", - " 26 Tiempo_tx 8795 non-null float64 \n", - " 27 Readmisiones_estudios 8795 non-null object \n", - " 28 Situacion_tratamiento 8795 non-null object \n", - " 29 Periodos_COVID 8795 non-null object \n", - " 30 Pandemia_inicio_fin_tratamiento 8795 non-null object \n", - " 31 Nreadmision 8795 non-null float64 \n", - " 32 Readmisiones_PRECOVID 8795 non-null float64 \n", - " 33 Readmisiones_COVID 8795 non-null float64 \n", - "dtypes: category(1), float64(10), object(23)\n", - "memory usage: 2.3+ MB\n", - "None\n", - "-------------------------------\n", - "POST-ALTA\n", - "\n", - "Index: 1882 entries, 258 to 85149\n", - "Data columns (total 34 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 CODPROYECTO 1882 non-null float64 \n", - " 1 Education 1882 non-null object \n", - " 2 Social_protection 1882 non-null object \n", - " 3 Job_insecurity 1882 non-null object \n", - " 4 Housing 1882 non-null object \n", - " 5 Alterations_early_childhood_develop 1882 non-null object \n", - " 6 Social_inclusion 1882 non-null object \n", - " 7 Risk_stigma 1777 non-null category\n", - " 8 Structural_conflic 1882 non-null float64 \n", - " 9 Age 1882 non-null float64 \n", - " 10 Sex 1882 non-null object \n", - " 11 NumHijos 1778 non-null float64 \n", - " 12 Smoking 1882 non-null object \n", - " 13 Biological_vulnerability 1882 non-null object \n", - " 14 Alcohol_DxCIE 1882 non-null object \n", - " 15 Opiaceos_DxCIE 1882 non-null object \n", - " 16 Cannabis_DXCIE 1882 non-null object \n", - " 17 BZD_DxCIE 1882 non-null object \n", - " 18 Cocaina_DxCIE 1882 non-null object \n", - " 19 Alucinogenos_DXCIE 1882 non-null object \n", - " 20 Tabaco_DXCIE 1882 non-null object \n", - " 21 FrecuenciaConsumo30Dias 1882 non-null object \n", - " 22 Años_consumo_droga 1851 non-null float64 \n", - " 23 OtrosDx_Psiquiatrico 1882 non-null object \n", - " 24 Tx_previos 1882 non-null object \n", - " 25 Adherencia_tto_recalc 1882 non-null float64 \n", - " 26 Tiempo_tx 1882 non-null float64 \n", - " 27 Readmisiones_estudios 1882 non-null object \n", - " 28 Situacion_tratamiento 1882 non-null object \n", - " 29 Periodos_COVID 1882 non-null object \n", - " 30 Pandemia_inicio_fin_tratamiento 1882 non-null object \n", - " 31 Nreadmision 1882 non-null float64 \n", - " 32 Readmisiones_PRECOVID 1882 non-null float64 \n", - " 33 Readmisiones_COVID 1882 non-null float64 \n", - "dtypes: category(1), float64(10), object(23)\n", - "memory usage: 501.9+ KB\n", - "None\n", - "-------------------------------\n" - ] - } - ], - "source": [ - "print(\"PRE\")\n", - "print(conj_pre.info())\n", - "print (\"-------------------------------\")\n", - "print(\"PRE-ABANDONO\")\n", - "print(pre_abandono.info())\n", - "print (\"-------------------------------\")\n", - "print(\"PRE-ALTA\")\n", - "print(pre_alta.info())\n", - "print (\"-------------------------------\")\n", - "\n", - "print(\"\\n\\n\\n\")\n", - "\n", - "print (\"POST\")\n", - "print(conj_post.info())\n", - "print (\"-------------------------------\")\n", - "print(\"POST-ABANDONO\")\n", - "print(post_abandono.info())\n", - "print (\"-------------------------------\")\n", - "print(\"POST-ALTA\")\n", - "print(post_alta.info())\n", - "print (\"-------------------------------\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Missing and Unknown Values" - ] - }, - { - "cell_type": "code", - "execution_count": 133, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['Live with families or friends' 'live alone' 'live in institutions' '9.0']\n", - "['Live with families or friends' 'live alone' 'live in institutions']\n", - "['No alterations (first exposure at 11 or more years)'\n", - " 'Alterations (first exposure before 11 years old)' '9']\n", - "['No alterations (first exposure at 11 or more years)'\n", - " 'Alterations (first exposure before 11 years old)']\n", - "[NaN, 'Yes', 'No']\n", - "Categories (3, object): [99.0, 'No', 'Yes']\n", - "[NaN, 'Yes', 'No']\n", - "Categories (2, object): ['No', 'Yes']\n", - "[nan 0. 1. 2. 3. 4. 5. 8. 10. 6. 11. 12. 9. 7. 99. 14. 15.]\n", - "[nan 0. 1. 2. 3. 4. 5. 8. 10. 6. 11. 12. 9. 7. 14. 15.]\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_8220\\1003504044.py:14: FutureWarning: The behavior of Series.replace (and DataFrame.replace) with CategoricalDtype is deprecated. In a future version, replace will only be used for cases that preserve the categories. To change the categories, use ser.cat.rename_categories instead.\n", - " bd['Risk_stigma'] = bd['Risk_stigma'].replace(99.0, mode_stigma)\n" - ] - } - ], - "source": [ - "# 9.0 represents unknown according to Variables.docx \n", - "print(bd['Social_inclusion'].unique())\n", - "mode_soc_inc = bd['Social_inclusion'].mode()[0]\n", - "bd['Social_inclusion'] = bd['Social_inclusion'].replace('9.0', mode_soc_inc)\n", - "print(bd['Social_inclusion'].unique())\n", - "\n", - "print(bd['Alterations_early_childhood_develop'].unique())\n", - "mode_alt = bd['Alterations_early_childhood_develop'].mode()[0]\n", - "bd['Alterations_early_childhood_develop'] = bd['Alterations_early_childhood_develop'].replace('9', mode_alt)\n", - "print(bd['Alterations_early_childhood_develop'].unique())\n", - "\n", - "print(bd['Risk_stigma'].unique())\n", - "mode_stigma = bd['Risk_stigma'].mode()[0]\n", - "bd['Risk_stigma'] = bd['Risk_stigma'].replace(99.0, mode_stigma)\n", - "print(bd['Risk_stigma'].unique())\n", - "\n", - "print(bd['NumHijos'].unique())\n", - "mode_hijos = bd['NumHijos'].mode()[0]\n", - "bd['NumHijos'] = bd['NumHijos'].replace(99.0, mode_hijos)\n", - "print(bd['NumHijos'].unique())" - ] - }, - { - "cell_type": "code", - "execution_count": 119, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Total missing values Age: 10\n", - "Total missing values Años_consumo_droga: 718\n", - "Total missing values Risk_stigma: 1847\n", - "Total missing values NumHijos: 1788\n", - "\tCONJUNTO PREPANDEMIA\n", - "\t\tMissing values Age: 9\n", - "\t\tMissing values Años_consumo_droga: 519\n", - "\t\tMissing values Risk_stigma: 1255\n", - "\t\tMissing values NumHijos: 1214\n", - "\tCONJUNTO POSTPANDEMIA\n", - "\t\tMissing values Age: 1\n", - "\t\tMissing values Años_consumo_droga: 199\n", - "\t\tMissing values Risk_stigma: 592\n", - "\t\tMissing values NumHijos: 574\n" - ] - } - ], - "source": [ - "print(f\"Total missing values Age: {bd['Age'].isnull().sum()}\")\n", - "print(f\"Total missing values Años_consumo_droga: {bd['Años_consumo_droga'].isnull().sum()}\")\n", - "print(f\"Total missing values Risk_stigma: {bd['Risk_stigma'].isnull().sum()}\")\n", - "print(f\"Total missing values NumHijos: {bd['NumHijos'].isnull().sum()}\")\n", - "\n", - "print(\"\\tCONJUNTO PREPANDEMIA\")\n", - "print(f\"\\t\\tMissing values Age: {conj_pre['Age'].isnull().sum()}\")\n", - "print(f\"\\t\\tMissing values Años_consumo_droga: {conj_pre['Años_consumo_droga'].isnull().sum()}\")\n", - "print(f\"\\t\\tMissing values Risk_stigma: {conj_pre['Risk_stigma'].isnull().sum()}\")\n", - "print(f\"\\t\\tMissing values NumHijos: {conj_pre['NumHijos'].isnull().sum()}\")\n", - "\n", - "print(\"\\tCONJUNTO POSTPANDEMIA\")\n", - "print(f\"\\t\\tMissing values Age: {conj_post['Age'].isnull().sum()}\")\n", - "print(f\"\\t\\tMissing values Años_consumo_droga: {conj_post['Años_consumo_droga'].isnull().sum()}\")\n", - "print(f\"\\t\\tMissing values Risk_stigma: {conj_post['Risk_stigma'].isnull().sum()}\")\n", - "print(f\"\\t\\tMissing values NumHijos: {conj_post['NumHijos'].isnull().sum()}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 134, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_8220\\3303146707.py:2: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", - "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", - "\n", - "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", - "\n", - "\n", - " bd['Age'].fillna(age_mode, inplace=True)\n", - "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_8220\\3303146707.py:5: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", - "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", - "\n", - "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", - "\n", - "\n", - " bd['Años_consumo_droga'].fillna(años_consumo_mode, inplace=True)\n", - "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_8220\\3303146707.py:8: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", - "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", - "\n", - "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", - "\n", - "\n", - " bd['Risk_stigma'].fillna(risk_stigma_mode, inplace=True)\n", - "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_8220\\3303146707.py:11: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", - "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", - "\n", - "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", - "\n", - "\n", - " bd['NumHijos'].fillna(num_hijos_mode, inplace=True)\n" - ] - } - ], - "source": [ - "age_mode = bd['Age'].mode()[0]\n", - "bd['Age'].fillna(age_mode, inplace=True)\n", - "\n", - "años_consumo_mode = bd['Años_consumo_droga'].mode()[0]\n", - "bd['Años_consumo_droga'].fillna(años_consumo_mode, inplace=True)\n", - "\n", - "risk_stigma_mode = bd['Risk_stigma'].mode()[0]\n", - "bd['Risk_stigma'].fillna(risk_stigma_mode, inplace=True)\n", - "\n", - "num_hijos_mode = bd['NumHijos'].mode()[0]\n", - "bd['NumHijos'].fillna(num_hijos_mode, inplace=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Distribution of Variables" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "disc_atts = ['Education', 'Social_protection', 'Job_insecurity', 'Housing',\n", - " 'Alterations_early_childhood_develop', 'Social_inclusion',\n", - " 'Risk_stigma', 'Sex', 'NumHijos', 'Smoking', 'Biological_vulnerability',\n", - " 'Opiaceos_DxCIE', 'Cannabis_DXCIE', 'BZD_DxCIE', 'Cocaina_DxCIE',\n", - " 'Alucinogenos_DXCIE', 'Tabaco_DXCIE', 'FrecuenciaConsumo30Dias',\n", - " 'OtrosDx_Psiquiatrico', 'Tx_previos', 'Readmisiones_estudios', 'Nreadmision'\n", - " ]\n", - "\n", - "num_atts = ['Structural_conflic', 'Adherencia_tto_recalc', 'Age', 'Años_consumo_droga', 'Tiempo_tx']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Discrete" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Countplots" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fig, axs = plt.subplots(len(disc_atts), 1, figsize=(15, 5*len(disc_atts)))\n", - "plt.subplots_adjust(hspace=0.75, wspace=1.25)\n", - "\n", - "for i, disc_att in enumerate(disc_atts):\n", - " ax = sns.countplot(x=disc_att, data=combined_pre_post, hue=combined_pre_post[['Situacion_tratamiento', 'Group']].apply(tuple, axis=1),\n", - " hue_order=[('Abandono', 'Pre'),('Alta terapéutica', 'Pre'), ('Abandono', 'Post'), ('Alta terapéutica', 'Post')],\n", - " ax=axs[i])\n", - " ax.set_title(disc_att, fontsize=16, fontweight='bold')\n", - " ax.get_legend().set_title(\"Groups\")\n", - " \n", - " # Adding count annotations\n", - " for p in ax.patches:\n", - " if p.get_label() == '_nolegend_':\n", - " ax.annotate(format(p.get_height(), '.0f'), \n", - " (p.get_x() + p.get_width() / 2., p.get_height()), \n", - " ha = 'center', va = 'center', \n", - " xytext = (0, 9), \n", - " textcoords = 'offset points')\n", - "\n", - "# Adjust layout to prevent overlapping titles\n", - "plt.tight_layout()\n", - "\n", - "plt.savefig('./output/plots/distributions/countplots.svg', dpi=600, bbox_inches='tight')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Normalized Countplots" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Function to plot countplot \n", - "def plot_count_perc_norm(i: int, group:int, disc_att:str) -> None:\n", - " \"\"\"\n", - " group: 1 (all), 2 (pre), 3 (post) \n", - " \"\"\"\n", - "\n", - " # Define data to work with based on group\n", - " if group == 1:\n", - " df = bd \n", - " elif group == 2:\n", - " df = conj_pre\n", - " elif group == 3:\n", - " df = conj_post\n", - "\n", - " # GOAL: find percentage of each possible category within the total of its situacion_tto subset\n", - " # Group data by 'Situacion_tratamiento' and 'Education' and count occurrences\n", - " grouped_counts = df.groupby(['Situacion_tratamiento', disc_att]).size().reset_index(name='count')\n", - " # Calculate total count for each 'Situacion_tratamiento' group\n", - " total_counts = df.groupby('Situacion_tratamiento')[disc_att].count()\n", - " # Divide each count by its corresponding total count and calculate percentage\n", - " grouped_counts['percentage'] = grouped_counts.apply(lambda row: row['count'] / total_counts[row['Situacion_tratamiento']] * 100, axis=1)\n", - " \n", - " # Follow the same order in plot as in computations\n", - " col_order = grouped_counts[grouped_counts['Situacion_tratamiento'] == 'Abandono'][disc_att].tolist()\n", - "\n", - " # Create countplot and split each bar into two based on the value of sit_tto\n", - " ax = sns.countplot(x=disc_att, hue='Situacion_tratamiento', data=df, order=col_order, ax=axs[i, group-2])\n", - "\n", - " # Adjust y-axis to represent percentages out of the total count\n", - " ax.set_ylim(0, 100)\n", - "\n", - " percentages = grouped_counts['percentage']\n", - " for i, p in enumerate(ax.patches):\n", - " # Skip going over the legend values\n", - " if p.get_label() == \"_nolegend_\":\n", - " # Set height to corresponding percentage and annotate result\n", - " height = percentages[i]\n", - " p.set_height(height)\n", - " ax.annotate(f'{height:.2f}%', (p.get_x() + p.get_width() / 2., height),\n", - " ha='center', va='bottom', fontsize=6, color='black', xytext=(0, 5),\n", - " textcoords='offset points')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fig, axs = plt.subplots(len(disc_atts), 2, figsize=(15, 7*len(disc_atts)))\n", - "plt.subplots_adjust(hspace=0.75, wspace=1.5)\n", - "\n", - "for i, disc_att in enumerate(disc_atts):\n", - "\n", - " # # 1: ALL \n", - " # plot_count_perc_norm(i, 1, disc_att)\n", - " # axs[i, 0].set_title(\"\\nALL\")\n", - " # axs[i, 0].set_xlabel(disc_att, fontweight='bold')\n", - " # axs[i, 0].set_ylabel(\"% of total within its Sit_tto group\")\n", - " # axs[i, 0].tick_params(axis='x', rotation=90)\n", - " \n", - " # 2: PRE\n", - " plot_count_perc_norm(i, 2, disc_att)\n", - " axs[i, 0].set_title(\"\\nPRE\")\n", - " axs[i, 0].set_xlabel(disc_att, fontweight='bold')\n", - " axs[i, 0].set_ylabel(\"% of total within its Sit_tto group\")\n", - " axs[i, 0].tick_params(axis='x', rotation=90)\n", - "\n", - " # 3: POST\n", - " plot_count_perc_norm(i, 3, disc_att)\n", - " axs[i, 1].set_title(\"\\nPOST\")\n", - " axs[i, 1].set_xlabel(disc_att, fontweight='bold')\n", - " axs[i, 1].set_ylabel(\"% of total within its Sit_tto group\")\n", - " axs[i, 1].tick_params(axis='x', rotation=90)\n", - "\n", - " \n", - "# Adjust layout to prevent overlapping titles\n", - "plt.tight_layout()\n", - "\n", - "# Save the figure in SVG format with DPI=600 in the \"._plots\" folder\n", - "plt.savefig('./output/plots/distributions/norm_countplots.svg', dpi=600, bbox_inches='tight')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Numerical" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Summary Stats" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(bd[num_atts].describe())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Boxplots" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fig, axs = plt.subplots(len(num_atts), 1, figsize=(12, 5*len(num_atts)))\n", - "plt.subplots_adjust(hspace=0.75, wspace=1.5)\n", - "\n", - "for i, num_att in enumerate(num_atts):\n", - " plt.subplot(len(num_atts), 1, i+1)\n", - " sns.boxplot(\n", - " data=combined_pre_post,\n", - " x = num_att,\n", - " y = 'Group',\n", - " hue='Situacion_tratamiento',\n", - " )\n", - "\n", - "# Adjust layout to prevent overlapping titles\n", - "plt.tight_layout()\n", - "\n", - "# Save the figure in SVG format with DPI=600 in the \"./EDA_plots\" folder\n", - "plt.savefig('./output/plots/distributions/boxplots.svg', dpi=600, bbox_inches='tight')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Histograms" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fig, axs = plt.subplots(len(num_atts), 3, figsize=(15, 6*len(num_atts)))\n", - "plt.subplots_adjust(hspace=0.75, wspace=1.5)\n", - "\n", - "for i, num_att in enumerate(num_atts):\n", - "\n", - " # 1: All alcohol patients\n", - " sns.histplot(data=bd,x=num_att,bins=15, hue='Situacion_tratamiento', stat='probability', common_norm=False, kde=True,\n", - " line_kws={'lw': 5}, alpha = 0.4, ax=axs[i, 0])\n", - " axs[i, 0].set_title(f\"\\nDistr. of {num_att} - ALL\")\n", - "\n", - " # 2: PRE\n", - " sns.histplot(data=conj_pre,x=num_att,bins=15, hue='Situacion_tratamiento', stat='probability', common_norm=False, kde=True, \n", - " line_kws={'lw': 5}, alpha = 0.4, ax=axs[i, 1])\n", - " axs[i, 1].set_title(f\"\\nDistr. of {num_att} - PRE\")\n", - "\n", - " # Subplot 3: POST\n", - " sns.histplot(data=conj_post,x=num_att,bins=15, hue='Situacion_tratamiento', stat='probability', common_norm=False, kde=True, \n", - " line_kws={'lw': 5}, alpha = 0.4, ax=axs[i, 2])\n", - " axs[i, 2].set_title(f\"\\nDistr. of {num_att} - POST\")\n", - "\n", - "# Adjust layout to prevent overlapping titles\n", - "plt.tight_layout()\n", - "\n", - "# Save the figure in SVG format with DPI=600 in the \"./EDA_plots\" folder\n", - "plt.savefig('./output/plots/distributions/histograms.svg', dpi=600, bbox_inches='tight')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Correlation Analysis" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Groups of Variables" - ] - }, - { - "cell_type": "code", - "execution_count": 135, - "metadata": {}, - "outputs": [], - "source": [ - "social_vars = ['Education', 'Social_protection', 'Job_insecurity', 'Housing', 'Alterations_early_childhood_develop', \n", - " 'Social_inclusion', 'Risk_stigma', 'Structural_conflic']\n", - "ind_vars = ['Age', 'Sex', 'NumHijos', 'Smoking', 'Biological_vulnerability', 'Opiaceos_DxCIE', \n", - " 'Cannabis_DXCIE', 'BZD_DxCIE', 'Cocaina_DxCIE', 'Alucinogenos_DXCIE', 'Tabaco_DXCIE', \n", - " 'FrecuenciaConsumo30Dias', 'Años_consumo_droga','OtrosDx_Psiquiatrico', 'Tx_previos', 'Adherencia_tto_recalc'] \n", - "target_var = 'Situacion_tratamiento'\n", - "\n", - "# Columns that are already numeric and we don't need to redefine \n", - "no_redef_cols = ['Structural_conflic', 'Age', 'NumHijos', 'Años_consumo_droga', 'Adherencia_tto_recalc']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### One-hot Encoding" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Binary" - ] - }, - { - "cell_type": "code", - "execution_count": 136, - "metadata": {}, - "outputs": [], - "source": [ - "# --------------------------------------------------------------------------\n", - "\n", - "# 'Alterations_early_childhood_develop'\n", - "alterations_mapping = {\n", - " 'No alterations (first exposure at 11 or more years)' : 0,\n", - " 'Alterations (first exposure before 11 years old)': 1,\n", - "}\n", - "\n", - "bd['Alterations_early_childhood_develop_REDEF'] = bd['Alterations_early_childhood_develop'].map(alterations_mapping)\n", - "\n", - "# --------------------------------------------------------------------------\n", - "\n", - "# Social protection\n", - "bd['Social_protection_REDEF'] = bd['Social_protection'].map({'No':0, 'Sí':1})\n", - "\n", - "# --------------------------------------------------------------------------\n", - "\n", - "# 'Risk_stigma'\n", - "bd['Risk_stigma_REDEF'] = bd['Risk_stigma'].map({'No':0, 'Yes':1})\n", - "\n", - "# --------------------------------------------------------------------------\n", - "\n", - "# 'Sex'\n", - "bd['Sex_REDEF'] = bd['Sex'].map({'Hombre':0, 'Mujer':1})\n", - "\n", - "# --------------------------------------------------------------------------\n", - "\n", - "# 'Smoking'\n", - "bd['Smoking_REDEF'] = bd['Smoking'].map({'No':0, 'Sí':1})\n", - "\n", - "# --------------------------------------------------------------------------\n", - "\n", - "# 'Biological_vulnerability'\n", - "bd['Biological_vulnerability_REDEF'] = bd['Biological_vulnerability'].map({'No':0, 'Sí':1})\n", - "\n", - "# --------------------------------------------------------------------------\n", - "\n", - "# 'Droga_DxCIE'\n", - "bd['Opiaceos_DxCIE_REDEF'] = bd['Opiaceos_DxCIE'].map({'No': 0, 'Sí': 1})\n", - "bd['Cannabis_DXCIE_REDEF'] = bd['Cannabis_DXCIE'].map({'No': 0, 'Sí': 1})\n", - "bd['BZD_DxCIE_REDEF'] = bd['BZD_DxCIE'].map({'No': 0, 'Sí': 1})\n", - "bd['Cocaina_DxCIE_REDEF'] = bd['Cocaina_DxCIE'].map({'No': 0, 'Sí': 1})\n", - "bd['Alucinogenos_DXCIE_REDEF'] = bd['Alucinogenos_DXCIE'].map({'No': 0, 'Sí': 1})\n", - "bd['Tabaco_DXCIE_REDEF'] = bd['Tabaco_DXCIE'].map({'No': 0, 'Sí': 1})\n", - "\n", - "# --------------------------------------------------------------------------\n", - "\n", - "# 'OtrosDx_Psiquiatrico'\n", - "bd['OtrosDx_Psiquiatrico_REDEF'] = bd['OtrosDx_Psiquiatrico'].map({'No':0, 'Sí':1})\n", - "\n", - "# --------------------------------------------------------------------------\n", - "\n", - "# 'Tx_previos'\n", - "bd['Tx_previos_REDEF'] = bd['Tx_previos'].map({'No':0, 'Sí':1})\n", - "\n", - "# --------------------------------------------------------------------------\n", - "\n", - "# 'Situacion_tratamiento (!!!!!)\n", - "# Important to define properly\n", - "bd['Situacion_tratamiento_REDEF'] = bd['Situacion_tratamiento'].map({'Abandono':1, 'Alta terapéutica':0})\n", - "\n", - "# --------------------------------------------------------------------------" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Categorical" - ] - }, - { - "cell_type": "code", - "execution_count": 137, - "metadata": {}, - "outputs": [], - "source": [ - "# Specify columns to one hot encode; empty list otherwise\n", - "one_hot_vars = ['Education', 'Job_insecurity', 'Housing', 'Social_inclusion', 'FrecuenciaConsumo30Dias']\n", - "\n", - "one_hots_vars_prefix = {\n", - " 'Education': 'Ed',\n", - " 'Job_insecurity': 'JobIn',\n", - " 'Housing': 'Hous', \n", - " 'Social_inclusion': 'SocInc',\n", - " 'FrecuenciaConsumo30Dias': 'Frec30',\n", - "}\n", - "\n", - "one_hot_cols_dic = {}\n", - "\n", - "for one_hot_var in one_hot_vars:\n", - " # Create one hot encoding version of attribute and concatenate new columns to main df\n", - " encoded_var = pd.get_dummies(bd[one_hot_var], prefix=one_hots_vars_prefix[one_hot_var])\n", - " bd = pd.concat([bd, encoded_var], axis=1)\n", - " one_hot_cols_dic[one_hot_var] = encoded_var.columns.tolist()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Final Columns" - ] - }, - { - "cell_type": "code", - "execution_count": 142, - "metadata": {}, - "outputs": [], - "source": [ - "soc_vars_enc = []\n", - "for soc_var in social_vars:\n", - " # If no need to redefine, append directly\n", - " if soc_var in no_redef_cols:\n", - " soc_vars_enc.append(soc_var)\n", - " # If need to redefine\n", - " else:\n", - " # Check if it was one-hot encoded\n", - " if soc_var in one_hot_vars:\n", - " # Append all one hot columns\n", - " soc_vars_enc = soc_vars_enc + one_hot_cols_dic[soc_var]\n", - " # If not, use redefined version through mapping\n", - " else:\n", - " soc_vars_enc.append(soc_var + '_REDEF')\n", - "\n", - "ind_vars_enc = []\n", - "for ind_var in ind_vars:\n", - " # If no need to redefine, append directly\n", - " if ind_var in no_redef_cols:\n", - " ind_vars_enc.append(ind_var)\n", - " # If need to redefine\n", - " else:\n", - " # Check if it was one-hot encoded\n", - " if ind_var in one_hot_vars:\n", - " # Append all one hot columns\n", - " ind_vars_enc = ind_vars_enc + one_hot_cols_dic[ind_var]\n", - " # If not, use redefined version through mapping\n", - " else:\n", - " ind_vars_enc.append(ind_var + '_REDEF')\n", - "\n", - "# Final version of columns we need to use for correlation analysis\n", - "corr_cols = soc_vars_enc + ind_vars_enc" - ] - }, - { - "cell_type": "code", - "execution_count": 143, - "metadata": {}, - "outputs": [], - "source": [ - "# Drop unknown columns\n", - "corr_cols = [corr_col for corr_col in corr_cols if corr_col not in ['Ed_Unknowledge', 'JobIn_unkwnodledge', 'Hous_unknowledge', 'Frec30_Desconocido']]\n", - "soc_vars_enc = [corr_col for corr_col in soc_vars_enc if corr_col not in ['Ed_Unknowledge', 'JobIn_unkwnodledge', 'Hous_unknowledge', 'Frec30_Desconocido']]\n", - "ind_vars_enc = [corr_col for corr_col in ind_vars_enc if corr_col not in ['Ed_Unknowledge', 'JobIn_unkwnodledge', 'Hous_unknowledge', 'Frec30_Desconocido']]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Renaming and Filtering" - ] - }, - { - "cell_type": "code", - "execution_count": 144, - "metadata": {}, - "outputs": [], - "source": [ - "columns_to_keep = corr_cols + ['Situacion_tratamiento','Situacion_tratamiento_REDEF', 'Pandemia_inicio_fin_tratamiento']\n", - "bd = bd[columns_to_keep]" - ] - }, - { - "cell_type": "code", - "execution_count": 145, - "metadata": {}, - "outputs": [], - "source": [ - "name_mapping = {\n", - " 'Ed_Not Complete primary school': 'Ed_Not_Complete_Primary',\n", - " 'Ed_Primary education': 'Ed_Primary',\n", - " 'Ed_Secondary Education': 'Ed_Secondary',\n", - " 'Ed_Secondary more technical education': 'Ed_Secondary_Technical',\n", - " 'Ed_Tertiary': 'Ed_Tertiary',\n", - " 'Social_protection_REDEF': 'Social_Protection',\n", - " 'JobIn_Non-stable': 'JobIn_Unstable',\n", - " 'JobIn_Stable': 'JobIn_Stable',\n", - " 'JobIn_Unemployed': 'JobIn_Unemployed',\n", - " 'Hous_Institutional': 'Hous_Institutional',\n", - " 'Hous_Stable': 'Hous_Stable',\n", - " 'Hous_Unstable': 'Hous_Unstable',\n", - " 'Alterations_early_childhood_develop_REDEF': 'Early_Alterations',\n", - " 'SocInc_Live with families or friends': 'SocInc_Family_Friends',\n", - " 'SocInc_live alone': 'SocInc_Alone',\n", - " 'SocInc_live in institutions': 'SocInc_Instit',\n", - " 'Risk_stigma_REDEF': 'Risk_Stigma',\n", - " 'Structural_conflic': 'Structural_Conflict',\n", - " 'Age': 'Age',\n", - " 'Sex_REDEF': 'Sex',\n", - " 'NumHijos': 'Num_Children',\n", - " 'Smoking_REDEF': 'Smoking',\n", - " 'Biological_vulnerability_REDEF': 'Bio_Vulner',\n", - " 'Opiaceos_DxCIE_REDEF': 'Opiods_DXCIE',\n", - " 'Cannabis_DXCIE_REDEF': 'Cannabis_DXCIE',\n", - " 'BZD_DxCIE_REDEF': 'BZD_DXCIE',\n", - " 'Cocaina_DxCIE_REDEF': 'Cocaine_DXCIE',\n", - " 'Alucinogenos_DXCIE_REDEF': 'Hallucin_DXCIE',\n", - " 'Tabaco_DXCIE_REDEF': 'Tobacco_DXCIE',\n", - " 'Frec30_1 día/semana': 'Freq_1dpw',\n", - " 'Frec30_2-3 días\\u200e/semana': 'Freq_2-3dpw',\n", - " 'Frec30_4-6 días/semana': 'Freq_4-6dpw',\n", - " 'Frec30_Menos de 1 día\\u200e/semana': 'Freq_l1dpw',\n", - " 'Frec30_No consumio': 'Freq_None',\n", - " 'Frec30_Todos los días': 'Freq_Everyday',\n", - " 'Años_consumo_droga': 'Years_Drug_Use',\n", - " 'OtrosDx_Psiquiatrico_REDEF': 'Other_Psychiatric_DX',\n", - " 'Tx_previos_REDEF': 'Previous_Treatments',\n", - " 'Adherencia_tto_recalc': 'Treatment_Adherence',\n", - " 'Situacion_tratamiento_REDEF': 'Treatment_Outcome',\n", - " 'Situacion_tratamiento': 'Situacion_tratamiento',\n", - " 'Pandemia_inicio_fin_tratamiento': 'Pandemia_inicio_fin_tratamiento'\n", - "}\n", - "\n", - "# Update lists of feature names\n", - "corr_cols = [name_mapping[corr_col] for corr_col in corr_cols]\n", - "soc_vars_enc = [name_mapping[col] for col in soc_vars_enc]\n", - "ind_vars_enc = [name_mapping[col] for col in ind_vars_enc]" - ] - }, - { - "cell_type": "code", - "execution_count": 146, - "metadata": {}, - "outputs": [], - "source": [ - "# Export feature names\n", - "np.save('./output/feature_names.npy', corr_cols)\n", - "np.save('./output/soc_vars_names.npy', soc_vars_enc)\n", - "np.save('./output/ind_vars_names.npy', ind_vars_enc)" - ] - }, - { - "cell_type": "code", - "execution_count": 147, - "metadata": {}, - "outputs": [], - "source": [ - "bd = bd.rename(columns=name_mapping)[list(name_mapping.values())]\n", - "#print(bd.columns)" - ] - }, - { - "cell_type": "code", - "execution_count": 148, - "metadata": {}, - "outputs": [], - "source": [ - "# Update main dfs\n", - "# Pre-pandemic\n", - "conj_pre = bd[bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio y fin prepandemia']\n", - "# Pre-pandemic abandono\n", - "pre_abandono = conj_pre[conj_pre['Situacion_tratamiento'] == 'Abandono']\n", - "# Pre-pandemic alta\n", - "pre_alta = conj_pre[conj_pre['Situacion_tratamiento'] == 'Alta terapéutica']\n", - "\n", - "# Post-pandemic\n", - "# Merging last two classes to balance sets\n", - "conj_post = bd[(bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio prepandemia y fin en pandemia') | \n", - " (bd['Pandemia_inicio_fin_tratamiento'] == 'inicio y fin en pandemia')]\n", - "# Post-pandemic abandono\n", - "post_abandono = conj_post[conj_post['Situacion_tratamiento'] == 'Abandono']\n", - "# Post-pandemic alta\n", - "post_alta = conj_post[conj_post['Situacion_tratamiento'] == 'Alta terapéutica']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Correlation Matrices" - ] - }, - { - "cell_type": "code", - "execution_count": 149, - "metadata": {}, - "outputs": [], - "source": [ - "binary_vars = [col for col in corr_cols if len(bd[col].unique()) == 2] + [name_mapping['Situacion_tratamiento_REDEF'], name_mapping['Risk_stigma_REDEF']]\n", - "cont_vars = [name_mapping[col] for col in ['Structural_conflic', 'Age', 'NumHijos', 'Años_consumo_droga', 'Adherencia_tto_recalc']]" - ] - }, - { - "cell_type": "code", - "execution_count": 150, - "metadata": {}, - "outputs": [], - "source": [ - "def get_corr_matrix(df, cols):\n", - " \n", - " # Initialize nxn matrix to zeroes\n", - " n = len(cols)\n", - " corr_matrix = np.zeros((n,n))\n", - "\n", - " for i, var_i in enumerate(cols):\n", - " for j, var_j in enumerate(cols):\n", - " # Fill lower triangle of matrix\n", - " if i > j:\n", - " # Binary with binary correlation: tetrachoric\n", - " if var_i in binary_vars and var_j in binary_vars:\n", - " corr = binary_binary(df[var_i], df[var_j], measure='tetrachoric')\n", - " # Continuous with continuous correlation: \n", - " elif var_i in cont_vars and var_j in cont_vars:\n", - " # Returning nan sometimes:\n", - " # corr_tuple = continuous_continuous(df[var_i], df[var_j], measure = 'spearman')\n", - " # corr = corr_tuple[0]\n", - " corr = df[var_i].corr(df[var_j], method='spearman')\n", - " # Binary vs Continuous correlation:\n", - " else:\n", - " if var_i in binary_vars:\n", - " bin_var = var_i\n", - " cont_var = var_j\n", - " else:\n", - " bin_var = var_j\n", - " cont_var = var_i\n", - " corr = binary_continuous(df[bin_var], df[cont_var], measure='point_biserial')\n", - " # Assign value to matrix\n", - " corr_matrix[i][j] = corr \n", - " \n", - " return corr_matrix" - ] - }, - { - "cell_type": "code", - "execution_count": 151, - "metadata": {}, - "outputs": [], - "source": [ - "def plot_heatmap(sit_tto: int, group:int) -> None:\n", - " \"\"\"\n", - " sit_tto: 1 (include it as another var), 2 (only abandono), 3 (only alta)\n", - " group: 1 (all alcohol patients), 2 (pre), 3 (post)\n", - " \"\"\"\n", - "\n", - " # Define columns based on sit_tto arg\n", - " if sit_tto == 1:\n", - " # Include target as another variable\n", - " cols = ['Treatment_Outcome'] + corr_cols\n", - " else:\n", - " cols = corr_cols\n", - " \n", - " # Title plot and select datat based on group and sit_tto\n", - " if group == 1:\n", - " plot_title = \"Correl Matrix - ALL\"\n", - " if sit_tto == 1:\n", - " bd_ca = bd[cols]\n", - " elif sit_tto == 2:\n", - " bd_ca = bd[bd['Situacion_tratamiento'] == 'Abandono'][cols]\n", - " elif sit_tto == 3:\n", - " bd_ca = bd[bd['Situacion_tratamiento'] == 'Alta terapéutica'][cols]\n", - " elif group == 2:\n", - " plot_title = \"Correl Matrix - PRE\"\n", - " if sit_tto == 1: \n", - " bd_ca = conj_pre[cols]\n", - " elif sit_tto == 2:\n", - " bd_ca = pre_abandono[cols]\n", - " elif sit_tto == 3:\n", - " bd_ca = pre_alta[cols]\n", - " elif group == 3:\n", - " plot_title = \"Correl Matrix - POST\"\n", - " if sit_tto == 1: \n", - " bd_ca = conj_post[cols]\n", - " elif sit_tto == 2:\n", - " bd_ca = post_abandono[cols]\n", - " elif sit_tto == 3:\n", - " bd_ca = post_alta[cols]\n", - " \n", - " # Complete title\n", - " if sit_tto == 2:\n", - " plot_title += \" - ABANDONO\"\n", - " elif sit_tto == 3:\n", - " plot_title += \" - ALTA\"\n", - "\n", - " corr_matrix = get_corr_matrix(bd_ca, cols)\n", - "\n", - " # Create a mask for the upper triangle\n", - " mask = np.triu(np.ones_like(corr_matrix, dtype=bool))\n", - "\n", - " # Create heatmap correlation matrix\n", - " dataplot = sns.heatmap(corr_matrix, mask=mask, xticklabels=cols, yticklabels=cols, cmap=\"coolwarm\", vmin=-1, vmax=1, annot=True, fmt=\".2f\", annot_kws={\"size\": 4})\n", - "\n", - " # Group ind vs social vars by color and modify tick label names\n", - " for tick_label in dataplot.axes.xaxis.get_ticklabels():\n", - " if tick_label.get_text() in ind_vars_enc:\n", - " tick_label.set_color('green')\n", - " elif tick_label.get_text() in soc_vars_enc:\n", - " tick_label.set_color('purple') \n", - " for tick_label in dataplot.axes.yaxis.get_ticklabels():\n", - " if tick_label.get_text() in ind_vars_enc:\n", - " tick_label.set_color('green')\n", - " elif tick_label.get_text() in soc_vars_enc:\n", - " tick_label.set_color('purple') \n", - "\n", - " # Increase the size of xtick labels\n", - " # dataplot.tick_params(axis='x', labelsize=12)\n", - "\n", - " # Increase the size of ytick labels\n", - " # dataplot.tick_params(axis='y', labelsize=12)\n", - "\n", - " # Add legend and place it in lower left \n", - " plt.legend(handles=[\n", - " plt.Line2D([0], [0], marker='o', color='w', label='Social Factors', markerfacecolor='purple', markersize=10),\n", - " plt.Line2D([0], [0], marker='o', color='w', label='Individual Factors', markerfacecolor='green', markersize=10)\n", - " ], bbox_to_anchor=(-0.1, -0.1), fontsize = 20)\n", - "\n", - " plt.title(\"\\n\\n\" + plot_title, fontdict={'fontsize': 30, 'fontweight': 'bold'})\n", - "\n", - " return corr_matrix" - ] - }, - { - "cell_type": "code", - "execution_count": 152, - "metadata": {}, - "outputs": [], - "source": [ - "fig, axs = plt.subplots(3, 3, figsize=(50, 50))\n", - "plt.subplots_adjust(hspace=0.75, wspace=2)\n", - "corr_mats = [] # List of tuples (m1, m2) to store the 3 pairs of matrices to compare (pre vs post)\n", - "\n", - "# Go through possible values for 'Situacion_tratamiento' and 'Group'\n", - "for sit_tto in range(1,4):\n", - " # ALL\n", - " plt.subplot(3, 3, 3*(sit_tto-1) + 1) # Calculate the subplot position dynamically\n", - " _ = plot_heatmap(sit_tto, 1)\n", - " # PRE\n", - " plt.subplot(3, 3, 3*(sit_tto-1) + 2) \n", - " corr_matrix_pre = plot_heatmap(sit_tto, 2)\n", - " # POST\n", - " plt.subplot(3, 3, 3*(sit_tto-1) + 3)\n", - " corr_matrix_post = plot_heatmap(sit_tto, 3)\n", - "\n", - " corr_mats.append((corr_matrix_pre, corr_matrix_post))\n", - " \n", - "plt.tight_layout()\n", - "\n", - "plt.savefig('./output/plots/correlations/heatmaps_one_hot.svg', dpi=550, bbox_inches='tight')" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.2" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/EDA/output/feature_names.npy b/EDA/output/feature_names/feature_names.npy similarity index 100% rename from EDA/output/feature_names.npy rename to EDA/output/feature_names/feature_names.npy diff --git a/EDA/output/ind_vars_names.npy b/EDA/output/feature_names/ind_vars_names.npy similarity index 100% rename from EDA/output/ind_vars_names.npy rename to EDA/output/feature_names/ind_vars_names.npy diff --git a/EDA/output/soc_vars_names.npy b/EDA/output/feature_names/soc_vars_names.npy similarity index 100% rename from EDA/output/soc_vars_names.npy rename to EDA/output/feature_names/soc_vars_names.npy diff --git a/EDA/output/plots/feature_importance/ANOVA.svg b/EDA/output/plots/feature_importance/ANOVA.svg index 3c4e8a7efe41b3624c0ec0ec233c4c27403a477f..bd44d500eb77efe03c9e3aa779f83984a3d61234 100644 --- a/EDA/output/plots/feature_importance/ANOVA.svg +++ b/EDA/output/plots/feature_importance/ANOVA.svg @@ -6,7 +6,7 @@ - 2024-04-25T15:11:31.191722 + 2024-06-28T14:50:53.726915 image/svg+xml @@ -30,188 +30,188 @@ z - - +" clip-path="url(#pdbc6a8b906)" style="fill: #008080"/> - +" clip-path="url(#pdbc6a8b906)" style="fill: #008080"/> - +" clip-path="url(#pdbc6a8b906)" style="fill: #008080"/> - +" clip-path="url(#pdbc6a8b906)" style="fill: #008080"/> - +" clip-path="url(#pdbc6a8b906)" style="fill: #008080"/> - +" clip-path="url(#pdbc6a8b906)" style="fill: #008080"/> - +" clip-path="url(#pdbc6a8b906)" style="fill: #008080"/> - +" clip-path="url(#pdbc6a8b906)" style="fill: #008080"/> - +" clip-path="url(#pdbc6a8b906)" style="fill: #008080"/> - +" clip-path="url(#pdbc6a8b906)" style="fill: #008080"/> - +" clip-path="url(#pdbc6a8b906)" style="fill: #008080"/> - +" clip-path="url(#pdbc6a8b906)" style="fill: #008080"/> - +" clip-path="url(#pdbc6a8b906)" style="fill: #008080"/> - +" clip-path="url(#pdbc6a8b906)" style="fill: #008080"/> - +" clip-path="url(#pdbc6a8b906)" style="fill: #008080"/> - +" clip-path="url(#pdbc6a8b906)" style="fill: #008080"/> - +" clip-path="url(#pdbc6a8b906)" style="fill: #008080"/> - +" clip-path="url(#pdbc6a8b906)" style="fill: #008080"/> - +" clip-path="url(#pdbc6a8b906)" style="fill: #008080"/> - +" clip-path="url(#pdbc6a8b906)" style="fill: #008080"/> - - + - + - + - + - + - + - + - + - + - + - + - - + - - + + - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + - + - - + + - - - - - - - - - - + - - + + - + - - - - - - - - - - - - - - - - - - - - + + + + + + + + - + - - + + + + + @@ -1168,31 +1083,40 @@ z - - - - - - - - - - - - + + + + + + - + - + + - + - - + + - - + + - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + - + - - + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -1470,12 +1451,12 @@ z - + - - + + - - - - - - - - - - - - - - - - + + + + + + - + - - + + @@ -1530,28 +1501,23 @@ z - - - - - - - - - - + + + + + - + - - + + @@ -1559,38 +1525,24 @@ z - - - - - - - - - - - - - - - - - - - - + + + + + + - + - - + + @@ -1598,48 +1550,42 @@ z - - - - - - - - - - - - - - - - + + + + + + + + + + - + - - + + - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + - + - - - - + @@ -1815,12 +1689,12 @@ z - + - - + + @@ -1833,118 +1707,53 @@ z - - - - - - - - - - - + - - + + - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + - + - - - - - - - + + @@ -1952,47 +1761,32 @@ z - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + - + - - + + @@ -2000,69 +1794,58 @@ z - - - - - - - + - - + + - - - - - - - - - - - - - - - - - + + + + + + + + + + + + - - - - - + @@ -2071,175 +1854,175 @@ L 436.635 26.88 - - +" clip-path="url(#p71656fb1ff)" style="fill: #008080"/> - +" clip-path="url(#p71656fb1ff)" style="fill: #008080"/> - +" clip-path="url(#p71656fb1ff)" style="fill: #008080"/> - +" clip-path="url(#p71656fb1ff)" style="fill: #008080"/> - +" clip-path="url(#p71656fb1ff)" style="fill: #008080"/> - +" clip-path="url(#p71656fb1ff)" style="fill: #008080"/> - +" clip-path="url(#p71656fb1ff)" style="fill: #008080"/> - +" clip-path="url(#p71656fb1ff)" style="fill: #008080"/> - +" clip-path="url(#p71656fb1ff)" style="fill: #008080"/> - +" clip-path="url(#p71656fb1ff)" style="fill: #008080"/> - +" clip-path="url(#p71656fb1ff)" style="fill: #008080"/> - +" clip-path="url(#p71656fb1ff)" style="fill: #008080"/> - +" clip-path="url(#p71656fb1ff)" style="fill: #008080"/> - +" clip-path="url(#p71656fb1ff)" style="fill: #008080"/> - +" clip-path="url(#p71656fb1ff)" style="fill: #008080"/> - +" clip-path="url(#p71656fb1ff)" style="fill: #008080"/> - +" clip-path="url(#p71656fb1ff)" style="fill: #008080"/> - +" clip-path="url(#p71656fb1ff)" style="fill: #008080"/> - +" clip-path="url(#p71656fb1ff)" style="fill: #008080"/> - + - + @@ -2249,12 +2032,12 @@ z - + - + @@ -2264,12 +2047,12 @@ z - + - + @@ -2279,12 +2062,12 @@ z - + - + @@ -2294,12 +2077,12 @@ z - + - + @@ -2308,7 +2091,7 @@ z - + @@ -2329,12 +2112,12 @@ z - + - + @@ -2352,46 +2135,35 @@ z - + - - + + - - - - - - - - - - - - - - - - - - - + + + + + + + + - + - - + + @@ -2404,43 +2176,28 @@ z - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + - + - + @@ -2465,115 +2222,93 @@ z - + - - + + - - - - - - - - - - - - - - - - + + + + + + - + - - + + - - - - - - - - - - - - - - - - - + + + + + + + + + + + + - + - - + + - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + - + - - + + @@ -2586,108 +2321,55 @@ z - - - - - - - - - - - + - - - - - + + - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + - + - - + + - - - - - - - + - - + + @@ -2695,34 +2377,28 @@ z - - - - - - - - - - - - - - - - + + + + + + + + + + - + - - + + @@ -2730,152 +2406,128 @@ z - - - - - - - - - - + + + + + - + - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + - + - - + + - - - - - - - - - - - - - - - - - - + + + + + + + + + + - + - - + + - - - - - - - - - - + + + + + + + + - + - - + + - - - - - - - - - - - - - - - - - - - - + + + + + + + + - + - - + + @@ -2883,60 +2535,67 @@ z - - - - - - - + - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + - + - - + + + + + @@ -2948,20 +2607,21 @@ z - - - - - - - + + + + + + + + - @@ -2970,18 +2630,18 @@ L 853.2 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square"/> - - - + @@ -2991,11 +2651,11 @@ L 853.2 26.88 - - + + - - + + diff --git a/EDA/output/plots/feature_importance/mutual_info.svg b/EDA/output/plots/feature_importance/mutual_info.svg index 4847501ec7e3d7bb095242cb857e5dd4c4510012..8eeb6cc7e925059f3b2f65255765621907d49b32 100644 --- a/EDA/output/plots/feature_importance/mutual_info.svg +++ b/EDA/output/plots/feature_importance/mutual_info.svg @@ -6,7 +6,7 @@ - 2024-04-25T15:31:34.349880 + 2024-06-28T14:50:52.047787 image/svg+xml @@ -30,188 +30,188 @@ z - - +" clip-path="url(#p780fbf8d15)" style="fill: #008080"/> - +" clip-path="url(#p780fbf8d15)" style="fill: #008080"/> - +" clip-path="url(#p780fbf8d15)" style="fill: #008080"/> - +" clip-path="url(#p780fbf8d15)" style="fill: #008080"/> - +" clip-path="url(#p780fbf8d15)" style="fill: #008080"/> - +" clip-path="url(#p780fbf8d15)" style="fill: #008080"/> - +" clip-path="url(#p780fbf8d15)" style="fill: #008080"/> - +" clip-path="url(#p780fbf8d15)" style="fill: #008080"/> - +" clip-path="url(#p780fbf8d15)" style="fill: #008080"/> - +" clip-path="url(#p780fbf8d15)" style="fill: #008080"/> - +" clip-path="url(#p780fbf8d15)" style="fill: #008080"/> - +" clip-path="url(#p780fbf8d15)" style="fill: #008080"/> - +" clip-path="url(#p780fbf8d15)" style="fill: #008080"/> - +" clip-path="url(#p780fbf8d15)" style="fill: #008080"/> - +" clip-path="url(#p780fbf8d15)" style="fill: #008080"/> - +" clip-path="url(#p780fbf8d15)" style="fill: #008080"/> - +" clip-path="url(#p780fbf8d15)" style="fill: #008080"/> - +" clip-path="url(#p780fbf8d15)" style="fill: #008080"/> - +" clip-path="url(#p780fbf8d15)" style="fill: #008080"/> - +" clip-path="url(#p780fbf8d15)" style="fill: #008080"/> - - + - - + + + - + - - + + + + + + + + + + + + + + + + + + + + + + + - - + + - + - - - + + + + + + + + + + + + + + + + + + + + + - - + + - + - - - + + + + + + + + + + + + + + + + + + + + + - - + + - + - - - + + + + + + + + + + + + + + + + + + + + + - + - + - + - - + - - - + + + - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + - + - + - - - + + + - + - - - - - - - - - - - - + + + + + + + + + + + + + - + - + - - - + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + - + - + - - - - - - - + + + @@ -1296,359 +1349,95 @@ z - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - + - - - + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + - + - + - - + + - - - - - - - - - - - - - - - - - - - + + + + + + + + - + - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + + + + @@ -1739,148 +1664,220 @@ z - - + + - + - - - + + + - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + - - + + - + - - - + + + - + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + - + - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -1892,93 +1889,68 @@ z - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + - - - - - + - + - @@ -1990,183 +1962,183 @@ z - - +" clip-path="url(#p21a1ed5bcf)" style="fill: #008080"/> - +" clip-path="url(#p21a1ed5bcf)" style="fill: #008080"/> - +" clip-path="url(#p21a1ed5bcf)" style="fill: #008080"/> - +" clip-path="url(#p21a1ed5bcf)" style="fill: #008080"/> - +" clip-path="url(#p21a1ed5bcf)" style="fill: #008080"/> - +" clip-path="url(#p21a1ed5bcf)" style="fill: #008080"/> - +" clip-path="url(#p21a1ed5bcf)" style="fill: #008080"/> - +" clip-path="url(#p21a1ed5bcf)" style="fill: #008080"/> - +" clip-path="url(#p21a1ed5bcf)" style="fill: #008080"/> - +" clip-path="url(#p21a1ed5bcf)" style="fill: #008080"/> - +" clip-path="url(#p21a1ed5bcf)" style="fill: #008080"/> - +" clip-path="url(#p21a1ed5bcf)" style="fill: #008080"/> - +" clip-path="url(#p21a1ed5bcf)" style="fill: #008080"/> - +" clip-path="url(#p21a1ed5bcf)" style="fill: #008080"/> - +" clip-path="url(#p21a1ed5bcf)" style="fill: #008080"/> - +" clip-path="url(#p21a1ed5bcf)" style="fill: #008080"/> - +" clip-path="url(#p21a1ed5bcf)" style="fill: #008080"/> - +" clip-path="url(#p21a1ed5bcf)" style="fill: #008080"/> - +" clip-path="url(#p21a1ed5bcf)" style="fill: #008080"/> - +" clip-path="url(#p21a1ed5bcf)" style="fill: #008080"/> - - + + - + - + - + @@ -2174,15 +2146,15 @@ z - - + + - + - + - + @@ -2190,15 +2162,15 @@ z - - + + - + - + - + @@ -2206,15 +2178,15 @@ z - - + + - + - + - + @@ -2222,15 +2194,15 @@ z - - + + - + - + - + @@ -2238,52 +2210,9 @@ z - - - - - - - - - - - - - - - - - - - - + - + @@ -2307,56 +2236,145 @@ z - + - + - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - + + - + - - - + + + @@ -2364,367 +2382,225 @@ z - - - - - - - - + + - + - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + - - + + - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + - - + + - + - - - + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + - + - - - + + + - - - - - - - - - - - - - - - - + + + + + + + + - - + + - + - - - + + + - - - - - - + + + + + + + + - - + + - + - - - + + + - - - - - - - - + + + + + + + + + - - + + - + - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + - - + + - + - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + - - + + - + - - - + + + @@ -2732,154 +2608,102 @@ z - - - - - - - - - - + + + + + + - - + + - + - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + - - + + - + - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + - - + + - + - - - + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + - - + + - + - + - + @@ -2894,59 +2718,46 @@ z - - - - - - - - - - - - - - - - - + + - + - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + - - + + - + - - - + + + @@ -2958,53 +2769,21 @@ z - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + - @@ -3013,41 +2792,18 @@ L 853.2 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square"/> - - - + - - - - + @@ -3057,11 +2813,11 @@ z - - + + - - + + diff --git a/EDA/output/plots/feature_importance/var_threshold.svg b/EDA/output/plots/feature_importance/var_threshold.svg index 06e69b66bfed6c64c480aa36eb9e16d65ad80a25..eb6ef43ca4cf64b525a023f6ecd4c6d4cb28985f 100644 --- a/EDA/output/plots/feature_importance/var_threshold.svg +++ b/EDA/output/plots/feature_importance/var_threshold.svg @@ -6,7 +6,7 @@ - 2024-04-25T15:26:39.246988 + 2024-06-28T14:50:55.177769 image/svg+xml @@ -30,188 +30,188 @@ z - - +" clip-path="url(#pc10ecad988)" style="fill: #008080"/> - +" clip-path="url(#pc10ecad988)" style="fill: #008080"/> - +" clip-path="url(#pc10ecad988)" style="fill: #008080"/> - +" clip-path="url(#pc10ecad988)" style="fill: #008080"/> - +" clip-path="url(#pc10ecad988)" style="fill: #008080"/> - +" clip-path="url(#pc10ecad988)" style="fill: #008080"/> - +" clip-path="url(#pc10ecad988)" style="fill: #008080"/> - +" clip-path="url(#pc10ecad988)" style="fill: #008080"/> - +" clip-path="url(#pc10ecad988)" style="fill: #008080"/> - +" clip-path="url(#pc10ecad988)" style="fill: #008080"/> - +" clip-path="url(#pc10ecad988)" style="fill: #008080"/> - +" clip-path="url(#pc10ecad988)" style="fill: #008080"/> - +" clip-path="url(#pc10ecad988)" style="fill: #008080"/> - +" clip-path="url(#pc10ecad988)" style="fill: #008080"/> - +" clip-path="url(#pc10ecad988)" style="fill: #008080"/> - +" clip-path="url(#pc10ecad988)" style="fill: #008080"/> - +" clip-path="url(#pc10ecad988)" style="fill: #008080"/> - +" clip-path="url(#pc10ecad988)" style="fill: #008080"/> - +" clip-path="url(#pc10ecad988)" style="fill: #008080"/> - +" clip-path="url(#pc10ecad988)" style="fill: #008080"/> - - + - + - + - + - + - + - + - + - + - + - + - + - + - + @@ -476,12 +476,12 @@ z - + - + @@ -492,7 +492,7 @@ z - + - - + - - + + - - - - - @@ -856,34 +776,28 @@ z - - - - - - - - - - - - - - - - + + + + + + + + + + - + - - + + - - @@ -911,28 +830,23 @@ z - - - - - - - - - - + + + + + - + - - + + - - - - - - - + - - + + + - + - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + - + - - + + + - @@ -1189,28 +1090,18 @@ z - - - - - - - - - - - + - - + + - @@ -1302,42 +1200,35 @@ z - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + - + - + - - - - - - - - - - + + + + + + + @@ -1458,79 +1316,90 @@ z - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + - + - - + + - - - - - - - - - - - - - + + + + + + + - + - - + + + + - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + - + - - + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + - + - - + + @@ -1691,29 +1509,47 @@ z - - - - - - - - - - - + - - + + + + @@ -1734,30 +1604,24 @@ z - - - - - - - - - - - - + + + + + + - + - + - + - - + + - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + - + - - + + @@ -1895,98 +1729,42 @@ z - - - - - - - + - - - - - - + + - - - - - + + + + + + + + + - + - - - - + @@ -1996,85 +1774,75 @@ z - + - - + + - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + - + - - + + + + + @@ -2086,20 +1854,21 @@ z - - - - - - - + + + + + + + + - @@ -2108,18 +1877,18 @@ L 519.94399 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square"/> - - - + @@ -2128,183 +1897,183 @@ L 519.94399 26.88 - - +" clip-path="url(#pb339196ebc)" style="fill: #008080"/> - +" clip-path="url(#pb339196ebc)" style="fill: #008080"/> - +" clip-path="url(#pb339196ebc)" style="fill: #008080"/> - +" clip-path="url(#pb339196ebc)" style="fill: #008080"/> - +" clip-path="url(#pb339196ebc)" style="fill: #008080"/> - +" clip-path="url(#pb339196ebc)" style="fill: #008080"/> - +" clip-path="url(#pb339196ebc)" style="fill: #008080"/> - +" clip-path="url(#pb339196ebc)" style="fill: #008080"/> - +" clip-path="url(#pb339196ebc)" style="fill: #008080"/> - +" clip-path="url(#pb339196ebc)" style="fill: #008080"/> - +" clip-path="url(#pb339196ebc)" style="fill: #008080"/> - +" clip-path="url(#pb339196ebc)" style="fill: #008080"/> - +" clip-path="url(#pb339196ebc)" style="fill: #008080"/> - +" clip-path="url(#pb339196ebc)" style="fill: #008080"/> - +" clip-path="url(#pb339196ebc)" style="fill: #008080"/> - +" clip-path="url(#pb339196ebc)" style="fill: #008080"/> - +" clip-path="url(#pb339196ebc)" style="fill: #008080"/> - +" clip-path="url(#pb339196ebc)" style="fill: #008080"/> - +" clip-path="url(#pb339196ebc)" style="fill: #008080"/> - +" clip-path="url(#pb339196ebc)" style="fill: #008080"/> - + - + @@ -2312,12 +2081,12 @@ z - + - + @@ -2328,12 +2097,12 @@ z - + - + @@ -2344,12 +2113,12 @@ z - + - + @@ -2360,12 +2129,12 @@ z - + - + @@ -2376,12 +2145,12 @@ z - + - + @@ -2393,12 +2162,12 @@ z - + - + @@ -2410,12 +2179,12 @@ z - + - + @@ -2426,7 +2195,7 @@ z - + @@ -2442,50 +2211,44 @@ z - + - - + + - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + - + - - + + @@ -2493,28 +2256,23 @@ z - - - - - - - - - - + + + + + - + - - + + @@ -2527,91 +2285,68 @@ z - - - - - - - - - - - + - - + + - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + - + - - + + - - - - - - - + - - + + @@ -2619,62 +2354,59 @@ z - - - - - - - - - - - - - - - - + + + + + + + + + + - + - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + - + - - + + @@ -2682,78 +2414,57 @@ z - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + - + - - + + - - - - - - - - - - - - - + + + + + + + - + - + @@ -2772,41 +2483,35 @@ z - + - - + + - - - - - - - - - - - - + + + + + + - + - - + + @@ -2817,70 +2522,40 @@ z - - - - - - - - - - - + - - + + - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + - + - + @@ -2903,12 +2578,12 @@ z - + - - + + @@ -2916,77 +2591,67 @@ z - - - - - - - + - - + + - - - - - - - - - - - - - - - - - - + + + + + + + + + + - + - - + + - - - - - + + + + + + + + + - + - + @@ -2996,42 +2661,38 @@ z - + - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + - + - - + + @@ -3043,20 +2704,21 @@ z - - - - - - - + + + + + + + + - @@ -3065,18 +2727,18 @@ L 1054.54399 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square"/> - - - + @@ -3086,11 +2748,11 @@ L 1054.54399 26.88 - - + + - - + +