diff --git a/EDA/EDA.ipynb b/EDA/EDA.ipynb index bcc6f79f5f7d61584456866ce74564eb6e0b7ab8..9426e5017064d955f4fd3ddf0958ee15d4690ad7 100644 --- a/EDA/EDA.ipynb +++ b/EDA/EDA.ipynb @@ -48,7 +48,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 139, "metadata": {}, "outputs": [], "source": [ @@ -70,9 +70,28 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 140, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_19584\\2495984927.py:18: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " conj_post['Group'] = 'Post'\n", + "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_19584\\2495984927.py:19: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " conj_pre['Group'] = 'Pre'\n" + ] + } + ], "source": [ "# Pre-pandemic\n", "conj_pre = bd[bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio y fin prepandemia']\n", @@ -98,9 +117,22 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 100, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PRE: 22861\n", + "\tALTA: 2792\n", + "\tABANDONO: 20069\n", + "POST: 10677\n", + "\tALTA: 1882\n", + "\tABANDONO: 8795\n" + ] + } + ], "source": [ "# Printing size of different datasets\n", "print(f\"PRE: {len(conj_pre)}\")\n", @@ -164,9 +196,18 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 141, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['Live with families or friends' 'live alone' 'live in institutions' '9.0']\n", + "['Live with families or friends' 'live alone' 'live in institutions']\n" + ] + } + ], "source": [ "# 9.0 represents unknown according to Variables.docx \n", "print(bd['Social_inclusion'].unique())\n", @@ -178,9 +219,20 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 142, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['No alterations (first exposure at 11 or more years)'\n", + " 'Alterations (first exposure before 11 years old)' '9']\n", + "['No alterations (first exposure at 11 or more years)'\n", + " 'Alterations (first exposure before 11 years old)']\n" + ] + } + ], "source": [ "print(bd['Alterations_early_childhood_develop'].unique())\n", "mode_alt = bd['Alterations_early_childhood_develop'].mode()[0]\n", @@ -190,9 +242,28 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 143, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[NaN, 'Yes', 'No']\n", + "Categories (3, object): [99.0, 'No', 'Yes']\n", + "[NaN, 'Yes', 'No']\n", + "Categories (2, object): ['No', 'Yes']\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_19584\\1073322024.py:3: FutureWarning: The behavior of Series.replace (and DataFrame.replace) with CategoricalDtype is deprecated. In a future version, replace will only be used for cases that preserve the categories. To change the categories, use ser.cat.rename_categories instead.\n", + " bd['Risk_stigma'] = bd['Risk_stigma'].replace(99.0, mode_stigma)\n" + ] + } + ], "source": [ "print(bd['Risk_stigma'].unique())\n", "mode_stigma = bd['Risk_stigma'].mode()[0]\n", @@ -202,9 +273,18 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 144, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[nan 0. 1. 2. 3. 4. 5. 8. 10. 6. 11. 12. 9. 7. 99. 14. 15.]\n", + "[nan 0. 1. 2. 3. 4. 5. 8. 10. 6. 11. 12. 9. 7. 14. 15.]\n" + ] + } + ], "source": [ "print(bd['NumHijos'].unique())\n", "mode_hijos = bd['NumHijos'].mode()[0]\n", @@ -252,9 +332,44 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 145, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_19584\\3303146707.py:2: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", + "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", + "\n", + "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", + "\n", + "\n", + " bd['Age'].fillna(age_mode, inplace=True)\n", + "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_19584\\3303146707.py:5: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", + "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", + "\n", + "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", + "\n", + "\n", + " bd['Años_consumo_droga'].fillna(años_consumo_mode, inplace=True)\n", + "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_19584\\3303146707.py:8: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", + "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", + "\n", + "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", + "\n", + "\n", + " bd['Risk_stigma'].fillna(risk_stigma_mode, inplace=True)\n", + "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_19584\\3303146707.py:11: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", + "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", + "\n", + "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", + "\n", + "\n", + " bd['NumHijos'].fillna(num_hijos_mode, inplace=True)\n" + ] + } + ], "source": [ "age_mode = bd['Age'].mode()[0]\n", "bd['Age'].fillna(age_mode, inplace=True)\n", @@ -552,7 +667,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 146, "metadata": {}, "outputs": [], "source": [ @@ -629,7 +744,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 127, "metadata": {}, "outputs": [], "source": [ @@ -643,7 +758,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 128, "metadata": {}, "outputs": [], "source": [ @@ -660,7 +775,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 147, "metadata": {}, "outputs": [], "source": [ @@ -695,7 +810,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 148, "metadata": {}, "outputs": [], "source": [ @@ -742,7 +857,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 149, "metadata": {}, "outputs": [], "source": [ @@ -754,9 +869,17 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 150, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['Ed_Not_Complete_Primary', 'Ed_Primary', 'Ed_Secondary', 'Ed_Secondary_Technical', 'Ed_Tertiary', 'Social_Protection', 'JobIn_Unstable', 'JobIn_Stable', 'JobIn_Unemployed', 'Hous_Institutional', 'Hous_Stable', 'Hous_Unstable', 'Early_Alterations', 'SocInc_Family_Friends', 'SocInc_Alone', 'SocInc_Instit', 'Risk_Stigma', 'Structural_Conflict', 'age', 'Sex', 'Num_Children', 'Smoking', 'Bio_Vulner', 'Opiods_DXCIE', 'Cannabis_DXCIE', 'BZD_DXCIE', 'Cocaine_DXCIE', 'Hallucin_DXCIE', 'Tobacco_DXCIE', 'Freq_1dpw', 'Freq_2-3dpw', 'Freq_4-6dpw', 'Freq_l1dpw', 'Freq_None', 'Freq_Everyday', 'Years_Drug_Use', 'Other_Psychiatric_DX', 'Previous_Treatments', 'Treatment_Adherence']\n" + ] + } + ], "source": [ "name_mapping = {\n", " 'Ed_Not Complete primary school': 'Ed_Not_Complete_Primary',\n", @@ -777,7 +900,7 @@ " 'SocInc_live in institutions': 'SocInc_Instit',\n", " 'Risk_stigma_REDEF': 'Risk_Stigma',\n", " 'Structural_conflic': 'Structural_Conflict',\n", - " 'Age': 'Age',\n", + " # 'Age': 'Age',\n", " 'Sex_REDEF': 'Sex',\n", " 'NumHijos': 'Num_Children',\n", " 'Smoking_REDEF': 'Smoking',\n", @@ -802,13 +925,26 @@ "\n", "# Update lists of feature names\n", "corr_cols = [name_mapping[corr_col] for corr_col in corr_cols]\n", + "print(corr_cols)\n", "soc_vars_enc = [name_mapping[col] for col in soc_vars_enc]\n", - "ind_vars_enc = [name_mapping[col] for col in ind_vars_enc]" + "ind_vars_enc = [name_mapping[col] for col in ind_vars_enc]\n", + "\n", + "bd = bd.rename(columns=name_mapping)" + ] + }, + { + "cell_type": "code", + "execution_count": 133, + "metadata": {}, + "outputs": [], + "source": [ + "# Create bd with just corr_cols and target\n", + "bd = bd[corr_cols + ['Situacion_tratamiento','Situacion_tratamiento_REDEF', 'Pandemia_inicio_fin_tratamiento']]" ] }, { "cell_type": "code", - "execution_count": 37, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -827,7 +963,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 134, "metadata": {}, "outputs": [], "source": [ @@ -857,12 +993,105 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "binary_vars = [col for col in corr_cols if len(bd[col].unique()) == 2] + ['Situacion_tratamiento_REDEF', 'Risk_stigma_REDEF']\n", - "cont_vars = ['Structural_conflic', 'Age', 'NumHijos', 'Años_consumo_droga', 'Adherencia_tto_recalc']" + "execution_count": 135, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index(['Ed_Not_Complete_Primary', 'Ed_Primary', 'Ed_Secondary',\n", + " 'Ed_Secondary_Technical', 'Ed_Tertiary', 'Social_Protection',\n", + " 'JobIn_Unstable', 'JobIn_Stable', 'JobIn_Unemployed',\n", + " 'Hous_Institutional', 'Hous_Stable', 'Hous_Unstable',\n", + " 'Early_Alterations', 'SocInc_Family_Friends', 'SocInc_Alone',\n", + " 'SocInc_Instit', 'Risk_Stigma', 'Structural_Conflict', 'age', 'Sex',\n", + " 'Sex', 'Num_Children', 'Smoking', 'Smoking', 'Bio_Vulner',\n", + " 'Opiods_DXCIE', 'Cannabis_DXCIE', 'Cannabis_DXCIE', 'BZD_DXCIE',\n", + " 'Cocaine_DXCIE', 'Hallucin_DXCIE', 'Tobacco_DXCIE', 'Freq_1dpw',\n", + " 'Freq_2-3dpw', 'Freq_4-6dpw', 'Freq_l1dpw', 'Freq_None',\n", + " 'Freq_Everyday', 'Years_Drug_Use', 'Other_Psychiatric_DX',\n", + " 'Previous_Treatments', 'Treatment_Adherence', 'Situacion_tratamiento',\n", + " 'Situacion_tratamiento_REDEF', 'Pandemia_inicio_fin_tratamiento'],\n", + " dtype='object')\n" + ] + } + ], + "source": [ + "print(bd.columns)" + ] + }, + { + "cell_type": "code", + "execution_count": 137, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Ed_Not_Complete_Primary\n", + "2\n", + "Ed_Primary\n", + "2\n", + "Ed_Secondary\n", + "2\n", + "Ed_Secondary_Technical\n", + "2\n", + "Ed_Tertiary\n", + "2\n", + "Social_Protection\n", + "2\n", + "JobIn_Unstable\n", + "2\n", + "JobIn_Stable\n", + "2\n", + "JobIn_Unemployed\n", + "2\n", + "Hous_Institutional\n", + "2\n", + "Hous_Stable\n", + "2\n", + "Hous_Unstable\n", + "2\n", + "Early_Alterations\n", + "2\n", + "SocInc_Family_Friends\n", + "2\n", + "SocInc_Alone\n", + "2\n", + "SocInc_Instit\n", + "2\n", + "Risk_Stigma\n", + "2\n", + "Structural_Conflict\n", + "107\n", + "age\n", + "74\n", + "Sex\n" + ] + }, + { + "ename": "AttributeError", + "evalue": "'DataFrame' object has no attribute 'unique'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m~\\AppData\\Local\\Temp\\ipykernel_19584\\340002156.py\u001b[0m in \u001b[0;36m?\u001b[1;34m()\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;31m# print(len(bd['Cocaine_DXCIE'].unique()) == 2)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mcol\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mcorr_cols\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcol\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 5\u001b[1;33m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mbd\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mcol\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0munique\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 6\u001b[0m \u001b[1;31m#binary_vars = [col for col in corr_cols if len(bd[col].unique()) == 2] + ['Situacion_tratamiento_REDEF', name_mapping['Risk_stigma_REDEF']]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 7\u001b[0m \u001b[1;31m#cont_vars = [name_mapping[col] for col in ['Structural_conflic', 'Age', 'NumHijos', 'Años_consumo_droga', 'Adherencia_tto_recalc']]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mc:\\Users\\Joaquín Torres\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\pandas\\core\\generic.py\u001b[0m in \u001b[0;36m?\u001b[1;34m(self, name)\u001b[0m\n\u001b[0;32m 6292\u001b[0m \u001b[1;32mand\u001b[0m \u001b[0mname\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_accessors\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 6293\u001b[0m \u001b[1;32mand\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_info_axis\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_can_hold_identifiers_and_holds_name\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mname\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 6294\u001b[0m \u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 6295\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mname\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 6296\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mobject\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__getattribute__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mname\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[1;31mAttributeError\u001b[0m: 'DataFrame' object has no attribute 'unique'" + ] + } + ], + "source": [ + "# print(len(bd['Cocaine_DXCIE'].unique()) == 2)\n", + "\n", + "for col in corr_cols:\n", + " print(col)\n", + " print(len(bd[col].unique()))\n", + "#binary_vars = [col for col in corr_cols if len(bd[col].unique()) == 2] + ['Situacion_tratamiento_REDEF', name_mapping['Risk_stigma_REDEF']]\n", + "#cont_vars = [name_mapping[col] for col in ['Structural_conflic', 'Age', 'NumHijos', 'Años_consumo_droga', 'Adherencia_tto_recalc']]" ] }, {