diff --git a/EDA/EDA.ipynb b/EDA/EDA.ipynb index 67ef1f8fd5297ba81c7c5d9724ec78fcfc0a9ff6..514304f4530e247401aa6c54fc2a1ac17c657cd4 100644 --- a/EDA/EDA.ipynb +++ b/EDA/EDA.ipynb @@ -17,7 +17,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -41,7 +41,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -56,28 +56,9 @@ }, { "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_10184\\2495984927.py:18: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " conj_post['Group'] = 'Post'\n", - "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_10184\\2495984927.py:19: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " conj_pre['Group'] = 'Pre'\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# Pre-pandemic\n", "conj_pre = bd[bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio y fin prepandemia']\n", @@ -103,22 +84,9 @@ }, { "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "PRE: 22861\n", - "\tALTA: 2792\n", - "\tABANDONO: 20069\n", - "POST: 10677\n", - "\tALTA: 1882\n", - "\tABANDONO: 8795\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# Printing size of different datasets\n", "print(f\"PRE: {len(conj_pre)}\")\n", @@ -132,286 +100,9 @@ }, { "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "PRE\n", - "\n", - "Index: 22861 entries, 0 to 85164\n", - "Data columns (total 35 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 CODPROYECTO 22861 non-null float64 \n", - " 1 Education 22861 non-null object \n", - " 2 Social_protection 22861 non-null object \n", - " 3 Job_insecurity 22861 non-null object \n", - " 4 Housing 22861 non-null object \n", - " 5 Alterations_early_childhood_develop 22861 non-null object \n", - " 6 Social_inclusion 22861 non-null object \n", - " 7 Risk_stigma 21606 non-null category\n", - " 8 Structural_conflic 22861 non-null float64 \n", - " 9 Age 22852 non-null float64 \n", - " 10 Sex 22861 non-null object \n", - " 11 NumHijos 21647 non-null float64 \n", - " 12 Smoking 22861 non-null object \n", - " 13 Biological_vulnerability 22861 non-null object \n", - " 14 Alcohol_DxCIE 22861 non-null object \n", - " 15 Opiaceos_DxCIE 22861 non-null object \n", - " 16 Cannabis_DXCIE 22861 non-null object \n", - " 17 BZD_DxCIE 22861 non-null object \n", - " 18 Cocaina_DxCIE 22861 non-null object \n", - " 19 Alucinogenos_DXCIE 22861 non-null object \n", - " 20 Tabaco_DXCIE 22861 non-null object \n", - " 21 FrecuenciaConsumo30Dias 22861 non-null object \n", - " 22 Años_consumo_droga 22342 non-null float64 \n", - " 23 OtrosDx_Psiquiatrico 22861 non-null object \n", - " 24 Tx_previos 22861 non-null object \n", - " 25 Adherencia_tto_recalc 22861 non-null float64 \n", - " 26 Tiempo_tx 22861 non-null float64 \n", - " 27 Readmisiones_estudios 22861 non-null object \n", - " 28 Situacion_tratamiento 22861 non-null object \n", - " 29 Periodos_COVID 22861 non-null object \n", - " 30 Pandemia_inicio_fin_tratamiento 22861 non-null object \n", - " 31 Nreadmision 22861 non-null float64 \n", - " 32 Readmisiones_PRECOVID 22861 non-null float64 \n", - " 33 Readmisiones_COVID 22861 non-null float64 \n", - " 34 Group 22861 non-null object \n", - "dtypes: category(1), float64(10), object(24)\n", - "memory usage: 6.1+ MB\n", - "None\n", - "-------------------------------\n", - "PRE-ABANDONO\n", - "\n", - "Index: 20069 entries, 0 to 85164\n", - "Data columns (total 34 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 CODPROYECTO 20069 non-null float64 \n", - " 1 Education 20069 non-null object \n", - " 2 Social_protection 20069 non-null object \n", - " 3 Job_insecurity 20069 non-null object \n", - " 4 Housing 20069 non-null object \n", - " 5 Alterations_early_childhood_develop 20069 non-null object \n", - " 6 Social_inclusion 20069 non-null object \n", - " 7 Risk_stigma 18919 non-null category\n", - " 8 Structural_conflic 20069 non-null float64 \n", - " 9 Age 20061 non-null float64 \n", - " 10 Sex 20069 non-null object \n", - " 11 NumHijos 18958 non-null float64 \n", - " 12 Smoking 20069 non-null object \n", - " 13 Biological_vulnerability 20069 non-null object \n", - " 14 Alcohol_DxCIE 20069 non-null object \n", - " 15 Opiaceos_DxCIE 20069 non-null object \n", - " 16 Cannabis_DXCIE 20069 non-null object \n", - " 17 BZD_DxCIE 20069 non-null object \n", - " 18 Cocaina_DxCIE 20069 non-null object \n", - " 19 Alucinogenos_DXCIE 20069 non-null object \n", - " 20 Tabaco_DXCIE 20069 non-null object \n", - " 21 FrecuenciaConsumo30Dias 20069 non-null object \n", - " 22 Años_consumo_droga 19609 non-null float64 \n", - " 23 OtrosDx_Psiquiatrico 20069 non-null object \n", - " 24 Tx_previos 20069 non-null object \n", - " 25 Adherencia_tto_recalc 20069 non-null float64 \n", - " 26 Tiempo_tx 20069 non-null float64 \n", - " 27 Readmisiones_estudios 20069 non-null object \n", - " 28 Situacion_tratamiento 20069 non-null object \n", - " 29 Periodos_COVID 20069 non-null object \n", - " 30 Pandemia_inicio_fin_tratamiento 20069 non-null object \n", - " 31 Nreadmision 20069 non-null float64 \n", - " 32 Readmisiones_PRECOVID 20069 non-null float64 \n", - " 33 Readmisiones_COVID 20069 non-null float64 \n", - "dtypes: category(1), float64(10), object(23)\n", - "memory usage: 5.2+ MB\n", - "None\n", - "-------------------------------\n", - "PRE-ALTA\n", - "\n", - "Index: 2792 entries, 23 to 85159\n", - "Data columns (total 34 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 CODPROYECTO 2792 non-null float64 \n", - " 1 Education 2792 non-null object \n", - " 2 Social_protection 2792 non-null object \n", - " 3 Job_insecurity 2792 non-null object \n", - " 4 Housing 2792 non-null object \n", - " 5 Alterations_early_childhood_develop 2792 non-null object \n", - " 6 Social_inclusion 2792 non-null object \n", - " 7 Risk_stigma 2687 non-null category\n", - " 8 Structural_conflic 2792 non-null float64 \n", - " 9 Age 2791 non-null float64 \n", - " 10 Sex 2792 non-null object \n", - " 11 NumHijos 2689 non-null float64 \n", - " 12 Smoking 2792 non-null object \n", - " 13 Biological_vulnerability 2792 non-null object \n", - " 14 Alcohol_DxCIE 2792 non-null object \n", - " 15 Opiaceos_DxCIE 2792 non-null object \n", - " 16 Cannabis_DXCIE 2792 non-null object \n", - " 17 BZD_DxCIE 2792 non-null object \n", - " 18 Cocaina_DxCIE 2792 non-null object \n", - " 19 Alucinogenos_DXCIE 2792 non-null object \n", - " 20 Tabaco_DXCIE 2792 non-null object \n", - " 21 FrecuenciaConsumo30Dias 2792 non-null object \n", - " 22 Años_consumo_droga 2733 non-null float64 \n", - " 23 OtrosDx_Psiquiatrico 2792 non-null object \n", - " 24 Tx_previos 2792 non-null object \n", - " 25 Adherencia_tto_recalc 2792 non-null float64 \n", - " 26 Tiempo_tx 2792 non-null float64 \n", - " 27 Readmisiones_estudios 2792 non-null object \n", - " 28 Situacion_tratamiento 2792 non-null object \n", - " 29 Periodos_COVID 2792 non-null object \n", - " 30 Pandemia_inicio_fin_tratamiento 2792 non-null object \n", - " 31 Nreadmision 2792 non-null float64 \n", - " 32 Readmisiones_PRECOVID 2792 non-null float64 \n", - " 33 Readmisiones_COVID 2792 non-null float64 \n", - "dtypes: category(1), float64(10), object(23)\n", - "memory usage: 744.5+ KB\n", - "None\n", - "-------------------------------\n", - "\n", - "\n", - "\n", - "\n", - "POST\n", - "\n", - "Index: 10677 entries, 11 to 85156\n", - "Data columns (total 35 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 CODPROYECTO 10677 non-null float64 \n", - " 1 Education 10677 non-null object \n", - " 2 Social_protection 10677 non-null object \n", - " 3 Job_insecurity 10677 non-null object \n", - " 4 Housing 10677 non-null object \n", - " 5 Alterations_early_childhood_develop 10677 non-null object \n", - " 6 Social_inclusion 10677 non-null object \n", - " 7 Risk_stigma 10085 non-null category\n", - " 8 Structural_conflic 10677 non-null float64 \n", - " 9 Age 10676 non-null float64 \n", - " 10 Sex 10677 non-null object \n", - " 11 NumHijos 10103 non-null float64 \n", - " 12 Smoking 10677 non-null object \n", - " 13 Biological_vulnerability 10677 non-null object \n", - " 14 Alcohol_DxCIE 10677 non-null object \n", - " 15 Opiaceos_DxCIE 10677 non-null object \n", - " 16 Cannabis_DXCIE 10677 non-null object \n", - " 17 BZD_DxCIE 10677 non-null object \n", - " 18 Cocaina_DxCIE 10677 non-null object \n", - " 19 Alucinogenos_DXCIE 10677 non-null object \n", - " 20 Tabaco_DXCIE 10677 non-null object \n", - " 21 FrecuenciaConsumo30Dias 10677 non-null object \n", - " 22 Años_consumo_droga 10478 non-null float64 \n", - " 23 OtrosDx_Psiquiatrico 10677 non-null object \n", - " 24 Tx_previos 10677 non-null object \n", - " 25 Adherencia_tto_recalc 10677 non-null float64 \n", - " 26 Tiempo_tx 10677 non-null float64 \n", - " 27 Readmisiones_estudios 10677 non-null object \n", - " 28 Situacion_tratamiento 10677 non-null object \n", - " 29 Periodos_COVID 10677 non-null object \n", - " 30 Pandemia_inicio_fin_tratamiento 10677 non-null object \n", - " 31 Nreadmision 10677 non-null float64 \n", - " 32 Readmisiones_PRECOVID 10677 non-null float64 \n", - " 33 Readmisiones_COVID 10677 non-null float64 \n", - " 34 Group 10677 non-null object \n", - "dtypes: category(1), float64(10), object(24)\n", - "memory usage: 2.9+ MB\n", - "None\n", - "-------------------------------\n", - "POST-ABANDONO\n", - "\n", - "Index: 8795 entries, 11 to 85156\n", - "Data columns (total 34 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 CODPROYECTO 8795 non-null float64 \n", - " 1 Education 8795 non-null object \n", - " 2 Social_protection 8795 non-null object \n", - " 3 Job_insecurity 8795 non-null object \n", - " 4 Housing 8795 non-null object \n", - " 5 Alterations_early_childhood_develop 8795 non-null object \n", - " 6 Social_inclusion 8795 non-null object \n", - " 7 Risk_stigma 8308 non-null category\n", - " 8 Structural_conflic 8795 non-null float64 \n", - " 9 Age 8794 non-null float64 \n", - " 10 Sex 8795 non-null object \n", - " 11 NumHijos 8325 non-null float64 \n", - " 12 Smoking 8795 non-null object \n", - " 13 Biological_vulnerability 8795 non-null object \n", - " 14 Alcohol_DxCIE 8795 non-null object \n", - " 15 Opiaceos_DxCIE 8795 non-null object \n", - " 16 Cannabis_DXCIE 8795 non-null object \n", - " 17 BZD_DxCIE 8795 non-null object \n", - " 18 Cocaina_DxCIE 8795 non-null object \n", - " 19 Alucinogenos_DXCIE 8795 non-null object \n", - " 20 Tabaco_DXCIE 8795 non-null object \n", - " 21 FrecuenciaConsumo30Dias 8795 non-null object \n", - " 22 Años_consumo_droga 8627 non-null float64 \n", - " 23 OtrosDx_Psiquiatrico 8795 non-null object \n", - " 24 Tx_previos 8795 non-null object \n", - " 25 Adherencia_tto_recalc 8795 non-null float64 \n", - " 26 Tiempo_tx 8795 non-null float64 \n", - " 27 Readmisiones_estudios 8795 non-null object \n", - " 28 Situacion_tratamiento 8795 non-null object \n", - " 29 Periodos_COVID 8795 non-null object \n", - " 30 Pandemia_inicio_fin_tratamiento 8795 non-null object \n", - " 31 Nreadmision 8795 non-null float64 \n", - " 32 Readmisiones_PRECOVID 8795 non-null float64 \n", - " 33 Readmisiones_COVID 8795 non-null float64 \n", - "dtypes: category(1), float64(10), object(23)\n", - "memory usage: 2.3+ MB\n", - "None\n", - "-------------------------------\n", - "POST-ALTA\n", - "\n", - "Index: 1882 entries, 258 to 85149\n", - "Data columns (total 34 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 CODPROYECTO 1882 non-null float64 \n", - " 1 Education 1882 non-null object \n", - " 2 Social_protection 1882 non-null object \n", - " 3 Job_insecurity 1882 non-null object \n", - " 4 Housing 1882 non-null object \n", - " 5 Alterations_early_childhood_develop 1882 non-null object \n", - " 6 Social_inclusion 1882 non-null object \n", - " 7 Risk_stigma 1777 non-null category\n", - " 8 Structural_conflic 1882 non-null float64 \n", - " 9 Age 1882 non-null float64 \n", - " 10 Sex 1882 non-null object \n", - " 11 NumHijos 1778 non-null float64 \n", - " 12 Smoking 1882 non-null object \n", - " 13 Biological_vulnerability 1882 non-null object \n", - " 14 Alcohol_DxCIE 1882 non-null object \n", - " 15 Opiaceos_DxCIE 1882 non-null object \n", - " 16 Cannabis_DXCIE 1882 non-null object \n", - " 17 BZD_DxCIE 1882 non-null object \n", - " 18 Cocaina_DxCIE 1882 non-null object \n", - " 19 Alucinogenos_DXCIE 1882 non-null object \n", - " 20 Tabaco_DXCIE 1882 non-null object \n", - " 21 FrecuenciaConsumo30Dias 1882 non-null object \n", - " 22 Años_consumo_droga 1851 non-null float64 \n", - " 23 OtrosDx_Psiquiatrico 1882 non-null object \n", - " 24 Tx_previos 1882 non-null object \n", - " 25 Adherencia_tto_recalc 1882 non-null float64 \n", - " 26 Tiempo_tx 1882 non-null float64 \n", - " 27 Readmisiones_estudios 1882 non-null object \n", - " 28 Situacion_tratamiento 1882 non-null object \n", - " 29 Periodos_COVID 1882 non-null object \n", - " 30 Pandemia_inicio_fin_tratamiento 1882 non-null object \n", - " 31 Nreadmision 1882 non-null float64 \n", - " 32 Readmisiones_PRECOVID 1882 non-null float64 \n", - " 33 Readmisiones_COVID 1882 non-null float64 \n", - "dtypes: category(1), float64(10), object(23)\n", - "memory usage: 501.9+ KB\n", - "None\n", - "-------------------------------\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "print(\"PRE\")\n", "print(conj_pre.info())\n", @@ -445,36 +136,9 @@ }, { "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['Live with families or friends' 'live alone' 'live in institutions' '9.0']\n", - "['Live with families or friends' 'live alone' 'live in institutions']\n", - "['No alterations (first exposure at 11 or more years)'\n", - " 'Alterations (first exposure before 11 years old)' '9']\n", - "['No alterations (first exposure at 11 or more years)'\n", - " 'Alterations (first exposure before 11 years old)']\n", - "[NaN, 'Yes', 'No']\n", - "Categories (3, object): [99.0, 'No', 'Yes']\n", - "[NaN, 'Yes', 'No']\n", - "Categories (2, object): ['No', 'Yes']\n", - "[nan 0. 1. 2. 3. 4. 5. 8. 10. 6. 11. 12. 9. 7. 99. 14. 15.]\n", - "[nan 0. 1. 2. 3. 4. 5. 8. 10. 6. 11. 12. 9. 7. 14. 15.]\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_10184\\1003504044.py:14: FutureWarning: The behavior of Series.replace (and DataFrame.replace) with CategoricalDtype is deprecated. In a future version, replace will only be used for cases that preserve the categories. To change the categories, use ser.cat.rename_categories instead.\n", - " bd['Risk_stigma'] = bd['Risk_stigma'].replace(99.0, mode_stigma)\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# 9.0 represents unknown according to Variables.docx \n", "print(bd['Social_inclusion'].unique())\n", @@ -500,30 +164,9 @@ }, { "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Total missing values Age: 10\n", - "Total missing values Años_consumo_droga: 718\n", - "Total missing values Risk_stigma: 1847\n", - "Total missing values NumHijos: 1788\n", - "\tCONJUNTO PREPANDEMIA\n", - "\t\tMissing values Age: 9\n", - "\t\tMissing values Años_consumo_droga: 519\n", - "\t\tMissing values Risk_stigma: 1255\n", - "\t\tMissing values NumHijos: 1214\n", - "\tCONJUNTO POSTPANDEMIA\n", - "\t\tMissing values Age: 1\n", - "\t\tMissing values Años_consumo_droga: 199\n", - "\t\tMissing values Risk_stigma: 592\n", - "\t\tMissing values NumHijos: 574\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "print(f\"Total missing values Age: {bd['Age'].isnull().sum()}\")\n", "print(f\"Total missing values Años_consumo_droga: {bd['Años_consumo_droga'].isnull().sum()}\")\n", @@ -545,44 +188,9 @@ }, { "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_10184\\3303146707.py:2: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", - "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", - "\n", - "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", - "\n", - "\n", - " bd['Age'].fillna(age_mode, inplace=True)\n", - "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_10184\\3303146707.py:5: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", - "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", - "\n", - "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", - "\n", - "\n", - " bd['Años_consumo_droga'].fillna(años_consumo_mode, inplace=True)\n", - "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_10184\\3303146707.py:8: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", - "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", - "\n", - "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", - "\n", - "\n", - " bd['Risk_stigma'].fillna(risk_stigma_mode, inplace=True)\n", - "C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_10184\\3303146707.py:11: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", - "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", - "\n", - "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", - "\n", - "\n", - " bd['NumHijos'].fillna(num_hijos_mode, inplace=True)\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "age_mode = bd['Age'].mode()[0]\n", "bd['Age'].fillna(age_mode, inplace=True)\n", @@ -873,7 +481,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -904,7 +512,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -981,7 +589,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1014,7 +622,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1054,7 +662,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1073,7 +681,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1083,7 +691,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1140,7 +748,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ diff --git a/explainability/compute_shap_inter_vals.py b/explainability/compute_shap_inter_vals.py index 36fe890ba279c690b6755774c64dc2895bf1deee..a16d8c3ab43a521db465aa428e57fc016ccbf2ea 100644 --- a/explainability/compute_shap_inter_vals.py +++ b/explainability/compute_shap_inter_vals.py @@ -36,7 +36,7 @@ if __name__ == "__main__": # Setup # -------------------------------------------------------------------------------------------------------- # Retrieve attribute names in order - attribute_names = list(np.load('../EDA/output/feature_names/feature_names.npy', allow_pickle=True)) + attribute_names = list(np.load('../EDA/output/feature_names/all_features.npy', allow_pickle=True)) # Reading data data_dic = read_test_data(attribute_names) method_names = { diff --git a/explainability/compute_shap_vals.py b/explainability/compute_shap_vals.py index c34b8ee6eab817b7e5af7f6fd9d4149c80f35a56..4b18213181ad61e753939345b025635a1a03bc80 100644 --- a/explainability/compute_shap_vals.py +++ b/explainability/compute_shap_vals.py @@ -36,7 +36,7 @@ if __name__ == "__main__": # Setup # -------------------------------------------------------------------------------------------------------- # Retrieve attribute names in order - attribute_names = list(np.load('../EDA/output/feature_names/feature_names.npy', allow_pickle=True)) + attribute_names = list(np.load('../EDA/output/feature_names/all_features.npy', allow_pickle=True)) # Reading data data_dic = read_test_data(attribute_names) method_names = { diff --git a/explainability/fit_final_models.py b/explainability/fit_final_models.py index 39206d433592d5bb1e34594847e8d8de55caa747..650500acbb628d8001fc8a000407ff59b6f41b0a 100644 --- a/explainability/fit_final_models.py +++ b/explainability/fit_final_models.py @@ -111,7 +111,7 @@ if __name__ == "__main__": # Setup # -------------------------------------------------------------------------------------------------------- # Retrieve attribute names in order - attribute_names = list(np.load('../EDA/output/feature_names/feature_names.npy', allow_pickle=True)) + attribute_names = list(np.load('../EDA/output/feature_names/all_features.npy', allow_pickle=True)) # Reading data data_dic = read_training_data(attribute_names) method_names = { diff --git a/explainability/shap_plots.ipynb b/explainability/shap_plots.ipynb index 91ec6ed1c5537461ce7e7e58cd7f7e07f6026873..8e3fcd30dc36cbaa4d137db0104b77dd20f72290 100644 --- a/explainability/shap_plots.ipynb +++ b/explainability/shap_plots.ipynb @@ -35,7 +35,7 @@ "outputs": [], "source": [ "# Retrieve attribute names in order\n", - "attribute_names = attribute_names = list(np.load('../EDA/output/feature_names/feature_names.npy', allow_pickle=True))\n", + "attribute_names = attribute_names = list(np.load('../EDA/output/feature_names/all_features.npy', allow_pickle=True))\n", "\n", "# Load test data\n", "X_test_pre = np.load('../gen_train_data/output/pre/X_test_pre.npy', allow_pickle=True)\n", @@ -71,8 +71,8 @@ " \"UNDER\": \"XGB\"\n", "}\n", "\n", - "soc_var_names = np.load('../EDA/output/feature_names/soc_vars_names.npy', allow_pickle=True)\n", - "ind_var_names = np.load('../EDA/output/feature_names/ind_vars_names.npy', allow_pickle=True)" + "soc_var_names = np.load('../EDA/output/feature_names/social_factors.npy', allow_pickle=True)\n", + "ind_var_names = np.load('../EDA/output/feature_names/individual_factors.npy', allow_pickle=True)" ] }, {