Commit c32883c4 authored by Joaquin Torres's avatar Joaquin Torres

Updated paths EDA

parent 2460bf0c
......@@ -16,7 +16,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
......@@ -48,11 +48,11 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"bd_all = pd.read_spss('17_abril.sav')\n",
"bd_all = pd.read_spss('./input/17_abril.sav')\n",
"\n",
"# Filter the dataset to work only with alcohol patients\n",
"bd = bd_all[bd_all['Alcohol_DxCIE'] == 'Sí']\n",
......@@ -70,28 +70,9 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_11540\\2495984927.py:18: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" conj_post['Group'] = 'Post'\n",
"C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_11540\\2495984927.py:19: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" conj_pre['Group'] = 'Pre'\n"
]
}
],
"outputs": [],
"source": [
"# Pre-pandemic\n",
"conj_pre = bd[bd['Pandemia_inicio_fin_tratamiento'] == 'Inicio y fin prepandemia']\n",
......@@ -117,22 +98,9 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"PRE: 22861\n",
"\tALTA: 2792\n",
"\tABANDONO: 20069\n",
"POST: 10677\n",
"\tALTA: 1882\n",
"\tABANDONO: 8795\n"
]
}
],
"outputs": [],
"source": [
"# Printing size of different datasets\n",
"print(f\"PRE: {len(conj_pre)}\")\n",
......@@ -160,286 +128,9 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"PRE\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 22861 entries, 0 to 85164\n",
"Data columns (total 35 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 CODPROYECTO 22861 non-null float64 \n",
" 1 Education 22861 non-null object \n",
" 2 Social_protection 22861 non-null object \n",
" 3 Job_insecurity 22861 non-null object \n",
" 4 Housing 22861 non-null object \n",
" 5 Alterations_early_childhood_develop 22861 non-null object \n",
" 6 Social_inclusion 22861 non-null object \n",
" 7 Risk_stigma 21606 non-null category\n",
" 8 Structural_conflic 22861 non-null float64 \n",
" 9 Age 22852 non-null float64 \n",
" 10 Sex 22861 non-null object \n",
" 11 NumHijos 21647 non-null float64 \n",
" 12 Smoking 22861 non-null object \n",
" 13 Biological_vulnerability 22861 non-null object \n",
" 14 Alcohol_DxCIE 22861 non-null object \n",
" 15 Opiaceos_DxCIE 22861 non-null object \n",
" 16 Cannabis_DXCIE 22861 non-null object \n",
" 17 BZD_DxCIE 22861 non-null object \n",
" 18 Cocaina_DxCIE 22861 non-null object \n",
" 19 Alucinogenos_DXCIE 22861 non-null object \n",
" 20 Tabaco_DXCIE 22861 non-null object \n",
" 21 FrecuenciaConsumo30Dias 22861 non-null object \n",
" 22 Años_consumo_droga 22342 non-null float64 \n",
" 23 OtrosDx_Psiquiatrico 22861 non-null object \n",
" 24 Tx_previos 22861 non-null object \n",
" 25 Adherencia_tto_recalc 22861 non-null float64 \n",
" 26 Tiempo_tx 22861 non-null float64 \n",
" 27 Readmisiones_estudios 22861 non-null object \n",
" 28 Situacion_tratamiento 22861 non-null object \n",
" 29 Periodos_COVID 22861 non-null object \n",
" 30 Pandemia_inicio_fin_tratamiento 22861 non-null object \n",
" 31 Nreadmision 22861 non-null float64 \n",
" 32 Readmisiones_PRECOVID 22861 non-null float64 \n",
" 33 Readmisiones_COVID 22861 non-null float64 \n",
" 34 Group 22861 non-null object \n",
"dtypes: category(1), float64(10), object(24)\n",
"memory usage: 6.1+ MB\n",
"None\n",
"-------------------------------\n",
"PRE-ABANDONO\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 20069 entries, 0 to 85164\n",
"Data columns (total 34 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 CODPROYECTO 20069 non-null float64 \n",
" 1 Education 20069 non-null object \n",
" 2 Social_protection 20069 non-null object \n",
" 3 Job_insecurity 20069 non-null object \n",
" 4 Housing 20069 non-null object \n",
" 5 Alterations_early_childhood_develop 20069 non-null object \n",
" 6 Social_inclusion 20069 non-null object \n",
" 7 Risk_stigma 18919 non-null category\n",
" 8 Structural_conflic 20069 non-null float64 \n",
" 9 Age 20061 non-null float64 \n",
" 10 Sex 20069 non-null object \n",
" 11 NumHijos 18958 non-null float64 \n",
" 12 Smoking 20069 non-null object \n",
" 13 Biological_vulnerability 20069 non-null object \n",
" 14 Alcohol_DxCIE 20069 non-null object \n",
" 15 Opiaceos_DxCIE 20069 non-null object \n",
" 16 Cannabis_DXCIE 20069 non-null object \n",
" 17 BZD_DxCIE 20069 non-null object \n",
" 18 Cocaina_DxCIE 20069 non-null object \n",
" 19 Alucinogenos_DXCIE 20069 non-null object \n",
" 20 Tabaco_DXCIE 20069 non-null object \n",
" 21 FrecuenciaConsumo30Dias 20069 non-null object \n",
" 22 Años_consumo_droga 19609 non-null float64 \n",
" 23 OtrosDx_Psiquiatrico 20069 non-null object \n",
" 24 Tx_previos 20069 non-null object \n",
" 25 Adherencia_tto_recalc 20069 non-null float64 \n",
" 26 Tiempo_tx 20069 non-null float64 \n",
" 27 Readmisiones_estudios 20069 non-null object \n",
" 28 Situacion_tratamiento 20069 non-null object \n",
" 29 Periodos_COVID 20069 non-null object \n",
" 30 Pandemia_inicio_fin_tratamiento 20069 non-null object \n",
" 31 Nreadmision 20069 non-null float64 \n",
" 32 Readmisiones_PRECOVID 20069 non-null float64 \n",
" 33 Readmisiones_COVID 20069 non-null float64 \n",
"dtypes: category(1), float64(10), object(23)\n",
"memory usage: 5.2+ MB\n",
"None\n",
"-------------------------------\n",
"PRE-ALTA\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 2792 entries, 23 to 85159\n",
"Data columns (total 34 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 CODPROYECTO 2792 non-null float64 \n",
" 1 Education 2792 non-null object \n",
" 2 Social_protection 2792 non-null object \n",
" 3 Job_insecurity 2792 non-null object \n",
" 4 Housing 2792 non-null object \n",
" 5 Alterations_early_childhood_develop 2792 non-null object \n",
" 6 Social_inclusion 2792 non-null object \n",
" 7 Risk_stigma 2687 non-null category\n",
" 8 Structural_conflic 2792 non-null float64 \n",
" 9 Age 2791 non-null float64 \n",
" 10 Sex 2792 non-null object \n",
" 11 NumHijos 2689 non-null float64 \n",
" 12 Smoking 2792 non-null object \n",
" 13 Biological_vulnerability 2792 non-null object \n",
" 14 Alcohol_DxCIE 2792 non-null object \n",
" 15 Opiaceos_DxCIE 2792 non-null object \n",
" 16 Cannabis_DXCIE 2792 non-null object \n",
" 17 BZD_DxCIE 2792 non-null object \n",
" 18 Cocaina_DxCIE 2792 non-null object \n",
" 19 Alucinogenos_DXCIE 2792 non-null object \n",
" 20 Tabaco_DXCIE 2792 non-null object \n",
" 21 FrecuenciaConsumo30Dias 2792 non-null object \n",
" 22 Años_consumo_droga 2733 non-null float64 \n",
" 23 OtrosDx_Psiquiatrico 2792 non-null object \n",
" 24 Tx_previos 2792 non-null object \n",
" 25 Adherencia_tto_recalc 2792 non-null float64 \n",
" 26 Tiempo_tx 2792 non-null float64 \n",
" 27 Readmisiones_estudios 2792 non-null object \n",
" 28 Situacion_tratamiento 2792 non-null object \n",
" 29 Periodos_COVID 2792 non-null object \n",
" 30 Pandemia_inicio_fin_tratamiento 2792 non-null object \n",
" 31 Nreadmision 2792 non-null float64 \n",
" 32 Readmisiones_PRECOVID 2792 non-null float64 \n",
" 33 Readmisiones_COVID 2792 non-null float64 \n",
"dtypes: category(1), float64(10), object(23)\n",
"memory usage: 744.5+ KB\n",
"None\n",
"-------------------------------\n",
"\n",
"\n",
"\n",
"\n",
"POST\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 10677 entries, 11 to 85156\n",
"Data columns (total 35 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 CODPROYECTO 10677 non-null float64 \n",
" 1 Education 10677 non-null object \n",
" 2 Social_protection 10677 non-null object \n",
" 3 Job_insecurity 10677 non-null object \n",
" 4 Housing 10677 non-null object \n",
" 5 Alterations_early_childhood_develop 10677 non-null object \n",
" 6 Social_inclusion 10677 non-null object \n",
" 7 Risk_stigma 10085 non-null category\n",
" 8 Structural_conflic 10677 non-null float64 \n",
" 9 Age 10676 non-null float64 \n",
" 10 Sex 10677 non-null object \n",
" 11 NumHijos 10103 non-null float64 \n",
" 12 Smoking 10677 non-null object \n",
" 13 Biological_vulnerability 10677 non-null object \n",
" 14 Alcohol_DxCIE 10677 non-null object \n",
" 15 Opiaceos_DxCIE 10677 non-null object \n",
" 16 Cannabis_DXCIE 10677 non-null object \n",
" 17 BZD_DxCIE 10677 non-null object \n",
" 18 Cocaina_DxCIE 10677 non-null object \n",
" 19 Alucinogenos_DXCIE 10677 non-null object \n",
" 20 Tabaco_DXCIE 10677 non-null object \n",
" 21 FrecuenciaConsumo30Dias 10677 non-null object \n",
" 22 Años_consumo_droga 10478 non-null float64 \n",
" 23 OtrosDx_Psiquiatrico 10677 non-null object \n",
" 24 Tx_previos 10677 non-null object \n",
" 25 Adherencia_tto_recalc 10677 non-null float64 \n",
" 26 Tiempo_tx 10677 non-null float64 \n",
" 27 Readmisiones_estudios 10677 non-null object \n",
" 28 Situacion_tratamiento 10677 non-null object \n",
" 29 Periodos_COVID 10677 non-null object \n",
" 30 Pandemia_inicio_fin_tratamiento 10677 non-null object \n",
" 31 Nreadmision 10677 non-null float64 \n",
" 32 Readmisiones_PRECOVID 10677 non-null float64 \n",
" 33 Readmisiones_COVID 10677 non-null float64 \n",
" 34 Group 10677 non-null object \n",
"dtypes: category(1), float64(10), object(24)\n",
"memory usage: 2.9+ MB\n",
"None\n",
"-------------------------------\n",
"POST-ABANDONO\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 8795 entries, 11 to 85156\n",
"Data columns (total 34 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 CODPROYECTO 8795 non-null float64 \n",
" 1 Education 8795 non-null object \n",
" 2 Social_protection 8795 non-null object \n",
" 3 Job_insecurity 8795 non-null object \n",
" 4 Housing 8795 non-null object \n",
" 5 Alterations_early_childhood_develop 8795 non-null object \n",
" 6 Social_inclusion 8795 non-null object \n",
" 7 Risk_stigma 8308 non-null category\n",
" 8 Structural_conflic 8795 non-null float64 \n",
" 9 Age 8794 non-null float64 \n",
" 10 Sex 8795 non-null object \n",
" 11 NumHijos 8325 non-null float64 \n",
" 12 Smoking 8795 non-null object \n",
" 13 Biological_vulnerability 8795 non-null object \n",
" 14 Alcohol_DxCIE 8795 non-null object \n",
" 15 Opiaceos_DxCIE 8795 non-null object \n",
" 16 Cannabis_DXCIE 8795 non-null object \n",
" 17 BZD_DxCIE 8795 non-null object \n",
" 18 Cocaina_DxCIE 8795 non-null object \n",
" 19 Alucinogenos_DXCIE 8795 non-null object \n",
" 20 Tabaco_DXCIE 8795 non-null object \n",
" 21 FrecuenciaConsumo30Dias 8795 non-null object \n",
" 22 Años_consumo_droga 8627 non-null float64 \n",
" 23 OtrosDx_Psiquiatrico 8795 non-null object \n",
" 24 Tx_previos 8795 non-null object \n",
" 25 Adherencia_tto_recalc 8795 non-null float64 \n",
" 26 Tiempo_tx 8795 non-null float64 \n",
" 27 Readmisiones_estudios 8795 non-null object \n",
" 28 Situacion_tratamiento 8795 non-null object \n",
" 29 Periodos_COVID 8795 non-null object \n",
" 30 Pandemia_inicio_fin_tratamiento 8795 non-null object \n",
" 31 Nreadmision 8795 non-null float64 \n",
" 32 Readmisiones_PRECOVID 8795 non-null float64 \n",
" 33 Readmisiones_COVID 8795 non-null float64 \n",
"dtypes: category(1), float64(10), object(23)\n",
"memory usage: 2.3+ MB\n",
"None\n",
"-------------------------------\n",
"POST-ALTA\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 1882 entries, 258 to 85149\n",
"Data columns (total 34 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 CODPROYECTO 1882 non-null float64 \n",
" 1 Education 1882 non-null object \n",
" 2 Social_protection 1882 non-null object \n",
" 3 Job_insecurity 1882 non-null object \n",
" 4 Housing 1882 non-null object \n",
" 5 Alterations_early_childhood_develop 1882 non-null object \n",
" 6 Social_inclusion 1882 non-null object \n",
" 7 Risk_stigma 1777 non-null category\n",
" 8 Structural_conflic 1882 non-null float64 \n",
" 9 Age 1882 non-null float64 \n",
" 10 Sex 1882 non-null object \n",
" 11 NumHijos 1778 non-null float64 \n",
" 12 Smoking 1882 non-null object \n",
" 13 Biological_vulnerability 1882 non-null object \n",
" 14 Alcohol_DxCIE 1882 non-null object \n",
" 15 Opiaceos_DxCIE 1882 non-null object \n",
" 16 Cannabis_DXCIE 1882 non-null object \n",
" 17 BZD_DxCIE 1882 non-null object \n",
" 18 Cocaina_DxCIE 1882 non-null object \n",
" 19 Alucinogenos_DXCIE 1882 non-null object \n",
" 20 Tabaco_DXCIE 1882 non-null object \n",
" 21 FrecuenciaConsumo30Dias 1882 non-null object \n",
" 22 Años_consumo_droga 1851 non-null float64 \n",
" 23 OtrosDx_Psiquiatrico 1882 non-null object \n",
" 24 Tx_previos 1882 non-null object \n",
" 25 Adherencia_tto_recalc 1882 non-null float64 \n",
" 26 Tiempo_tx 1882 non-null float64 \n",
" 27 Readmisiones_estudios 1882 non-null object \n",
" 28 Situacion_tratamiento 1882 non-null object \n",
" 29 Periodos_COVID 1882 non-null object \n",
" 30 Pandemia_inicio_fin_tratamiento 1882 non-null object \n",
" 31 Nreadmision 1882 non-null float64 \n",
" 32 Readmisiones_PRECOVID 1882 non-null float64 \n",
" 33 Readmisiones_COVID 1882 non-null float64 \n",
"dtypes: category(1), float64(10), object(23)\n",
"memory usage: 501.9+ KB\n",
"None\n",
"-------------------------------\n"
]
}
],
"outputs": [],
"source": [
"print(\"PRE\")\n",
"print(conj_pre.info())\n",
......@@ -473,18 +164,9 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['Live with families or friends' 'live alone' 'live in institutions' '9.0']\n",
"['Live with families or friends' 'live alone' 'live in institutions']\n"
]
}
],
"outputs": [],
"source": [
"# 9.0 represents unknown according to Variables.docx \n",
"print(bd['Social_inclusion'].unique())\n",
......@@ -496,20 +178,9 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['No alterations (first exposure at 11 or more years)'\n",
" 'Alterations (first exposure before 11 years old)' '9']\n",
"['No alterations (first exposure at 11 or more years)'\n",
" 'Alterations (first exposure before 11 years old)']\n"
]
}
],
"outputs": [],
"source": [
"print(bd['Alterations_early_childhood_develop'].unique())\n",
"mode_alt = bd['Alterations_early_childhood_develop'].mode()[0]\n",
......@@ -519,28 +190,9 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[NaN, 'Yes', 'No']\n",
"Categories (3, object): [99.0, 'No', 'Yes']\n",
"[NaN, 'Yes', 'No']\n",
"Categories (2, object): ['No', 'Yes']\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_11540\\1073322024.py:3: FutureWarning: The behavior of Series.replace (and DataFrame.replace) with CategoricalDtype is deprecated. In a future version, replace will only be used for cases that preserve the categories. To change the categories, use ser.cat.rename_categories instead.\n",
" bd['Risk_stigma'] = bd['Risk_stigma'].replace(99.0, mode_stigma)\n"
]
}
],
"outputs": [],
"source": [
"print(bd['Risk_stigma'].unique())\n",
"mode_stigma = bd['Risk_stigma'].mode()[0]\n",
......@@ -550,18 +202,9 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[nan 0. 1. 2. 3. 4. 5. 8. 10. 6. 11. 12. 9. 7. 99. 14. 15.]\n",
"[nan 0. 1. 2. 3. 4. 5. 8. 10. 6. 11. 12. 9. 7. 14. 15.]\n"
]
}
],
"outputs": [],
"source": [
"print(bd['NumHijos'].unique())\n",
"mode_hijos = bd['NumHijos'].mode()[0]\n",
......@@ -609,44 +252,9 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_11540\\3303146707.py:2: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n",
"The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n",
"\n",
"For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n",
"\n",
"\n",
" bd['Age'].fillna(age_mode, inplace=True)\n",
"C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_11540\\3303146707.py:5: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n",
"The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n",
"\n",
"For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n",
"\n",
"\n",
" bd['Años_consumo_droga'].fillna(años_consumo_mode, inplace=True)\n",
"C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_11540\\3303146707.py:8: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n",
"The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n",
"\n",
"For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n",
"\n",
"\n",
" bd['Risk_stigma'].fillna(risk_stigma_mode, inplace=True)\n",
"C:\\Users\\Joaquín Torres\\AppData\\Local\\Temp\\ipykernel_11540\\3303146707.py:11: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n",
"The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n",
"\n",
"For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n",
"\n",
"\n",
" bd['NumHijos'].fillna(num_hijos_mode, inplace=True)\n"
]
}
],
"outputs": [],
"source": [
"age_mode = bd['Age'].mode()[0]\n",
"bd['Age'].fillna(age_mode, inplace=True)\n",
......@@ -734,8 +342,7 @@
"# Adjust layout to prevent overlapping titles\n",
"plt.tight_layout()\n",
"\n",
"# Save the figure in SVG format with DPI=600 in the \"./EDA_plots\" folder\n",
"plt.savefig('./EDA_plots/countplots.svg', dpi=600, bbox_inches='tight')"
"plt.savefig('./output/plots/distributions/countplots.svg', dpi=600, bbox_inches='tight')"
]
},
{
......@@ -830,8 +437,8 @@
"# Adjust layout to prevent overlapping titles\n",
"plt.tight_layout()\n",
"\n",
"# Save the figure in SVG format with DPI=600 in the \"./EDA_plots\" folder\n",
"plt.savefig('./EDA_plots/norm_countplots.svg', dpi=600, bbox_inches='tight')"
"# Save the figure in SVG format with DPI=600 in the \"._plots\" folder\n",
"plt.savefig('./output/plots/distributions/norm_countplots.svg', dpi=600, bbox_inches='tight')"
]
},
{
......@@ -886,7 +493,7 @@
"plt.tight_layout()\n",
"\n",
"# Save the figure in SVG format with DPI=600 in the \"./EDA_plots\" folder\n",
"plt.savefig('./EDA_plots/boxplots.svg', dpi=600, bbox_inches='tight')"
"plt.savefig('./output/plots/distributions/boxplots.svg', dpi=600, bbox_inches='tight')"
]
},
{
......@@ -926,7 +533,7 @@
"plt.tight_layout()\n",
"\n",
"# Save the figure in SVG format with DPI=600 in the \"./EDA_plots\" folder\n",
"plt.savefig('./EDA_plots/histograms.svg', dpi=600, bbox_inches='tight')"
"plt.savefig('./output/plots/distributions/histograms.svg', dpi=600, bbox_inches='tight')"
]
},
{
......@@ -945,7 +552,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
......@@ -1022,7 +629,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
......@@ -1036,7 +643,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
......@@ -1063,7 +670,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
......@@ -1098,7 +705,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
......@@ -1138,21 +745,13 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['Ed_Not Complete primary school', 'Ed_Primary education', 'Ed_Secondary Education', 'Ed_Secondary more technical education', 'Ed_Tertiary', 'Ed_Unknowledge', 'Social_protection_REDEF', 'JobIn_Non-stable', 'JobIn_Stable', 'JobIn_Unemployed', 'JobIn_unkwnodledge', 'Hous_Institutional', 'Hous_Stable', 'Hous_Unstable', 'Hous_unknowledge', 'Alterations_early_childhood_develop_REDEF', 'SocInc_Live with families or friends', 'SocInc_live alone', 'SocInc_live in institutions', 'Risk_stigma_REDEF', 'Structural_conflic']\n"
]
}
],
"outputs": [],
"source": [
"# Export column names for future programs\n",
"np.save('./soc_vars_names.npy', soc_vars_enc)\n",
"np.save('./ind_vars_names.npy', soc_vars_enc)"
"np.save('./output/soc_vars_names.npy', soc_vars_enc)\n",
"np.save('./output/ind_vars_names.npy', soc_vars_enc)"
]
},
{
......@@ -1164,7 +763,7 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
......@@ -1194,7 +793,7 @@
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
......@@ -1204,7 +803,7 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
......@@ -1244,7 +843,7 @@
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
......@@ -1358,7 +957,7 @@
"plt.tight_layout()\n",
"\n",
"# Save the figure in SVG format in the \"./EDA_plots\" folder\n",
"plt.savefig('./EDA_plots/heatmaps_one_hot.svg', dpi=550, bbox_inches='tight')"
"plt.savefig('./output/plots/correlations/heatmaps_one_hot.svg', dpi=550, bbox_inches='tight')"
]
},
{
......@@ -1489,69 +1088,9 @@
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 33538 entries, 0 to 85164\n",
"Data columns (total 45 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 Ed_Not Complete primary school 33538 non-null bool \n",
" 1 Ed_Primary education 33538 non-null bool \n",
" 2 Ed_Secondary Education 33538 non-null bool \n",
" 3 Ed_Secondary more technical education 33538 non-null bool \n",
" 4 Ed_Tertiary 33538 non-null bool \n",
" 5 Ed_Unknowledge 33538 non-null bool \n",
" 6 Social_protection_REDEF 33538 non-null int64 \n",
" 7 JobIn_Non-stable 33538 non-null bool \n",
" 8 JobIn_Stable 33538 non-null bool \n",
" 9 JobIn_Unemployed 33538 non-null bool \n",
" 10 JobIn_unkwnodledge 33538 non-null bool \n",
" 11 Hous_Institutional 33538 non-null bool \n",
" 12 Hous_Stable 33538 non-null bool \n",
" 13 Hous_Unstable 33538 non-null bool \n",
" 14 Hous_unknowledge 33538 non-null bool \n",
" 15 Alterations_early_childhood_develop_REDEF 33538 non-null int64 \n",
" 16 SocInc_Live with families or friends 33538 non-null bool \n",
" 17 SocInc_live alone 33538 non-null bool \n",
" 18 SocInc_live in institutions 33538 non-null bool \n",
" 19 Risk_stigma_REDEF 33538 non-null category\n",
" 20 Structural_conflic 33538 non-null float64 \n",
" 21 Age 33538 non-null float64 \n",
" 22 Sex_REDEF 33538 non-null int64 \n",
" 23 NumHijos 33538 non-null float64 \n",
" 24 Smoking_REDEF 33538 non-null int64 \n",
" 25 Biological_vulnerability_REDEF 33538 non-null int64 \n",
" 26 Opiaceos_DxCIE_REDEF 33538 non-null int64 \n",
" 27 Cannabis_DXCIE_REDEF 33538 non-null int64 \n",
" 28 BZD_DxCIE_REDEF 33538 non-null int64 \n",
" 29 Cocaina_DxCIE_REDEF 33538 non-null int64 \n",
" 30 Alucinogenos_DXCIE_REDEF 33538 non-null int64 \n",
" 31 Tabaco_DXCIE_REDEF 33538 non-null int64 \n",
" 32 Frec30_1 día/semana 33538 non-null bool \n",
" 33 Frec30_2-3 días‎/semana 33538 non-null bool \n",
" 34 Frec30_4-6 días/semana 33538 non-null bool \n",
" 35 Frec30_Desconocido 33538 non-null bool \n",
" 36 Frec30_Menos de 1 día‎/semana 33538 non-null bool \n",
" 37 Frec30_No consumio 33538 non-null bool \n",
" 38 Frec30_Todos los días 33538 non-null bool \n",
" 39 Años_consumo_droga 33538 non-null float64 \n",
" 40 OtrosDx_Psiquiatrico_REDEF 33538 non-null int64 \n",
" 41 Tx_previos_REDEF 33538 non-null int64 \n",
" 42 Adherencia_tto_recalc 33538 non-null float64 \n",
" 43 Pandemia_inicio_fin_tratamiento 33538 non-null object \n",
" 44 Situacion_tratamiento_REDEF 33538 non-null int64 \n",
"dtypes: bool(24), category(1), float64(5), int64(14), object(1)\n",
"memory usage: 6.2+ MB\n",
"None\n"
]
}
],
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Work with columns of interest\n",
"cols_of_interest = corr_cols + ['Pandemia_inicio_fin_tratamiento'] + [target_var + \"_REDEF\"]\n",
......@@ -1561,7 +1100,7 @@
},
{
"cell_type": "code",
"execution_count": 25,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
......@@ -1572,72 +1111,16 @@
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 33538 entries, 0 to 85164\n",
"Data columns (total 41 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 Ed_Not Complete primary school 33538 non-null bool \n",
" 1 Ed_Primary education 33538 non-null bool \n",
" 2 Ed_Secondary Education 33538 non-null bool \n",
" 3 Ed_Secondary more technical education 33538 non-null bool \n",
" 4 Ed_Tertiary 33538 non-null bool \n",
" 5 Social_protection_REDEF 33538 non-null int64 \n",
" 6 JobIn_Non-stable 33538 non-null bool \n",
" 7 JobIn_Stable 33538 non-null bool \n",
" 8 JobIn_Unemployed 33538 non-null bool \n",
" 9 Hous_Institutional 33538 non-null bool \n",
" 10 Hous_Stable 33538 non-null bool \n",
" 11 Hous_Unstable 33538 non-null bool \n",
" 12 Alterations_early_childhood_develop_REDEF 33538 non-null int64 \n",
" 13 SocInc_Live with families or friends 33538 non-null bool \n",
" 14 SocInc_live alone 33538 non-null bool \n",
" 15 SocInc_live in institutions 33538 non-null bool \n",
" 16 Risk_stigma_REDEF 33538 non-null category\n",
" 17 Structural_conflic 33538 non-null float64 \n",
" 18 Age 33538 non-null float64 \n",
" 19 Sex_REDEF 33538 non-null int64 \n",
" 20 NumHijos 33538 non-null float64 \n",
" 21 Smoking_REDEF 33538 non-null int64 \n",
" 22 Biological_vulnerability_REDEF 33538 non-null int64 \n",
" 23 Opiaceos_DxCIE_REDEF 33538 non-null int64 \n",
" 24 Cannabis_DXCIE_REDEF 33538 non-null int64 \n",
" 25 BZD_DxCIE_REDEF 33538 non-null int64 \n",
" 26 Cocaina_DxCIE_REDEF 33538 non-null int64 \n",
" 27 Alucinogenos_DXCIE_REDEF 33538 non-null int64 \n",
" 28 Tabaco_DXCIE_REDEF 33538 non-null int64 \n",
" 29 Frec30_1 día/semana 33538 non-null bool \n",
" 30 Frec30_2-3 días‎/semana 33538 non-null bool \n",
" 31 Frec30_4-6 días/semana 33538 non-null bool \n",
" 32 Frec30_Menos de 1 día‎/semana 33538 non-null bool \n",
" 33 Frec30_No consumio 33538 non-null bool \n",
" 34 Frec30_Todos los días 33538 non-null bool \n",
" 35 Años_consumo_droga 33538 non-null float64 \n",
" 36 OtrosDx_Psiquiatrico_REDEF 33538 non-null int64 \n",
" 37 Tx_previos_REDEF 33538 non-null int64 \n",
" 38 Adherencia_tto_recalc 33538 non-null float64 \n",
" 39 Pandemia_inicio_fin_tratamiento 33538 non-null object \n",
" 40 Situacion_tratamiento_REDEF 33538 non-null int64 \n",
"dtypes: bool(20), category(1), float64(5), int64(14), object(1)\n",
"memory usage: 6.0+ MB\n",
"None\n"
]
}
],
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(temp_bd.info())"
]
},
{
"cell_type": "code",
"execution_count": 27,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
......@@ -1653,135 +1136,25 @@
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 22861 entries, 0 to 85164\n",
"Data columns (total 40 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 Ed_Not Complete primary school 22861 non-null bool \n",
" 1 Ed_Primary education 22861 non-null bool \n",
" 2 Ed_Secondary Education 22861 non-null bool \n",
" 3 Ed_Secondary more technical education 22861 non-null bool \n",
" 4 Ed_Tertiary 22861 non-null bool \n",
" 5 Social_protection_REDEF 22861 non-null int64 \n",
" 6 JobIn_Non-stable 22861 non-null bool \n",
" 7 JobIn_Stable 22861 non-null bool \n",
" 8 JobIn_Unemployed 22861 non-null bool \n",
" 9 Hous_Institutional 22861 non-null bool \n",
" 10 Hous_Stable 22861 non-null bool \n",
" 11 Hous_Unstable 22861 non-null bool \n",
" 12 Alterations_early_childhood_develop_REDEF 22861 non-null int64 \n",
" 13 SocInc_Live with families or friends 22861 non-null bool \n",
" 14 SocInc_live alone 22861 non-null bool \n",
" 15 SocInc_live in institutions 22861 non-null bool \n",
" 16 Risk_stigma_REDEF 22861 non-null category\n",
" 17 Structural_conflic 22861 non-null float64 \n",
" 18 Age 22861 non-null float64 \n",
" 19 Sex_REDEF 22861 non-null int64 \n",
" 20 NumHijos 22861 non-null float64 \n",
" 21 Smoking_REDEF 22861 non-null int64 \n",
" 22 Biological_vulnerability_REDEF 22861 non-null int64 \n",
" 23 Opiaceos_DxCIE_REDEF 22861 non-null int64 \n",
" 24 Cannabis_DXCIE_REDEF 22861 non-null int64 \n",
" 25 BZD_DxCIE_REDEF 22861 non-null int64 \n",
" 26 Cocaina_DxCIE_REDEF 22861 non-null int64 \n",
" 27 Alucinogenos_DXCIE_REDEF 22861 non-null int64 \n",
" 28 Tabaco_DXCIE_REDEF 22861 non-null int64 \n",
" 29 Frec30_1 día/semana 22861 non-null bool \n",
" 30 Frec30_2-3 días‎/semana 22861 non-null bool \n",
" 31 Frec30_4-6 días/semana 22861 non-null bool \n",
" 32 Frec30_Menos de 1 día‎/semana 22861 non-null bool \n",
" 33 Frec30_No consumio 22861 non-null bool \n",
" 34 Frec30_Todos los días 22861 non-null bool \n",
" 35 Años_consumo_droga 22861 non-null float64 \n",
" 36 OtrosDx_Psiquiatrico_REDEF 22861 non-null int64 \n",
" 37 Tx_previos_REDEF 22861 non-null int64 \n",
" 38 Adherencia_tto_recalc 22861 non-null float64 \n",
" 39 Situacion_tratamiento_REDEF 22861 non-null int64 \n",
"dtypes: bool(20), category(1), float64(5), int64(14)\n",
"memory usage: 3.9 MB\n",
"None\n"
]
}
],
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(conj_pre.info())"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 10677 entries, 11 to 85156\n",
"Data columns (total 40 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 Ed_Not Complete primary school 10677 non-null bool \n",
" 1 Ed_Primary education 10677 non-null bool \n",
" 2 Ed_Secondary Education 10677 non-null bool \n",
" 3 Ed_Secondary more technical education 10677 non-null bool \n",
" 4 Ed_Tertiary 10677 non-null bool \n",
" 5 Social_protection_REDEF 10677 non-null int64 \n",
" 6 JobIn_Non-stable 10677 non-null bool \n",
" 7 JobIn_Stable 10677 non-null bool \n",
" 8 JobIn_Unemployed 10677 non-null bool \n",
" 9 Hous_Institutional 10677 non-null bool \n",
" 10 Hous_Stable 10677 non-null bool \n",
" 11 Hous_Unstable 10677 non-null bool \n",
" 12 Alterations_early_childhood_develop_REDEF 10677 non-null int64 \n",
" 13 SocInc_Live with families or friends 10677 non-null bool \n",
" 14 SocInc_live alone 10677 non-null bool \n",
" 15 SocInc_live in institutions 10677 non-null bool \n",
" 16 Risk_stigma_REDEF 10677 non-null category\n",
" 17 Structural_conflic 10677 non-null float64 \n",
" 18 Age 10677 non-null float64 \n",
" 19 Sex_REDEF 10677 non-null int64 \n",
" 20 NumHijos 10677 non-null float64 \n",
" 21 Smoking_REDEF 10677 non-null int64 \n",
" 22 Biological_vulnerability_REDEF 10677 non-null int64 \n",
" 23 Opiaceos_DxCIE_REDEF 10677 non-null int64 \n",
" 24 Cannabis_DXCIE_REDEF 10677 non-null int64 \n",
" 25 BZD_DxCIE_REDEF 10677 non-null int64 \n",
" 26 Cocaina_DxCIE_REDEF 10677 non-null int64 \n",
" 27 Alucinogenos_DXCIE_REDEF 10677 non-null int64 \n",
" 28 Tabaco_DXCIE_REDEF 10677 non-null int64 \n",
" 29 Frec30_1 día/semana 10677 non-null bool \n",
" 30 Frec30_2-3 días‎/semana 10677 non-null bool \n",
" 31 Frec30_4-6 días/semana 10677 non-null bool \n",
" 32 Frec30_Menos de 1 día‎/semana 10677 non-null bool \n",
" 33 Frec30_No consumio 10677 non-null bool \n",
" 34 Frec30_Todos los días 10677 non-null bool \n",
" 35 Años_consumo_droga 10677 non-null float64 \n",
" 36 OtrosDx_Psiquiatrico_REDEF 10677 non-null int64 \n",
" 37 Tx_previos_REDEF 10677 non-null int64 \n",
" 38 Adherencia_tto_recalc 10677 non-null float64 \n",
" 39 Situacion_tratamiento_REDEF 10677 non-null int64 \n",
"dtypes: bool(20), category(1), float64(5), int64(14)\n",
"memory usage: 1.8 MB\n",
"None\n"
]
}
],
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(conj_post.info())"
]
},
{
"cell_type": "code",
"execution_count": 35,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
......@@ -1793,52 +1166,18 @@
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['Ed_Not Complete primary school' 'Ed_Primary education'\n",
" 'Ed_Secondary Education' 'Ed_Secondary more technical education'\n",
" 'Ed_Tertiary' 'Social_protection_REDEF' 'JobIn_Non-stable' 'JobIn_Stable'\n",
" 'JobIn_Unemployed' 'Hous_Institutional' 'Hous_Stable' 'Hous_Unstable'\n",
" 'Alterations_early_childhood_develop_REDEF'\n",
" 'SocInc_Live with families or friends' 'SocInc_live alone'\n",
" 'SocInc_live in institutions' 'Risk_stigma_REDEF' 'Structural_conflic'\n",
" 'Age' 'Sex_REDEF' 'NumHijos' 'Smoking_REDEF'\n",
" 'Biological_vulnerability_REDEF' 'Opiaceos_DxCIE_REDEF'\n",
" 'Cannabis_DXCIE_REDEF' 'BZD_DxCIE_REDEF' 'Cocaina_DxCIE_REDEF'\n",
" 'Alucinogenos_DXCIE_REDEF' 'Tabaco_DXCIE_REDEF' 'Frec30_1 día/semana'\n",
" 'Frec30_2-3 días\\u200e/semana' 'Frec30_4-6 días/semana'\n",
" 'Frec30_Menos de 1 día\\u200e/semana' 'Frec30_No consumio'\n",
" 'Frec30_Todos los días' 'Años_consumo_droga' 'OtrosDx_Psiquiatrico_REDEF'\n",
" 'Tx_previos_REDEF' 'Adherencia_tto_recalc']\n"
]
}
],
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(feat)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(22861, 39)\n",
"(10677, 39)\n",
"(22861,)\n",
"(10677,)\n",
"39\n"
]
}
],
"outputs": [],
"source": [
"print(X_pre.shape)\n",
"print(X_post.shape)\n",
......@@ -1887,7 +1226,7 @@
"axes[1].set_title(\"POST\")\n",
"\n",
"plt.tight_layout()\n",
"plt.savefig('EDA_plots/features/mutual_info.svg', format='svg', dpi=1200)\n",
"plt.savefig('./output/plots/feature_importance/mutual_info.svg', format='svg', dpi=1200)\n",
"plt.show()"
]
},
......@@ -1926,7 +1265,7 @@
"axes[1].set_title(\"POST\")\n",
"\n",
"plt.tight_layout()\n",
"plt.savefig('EDA_plots/features/ANOVA.svg', format='svg', dpi=1200)\n",
"plt.savefig('./output/plots/feature_importance/ANOVA.svg', format='svg', dpi=1200)\n",
"plt.show()"
]
},
......@@ -1958,7 +1297,7 @@
"axes[1].set_title(\"POST\")\n",
"\n",
"plt.tight_layout()\n",
"plt.savefig('EDA_plots/features/var_threshold.svg', format='svg', dpi=1200)\n",
"plt.savefig('./output/plots/feature_importance/var_threshold.svg', format='svg', dpi=1200)\n",
"plt.show()"
]
},
......@@ -1971,7 +1310,7 @@
},
{
"cell_type": "code",
"execution_count": 30,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment