Uploading data_analysis correctly

a2cdcd08 · Laura Masa · 93989e08 · a2cdcd08 · a2cdcd08 · a2cdcd08
Commit a2cdcd08 authored Jul 09, 2024 by Laura Masa
92 changed files
--- a/data_analysis/data/data_01_neuro_diseases_final_disease_selected.tsv
+++ b/data_analysis/data/data_01_neuro_diseases_final_disease_selected.tsv
+cui	disease_name	gene_id	drug_id	symptom_id	Genes in LCC
+C0001206	Acromegaly	178.0	3.0	18.0	38.0
+C0001973	Alcoholic Intoxication, Chronic	1008.0	22.0	34.0	249.0
+C0002395	Alzheimer's Disease	5736.0	38.0	7.0	2697.0
+C0002622	Amnesia	200.0	41.0	14.0	74.0
+C0002736	Amyotrophic Lateral Sclerosis	1791.0	11.0	23.0	847.0
+C0003467	Anxiety	1335.0	0.0	16.0	684.0
+C0003537	Aphasia	94.0	3.0	8.0	27.0
+C0004135	Ataxia Telangiectasia	481.0	0.0	24.0	235.0
+C0004352	Autistic Disorder	1710.0	35.0	5.0	706.0
+C0005586	Bipolar Disorder	2040.0	78.0	12.0	727.0
+C0007282	Carotid Stenosis	163.0	2.0	3.0	23.0
+C0007766	Intracranial Aneurysm	399.0	5.0	13.0	124.0
+C0007785	Cerebral Infarction	891.0	15.0	32.0	444.0
+C0007786	Brain Ischemia	449.0	48.0	4.0	236.0
+C0007787	Transient Ischemic Attack	410.0	17.0	14.0	233.0
+C0007789	Cerebral Palsy	284.0	5.0	31.0	81.0
+C0007959	Charcot-Marie-Tooth Disease	349.0	1.0	19.0	96.0
+C0009171	Cocaine Abuse	182.0	97.0	16.0	57.0
+C0009207	Cockayne Syndrome	133.0	0.0	7.0	46.0
+C0009952	Febrile Convulsions	267.0	8.0	9.0	30.0
+C0010964	Dandy-Walker Syndrome	155.0	0.0	9.0	53.0
+C0011195	Dejerine-Sottas Disease (disorder)	235.0	0.0	22.0	60.0
+C0011206	Delirium	152.0	23.0	18.0	15.0
+C0011265	Presenile dementia	944.0	38.0	23.0	467.0
+C0011269	Dementia, Vascular	268.0	2.0	26.0	103.0
+C0011570	Mental Depression	2208.0	0.0	7.0	1047.0
+C0011581	Depressive disorder	2544.0	112.0	5.0	1266.0
+C0011633	Dermatomyositis	296.0	8.0	18.0	80.0
+C0013080	Down Syndrome	1066.0	1.0	3.0	511.0
+C0013264	Muscular Dystrophy, Duchenne	467.0	3.0	10.0	196.0
+C0013362	Dysarthria	531.0	7.0	10.0	260.0
+C0013384	Dyskinetic syndrome	400.0	31.0	26.0	145.0
+C0014038	Encephalitis	406.0	8.0	37.0	148.0
+C0014057	Japanese Encephalitis	142.0	0.0	21.0	50.0
+C0014060	Encephalitis, St. Louis	330.0	0.0	13.0	124.0
+C0014065	Congenital cerebral hernia	101.0	0.0	4.0	43.0
+C0014070	Encephalomyelitis	1062.0	0.0	32.0	578.0
+C0014544	Epilepsy	1727.0	48.0	19.0	828.0
+C0014553	Absence Epilepsy	131.0	15.0	11.0	28.0
+C0014556	Epilepsy, Temporal Lobe	470.0	11.0	14.0	181.0
+C0015469	Facial paralysis	186.0	1.0	12.0	25.0
+C0016667	Fragile X Syndrome	255.0	1.0	6.0	86.0
+C0016719	Friedreich Ataxia	111.0	4.0	12.0	22.0
+C0017205	Gaucher Disease	183.0	2.0	17.0	35.0
+C0017921	Glycogen storage disease type II	243.0	0.0	12.0	44.0
+C0018378	Guillain-Barre Syndrome	194.0	0.0	44.0	58.0
+C0018524	Hallucinations	192.0	30.0	10.0	83.0
+C0018784	Sensorineural Hearing Loss (disorder)	989.0	6.0	5.0	432.0
+C0019202	Hepatolenticular Degeneration	200.0	9.0	21.0	43.0
+C0019562	Von Hippel-Lindau Syndrome	240.0	0.0	4.0	91.0
+C0020179	Huntington Disease	1345.0	11.0	16.0	677.0
+C0020255	Hydrocephalus	555.0	3.0	15.0	267.0
+C0022336	Creutzfeldt-Jakob disease	180.0	0.0	24.0	53.0
+C0023264	Leigh Disease	285.0	0.0	18.0	70.0
+C0023524	Leukoencephalopathy, Progressive Multifocal	263.0	1.0	6.0	152.0
+C0024266	Lymphocytic Choriomeningitis	159.0	0.0	22.0	47.0
+C0024408	Machado-Joseph Disease	171.0	0.0	14.0	66.0
+C0024809	Marijuana Abuse	209.0	0.0	13.0	52.0
+C0024814	Marinesco-Sjogren syndrome	167.0	1.0	15.0	62.0
+C0025286	Meningioma	929.0	2.0	14.0	460.0
+C0025289	Meningitis	225.0	20.0	42.0	79.0
+C0025958	Microcephaly	1360.0	0.0	10.0	824.0
+C0026106	Mild Mental Retardation	374.0	0.0	2.0	120.0
+C0026654	Moyamoya Disease	151.0	2.0	13.0	16.0
+C0026769	Multiple Sclerosis	2878.0	21.0	28.0	1284.0
+C0026847	Spinal Muscular Atrophy	440.0	0.0	17.0	194.0
+C0026850	Muscular Dystrophy	453.0	1.0	33.0	124.0
+C0026896	Myasthenia Gravis	447.0	17.0	11.0	133.0
+C0027121	Myositis	301.0	4.0	24.0	109.0
+C0027126	Myotonic Dystrophy	204.0	3.0	8.0	72.0
+C0027809	Neurilemmoma	259.0	2.0	13.0	120.0
+C0027830	neurofibroma	157.0	0.0	12.0	65.0
+C0027831	Neurofibromatosis 1	433.0	0.0	15.0	183.0
+C0027832	Neurofibromatosis 2	160.0	0.0	12.0	82.0
+C0027859	Acoustic Neuroma	159.0	0.0	11.0	74.0
+C0027873	Neuromyelitis Optica	214.0	1.0	25.0	30.0
+C0028043	Nicotine Dependence	227.0	13.0	8.0	38.0
+C0028077	Night Blindness	201.0	1.0	15.0	16.0
+C0028738	Nystagmus	923.0	10.0	12.0	551.0
+C0029124	Optic Atrophy	648.0	0.0	15.0	335.0
+C0030567	Parkinson Disease	3240.0	47.0	32.0	1582.0
+C0032000	Pituitary Adenoma	211.0	7.0	15.0	88.0
+C0033375	Prolactinoma	243.0	5.0	11.0	87.0
+C0035258	Restless Legs Syndrome	174.0	17.0	28.0	28.0
+C0035372	Rett Syndrome	299.0	0.0	25.0	105.0
+C0036341	Schizophrenia	5398.0	71.0	18.0	2171.0
+C0037773	Spastic Paraplegia, Hereditary	209.0	0.0	7.0	41.0
+C0038220	Status Epilepticus	664.0	41.0	23.0	294.0
+C0038379	Strabismus	771.0	1.0	4.0	501.0
+C0038436	Post-Traumatic Stress Disorder	528.0	23.0	8.0	200.0
+C0038525	Subarachnoid Hemorrhage	659.0	14.0	14.0	342.0
+C0038868	Progressive supranuclear palsy	237.0	1.0	19.0	92.0
+C0039483	Giant Cell Arteritis	327.0	5.0	15.0	123.0
+C0040517	Gilles de la Tourette syndrome	232.0	17.0	11.0	37.0
+C0041341	Tuberous Sclerosis	378.0	4.0	18.0	159.0
+C0042170	Uveomeningoencephalitic Syndrome	124.0	5.0	14.0	15.0
+C0043124	West Nile Fever	89.0	1.0	25.0	18.0
+C0043459	Zellweger Syndrome	81.0	0.0	5.0	22.0
+C0080178	Spina Bifida	233.0	1.0	12.0	59.0
+C0085084	Motor Neuron Disease	270.0	3.0	35.0	118.0
+C0085655	Polymyositis	226.0	7.0	10.0	41.0
+C0085762	Alcohol abuse	226.0	22.0	29.0	47.0
+C0086769	Panic Attacks	63.0	27.0	23.0	20.0
+C0149940	Sciatic Neuropathy	121.0	2.0	7.0	44.0
+C0151311	Cranial nerve palsies	81.0	2.0	29.0	26.0
+C0151740	Intracranial Hypertension	72.0	12.0	14.0	27.0
+C0152020	Gastroparesis	102.0	4.0	24.0	17.0
+C0152025	Polyneuropathy	192.0	6.0	14.0	45.0
+C0153633	Malignant neoplasm of brain	330.0	22.0	36.0	161.0
+C0162309	Adrenoleukodystrophy	368.0	0.0	12.0	119.0
+C0162635	Angelman Syndrome	149.0	0.0	17.0	44.0
+C0162666	Mitochondrial Encephalomyopathies	60.0	1.0	15.0	19.0
+C0175754	Agenesis of corpus callosum	767.0	0.0	3.0	417.0
+C0206728	Plexiform Neurofibroma	56.0	1.0	13.0	20.0
+C0220756	Niemann-Pick Disease, Type C	230.0	1.0	19.0	119.0
+C0221056	Adult type dermatomyositis	256.0	8.0	18.0	67.0
+C0221406	Pituitary-dependent Cushing's disease	164.0	3.0	11.0	37.0
+C0234144	Dysgraphia	43.0	0.0	9.0	22.0
+C0236642	Pick Disease of the Brain	289.0	1.0	9.0	137.0
+C0238190	Inclusion Body Myositis (disorder)	101.0	1.0	11.0	35.0
+C0238288	Muscular Dystrophy, Facioscapulohumeral	187.0	0.0	14.0	48.0
+C0242350	Erectile dysfunction	322.0	35.0	2.0	72.0
+C0265219	Miller Dieker syndrome	221.0	0.0	20.0	99.0
+C0266463	Lissencephaly	98.0	0.0	15.0	30.0
+C0266464	Polymicrogyria	239.0	0.0	2.0	109.0
+C0266483	Pachygyria	156.0	0.0	15.0	39.0
+C0270824	Visual seizure	235.0	216.0	25.0	53.0
+C0270972	Cornelia De Lange Syndrome	78.0	0.0	10.0	22.0
+C0271270	Oculovestibuloauditory syndrome	95.0	0.0	13.0	33.0
+C0276226	Herpes encephalitis	68.0	2.0	2.0	16.0
+C0276496	Familial Alzheimer Disease (FAD)	336.0	38.0	7.0	165.0
+C0282527	Infantile Refsum Disease (disorder)	37.0	0.0	2.0	15.0
+C0338451	Frontotemporal dementia	464.0	4.0	9.0	203.0
+C0338508	Optic Atrophy, Autosomal Dominant	161.0	0.0	16.0	59.0
+C0349204	Nonorganic psychosis	528.0	0.0	9.0	184.0
+C0410189	Muscular Dystrophy, Emery-Dreifuss	76.0	0.0	7.0	15.0
+C0410207	Tubular Aggregate Myopathy	78.0	0.0	14.0	31.0
+C0431380	Cortical Dysplasia	139.0	2.0	6.0	31.0
+C0494463	Alzheimer Disease, Late Onset	529.0	38.0	7.0	241.0
+C0496899	Benign neoplasm of brain, unspecified	42.0	22.0	23.0	18.0
+C0497327	Dementia	1153.0	16.0	28.0	522.0
+C0520679	Sleep Apnea, Obstructive	610.0	4.0	26.0	249.0
+C0543859	Amyotrophic Lateral Sclerosis, Guam Form	40.0	11.0	26.0	19.0
+C0546126	Acute Confusional Senile Dementia	100.0	38.0	7.0	48.0
+C0577631	Carotid Atherosclerosis	263.0	2.0	9.0	75.0
+C0600427	Cocaine Dependence	300.0	97.0	16.0	65.0
+C0740391	Middle Cerebral Artery Occlusion	766.0	15.0	7.0	404.0
+C0740392	Infarction, Middle Cerebral Artery	160.0	15.0	7.0	62.0
+C0750900	Alzheimer's Disease, Focal Onset	100.0	38.0	7.0	48.0
+C0750901	Alzheimer Disease, Early Onset	207.0	38.0	7.0	95.0
+C0750974	Brain Tumor, Primary	137.0	22.0	16.0	64.0
+C0750977	Recurrent Brain Neoplasm	39.0	22.0	16.0	18.0
+C0750979	Primary malignant neoplasm of brain	42.0	22.0	16.0	20.0
+C0751265	Learning Disabilities	114.0	38.0	7.0	25.0
+C0751587	CADASIL Syndrome	53.0	0.0	7.0	16.0
+C0751690	Malignant Peripheral Nerve Sheath Tumor	332.0	0.0	9.0	172.0
+C0751713	Inclusion Body Myopathy, Sporadic	93.0	1.0	11.0	38.0
+C0751772	REM Sleep Behavior Disorder	60.0	4.0	13.0	17.0
+C0751781	Dentatorubral-Pallidoluysian Atrophy	123.0	3.0	18.0	37.0
+C0751967	Multiple Sclerosis, Relapsing-Remitting	249.0	7.0	3.0	61.0
+C0752120	Spinocerebellar Ataxia Type 1	126.0	2.0	13.0	33.0
+C0752125	Spinocerebellar Ataxia Type 7	94.0	2.0	13.0	28.0
+C0752166	Bardet-Biedl Syndrome	176.0	0.0	10.0	27.0
+C0752304	Hypoxic-Ischemic Encephalopathy	197.0	2.0	1.0	62.0
+C0752347	Lewy Body Disease	335.0	3.0	32.0	145.0
+C0917798	Cerebral Ischemia	121.0	48.0	4.0	49.0
+C0917816	Mental deficiency	150.0	2.0	1.0	27.0
+C1263846	Attention deficit hyperactivity disorder	1084.0	30.0	14.0	484.0
+C1269683	Major Depressive Disorder	1814.0	56.0	19.0	775.0
+C1306214	ACTH-Secreting Pituitary Adenoma	88.0	0.0	3.0	23.0
+C1510586	Autism Spectrum Disorders	1478.0	0.0	9.0	699.0
+C1839259	Bulbo-Spinal Atrophy, X-Linked	144.0	0.0	13.0	70.0
+C1868675	PARKINSON DISEASE 2, AUTOSOMAL RECESSIVE JUVENILE	89.0	37.0	2.0	27.0
+C1955869	Malformations of Cortical Development	80.0	2.0	6.0	26.0
+C2931689	Dystrophia myotonica 2	144.0	3.0	18.0	21.0
+C3658299	Zellweger Spectrum	35.0	0.0	5.0	17.0
--- a/data_analysis/results/expr_dea_groupwise/combined_results.csv
+++ b/data_analysis/results/expr_dea_groupwise/combined_results.csv
--- a/data_analysis/results/expr_dea_groupwise/significant_results.csv
+++ b/data_analysis/results/expr_dea_groupwise/significant_results.csv
+"gds_id","id_ref","gene_symbol","logFC","AveExpr","t","p_val","adj_p_val","B","significant"
+"GDS2519","207205_at","CEACAM4",-43.2474,167.0559,-5.0149,0,0.0486,1.8086,"u"
+"GDS4128","1554830_a_at","STEAP3",-80.5503,22.9122,-8.5324,0,0.0012,-4.2128,"u"
+"GDS4128","1555666_at","PTPRS",-37.5894,15.1636,-5.9976,0,0.032,-4.2814,"u"
+"GDS4128","1558561_at","HM13",-96.467,40.089,-5.6353,0,0.0383,-4.2958,"u"
+"GDS4128","1558778_s_at","MKL2",-45.8069,12.5022,-5.6737,0,0.0373,-4.2942,"u"
+"GDS4128","1561451_a_at","LOC101928551",-93.1816,24.9414,-9.7082,0,4e-04,-4.1937,"u"
+"GDS4128","1563255_at","FAM170B-AS1",-114.2319,40.098,-5.843,0,0.0333,-4.2874,"u"
+"GDS4128","1564679_at","ASB15",-112.9483,43.6432,-5.6903,0,0.037,-4.2935,"u"
+"GDS4128","1567022_at","OR5AK4P",-62.9342,29.9196,-5.4696,0,0.0486,-4.3028,"u"
+"GDS4128","1570114_at","MDGA2",-54.1825,17.6711,-6.1915,0,0.0252,-4.2743,"u"
+"GDS4128","1570433_at","TMPRSS2",-57.0986,24.1373,-7.1464,0,0.0091,-4.2444,"u"
+"GDS4128","204847_at","ZBTB11",-244.6604,290.0519,-5.7517,0,0.0346,-4.291,"u"
+"GDS4128","206754_s_at","CYP2B7P",-124.7891,39.8167,-6.8397,0,0.0121,-4.2532,"u"
+"GDS4128","207016_s_at","LOC101928635",-110.6628,34.8775,-5.8151,0,0.0333,-4.2885,"u"
+"GDS4128","209044_x_at","SF3B4",-146.8653,66.8142,-5.8298,0,0.0333,-4.2879,"u"
+"GDS4128","209176_at","SEC23IP",-59.0793,23.8628,-5.7662,0,0.0345,-4.2904,"u"
+"GDS4128","212728_at","DLG3",-234.6568,102.6718,-5.9427,0,0.032,-4.2835,"u"
+"GDS4128","213071_at","DPT",-70.2717,19.2877,-11.3337,0,1e-04,-4.1748,"u"
+"GDS4128","214547_at","ADCY10",-72.8003,25.7941,-7.2419,0,0.0086,-4.2419,"u"
+"GDS4128","216800_at","AK027069",-94.2253,35.3219,-8.9563,0,7e-04,-4.2053,"u"
+"GDS4128","217425_at","MC2R",-37.4118,15.7909,-6.4641,0,0.0195,-4.265,"u"
+"GDS4128","219722_s_at","GDPD3",-117.7731,50.0628,-6.0499,0,0.0317,-4.2795,"u"
+"GDS4128","219876_s_at","GOLGA2P5",-38.2726,16.5466,-6.4346,0,0.0195,-4.2659,"u"
+"GDS4128","220440_at","LGALS13",-81.4139,29.0418,-6.2259,0,0.0247,-4.2731,"u"
+"GDS4128","220640_at","CSNK1G1",-139.926,47.6199,-5.6534,0,0.0379,-4.295,"u"
+"GDS4128","222432_s_at","CCDC47",-350.3929,251.3091,-6.0249,0,0.0318,-4.2804,"u"
+"GDS4128","225241_at","CCDC80",-56.5799,27.3373,-5.7703,0,0.0345,-4.2902,"u"
+"GDS4128","225608_at","SNX29",-85.562,43.706,-5.8889,0,0.0327,-4.2856,"u"
+"GDS4128","226561_at","AGFG1",-33.2307,10.1255,-5.5661,0,0.043,-4.2987,"u"
+"GDS4128","228517_at","MEAF6",-53.4851,22.0799,-5.4509,0,0.0486,-4.3036,"u"
+"GDS4128","228773_at","LOC100506100",86.1618,94.8674,5.5247,0,0.0457,-4.3004,"o"
+"GDS4128","229133_s_at","ZNF397",-69.6686,23.8349,-6.5758,0,0.0171,-4.2613,"u"
+"GDS4128","230463_at","BG054960",-42.1868,21.9201,-6.3753,0,0.0195,-4.2679,"u"
+"GDS4128","230871_at","DHX30",-111.7965,47.467,-6.4034,0,0.0195,-4.267,"u"
+"GDS4128","231256_at","LOC727944",-57.0748,21.0099,-7.0539,0,0.0097,-4.247,"u"
+"GDS4128","232578_at","CLDN18",-42.0285,12.0969,-6.6498,0,0.0159,-4.259,"u"
+"GDS4128","232939_at","AU152763",-97.6924,35.0915,-5.4828,0,0.0486,-4.3023,"u"
+"GDS4128","234693_at","LOC105376689",-59.7809,28.2878,-5.4564,0,0.0486,-4.3034,"u"
+"GDS4128","235751_s_at","VMO1",-46.5896,12.4801,-5.926,0,0.032,-4.2841,"u"
+"GDS4128","237039_at","LOC100130502",-89.3097,26.8886,-9.4693,0,4e-04,-4.1971,"u"
+"GDS4128","237042_at","LOC100507024",-52.9344,26.2389,-5.8718,0,0.0327,-4.2862,"u"
+"GDS4128","238627_at","TRAPPC2L",-85.1377,42.3262,-6.8935,0,0.0119,-4.2516,"u"
+"GDS4128","240835_at","BE671305",-91.7474,102.1718,-5.9747,0,0.032,-4.2823,"u"
+"GDS4128","241155_at","AA704588",-64.6885,22.3755,-5.7152,0,0.0362,-4.2925,"u"
+"GDS4128","244069_at","BE220584",-81.1783,21.7971,-14.7565,0,0,-4.1513,"u"
+"GDS4128","244173_at","MIS18BP1",-51.5387,12.2448,-7.2964,0,0.0086,-4.2404,"u"
+"GDS4128","244548_at","AI189587",-54.2703,21.0028,-5.9189,0,0.032,-4.2844,"u"
--- a/data_analysis/results/expr_dea_groupwise_graphs/adj_p_val_distribution_all_gds_ids.png
+++ b/data_analysis/results/expr_dea_groupwise_graphs/adj_p_val_distribution_all_gds_ids.png
--- a/data_analysis/results/expr_dea_groupwise_graphs/heatmap_GDS1917.png
+++ b/data_analysis/results/expr_dea_groupwise_graphs/heatmap_GDS1917.png
--- a/data_analysis/results/expr_dea_groupwise_graphs/heatmap_GDS2519.png
+++ b/data_analysis/results/expr_dea_groupwise_graphs/heatmap_GDS2519.png
--- a/data_analysis/results/expr_dea_groupwise_graphs/heatmap_GDS2795.png
+++ b/data_analysis/results/expr_dea_groupwise_graphs/heatmap_GDS2795.png
--- a/data_analysis/results/expr_dea_groupwise_graphs/heatmap_GDS4128.png
+++ b/data_analysis/results/expr_dea_groupwise_graphs/heatmap_GDS4128.png
--- a/data_analysis/results/expr_dea_groupwise_graphs/heatmap_GDS4135.png
+++ b/data_analysis/results/expr_dea_groupwise_graphs/heatmap_GDS4135.png
--- a/data_analysis/results/expr_dea_groupwise_graphs/heatmap_GDS4136.png
+++ b/data_analysis/results/expr_dea_groupwise_graphs/heatmap_GDS4136.png
--- a/data_analysis/results/expr_dea_groupwise_graphs/heatmap_GDS4522.png
+++ b/data_analysis/results/expr_dea_groupwise_graphs/heatmap_GDS4522.png
--- a/data_analysis/results/expr_dea_groupwise_graphs/heatmap_GDS4523.png
+++ b/data_analysis/results/expr_dea_groupwise_graphs/heatmap_GDS4523.png
--- a/data_analysis/results/expr_dea_groupwise_graphs/heatmap_GDS810.png
+++ b/data_analysis/results/expr_dea_groupwise_graphs/heatmap_GDS810.png
--- a/data_analysis/results/expr_dea_groupwise_graphs/volcano_plot_GDS1917.png
+++ b/data_analysis/results/expr_dea_groupwise_graphs/volcano_plot_GDS1917.png
--- a/data_analysis/results/expr_dea_groupwise_graphs/volcano_plot_GDS2519.png
+++ b/data_analysis/results/expr_dea_groupwise_graphs/volcano_plot_GDS2519.png
--- a/data_analysis/results/expr_dea_groupwise_graphs/volcano_plot_GDS2795.png
+++ b/data_analysis/results/expr_dea_groupwise_graphs/volcano_plot_GDS2795.png
--- a/data_analysis/results/expr_dea_groupwise_graphs/volcano_plot_GDS4128.png
+++ b/data_analysis/results/expr_dea_groupwise_graphs/volcano_plot_GDS4128.png
--- a/data_analysis/results/expr_dea_groupwise_graphs/volcano_plot_GDS4136.png
+++ b/data_analysis/results/expr_dea_groupwise_graphs/volcano_plot_GDS4136.png
--- a/data_analysis/results/expr_dea_groupwise_graphs/volcano_plot_GDS4522.png
+++ b/data_analysis/results/expr_dea_groupwise_graphs/volcano_plot_GDS4522.png
--- a/data_analysis/results/expr_dea_groupwise_graphs/volcano_plot_GDS4523.png
+++ b/data_analysis/results/expr_dea_groupwise_graphs/volcano_plot_GDS4523.png
--- a/data_analysis/results/expr_dea_groupwise_graphs/volcano_plot_GDS810.png
+++ b/data_analysis/results/expr_dea_groupwise_graphs/volcano_plot_GDS810.png
--- a/data_analysis/results/expr_dea_perindividual/combined_individual_results.csv
+++ b/data_analysis/results/expr_dea_perindividual/combined_individual_results.csv
--- a/data_analysis/results/expr_dea_perindividual/individual_summary.csv
+++ b/data_analysis/results/expr_dea_perindividual/individual_summary.csv
+"gds_id","gsm_min","sig_gene_count"
+"GDS2519",5,1917
+"GDS2795",3,3121
+"GDS4128",2,374
+"GDS4136",4,7885
+"GDS810",4,4159
--- a/data_analysis/results/expr_dea_perindividual_graphs/heatmap_GDS1917.png
+++ b/data_analysis/results/expr_dea_perindividual_graphs/heatmap_GDS1917.png
--- a/data_analysis/results/expr_dea_perindividual_graphs/heatmap_GDS2519.png
+++ b/data_analysis/results/expr_dea_perindividual_graphs/heatmap_GDS2519.png
--- a/data_analysis/results/expr_dea_perindividual_graphs/heatmap_GDS2795.png
+++ b/data_analysis/results/expr_dea_perindividual_graphs/heatmap_GDS2795.png
--- a/data_analysis/results/expr_dea_perindividual_graphs/heatmap_GDS4128.png
+++ b/data_analysis/results/expr_dea_perindividual_graphs/heatmap_GDS4128.png
--- a/data_analysis/results/expr_dea_perindividual_graphs/heatmap_GDS4136.png
+++ b/data_analysis/results/expr_dea_perindividual_graphs/heatmap_GDS4136.png
--- a/data_analysis/results/expr_descriptive_analysis/GDS2519/GDS2519_disease_state_counts.csv
+++ b/data_analysis/results/expr_descriptive_analysis/GDS2519/GDS2519_disease_state_counts.csv
+"","Var1","Freq"
+"1","c",55
+"2","d",50
--- a/data_analysis/results/expr_descriptive_analysis/GDS2519/GDS2519_disease_state_counts.png
+++ b/data_analysis/results/expr_descriptive_analysis/GDS2519/GDS2519_disease_state_counts.png
--- a/data_analysis/results/expr_descriptive_analysis/GDS2519/GDS2519_expression_by_disease_state.png
+++ b/data_analysis/results/expr_descriptive_analysis/GDS2519/GDS2519_expression_by_disease_state.png
--- a/data_analysis/results/expr_descriptive_analysis/GDS2519/GDS2519_expression_stats.csv
+++ b/data_analysis/results/expr_descriptive_analysis/GDS2519/GDS2519_expression_stats.csv
+"","mean","std","min","25%","50%","75%","max"
+"1",174.880692883186,929.601049291251,0.312380952380952,15.6685714285714,50.3761904761905,118.235238095238,42470.7895238095
--- a/data_analysis/results/expr_descriptive_analysis/GDS2519/GDS2519_expression_stats_postnorm.csv
+++ b/data_analysis/results/expr_descriptive_analysis/GDS2519/GDS2519_expression_stats_postnorm.csv
+"","mean","std","min","25%","50%","75%","max"
+"1",174.880692883186,929.601049291251,0.312380952380952,15.6685714285714,50.3761904761905,118.235238095238,42470.7895238095
--- a/data_analysis/results/expr_descriptive_analysis/GDS2519/GDS2519_expression_stats_prenorm.csv
+++ b/data_analysis/results/expr_descriptive_analysis/GDS2519/GDS2519_expression_stats_prenorm.csv
+"","mean","std","min","25%","50%","75%","max"
+"1",174.880698998339,998.361671556367,0.1,15.7,50.2,118.6,158720
--- a/data_analysis/results/expr_descriptive_analysis/GDS2519/GDS2519_expression_values_boxplot.png
+++ b/data_analysis/results/expr_descriptive_analysis/GDS2519/GDS2519_expression_values_boxplot.png
--- a/data_analysis/results/expr_descriptive_analysis/GDS2519/GDS2519_norm_expression_values_boxplot.png
+++ b/data_analysis/results/expr_descriptive_analysis/GDS2519/GDS2519_norm_expression_values_boxplot.png
--- a/data_analysis/results/expr_descriptive_analysis/GDS2519/GDS2519_processed_disease_state_counts.csv
+++ b/data_analysis/results/expr_descriptive_analysis/GDS2519/GDS2519_processed_disease_state_counts.csv
+"","disease_state","freq"
+"1","c",55
+"2","d",50
--- a/data_analysis/results/expr_descriptive_analysis/GDS2519/GDS2519_raw_disease_state_counts.csv
+++ b/data_analysis/results/expr_descriptive_analysis/GDS2519/GDS2519_raw_disease_state_counts.csv
+"","disease_state","freq"
+"1","healthy control",22
+"2","neurodegenerative disease control",33
+"3","Parkinson's disease",50
--- a/data_analysis/results/expr_descriptive_analysis/GDS2795/GDS2795_disease_state_counts.png
+++ b/data_analysis/results/expr_descriptive_analysis/GDS2795/GDS2795_disease_state_counts.png
--- a/data_analysis/results/expr_descriptive_analysis/GDS2795/GDS2795_expression_by_disease_state.png
+++ b/data_analysis/results/expr_descriptive_analysis/GDS2795/GDS2795_expression_by_disease_state.png
--- a/data_analysis/results/expr_descriptive_analysis/GDS2795/GDS2795_expression_stats_postnorm.csv
+++ b/data_analysis/results/expr_descriptive_analysis/GDS2795/GDS2795_expression_stats_postnorm.csv
+"","mean","std","min","25%","50%","75%","max"
+"1",215.442423566996,801.732437569486,0.29,21.64,71.9775,181.585,52550.39
--- a/data_analysis/results/expr_descriptive_analysis/GDS2795/GDS2795_expression_stats_prenorm.csv
+++ b/data_analysis/results/expr_descriptive_analysis/GDS2795/GDS2795_expression_stats_prenorm.csv
+"","mean","std","min","25%","50%","75%","max"
+"1",215.442412079161,894.187411696145,0,21.2,71.2,183.9,145814
--- a/data_analysis/results/expr_descriptive_analysis/GDS2795/GDS2795_expression_values_boxplot.png
+++ b/data_analysis/results/expr_descriptive_analysis/GDS2795/GDS2795_expression_values_boxplot.png
--- a/data_analysis/results/expr_descriptive_analysis/GDS2795/GDS2795_norm_expression_values_boxplot.png
+++ b/data_analysis/results/expr_descriptive_analysis/GDS2795/GDS2795_norm_expression_values_boxplot.png
--- a/data_analysis/results/expr_descriptive_analysis/GDS2795/GDS2795_processed_disease_state_counts.csv
+++ b/data_analysis/results/expr_descriptive_analysis/GDS2795/GDS2795_processed_disease_state_counts.csv
+"","disease_state","freq"
+"1","c",10
+"2","d",10
--- a/data_analysis/results/expr_descriptive_analysis/GDS2795/GDS2795_raw_disease_state_counts.csv
+++ b/data_analysis/results/expr_descriptive_analysis/GDS2795/GDS2795_raw_disease_state_counts.csv
+"","disease_state","freq"
+"1","neurofibrillary tangle",10
+"2","normal",10
--- a/data_analysis/results/expr_descriptive_analysis/GDS4128/GDS4128_disease_state_counts.csv
+++ b/data_analysis/results/expr_descriptive_analysis/GDS4128/GDS4128_disease_state_counts.csv
+"","Var1","Freq"
+"1","c",16
+"2","d",2
--- a/data_analysis/results/expr_descriptive_analysis/GDS4128/GDS4128_disease_state_counts.png
+++ b/data_analysis/results/expr_descriptive_analysis/GDS4128/GDS4128_disease_state_counts.png
--- a/data_analysis/results/expr_descriptive_analysis/GDS4128/GDS4128_expression_by_disease_state.png
+++ b/data_analysis/results/expr_descriptive_analysis/GDS4128/GDS4128_expression_by_disease_state.png
--- a/data_analysis/results/expr_descriptive_analysis/GDS4128/GDS4128_expression_stats.csv
+++ b/data_analysis/results/expr_descriptive_analysis/GDS4128/GDS4128_expression_stats.csv
+"","mean","std","min","25%","50%","75%","max"
+"1",196.828769593056,1067.9662198029,0.322222222222222,14.3722222222222,43.8388888888889,108.075,40597.2611111111
--- a/data_analysis/results/expr_descriptive_analysis/GDS4128/GDS4128_expression_stats_postnorm.csv
+++ b/data_analysis/results/expr_descriptive_analysis/GDS4128/GDS4128_expression_stats_postnorm.csv
+"","mean","std","min","25%","50%","75%","max"
+"1",196.828769593056,1067.9662198029,0.322222222222222,14.3722222222222,43.8388888888889,108.075,40597.2611111111
--- a/data_analysis/results/expr_descriptive_analysis/GDS4128/GDS4128_expression_stats_prenorm.csv
+++ b/data_analysis/results/expr_descriptive_analysis/GDS4128/GDS4128_expression_stats_prenorm.csv
+"","mean","std","min","25%","50%","75%","max"
+"1",196.828768203057,1118.2246457,0.1,14.2,43.1,109.3,70634.7
--- a/data_analysis/results/expr_descriptive_analysis/GDS4128/GDS4128_expression_values_boxplot.png
+++ b/data_analysis/results/expr_descriptive_analysis/GDS4128/GDS4128_expression_values_boxplot.png
--- a/data_analysis/results/expr_descriptive_analysis/GDS4128/GDS4128_norm_expression_values_boxplot.png
+++ b/data_analysis/results/expr_descriptive_analysis/GDS4128/GDS4128_norm_expression_values_boxplot.png
--- a/data_analysis/results/expr_descriptive_analysis/GDS4128/GDS4128_processed_disease_state_counts.csv
+++ b/data_analysis/results/expr_descriptive_analysis/GDS4128/GDS4128_processed_disease_state_counts.csv
+"","disease_state","freq"
+"1","c",16
+"2","d",2
--- a/data_analysis/results/expr_descriptive_analysis/GDS4128/GDS4128_raw_disease_state_counts.csv
+++ b/data_analysis/results/expr_descriptive_analysis/GDS4128/GDS4128_raw_disease_state_counts.csv
+"","disease_state","freq"
+"1","Alzheimer’s disease-like",2
+"2","healthy",16
--- a/data_analysis/results/expr_descriptive_analysis/GDS4135/GDS4135_disease_state_counts.csv
+++ b/data_analysis/results/expr_descriptive_analysis/GDS4135/GDS4135_disease_state_counts.csv
+"","Var1","Freq"
+"1","d",18
--- a/data_analysis/results/expr_descriptive_analysis/GDS4135/GDS4135_disease_state_counts.png
+++ b/data_analysis/results/expr_descriptive_analysis/GDS4135/GDS4135_disease_state_counts.png
--- a/data_analysis/results/expr_descriptive_analysis/GDS4135/GDS4135_expression_by_disease_state.png
+++ b/data_analysis/results/expr_descriptive_analysis/GDS4135/GDS4135_expression_by_disease_state.png
--- a/data_analysis/results/expr_descriptive_analysis/GDS4135/GDS4135_expression_stats.csv
+++ b/data_analysis/results/expr_descriptive_analysis/GDS4135/GDS4135_expression_stats.csv
+"","mean","std","min","25%","50%","75%","max"
+"1",146.82249472959,696.391251626308,0.222222222222222,15.2,56.55,124.481944291881,53931.4
--- a/data_analysis/results/expr_descriptive_analysis/GDS4135/GDS4135_expression_stats_postnorm.csv
+++ b/data_analysis/results/expr_descriptive_analysis/GDS4135/GDS4135_expression_stats_postnorm.csv
+"","mean","std","min","25%","50%","75%","max"
+"1",146.82249472959,696.391251626308,0.222222222222222,15.2,56.55,124.481944291881,53931.4
--- a/data_analysis/results/expr_descriptive_analysis/GDS4135/GDS4135_expression_stats_prenorm.csv
+++ b/data_analysis/results/expr_descriptive_analysis/GDS4135/GDS4135_expression_stats_prenorm.csv
+"","mean","std","min","25%","50%","75%","max"
+"1",146.822516326607,825.002914679964,0.1,15.4,55.4,127.4,209365
--- a/data_analysis/results/expr_descriptive_analysis/GDS4135/GDS4135_expression_values_boxplot.png
+++ b/data_analysis/results/expr_descriptive_analysis/GDS4135/GDS4135_expression_values_boxplot.png
--- a/data_analysis/results/expr_descriptive_analysis/GDS4135/GDS4135_norm_expression_values_boxplot.png
+++ b/data_analysis/results/expr_descriptive_analysis/GDS4135/GDS4135_norm_expression_values_boxplot.png
--- a/data_analysis/results/expr_descriptive_analysis/GDS4135/GDS4135_processed_disease_state_counts.csv
+++ b/data_analysis/results/expr_descriptive_analysis/GDS4135/GDS4135_processed_disease_state_counts.csv
+"","disease_state","freq"
+"1","d",18
--- a/data_analysis/results/expr_descriptive_analysis/GDS4135/GDS4135_raw_disease_state_counts.csv
+++ b/data_analysis/results/expr_descriptive_analysis/GDS4135/GDS4135_raw_disease_state_counts.csv
+"","disease_state","freq"
+"1","Braak stage I-II",6
+"2","Braak stage III-IV",6
+"3","Braak stage V-VI",6
--- a/data_analysis/results/expr_descriptive_analysis/GDS4136/GDS4136_disease_state_counts.csv
+++ b/data_analysis/results/expr_descriptive_analysis/GDS4136/GDS4136_disease_state_counts.csv
+"","Var1","Freq"
+"1","c",8
+"2","d",22
--- a/data_analysis/results/expr_descriptive_analysis/GDS4136/GDS4136_disease_state_counts.png
+++ b/data_analysis/results/expr_descriptive_analysis/GDS4136/GDS4136_disease_state_counts.png
--- a/data_analysis/results/expr_descriptive_analysis/GDS4136/GDS4136_expression_by_disease_state.png
+++ b/data_analysis/results/expr_descriptive_analysis/GDS4136/GDS4136_expression_by_disease_state.png
--- a/data_analysis/results/expr_descriptive_analysis/GDS4136/GDS4136_expression_stats.csv
+++ b/data_analysis/results/expr_descriptive_analysis/GDS4136/GDS4136_expression_stats.csv
+"","mean","std","min","25%","50%","75%","max"
+"1",856.675968074092,4919.41233618724,1.08666666666667,69.8066666666667,221.59,563.95,299891.566666667
--- a/data_analysis/results/expr_descriptive_analysis/GDS4136/GDS4136_expression_stats_postnorm.csv
+++ b/data_analysis/results/expr_descriptive_analysis/GDS4136/GDS4136_expression_stats_postnorm.csv
+"","mean","std","min","25%","50%","75%","max"
+"1",856.675968074092,4919.41233618724,1.08666666666667,69.8066666666667,221.59,563.95,299891.566666667
--- a/data_analysis/results/expr_descriptive_analysis/GDS4136/GDS4136_expression_stats_prenorm.csv
+++ b/data_analysis/results/expr_descriptive_analysis/GDS4136/GDS4136_expression_stats_prenorm.csv
+"","mean","std","min","25%","50%","75%","max"
+"1",856.675969951608,5050.15888527268,0.3,69.5,220.8,565.7,640898
--- a/data_analysis/results/expr_descriptive_analysis/GDS4136/GDS4136_expression_values_boxplot.png
+++ b/data_analysis/results/expr_descriptive_analysis/GDS4136/GDS4136_expression_values_boxplot.png
--- a/data_analysis/results/expr_descriptive_analysis/GDS4136/GDS4136_norm_expression_values_boxplot.png
+++ b/data_analysis/results/expr_descriptive_analysis/GDS4136/GDS4136_norm_expression_values_boxplot.png
--- a/data_analysis/results/expr_descriptive_analysis/GDS4136/GDS4136_processed_disease_state_counts.csv
+++ b/data_analysis/results/expr_descriptive_analysis/GDS4136/GDS4136_processed_disease_state_counts.csv
+"","disease_state","freq"
+"1","c",8
+"2","d",22
--- a/data_analysis/results/expr_descriptive_analysis/GDS4136/GDS4136_raw_disease_state_counts.csv
+++ b/data_analysis/results/expr_descriptive_analysis/GDS4136/GDS4136_raw_disease_state_counts.csv
+"","disease_state","freq"
+"1","control",8
+"2","incipient stage",7
+"3","moderate stage",8
+"4","severe stage",7
--- a/data_analysis/results/expr_descriptive_analysis/GDS810/GDS810_disease_state_counts.csv
+++ b/data_analysis/results/expr_descriptive_analysis/GDS810/GDS810_disease_state_counts.csv
+"","Var1","Freq"
+"1","c",9
+"2","d",22
--- a/data_analysis/results/expr_descriptive_analysis/GDS810/GDS810_disease_state_counts.png
+++ b/data_analysis/results/expr_descriptive_analysis/GDS810/GDS810_disease_state_counts.png
--- a/data_analysis/results/expr_descriptive_analysis/GDS810/GDS810_expression_by_disease_state.png
+++ b/data_analysis/results/expr_descriptive_analysis/GDS810/GDS810_expression_by_disease_state.png
--- a/data_analysis/results/expr_descriptive_analysis/GDS810/GDS810_expression_stats.csv
+++ b/data_analysis/results/expr_descriptive_analysis/GDS810/GDS810_expression_stats.csv
+"","mean","std","min","25%","50%","75%","max"
+"1",677.781240828767,1680.87637113157,0.845161290322581,69.9887096774194,224.209677419355,603.822580645161,35924.035483871
--- a/data_analysis/results/expr_descriptive_analysis/GDS810/GDS810_expression_stats_postnorm.csv
+++ b/data_analysis/results/expr_descriptive_analysis/GDS810/GDS810_expression_stats_postnorm.csv
+"","mean","std","min","25%","50%","75%","max"
+"1",677.781240828767,1680.87637113157,0.845161290322581,69.9887096774194,224.209677419355,603.822580645161,35924.035483871
--- a/data_analysis/results/expr_descriptive_analysis/GDS810/GDS810_expression_stats_prenorm.csv
+++ b/data_analysis/results/expr_descriptive_analysis/GDS810/GDS810_expression_stats_prenorm.csv
+"","mean","std","min","25%","50%","75%","max"
+"1",677.781250843645,1690.72429835133,0.4,69.4,224.2,605.1,78292.3
--- a/data_analysis/results/expr_descriptive_analysis/GDS810/GDS810_expression_values_boxplot.png
+++ b/data_analysis/results/expr_descriptive_analysis/GDS810/GDS810_expression_values_boxplot.png
--- a/data_analysis/results/expr_descriptive_analysis/GDS810/GDS810_norm_expression_values_boxplot.png
+++ b/data_analysis/results/expr_descriptive_analysis/GDS810/GDS810_norm_expression_values_boxplot.png
--- a/data_analysis/results/expr_descriptive_analysis/GDS810/GDS810_processed_disease_state_counts.csv
+++ b/data_analysis/results/expr_descriptive_analysis/GDS810/GDS810_processed_disease_state_counts.csv
+"","disease_state","freq"
+"1","c",9
+"2","d",22
--- a/data_analysis/results/expr_descriptive_analysis/GDS810/GDS810_raw_disease_state_counts.csv
+++ b/data_analysis/results/expr_descriptive_analysis/GDS810/GDS810_raw_disease_state_counts.csv
+"","disease_state","freq"
+"1","control",9
+"2","incipient AD",7
+"3","moderate AD",8
+"4","severe AD",7
--- a/data_analysis/results/expr_descriptive_analysis/boxplot_expression_values_postnormalization.png
+++ b/data_analysis/results/expr_descriptive_analysis/boxplot_expression_values_postnormalization.png
--- a/data_analysis/results/expr_descriptive_analysis/boxplot_expression_values_prenormalization.png
+++ b/data_analysis/results/expr_descriptive_analysis/boxplot_expression_values_prenormalization.png
--- a/data_analysis/results/expr_descriptive_analysis/number_of_gds_per_disease.png
+++ b/data_analysis/results/expr_descriptive_analysis/number_of_gds_per_disease.png
--- a/data_analysis/results/expr_descriptive_analysis/number_of_gsm_per_gds.png
+++ b/data_analysis/results/expr_descriptive_analysis/number_of_gsm_per_gds.png
--- a/data_analysis/scripts/descriptive_analysis.R
+++ b/data_analysis/scripts/descriptive_analysis.R
+library(DBI)
+library(reshape2)
+library(limma)
+library(ggplot2)
+library(dplyr)
+library(tidyverse) 
+library(gridExtra)
+output_dir="C:/Users/Laura/Documents/Master Cuatri 2/Practicas/DEA/exprs_dea/results/expr_descriptive_analysis"
+diseases <- read.delim("C:/Users/Laura/Documents/Master Cuatri 2/Practicas/DEA/exprs_dea/data/data_01_neuro_diseases_final_disease_selected.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
+diseases <- diseases %>%
+  rename(disease_id = cui)%>%
+  select(disease_id, disease_name)
+
+con <- dbConnect(MySQL(), 
+                 dbname = "disnet_biolayer", 
+                 host = "ares.ctb.upm.es", 
+                 port = 30604, 
+                 user = "gene_expression", 
+                 password = "gene_expression")
+
+
+##TO REPRESENT THE NUMBER OF DATASETS (GDS) PER DISEASE 
+
+all_dis_gds<-data.frame()
+# Get expr_disease_gds for the current gds_id
+query_dis_gds <- paste0("SELECT * FROM expr_disease_gds")
+dis_gds <- dbGetQuery(con, query_dis_gds)
+
+# Merge the dataframes all_dis_gds and all_diseases by disease_id
+merged_data <- merge(dis_gds, diseases, by = "disease_id")
+
+# Count the number of GDS for each disease_name
+gds_count <- aggregate(gds_id ~ disease_name, data = merged_data, FUN = length)
+
+gds_count <- gds_count %>% arrange(desc(gds_id))
+
+# Plot the results with blue bars and sorted by the number of GDS
+dis_gds_plot<-ggplot(gds_count, aes(x = reorder(disease_name, -gds_id), y = gds_id)) +
+  geom_bar(stat = "identity", fill = "#7AC5CD") +
+  xlab("Disease Name") +
+  ylab("Number of GDS") +
+  ggtitle("Number of GDS per Disease") +
+  theme(
+    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 14),
+    axis.text.y= element_text(size=14),
+    axis.title.x = element_text(size = 14),
+    axis.title.y = element_text(size = 14),
+    panel.background = element_rect(fill = "white", color = "black"),
+    panel.grid.major.x = element_blank(), # Remove vertical grid lines
+    panel.grid.minor.x = element_blank(), # Remove minor vertical grid lines
+    panel.grid.major.y = element_line(color = "gray90"), # Optional: Add horizontal grid lines
+    panel.grid.minor.y = element_blank(), # Optional: Remove minor horizontal grid lines
+    axis.line = element_line(color = "black"), # Add axis lines
+    plot.title = element_text(hjust = 0.5, size = 16,  color = "black",face = "bold")
+  ) +
+  scale_y_continuous(expand = expansion(mult = c(0, 0.1))) # Adjust the y-axis to start at 0 and add some padding at the top
+
+dis_gds_plot
+ggsave("C:/Users/Laura/Documents/Master Cuatri 2/Practicas/DEA/exprs_dea/results/expr_descriptive_analysis/number_of_gds_per_disease.png", plot = dis_gds_plot, width = 15, height = 12)
+
+##REPRESENT THE NUMBER OF samples per dataset (GSMs per GDS)
+query_gds_gsm <- paste0("SELECT gds_id,gsm_id FROM expr_processed_annot")
+gds_gsm <- dbGetQuery(con, query_gds_gsm)
+gds_count <- gds_gsm %>%
+  group_by(gds_id) %>%
+  summarise(num_gsm = n())
+gds_gsm_plot <- ggplot(gds_count, aes(x = as.factor(gds_id), y = num_gsm)) +
+  geom_bar(stat = "identity", fill = "#7AC5CD", width = 0.9)+
+  xlab("GDS ID") +
+  ylab("Number of GSM IDs") +
+  ggtitle("Number of samples per dataset (GSMs per GDS)") +
+  theme(
+    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 14),
+    axis.text.y = element_text(size = 14),
+    axis.title.x = element_text(size = 14),
+    axis.title.y = element_text(size = 14),
+    panel.background = element_rect(fill = "white", color = "black"),
+    panel.grid.major.x = element_blank(), # Remove vertical grid lines
+    panel.grid.minor.x = element_blank(), # Remove minor vertical grid lines
+    panel.grid.major.y = element_line(color = "gray90"), # Optional: Add horizontal grid lines
+    panel.grid.minor.y = element_blank(), # Optional: Remove minor horizontal grid lines
+    axis.line = element_line(color = "black"), # Add axis lines
+    plot.title = element_text(hjust = 0.5, size = 16,  color = "black",face = "bold")
+  ) +
+  scale_y_continuous(expand = expansion(mult = c(0, 0.1)))+  
+  
+  scale_x_discrete(expand = expansion(add = c(0.9, 0.9)))
+gds_gsm_plot
+ggsave("C:/Users/Laura/Documents/Master Cuatri 2/Practicas/DEA/exprs_dea/results/expr_descriptive_analysis/number_of_gsm_per_gds.png", plot = gds_gsm_plot, width = 15, height = 12)
+
+
+
+
+##THE REST OF DESCRIPTIVE ANALYSIS ITERING GDS_ID PER GDS_ID
+
+gds_ids <- dbGetQuery(con, "SELECT DISTINCT gds_id FROM expr_processed_annot WHERE flag = 1")
+
+
+all_expr_values <- data.frame()
+all_normalized_values <- data.frame()
+
+
+for (i in 1:nrow(gds_ids)) {
+  gds_id<-gds_ids$gds_id[i]
+  gds_output_dir <- file.path(output_dir, gds_id)
+  
+  # Check if the directory already exists
+  if (!dir.exists(gds_output_dir)) {
+    dir.create(gds_output_dir, recursive = TRUE)
+  } 
+  setwd(gds_output_dir)
+  
+  #extraction of data from expr_values table
+  query_expr_values <- paste0("SELECT gsm_id, id_ref, gene_symbol, value FROM expr_values WHERE gds_id = '", gds_id, "'")
+  expr_values <- dbGetQuery(con, query_expr_values)
+  
+  #extraction of data from expr_processed_annot table with flag = 1
+  query_expr_raw_annot <- paste0("SELECT gsm_id, disease_state FROM expr_raw_annot WHERE gds_id = '", gds_id, "'")
+  expr_raw_annot <- dbGetQuery(con, query_expr_raw_annot)
+  
+  #extraction of data from expr_processed_annot table with flag = 1
+  query_expr_processed_annot <- paste0("SELECT gsm_id, disease_state FROM expr_processed_annot WHERE gds_id = '", gds_id, "'")
+  expr_processed_annot <- dbGetQuery(con, query_expr_processed_annot)
+  
+  #to keep gene_symbol for reference at the end
+  gene_symbols <- unique(expr_values[, c("id_ref", "gene_symbol")])
+  
+  
+  if (nrow(expr_values) == 0) {
+    warning(paste("No data found for gds_id:", gds_id))
+    next
+  }
+  
+  if (!is.numeric(expr_values$value)) {
+    expr_values$value <- as.numeric(expr_values$value)
+  }
+  
+  #create the expression matrix without gene_symbol
+  expression <- dcast(expr_values, id_ref ~ gsm_id, value.var = "value", fun.aggregate = mean)
+  rownames(expression) <- expression$id_ref
+  expression <- expression[, -1]  # Remove the id_ref column
+  
+  if (ncol(expression) == 0) {
+    warning(paste("No expression data available for gds_id:", gds_id))
+    next
+  }
+  
+  #normalization of expression matrix
+  exprs_matrix <- as.matrix(expression)
+  exprs_normalized <- normalizeBetweenArrays(exprs_matrix, method = 'quantile')
+  exprs_normalized_df <- melt(exprs_normalized, varnames = c("id_ref", "gsm_id"), value.name = "value")
+  normalized_df <- merge(exprs_normalized_df, gene_symbols, by = "id_ref", all.x = TRUE)
+  
+  expr_values$gds_id <- gds_id
+  normalized_df$gds_id <- gds_id
+  
+  all_expr_values <- rbind(all_expr_values, expr_values)
+  all_normalized_values <- rbind(all_normalized_values, normalized_df)
+  
+  
+  expr_stats <- summarize(
+    expr_values,
+    mean = mean(value, na.rm = TRUE),
+    std = sd(value, na.rm = TRUE),
+    min = min(value, na.rm = TRUE),
+    `25%` = quantile(value, 0.25, na.rm = TRUE),
+    `50%` = median(value, na.rm = TRUE),
+    `75%` = quantile(value, 0.75, na.rm = TRUE),
+    max = max(value, na.rm = TRUE)
+  )
+  write.csv(expr_stats, file.path(gds_output_dir, paste0(gds_id, "_expression_stats_prenorm.csv")))
+  
+  
+  expr_stats_norm <- summarize(
+    normalized_df,
+    mean = mean(value, na.rm = TRUE),
+    std = sd(value, na.rm = TRUE),
+    min = min(value, na.rm = TRUE),
+    `25%` = quantile(value, 0.25, na.rm = TRUE),
+    `50%` = median(value, na.rm = TRUE),
+    `75%` = quantile(value, 0.75, na.rm = TRUE),
+    max = max(value, na.rm = TRUE)
+  )
+  write.csv(expr_stats_norm, file.path(gds_output_dir, paste0(gds_id, "_expression_stats_postnorm.csv")))
+  
+  # Count categories and generate plots only for columns with data
+  categorical_columns <- c('agent', 'disease_state', 'tissue', 'cell_type')
+  for (col in categorical_columns) {
+    if (col %in% colnames(expr_raw_annot) && !all(is.na(expr_raw_annot[[col]]))) {
+      cat_counts <- table(expr_raw_annot[[col]])
+      df_cat_counts <- as.data.frame(cat_counts)
+      colnames(df_cat_counts) <- c(col, "freq")
+      write.csv(df_cat_counts, file.path(gds_output_dir, paste0(gds_id, "_raw_", col, "_counts.csv")))
+    
+    if (col %in% colnames(expr_processed_annot) && !all(is.na(expr_processed_annot[[col]]))) {
+      cat_counts <- table(expr_processed_annot[[col]])
+      df_cat_counts <- as.data.frame(cat_counts)
+      colnames(df_cat_counts) <- c(col, "freq")
+      write.csv(df_cat_counts, file.path(gds_output_dir, paste0(gds_id, "_processed_", col, "_counts.csv")))
+      
+      plot <- ggplot(expr_processed_annot, aes(y = .data[[col]])) +
+        geom_bar() +
+        labs(title = paste0(str_to_title(col), " count for ", gds_id), hjust = 0.5) +
+        theme_minimal()+
+        theme(
+          plot.title = element_text(size = 18, face = "bold", hjust = 0.5),  
+          axis.title.x = element_text(size = 16),  
+          axis.title.y = element_text(size = 16),  
+          axis.text.x = element_text(size = 15),  
+          axis.text.y = element_text(size = 15)   
+        )
+      
+      ggsave(file.path(gds_output_dir, paste0(gds_id, "_", col, "_counts.png")), plot = plot, width = 10, height = 6,bg='white')
+    }
+  }
+  
+  
+    # Boxplot of expression by disease state
+    # Boxplot of expression by disease state
+    if ("disease_state" %in% colnames(expr_processed_annot)) {
+      # Merge expression and annotation dataframes
+      merged_df <- inner_join(normalized_df, expr_processed_annot %>% select(gsm_id, disease_state), by = "gsm_id")
+      
+      # Violin plot
+      violin_disease <- ggplot(merged_df, aes(x = disease_state, y = value)) +
+        geom_violin(aes(fill = disease_state), position = position_dodge(0.9)) +
+        scale_fill_manual(
+          values = c("c" = "#FFFFF0", "d" = "#838B83"),  # Specify colors for the groups
+          labels = c("c" = "Control", "d" = "Disease"), # Set the labels for the legend
+          name = "Disease State"  # Set the title for the legend
+        ) +
+        scale_x_discrete(
+          labels = c("c" = "Control", "d" = "Disease")  # Rename x-axis labels
+        ) +
+        labs(
+          title = paste0("Expression Values by Disease State for ", gds_id),
+          x = "Disease State", 
+          y = "Expression Value"
+        ) +
+        theme_minimal() +
+        theme(
+          plot.title = element_text(size = 18, face = "bold", hjust = 0.5),  
+          axis.title.x = element_text(size = 16),  
+          axis.title.y = element_text(size = 16),  
+          axis.text.x = element_text(size = 15),  
+          axis.text.y = element_text(size = 15),
+          legend.position = "none"  # Remove the legend
+        )
+      
+      # Save the plot
+      ggsave(
+        file.path(gds_output_dir, paste0(gds_id, "_expression_by_disease_state.png")), 
+        plot = violin_disease, 
+        width = 14, 
+        height = 8, 
+        bg = 'white'
+      )
+    }
+    
+  
+  
+  # Boxplot of expression values
+  boxplot_expression <- ggplot(expr_values, aes(x=gds_id, y = value)) +
+    geom_boxplot() +
+    labs(title = paste0("Distribution of Expression Values for ", gds_id),x='',
+         y = "Expression Value") +
+    theme_minimal()+
+    theme(
+      plot.title = element_text(size = 18, face = "bold", hjust = 0.5),  
+      axis.title.x = element_text(size = 16),  
+      axis.title.y = element_text(size = 16),  
+      axis.text.x = element_text(size = 15),  
+      axis.text.y = element_text(size = 15)   
+    )
+  
+  ggsave(file.path(gds_output_dir, paste0(gds_id, "_expression_values_boxplot.png")), plot = boxplot_expression, width = 14, height = 8, bg = "white")
+  
+  
+  # Boxplot of normalized expression values
+  boxplot_norm_expression <- ggplot(normalized_df, aes(x=gds_id, y = value)) +
+    geom_boxplot() +
+    labs(title = paste0("Distribution of Normalized Expression Values for ", gds_id), x='',
+         y = "Expression Value") +
+    theme_minimal()+
+    theme(
+      plot.title = element_text(size = 18, face = "bold", hjust = 0.5),  
+      axis.title.x = element_text(size = 16),  
+      axis.title.y = element_text(size = 16),  
+      axis.text.x = element_text(size = 15),  
+      axis.text.y = element_text(size = 15)   
+    )
+  
+  ggsave(file.path(gds_output_dir, paste0(gds_id, "_norm_expression_values_boxplot.png")), plot = boxplot_norm_expression, width = 14, height = 8, bg = "white")
+  
+  
+  }
+}
+
+
+
+plot_expr_values <- ggplot(all_expr_values, aes(x = as.factor(gds_id), y = value)) +
+  geom_boxplot() +
+  labs(title = "Distribution of Expression Values Before Normalization",
+       x = "GDS ID", y = "Expression Value") +
+  theme_minimal()+
+  theme(
+    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),  
+    axis.title.x = element_text(size = 14),  
+    axis.title.y = element_text(size = 14),  
+    axis.text.x = element_text(size = 12),  
+    axis.text.y = element_text(size = 12)   
+  )
+
+ggsave(file.path(output_dir, "boxplot_expression_values_prenormalization.png"), plot = plot_expr_values, width = 14, height = 8, bg = 'white')
+
+# Graficar caja y bigotes para valores después de la normalización
+plot_normalized_values <- ggplot(all_normalized_values, aes(x = as.factor(gds_id), y = value)) +
+  geom_boxplot() +
+  labs(title = "Distribution of Normalized Expression Values",
+       x = "GDS ID", y = "Expression Value") +
+  theme_minimal()+
+  theme(
+    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),  
+    axis.title.x = element_text(size = 14),  
+    axis.title.y = element_text(size = 14),  
+    axis.text.x = element_text(size = 12),  
+    axis.text.y = element_text(size = 12)   
+  )
+
+ggsave(file.path(output_dir, "boxplot_expression_values_postnormalization.png"), plot = plot_normalized_values, width = 14, height = 8, bg = 'white')
+
+
+
+
+
+
+
+
+
+
+#combined_plot <- grid.arrange(plot_expr_values, plot_normalized_values, nrow = 2)
+#ggsave(file.path(output_dir, "boxplot_expression_values_combined.png"), plot = combined_plot, width = 14, height = 8, bg = 'white')
+
--- a/data_analysis/scripts/differential_expresssion_analysis.R
+++ b/data_analysis/scripts/differential_expresssion_analysis.R
+#install.packages("DBI")
+#install.packages("RMySQL")
+library(DBI)
+library(RMySQL)
+library(limma)
+library(ggplot2)
+library(pheatmap)
+library(dplyr)
+library(reshape2)
+library(grid)
+library(gtable)
+#dbDisconnect(con)
+con <- dbConnect(MySQL(), 
+                 dbname = "disnet_biolayer", 
+                 host = "ares.ctb.upm.es", 
+                 port = 30604, 
+                 user = "gene_expression", 
+                 password = "gene_expression")
+
+
+
+graphs_dir <- "C:/Users/Laura/Documents/Master Cuatri 2/Practicas/DEA/exprs_dea/results/expr_dea_groupwise_graphs"
+gds_ids <- dbGetQuery(con, "SELECT DISTINCT gds_id FROM expr_processed_annot WHERE flag = 1")
+
+
+
+differential_expression_analysis <- function(gds_id, con) {
+  #extraction of data from expr_values table
+  query_expr_values <- paste0("SELECT gsm_id, id_ref, gene_symbol, value FROM expr_values WHERE gds_id = '", gds_id, "'")
+  expr_values <- dbGetQuery(con, query_expr_values,  columns = c("gsm_id" = "character", 
+                                                                 "id_ref" = "character", 
+                                                                 "gene_symbol" = "character", 
+                                                                 "value" = "numeric"))
+  
+  
+  
+  
+  if (nrow(expr_values) == 0) {
+    warning(paste("No data found for gds_id:", gds_id))
+    return(NULL)
+  }
+  
+  if (!is.numeric(expr_values$value)) {
+    expr_values$value <- as.numeric(expr_values$value)
+  }
+  
+  
+  #extraction of data from expr_processed_annot table with flag = 1
+  query_expr_processed_annot <- paste0("SELECT gsm_id, disease_state FROM expr_processed_annot WHERE gds_id = '", gds_id, "' AND flag = 1")
+  expr_processed_annot <- dbGetQuery(con, query_expr_processed_annot)
+  
+  if (nrow(expr_processed_annot) == 0) {
+    warning(paste("No data found in expr_processed_annot for gds_id:", gds_id))
+    return(NULL)
+  }
+  
+  #ensure there are enough disease state levels
+  if (length(unique(expr_processed_annot$disease_state)) < 2) {
+    warning(paste("Not enough disease state levels for gds_id:", gds_id))
+    return(NULL)
+  }
+  
+  #to keep gene_symbol for reference at the end
+  gene_symbols <- unique(expr_values[, c("id_ref", "gene_symbol")])
+  
+  #creation of the expression matrix without gene_symbol
+  expression <- reshape2::dcast(expr_values, id_ref ~ gsm_id, value.var = "value", fun.aggregate = mean)
+  rownames(expression) <- expression$id_ref
+  expression <- expression[, -1]  # Remove the id_ref column
+  
+  if (ncol(expression) == 0) {
+    warning(paste("No expression data available for gds_id:", gds_id))
+    return(NULL)
+  }
+  
+  #normalize expression matrix
+  exprs_matrix <- as.matrix(expression)
+  exprs_normalized <- normalizeBetweenArrays(exprs_matrix, method = 'quantile')
+  
+  #create design matrix
+  target <- expr_processed_annot
+  target$disease_state <- factor(target$disease_state)
+  
+  if (length(levels(target$disease_state)) < 2) {
+    warning(paste("Not enough disease state levels for gds_id:", gds_id))
+    return(NULL)
+  }
+  
+  design <- model.matrix(~ 0 + disease_state, data = target)
+  colnames(design) <- levels(target$disease_state)
+  
+  # Fit linear model
+  fit <- lmFit(exprs_normalized, design)
+  
+  #to handle NA coefficients by excluding probes with missing data
+  na_probes <- rownames(fit$coefficients)[apply(is.na(fit$coefficients), 1, any)]
+  if (length(na_probes) > 0) {
+    warning(paste("Partial NA coefficients for", length(na_probes), "probe(s). These will be excluded from the analysis."))
+    fit <- fit[-match(na_probes, rownames(fit$coefficients)), ]
+  }
+  
+  #create and apply contrasts
+  contrast_matrix <- makeContrasts(
+    control_vs_disease = c - d, 
+    levels = design
+  )
+  
+  fit2 <- contrasts.fit(fit, contrast_matrix)
+  fit2 <- eBayes(fit2)
+  
+  #handle zero sample variances by offsetting or excluding
+  zero_variances <- fit2$sigma == 0
+  if (any(zero_variances)) {
+    warning("Zero sample variances detected, have been offset away from zero or excluded")
+    fit2$sigma[zero_variances] <- min(fit2$sigma[!zero_variances]) * 1e-6
+  }
+  
+  # results
+  results <- topTable(fit2, adjust = "fdr", number = Inf)
+  results$significant <- ifelse(results$adj.P.Val < 0.05 & abs(results$logFC) > 1, 
+                                ifelse(results$logFC > 0, "o", "u"),
+                                "n")
+  
+  #check if results contain any significant genes
+  if (nrow(results) == 0) {
+    warning(paste("No significant results for gds_id:", gds_id))
+    return(NULL)
+  }
+  
+  #add gene symbols back to results to have a reference
+  results$id_ref <- rownames(results)
+  results <- merge(results, gene_symbols, by = "id_ref", all.x = TRUE)
+  
+  #add gds_id column to know from which dataset comes from 
+  results$gds_id <- gds_id
+  results <- results[, c("gds_id", "id_ref", "gene_symbol", "logFC", "AveExpr", "t", "P.Value", "adj.P.Val", "B", "significant")]
+  return(list(results = results, gene_symbols=gene_symbols, exprs_normalized = exprs_normalized,target=target))
+}
+
+
+
+# Function: individual_expression_analysis
+# Description: Performs individual expression analysis by calculating z-scores for each gene in each sample, 
+#              based on normalized gene expression data. It identifies significant gene expression changes 
+#              in individual samples compared to a control group.
+# Inputs:
+#   - gds_id: Identifier for the gene expression dataset.
+#   - target: Data frame containing sample metadata, including the disease state (control or diseased).
+#   - exprs_normalized: Matrix of normalized gene expression data, where rows represent genes and columns 
+#                        represent samples.
+#   - gene_symbols: Data frame containing gene symbols or identifiers corresponding to the rows of exprs_normalized.
+#   - output_dir: Directory path where the output results will be stored.
+# Outputs:
+#   - A data frame containing the following columns:
+#     - gds_id: Identifier for the gene expression dataset.
+#     - gsm_id: Identifier for the individual sample.
+#     - id_ref: Identifier for the gene with significant expression changes in the sample.
+#     - gene_symbol: Symbol or identifier for the gene with significant expression changes.
+#     - z_score: Z-score indicating the magnitude and direction of expression change for the gene in the sample.
+#   - If no individual significant results are found for a given gds_id, the function issues a warning message indicating this outcome.
+
+individual_expression_analysis <- function(gds_id, target, exprs_normalized, gene_symbols, output_dir) {
+  if (!is.matrix(exprs_normalized) || ncol(exprs_normalized) < 2 || nrow(exprs_normalized) < 2) {
+    warning(paste("exprs_normalized does not have at least dos dimensiones para gds_id:", gds_id))
+    return(NULL)
+  }
+  
+  if (sum(target$disease_state == 'c') == 0) {
+    warning("No control samples found.")
+    return(NULL)
+  }
+  
+  gsm_control <- subset(target, disease_state == 'c')
+  gsm_control_ids <- gsm_control$gsm_id
+  
+  exprs_control <- exprs_normalized[, gsm_control_ids]
+  
+  gene_means_control <- rowMeans(exprs_control, na.rm = TRUE)
+  gene_sds_control <- apply(exprs_control, 1, sd, na.rm = TRUE)
+  
+  z <- sweep(exprs_normalized, 1, gene_means_control, "-")  
+  z <- sweep(z, 1, gene_sds_control, "/")
+  
+  cutoff <- 2.5
+  
+  results_list <- list()
+  for (i in 1:ncol(exprs_normalized)) {
+    df <- data.frame(
+      gsm_id = colnames(exprs_normalized)[i],
+      id_ref = rownames(exprs_normalized),
+      z_score = z[, i]
+    )
+    df$significant <- ifelse(abs(df$z_score) > cutoff, 
+                             ifelse(df$z_score > 0, "o", "u"), 
+                             "n")
+    results_list[[length(results_list) + 1]] <- df
+  }
+  
+  individual_results <- do.call(rbind, results_list)
+  individual_results <- merge(individual_results, gene_symbols, by = "id_ref", all.x = TRUE)
+  individual_results$gds_id <- gds_id
+  individual_results <- individual_results[, c("gds_id", "gsm_id", "id_ref", "gene_symbol", "z_score", "significant")]
+  
+  return(individual_results)
+}
+
+
+
+
+
+randomized_model <- function(exprs_normalized, target) {
+  set.seed(123)  # Para la reproducibilidad
+  G <- nrow(exprs_normalized)  # Número de genes
+  n <- ncol(exprs_normalized)  # Número de muestras
+  
+  num_simulations <- 1000
+  z_cutoff <- 2.5
+  max_subjects_per_gene_random <- numeric(num_simulations)
+  
+  for (sim in 1:num_simulations) {
+    # Generar una matriz de Z-scores aleatorizados
+    random_z_scores <- matrix(rnorm(G * n), nrow = G, ncol = n)
+    
+    # Contar el número de sujetos con Z-score mayor que el umbral en cada gen
+    diseased_subjects <- which(target$disease_state == 'd')
+    counts_per_gene <- rowSums(random_z_scores[, diseased_subjects] > z_cutoff)
+    
+    # Almacenar el máximo número de sujetos por gen en esta simulación
+    max_subjects_per_gene_random[sim] <- max(counts_per_gene)
+  }
+  
+  # Determinar el umbral X (percentil 95 de las simulaciones)
+  X_rand <- quantile(max_subjects_per_gene_random, 0.95)
+  return(X_rand)
+}
+
+
+
+
+
+
+
+# This function generates a volcano plot from differential expression results and saves it as a PNG file.
+# 
+# Args:
+#   results: A data frame containing the results of differential expression analysis with columns for 
+#            log fold change (logFC), p-values (P.Value), and significance status (significant).
+#   gds_id: A string representing the GDS ID, which will be used in the plot title and filename.
+#   graphs_dir: A string representing the directory where the plot will be saved.
+# 
+# Output:
+#   A volcano plot is saved as a PNG file in the specified directory.
+
+create_volcano_plot <- function(results, gds_id, graphs_dir) {
+  
+  # Map the significant values to corresponding labels for better readability in the legend
+  results$significant_label <- factor(results$significant, levels = c("n", "o", "u"),
+                                      labels = c("not significant", "overexpressed", "underexpressed"))
+  
+  # Create the volcano plot using ggplot2
+  volcano_plot <- ggplot(results, aes(x = logFC, y = -log10(P.Value), color = significant_label)) +
+    geom_point() +  # Plot the points
+    theme_minimal() +  # Use a minimal theme for the plot
+    labs(
+      title = paste("Volcano Plot for", gds_id),  # Title of the plot
+      x = "Log Fold Change",  # X-axis label
+      y = "-log10 adj-p-value",  # Y-axis label
+      color = "Differential Expression"  # Legend title
+    ) +
+    scale_color_manual(
+      values = c("not significant" = "grey", "overexpressed" = "red", "underexpressed" = "blue")  # Custom colors
+    ) +
+    theme(
+      plot.title = element_text(size = 16, face = "bold", hjust = 0.5),  # Customize plot title
+      axis.title.x = element_text(size = 14),  # Customize X-axis title
+      axis.title.y = element_text(size = 14),  # Customize Y-axis title
+      axis.text.x = element_text(size = 12),  # Customize X-axis text
+      axis.text.y = element_text(size = 12),  # Customize Y-axis text
+      legend.position = "right",  # Position the legend on the right
+      legend.justification = "center",  # Center the legend
+      legend.title = element_text(size = 14),  # Customize legend title
+      legend.text = element_text(size = 12),  # Customize legend text
+      legend.box.background = element_rect(color = "black", linewidth = 0.5, linetype = "solid"),  # Legend box background
+      legend.box.margin = margin(5, 5, 5, 5),  # Margin around legend box
+      legend.box.spacing = unit(0.5, "cm"),  # Spacing within legend box
+      legend.direction = "vertical"  # Legend direction
+    )
+  
+  # Save the plot as a PNG file
+  ggsave(file.path(graphs_dir, paste0("volcano_plot_", gds_id, ".png")), plot = volcano_plot, bg = 'white')
+}
+
+
+create_heatmap <- function(exprs_normalized, results, gds_id, graphs_dir, target) {
+  # Seleccionar los 50 genes más significativos
+  top_genes <- results[order(results$adj.P.Val), ][1:50, ]
+  top_gene_names <- top_genes$id_ref
+  
+  if (length(top_gene_names) < 50) {
+    warning(paste("Less than 50 significant genes for gds_id:", gds_id))
+  }
+  
+  exprs_top <- exprs_normalized[top_gene_names, ]
+  
+  if (any(!is.finite(as.matrix(exprs_top)))) {
+    warning(paste("Non-finite values detected in the top expression data for gds_id:", gds_id))
+    return(NULL)
+  }
+  
+  if (!is.factor(target$disease_state)) {
+    target$disease_state <- factor(target$disease_state)
+  }
+  
+  annotation_col <- data.frame(DiseaseState = target$disease_state)
+  rownames(annotation_col) <- target$gsm_id
+  annotation_colors <- list(DiseaseState = c("c" = "white", "d" = "grey"))
+  
+  heatmap <- pheatmap(
+    exprs_top,
+    scale = "row",
+    clustering_distance_rows = "correlation",
+    clustering_distance_cols = "correlation",
+    main = paste("Heatmap of top 50 genes present in the most GSMs for", gds_id),
+    fontsize_row = 8,
+    fontsize_col = 4,
+    cluster_cols = FALSE, 
+    annotation_col = annotation_col,
+    annotation_colors = annotation_colors
+  )
+  
+  
+  ggsave(file.path(graphs_dir, paste0("heatmap_", gds_id, ".png")), plot = heatmap)
+}
+
+
+
+
+
+all_results <- list()
+all_individual_results <- list()
+summary_results <- list()
+for (gds_id in gds_ids$gds_id) {
+  de_analysis <- differential_expression_analysis(gds_id, con)
+
+  
+  if (!is.null(de_analysis)) {
+    create_volcano_plot(de_analysis$results, gds_id, graphs_dir)
+    #create_heatmap(de_analysis$exprs_normalized, de_analysis$results, gds_id, graphs_dir, de_analysis$target)
+    all_results[[length(all_results) + 1]] <- de_analysis$results
+  }
+  
+  
+  individual_results <- individual_expression_analysis(gds_id, de_analysis$target, de_analysis$exprs_normalized, de_analysis$gene_symbols, output_dir)
+  if (is.null(individual_results)) next
+  
+  #DEG only in diseased individuals
+  filtered_ind_results <- individual_results %>%
+    filter(significant != "n") %>%
+    inner_join(de_analysis$target %>% filter(disease_state == "d"), by = "gsm_id")
+  
+  #define X with the random model
+  X <- randomized_model(de_analysis$exprs_normalized, de_analysis$target)
+  
+  #calculate the number of individuals with DEG
+  gene_counts <- filtered_ind_results %>%
+    group_by(gene_symbol) %>%
+    summarize(num_subjects = n_distinct(gsm_id))
+  
+  #filter the genes that are DEG in a number of individuals > x
+  genes_above_threshold <- gene_counts %>%
+    filter(num_subjects >= X)
+  
+  # get results
+  final_results <- filtered_ind_results %>%
+    filter(gene_symbol %in% genes_above_threshold$gene_symbol)
+  
+  
+  summary_results[[length(summary_results) + 1]] <- data.frame(
+    gds_id = gds_id,
+    gsm_min = X,
+    sig_gene_count = nrow(genes_above_threshold)
+  )
+  
+  all_results[[length(all_results) + 1]] <- de_analysis$results
+  all_individual_results[[length(all_individual_results) + 1]] <- final_results
+}
+
+
+
+
+
+
+
+
+
+
+
+if (length(all_results) > 0) {
+  combined_results <- do.call(rbind, all_results)
+  colnames(combined_results)[colnames(combined_results) == "adj.P.Val"] <- "adj_p_val"
+  colnames(combined_results)[colnames(combined_results) == "P.Value"] <- "p_val"
+  combined_results_round <- combined_results %>% mutate(across(where(is.numeric), \(x) round(x, 4)))
+  
+  significant_results <- subset(combined_results_round, significant != 'n')
+ # write.csv(significant_results, file = "C:/Users/Laura/Documents/Master Cuatri 2/Practicas/DEA/exprs_dea/results/expr_dea_groupwise/significant_results.csv", row.names = FALSE)
+  # write.csv(combined_results_round, file = "C:/Users/Laura/Documents/Master Cuatri 2/Practicas/DEA/exprs_dea/results/expr_dea_groupwise/combined_results.csv", row.names = FALSE)
+}
+
+
+
+
+#upload group wise results to MySQL
+#dbWriteTable(con, "expr_dea_groupwise", combined_results_round, append = TRUE, row.names = FALSE)
+
+
+if (length(all_individual_results) > 0) {
+  combined_individual_results <- do.call(rbind, all_individual_results)
+  combined_individual_results_round <- combined_individual_results %>% 
+    mutate(across(where(is.numeric), \(x) round(x, 4)))
+  
+}
+
+
+individual_summary <- do.call(rbind, summary_results)
+#write.csv(individual_summary, file = "C:/Users/Laura/Documents/Master Cuatri 2/Practicas/DEA/exprs_dea/results/expr_dea_perindividual/individual_summary.csv", row.names = FALSE)
+
+
+
+
+
+df_without_disease_state <- combined_individual_results_round %>% select(-disease_state)
+
+#write.csv(df_without_disease_state, file = "C:/Users/Laura/Documents/Master Cuatri 2/Practicas/DEA/exprs_dea/results/expr_dea_perindividual/combined_individual_results.csv", row.names = FALSE)
+
+
+
+#dbWriteTable(con, "expr_dea_perindividual", df_without_disease_state, append = TRUE, row.names = FALSE)
+
+
+
+#Plot the adj p value of all the gds_id
+p <- ggplot(combined_results, aes(x = adj_p_val, color = as.factor(gds_id))) +
+  geom_histogram(binwidth = 0.01, fill = NA, position = "identity") +
+  labs(title = "Histogram of adjusted p-value for each GDS",
+       x = "Adjusted p-value",
+       y = "Frequency") +
+  theme_minimal() +
+  xlim(0, 1.3) +
+  scale_color_discrete(name = "GDS ID")
+
+ggsave(file.path(graphs_dir, "adj_p_val_distribution_all_gds_ids.png"), plot = p, bg = 'white')
+
+
+
+#initialize an empty list to store all results
+all_results <- list()
+all_individual_results <- list()
+
+for (gds_id in gds_ids$gds_id) {
+  de_analysis <- differential_expression_analysis(gds_id, con)
+  individual_results <- individual_expression_analysis(gds_id, de_analysis$target,de_analysis$exprs_normalized,de_analysis$gene_symbols, output_dir)
+  # disease_associated_genes <- randomized_model(gds_id, de_analysis$exprs_normalized, de_analysis$target)
+  if (!is.null(individual_results)) {
+    all_individual_results[[length(all_individual_results) + 1]] <- individual_results
+  }
+  if (!is.null(de_analysis)) {
+    # create_volcano_plot(de_analysis$results, gds_id, graphs_dir)
+    #create_heatmap(de_analysis$exprs_normalized, de_analysis$results, gds_id, graphs_dir, de_analysis$target)
+    all_results[[length(all_results) + 1]] <- de_analysis$results
+  }
+}
+