Commit a2cdcd08 authored by Laura Masa's avatar Laura Masa

Uploading data_analysis correctly

parent 93989e08
cui disease_name gene_id drug_id symptom_id Genes in LCC
C0001206 Acromegaly 178.0 3.0 18.0 38.0
C0001973 Alcoholic Intoxication, Chronic 1008.0 22.0 34.0 249.0
C0002395 Alzheimer's Disease 5736.0 38.0 7.0 2697.0
C0002622 Amnesia 200.0 41.0 14.0 74.0
C0002736 Amyotrophic Lateral Sclerosis 1791.0 11.0 23.0 847.0
C0003467 Anxiety 1335.0 0.0 16.0 684.0
C0003537 Aphasia 94.0 3.0 8.0 27.0
C0004135 Ataxia Telangiectasia 481.0 0.0 24.0 235.0
C0004352 Autistic Disorder 1710.0 35.0 5.0 706.0
C0005586 Bipolar Disorder 2040.0 78.0 12.0 727.0
C0007282 Carotid Stenosis 163.0 2.0 3.0 23.0
C0007766 Intracranial Aneurysm 399.0 5.0 13.0 124.0
C0007785 Cerebral Infarction 891.0 15.0 32.0 444.0
C0007786 Brain Ischemia 449.0 48.0 4.0 236.0
C0007787 Transient Ischemic Attack 410.0 17.0 14.0 233.0
C0007789 Cerebral Palsy 284.0 5.0 31.0 81.0
C0007959 Charcot-Marie-Tooth Disease 349.0 1.0 19.0 96.0
C0009171 Cocaine Abuse 182.0 97.0 16.0 57.0
C0009207 Cockayne Syndrome 133.0 0.0 7.0 46.0
C0009952 Febrile Convulsions 267.0 8.0 9.0 30.0
C0010964 Dandy-Walker Syndrome 155.0 0.0 9.0 53.0
C0011195 Dejerine-Sottas Disease (disorder) 235.0 0.0 22.0 60.0
C0011206 Delirium 152.0 23.0 18.0 15.0
C0011265 Presenile dementia 944.0 38.0 23.0 467.0
C0011269 Dementia, Vascular 268.0 2.0 26.0 103.0
C0011570 Mental Depression 2208.0 0.0 7.0 1047.0
C0011581 Depressive disorder 2544.0 112.0 5.0 1266.0
C0011633 Dermatomyositis 296.0 8.0 18.0 80.0
C0013080 Down Syndrome 1066.0 1.0 3.0 511.0
C0013264 Muscular Dystrophy, Duchenne 467.0 3.0 10.0 196.0
C0013362 Dysarthria 531.0 7.0 10.0 260.0
C0013384 Dyskinetic syndrome 400.0 31.0 26.0 145.0
C0014038 Encephalitis 406.0 8.0 37.0 148.0
C0014057 Japanese Encephalitis 142.0 0.0 21.0 50.0
C0014060 Encephalitis, St. Louis 330.0 0.0 13.0 124.0
C0014065 Congenital cerebral hernia 101.0 0.0 4.0 43.0
C0014070 Encephalomyelitis 1062.0 0.0 32.0 578.0
C0014544 Epilepsy 1727.0 48.0 19.0 828.0
C0014553 Absence Epilepsy 131.0 15.0 11.0 28.0
C0014556 Epilepsy, Temporal Lobe 470.0 11.0 14.0 181.0
C0015469 Facial paralysis 186.0 1.0 12.0 25.0
C0016667 Fragile X Syndrome 255.0 1.0 6.0 86.0
C0016719 Friedreich Ataxia 111.0 4.0 12.0 22.0
C0017205 Gaucher Disease 183.0 2.0 17.0 35.0
C0017921 Glycogen storage disease type II 243.0 0.0 12.0 44.0
C0018378 Guillain-Barre Syndrome 194.0 0.0 44.0 58.0
C0018524 Hallucinations 192.0 30.0 10.0 83.0
C0018784 Sensorineural Hearing Loss (disorder) 989.0 6.0 5.0 432.0
C0019202 Hepatolenticular Degeneration 200.0 9.0 21.0 43.0
C0019562 Von Hippel-Lindau Syndrome 240.0 0.0 4.0 91.0
C0020179 Huntington Disease 1345.0 11.0 16.0 677.0
C0020255 Hydrocephalus 555.0 3.0 15.0 267.0
C0022336 Creutzfeldt-Jakob disease 180.0 0.0 24.0 53.0
C0023264 Leigh Disease 285.0 0.0 18.0 70.0
C0023524 Leukoencephalopathy, Progressive Multifocal 263.0 1.0 6.0 152.0
C0024266 Lymphocytic Choriomeningitis 159.0 0.0 22.0 47.0
C0024408 Machado-Joseph Disease 171.0 0.0 14.0 66.0
C0024809 Marijuana Abuse 209.0 0.0 13.0 52.0
C0024814 Marinesco-Sjogren syndrome 167.0 1.0 15.0 62.0
C0025286 Meningioma 929.0 2.0 14.0 460.0
C0025289 Meningitis 225.0 20.0 42.0 79.0
C0025958 Microcephaly 1360.0 0.0 10.0 824.0
C0026106 Mild Mental Retardation 374.0 0.0 2.0 120.0
C0026654 Moyamoya Disease 151.0 2.0 13.0 16.0
C0026769 Multiple Sclerosis 2878.0 21.0 28.0 1284.0
C0026847 Spinal Muscular Atrophy 440.0 0.0 17.0 194.0
C0026850 Muscular Dystrophy 453.0 1.0 33.0 124.0
C0026896 Myasthenia Gravis 447.0 17.0 11.0 133.0
C0027121 Myositis 301.0 4.0 24.0 109.0
C0027126 Myotonic Dystrophy 204.0 3.0 8.0 72.0
C0027809 Neurilemmoma 259.0 2.0 13.0 120.0
C0027830 neurofibroma 157.0 0.0 12.0 65.0
C0027831 Neurofibromatosis 1 433.0 0.0 15.0 183.0
C0027832 Neurofibromatosis 2 160.0 0.0 12.0 82.0
C0027859 Acoustic Neuroma 159.0 0.0 11.0 74.0
C0027873 Neuromyelitis Optica 214.0 1.0 25.0 30.0
C0028043 Nicotine Dependence 227.0 13.0 8.0 38.0
C0028077 Night Blindness 201.0 1.0 15.0 16.0
C0028738 Nystagmus 923.0 10.0 12.0 551.0
C0029124 Optic Atrophy 648.0 0.0 15.0 335.0
C0030567 Parkinson Disease 3240.0 47.0 32.0 1582.0
C0032000 Pituitary Adenoma 211.0 7.0 15.0 88.0
C0033375 Prolactinoma 243.0 5.0 11.0 87.0
C0035258 Restless Legs Syndrome 174.0 17.0 28.0 28.0
C0035372 Rett Syndrome 299.0 0.0 25.0 105.0
C0036341 Schizophrenia 5398.0 71.0 18.0 2171.0
C0037773 Spastic Paraplegia, Hereditary 209.0 0.0 7.0 41.0
C0038220 Status Epilepticus 664.0 41.0 23.0 294.0
C0038379 Strabismus 771.0 1.0 4.0 501.0
C0038436 Post-Traumatic Stress Disorder 528.0 23.0 8.0 200.0
C0038525 Subarachnoid Hemorrhage 659.0 14.0 14.0 342.0
C0038868 Progressive supranuclear palsy 237.0 1.0 19.0 92.0
C0039483 Giant Cell Arteritis 327.0 5.0 15.0 123.0
C0040517 Gilles de la Tourette syndrome 232.0 17.0 11.0 37.0
C0041341 Tuberous Sclerosis 378.0 4.0 18.0 159.0
C0042170 Uveomeningoencephalitic Syndrome 124.0 5.0 14.0 15.0
C0043124 West Nile Fever 89.0 1.0 25.0 18.0
C0043459 Zellweger Syndrome 81.0 0.0 5.0 22.0
C0080178 Spina Bifida 233.0 1.0 12.0 59.0
C0085084 Motor Neuron Disease 270.0 3.0 35.0 118.0
C0085655 Polymyositis 226.0 7.0 10.0 41.0
C0085762 Alcohol abuse 226.0 22.0 29.0 47.0
C0086769 Panic Attacks 63.0 27.0 23.0 20.0
C0149940 Sciatic Neuropathy 121.0 2.0 7.0 44.0
C0151311 Cranial nerve palsies 81.0 2.0 29.0 26.0
C0151740 Intracranial Hypertension 72.0 12.0 14.0 27.0
C0152020 Gastroparesis 102.0 4.0 24.0 17.0
C0152025 Polyneuropathy 192.0 6.0 14.0 45.0
C0153633 Malignant neoplasm of brain 330.0 22.0 36.0 161.0
C0162309 Adrenoleukodystrophy 368.0 0.0 12.0 119.0
C0162635 Angelman Syndrome 149.0 0.0 17.0 44.0
C0162666 Mitochondrial Encephalomyopathies 60.0 1.0 15.0 19.0
C0175754 Agenesis of corpus callosum 767.0 0.0 3.0 417.0
C0206728 Plexiform Neurofibroma 56.0 1.0 13.0 20.0
C0220756 Niemann-Pick Disease, Type C 230.0 1.0 19.0 119.0
C0221056 Adult type dermatomyositis 256.0 8.0 18.0 67.0
C0221406 Pituitary-dependent Cushing's disease 164.0 3.0 11.0 37.0
C0234144 Dysgraphia 43.0 0.0 9.0 22.0
C0236642 Pick Disease of the Brain 289.0 1.0 9.0 137.0
C0238190 Inclusion Body Myositis (disorder) 101.0 1.0 11.0 35.0
C0238288 Muscular Dystrophy, Facioscapulohumeral 187.0 0.0 14.0 48.0
C0242350 Erectile dysfunction 322.0 35.0 2.0 72.0
C0265219 Miller Dieker syndrome 221.0 0.0 20.0 99.0
C0266463 Lissencephaly 98.0 0.0 15.0 30.0
C0266464 Polymicrogyria 239.0 0.0 2.0 109.0
C0266483 Pachygyria 156.0 0.0 15.0 39.0
C0270824 Visual seizure 235.0 216.0 25.0 53.0
C0270972 Cornelia De Lange Syndrome 78.0 0.0 10.0 22.0
C0271270 Oculovestibuloauditory syndrome 95.0 0.0 13.0 33.0
C0276226 Herpes encephalitis 68.0 2.0 2.0 16.0
C0276496 Familial Alzheimer Disease (FAD) 336.0 38.0 7.0 165.0
C0282527 Infantile Refsum Disease (disorder) 37.0 0.0 2.0 15.0
C0338451 Frontotemporal dementia 464.0 4.0 9.0 203.0
C0338508 Optic Atrophy, Autosomal Dominant 161.0 0.0 16.0 59.0
C0349204 Nonorganic psychosis 528.0 0.0 9.0 184.0
C0410189 Muscular Dystrophy, Emery-Dreifuss 76.0 0.0 7.0 15.0
C0410207 Tubular Aggregate Myopathy 78.0 0.0 14.0 31.0
C0431380 Cortical Dysplasia 139.0 2.0 6.0 31.0
C0494463 Alzheimer Disease, Late Onset 529.0 38.0 7.0 241.0
C0496899 Benign neoplasm of brain, unspecified 42.0 22.0 23.0 18.0
C0497327 Dementia 1153.0 16.0 28.0 522.0
C0520679 Sleep Apnea, Obstructive 610.0 4.0 26.0 249.0
C0543859 Amyotrophic Lateral Sclerosis, Guam Form 40.0 11.0 26.0 19.0
C0546126 Acute Confusional Senile Dementia 100.0 38.0 7.0 48.0
C0577631 Carotid Atherosclerosis 263.0 2.0 9.0 75.0
C0600427 Cocaine Dependence 300.0 97.0 16.0 65.0
C0740391 Middle Cerebral Artery Occlusion 766.0 15.0 7.0 404.0
C0740392 Infarction, Middle Cerebral Artery 160.0 15.0 7.0 62.0
C0750900 Alzheimer's Disease, Focal Onset 100.0 38.0 7.0 48.0
C0750901 Alzheimer Disease, Early Onset 207.0 38.0 7.0 95.0
C0750974 Brain Tumor, Primary 137.0 22.0 16.0 64.0
C0750977 Recurrent Brain Neoplasm 39.0 22.0 16.0 18.0
C0750979 Primary malignant neoplasm of brain 42.0 22.0 16.0 20.0
C0751265 Learning Disabilities 114.0 38.0 7.0 25.0
C0751587 CADASIL Syndrome 53.0 0.0 7.0 16.0
C0751690 Malignant Peripheral Nerve Sheath Tumor 332.0 0.0 9.0 172.0
C0751713 Inclusion Body Myopathy, Sporadic 93.0 1.0 11.0 38.0
C0751772 REM Sleep Behavior Disorder 60.0 4.0 13.0 17.0
C0751781 Dentatorubral-Pallidoluysian Atrophy 123.0 3.0 18.0 37.0
C0751967 Multiple Sclerosis, Relapsing-Remitting 249.0 7.0 3.0 61.0
C0752120 Spinocerebellar Ataxia Type 1 126.0 2.0 13.0 33.0
C0752125 Spinocerebellar Ataxia Type 7 94.0 2.0 13.0 28.0
C0752166 Bardet-Biedl Syndrome 176.0 0.0 10.0 27.0
C0752304 Hypoxic-Ischemic Encephalopathy 197.0 2.0 1.0 62.0
C0752347 Lewy Body Disease 335.0 3.0 32.0 145.0
C0917798 Cerebral Ischemia 121.0 48.0 4.0 49.0
C0917816 Mental deficiency 150.0 2.0 1.0 27.0
C1263846 Attention deficit hyperactivity disorder 1084.0 30.0 14.0 484.0
C1269683 Major Depressive Disorder 1814.0 56.0 19.0 775.0
C1306214 ACTH-Secreting Pituitary Adenoma 88.0 0.0 3.0 23.0
C1510586 Autism Spectrum Disorders 1478.0 0.0 9.0 699.0
C1839259 Bulbo-Spinal Atrophy, X-Linked 144.0 0.0 13.0 70.0
C1868675 PARKINSON DISEASE 2, AUTOSOMAL RECESSIVE JUVENILE 89.0 37.0 2.0 27.0
C1955869 Malformations of Cortical Development 80.0 2.0 6.0 26.0
C2931689 Dystrophia myotonica 2 144.0 3.0 18.0 21.0
C3658299 Zellweger Spectrum 35.0 0.0 5.0 17.0
This source diff could not be displayed because it is too large. You can view the blob instead.
"gds_id","id_ref","gene_symbol","logFC","AveExpr","t","p_val","adj_p_val","B","significant"
"GDS2519","207205_at","CEACAM4",-43.2474,167.0559,-5.0149,0,0.0486,1.8086,"u"
"GDS4128","1554830_a_at","STEAP3",-80.5503,22.9122,-8.5324,0,0.0012,-4.2128,"u"
"GDS4128","1555666_at","PTPRS",-37.5894,15.1636,-5.9976,0,0.032,-4.2814,"u"
"GDS4128","1558561_at","HM13",-96.467,40.089,-5.6353,0,0.0383,-4.2958,"u"
"GDS4128","1558778_s_at","MKL2",-45.8069,12.5022,-5.6737,0,0.0373,-4.2942,"u"
"GDS4128","1561451_a_at","LOC101928551",-93.1816,24.9414,-9.7082,0,4e-04,-4.1937,"u"
"GDS4128","1563255_at","FAM170B-AS1",-114.2319,40.098,-5.843,0,0.0333,-4.2874,"u"
"GDS4128","1564679_at","ASB15",-112.9483,43.6432,-5.6903,0,0.037,-4.2935,"u"
"GDS4128","1567022_at","OR5AK4P",-62.9342,29.9196,-5.4696,0,0.0486,-4.3028,"u"
"GDS4128","1570114_at","MDGA2",-54.1825,17.6711,-6.1915,0,0.0252,-4.2743,"u"
"GDS4128","1570433_at","TMPRSS2",-57.0986,24.1373,-7.1464,0,0.0091,-4.2444,"u"
"GDS4128","204847_at","ZBTB11",-244.6604,290.0519,-5.7517,0,0.0346,-4.291,"u"
"GDS4128","206754_s_at","CYP2B7P",-124.7891,39.8167,-6.8397,0,0.0121,-4.2532,"u"
"GDS4128","207016_s_at","LOC101928635",-110.6628,34.8775,-5.8151,0,0.0333,-4.2885,"u"
"GDS4128","209044_x_at","SF3B4",-146.8653,66.8142,-5.8298,0,0.0333,-4.2879,"u"
"GDS4128","209176_at","SEC23IP",-59.0793,23.8628,-5.7662,0,0.0345,-4.2904,"u"
"GDS4128","212728_at","DLG3",-234.6568,102.6718,-5.9427,0,0.032,-4.2835,"u"
"GDS4128","213071_at","DPT",-70.2717,19.2877,-11.3337,0,1e-04,-4.1748,"u"
"GDS4128","214547_at","ADCY10",-72.8003,25.7941,-7.2419,0,0.0086,-4.2419,"u"
"GDS4128","216800_at","AK027069",-94.2253,35.3219,-8.9563,0,7e-04,-4.2053,"u"
"GDS4128","217425_at","MC2R",-37.4118,15.7909,-6.4641,0,0.0195,-4.265,"u"
"GDS4128","219722_s_at","GDPD3",-117.7731,50.0628,-6.0499,0,0.0317,-4.2795,"u"
"GDS4128","219876_s_at","GOLGA2P5",-38.2726,16.5466,-6.4346,0,0.0195,-4.2659,"u"
"GDS4128","220440_at","LGALS13",-81.4139,29.0418,-6.2259,0,0.0247,-4.2731,"u"
"GDS4128","220640_at","CSNK1G1",-139.926,47.6199,-5.6534,0,0.0379,-4.295,"u"
"GDS4128","222432_s_at","CCDC47",-350.3929,251.3091,-6.0249,0,0.0318,-4.2804,"u"
"GDS4128","225241_at","CCDC80",-56.5799,27.3373,-5.7703,0,0.0345,-4.2902,"u"
"GDS4128","225608_at","SNX29",-85.562,43.706,-5.8889,0,0.0327,-4.2856,"u"
"GDS4128","226561_at","AGFG1",-33.2307,10.1255,-5.5661,0,0.043,-4.2987,"u"
"GDS4128","228517_at","MEAF6",-53.4851,22.0799,-5.4509,0,0.0486,-4.3036,"u"
"GDS4128","228773_at","LOC100506100",86.1618,94.8674,5.5247,0,0.0457,-4.3004,"o"
"GDS4128","229133_s_at","ZNF397",-69.6686,23.8349,-6.5758,0,0.0171,-4.2613,"u"
"GDS4128","230463_at","BG054960",-42.1868,21.9201,-6.3753,0,0.0195,-4.2679,"u"
"GDS4128","230871_at","DHX30",-111.7965,47.467,-6.4034,0,0.0195,-4.267,"u"
"GDS4128","231256_at","LOC727944",-57.0748,21.0099,-7.0539,0,0.0097,-4.247,"u"
"GDS4128","232578_at","CLDN18",-42.0285,12.0969,-6.6498,0,0.0159,-4.259,"u"
"GDS4128","232939_at","AU152763",-97.6924,35.0915,-5.4828,0,0.0486,-4.3023,"u"
"GDS4128","234693_at","LOC105376689",-59.7809,28.2878,-5.4564,0,0.0486,-4.3034,"u"
"GDS4128","235751_s_at","VMO1",-46.5896,12.4801,-5.926,0,0.032,-4.2841,"u"
"GDS4128","237039_at","LOC100130502",-89.3097,26.8886,-9.4693,0,4e-04,-4.1971,"u"
"GDS4128","237042_at","LOC100507024",-52.9344,26.2389,-5.8718,0,0.0327,-4.2862,"u"
"GDS4128","238627_at","TRAPPC2L",-85.1377,42.3262,-6.8935,0,0.0119,-4.2516,"u"
"GDS4128","240835_at","BE671305",-91.7474,102.1718,-5.9747,0,0.032,-4.2823,"u"
"GDS4128","241155_at","AA704588",-64.6885,22.3755,-5.7152,0,0.0362,-4.2925,"u"
"GDS4128","244069_at","BE220584",-81.1783,21.7971,-14.7565,0,0,-4.1513,"u"
"GDS4128","244173_at","MIS18BP1",-51.5387,12.2448,-7.2964,0,0.0086,-4.2404,"u"
"GDS4128","244548_at","AI189587",-54.2703,21.0028,-5.9189,0,0.032,-4.2844,"u"
This source diff could not be displayed because it is too large. You can view the blob instead.
"gds_id","gsm_min","sig_gene_count"
"GDS2519",5,1917
"GDS2795",3,3121
"GDS4128",2,374
"GDS4136",4,7885
"GDS810",4,4159
"","mean","std","min","25%","50%","75%","max"
"1",174.880692883186,929.601049291251,0.312380952380952,15.6685714285714,50.3761904761905,118.235238095238,42470.7895238095
"","mean","std","min","25%","50%","75%","max"
"1",174.880692883186,929.601049291251,0.312380952380952,15.6685714285714,50.3761904761905,118.235238095238,42470.7895238095
"","mean","std","min","25%","50%","75%","max"
"1",174.880698998339,998.361671556367,0.1,15.7,50.2,118.6,158720
"","disease_state","freq"
"1","healthy control",22
"2","neurodegenerative disease control",33
"3","Parkinson's disease",50
"","mean","std","min","25%","50%","75%","max"
"1",215.442423566996,801.732437569486,0.29,21.64,71.9775,181.585,52550.39
"","mean","std","min","25%","50%","75%","max"
"1",215.442412079161,894.187411696145,0,21.2,71.2,183.9,145814
"","disease_state","freq"
"1","neurofibrillary tangle",10
"2","normal",10
"","mean","std","min","25%","50%","75%","max"
"1",196.828769593056,1067.9662198029,0.322222222222222,14.3722222222222,43.8388888888889,108.075,40597.2611111111
"","mean","std","min","25%","50%","75%","max"
"1",196.828769593056,1067.9662198029,0.322222222222222,14.3722222222222,43.8388888888889,108.075,40597.2611111111
"","mean","std","min","25%","50%","75%","max"
"1",196.828768203057,1118.2246457,0.1,14.2,43.1,109.3,70634.7
"","disease_state","freq"
"1","Alzheimer’s disease-like",2
"2","healthy",16
"","mean","std","min","25%","50%","75%","max"
"1",146.82249472959,696.391251626308,0.222222222222222,15.2,56.55,124.481944291881,53931.4
"","mean","std","min","25%","50%","75%","max"
"1",146.82249472959,696.391251626308,0.222222222222222,15.2,56.55,124.481944291881,53931.4
"","mean","std","min","25%","50%","75%","max"
"1",146.822516326607,825.002914679964,0.1,15.4,55.4,127.4,209365
"","disease_state","freq"
"1","Braak stage I-II",6
"2","Braak stage III-IV",6
"3","Braak stage V-VI",6
"","mean","std","min","25%","50%","75%","max"
"1",856.675968074092,4919.41233618724,1.08666666666667,69.8066666666667,221.59,563.95,299891.566666667
"","mean","std","min","25%","50%","75%","max"
"1",856.675968074092,4919.41233618724,1.08666666666667,69.8066666666667,221.59,563.95,299891.566666667
"","mean","std","min","25%","50%","75%","max"
"1",856.675969951608,5050.15888527268,0.3,69.5,220.8,565.7,640898
"","disease_state","freq"
"1","control",8
"2","incipient stage",7
"3","moderate stage",8
"4","severe stage",7
"","mean","std","min","25%","50%","75%","max"
"1",677.781240828767,1680.87637113157,0.845161290322581,69.9887096774194,224.209677419355,603.822580645161,35924.035483871
"","mean","std","min","25%","50%","75%","max"
"1",677.781240828767,1680.87637113157,0.845161290322581,69.9887096774194,224.209677419355,603.822580645161,35924.035483871
"","mean","std","min","25%","50%","75%","max"
"1",677.781250843645,1690.72429835133,0.4,69.4,224.2,605.1,78292.3
"","disease_state","freq"
"1","control",9
"2","incipient AD",7
"3","moderate AD",8
"4","severe AD",7
library(DBI)
library(reshape2)
library(limma)
library(ggplot2)
library(dplyr)
library(tidyverse)
library(gridExtra)
output_dir="C:/Users/Laura/Documents/Master Cuatri 2/Practicas/DEA/exprs_dea/results/expr_descriptive_analysis"
diseases <- read.delim("C:/Users/Laura/Documents/Master Cuatri 2/Practicas/DEA/exprs_dea/data/data_01_neuro_diseases_final_disease_selected.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
diseases <- diseases %>%
rename(disease_id = cui)%>%
select(disease_id, disease_name)
con <- dbConnect(MySQL(),
dbname = "disnet_biolayer",
host = "ares.ctb.upm.es",
port = 30604,
user = "gene_expression",
password = "gene_expression")
##TO REPRESENT THE NUMBER OF DATASETS (GDS) PER DISEASE
all_dis_gds<-data.frame()
# Get expr_disease_gds for the current gds_id
query_dis_gds <- paste0("SELECT * FROM expr_disease_gds")
dis_gds <- dbGetQuery(con, query_dis_gds)
# Merge the dataframes all_dis_gds and all_diseases by disease_id
merged_data <- merge(dis_gds, diseases, by = "disease_id")
# Count the number of GDS for each disease_name
gds_count <- aggregate(gds_id ~ disease_name, data = merged_data, FUN = length)
gds_count <- gds_count %>% arrange(desc(gds_id))
# Plot the results with blue bars and sorted by the number of GDS
dis_gds_plot<-ggplot(gds_count, aes(x = reorder(disease_name, -gds_id), y = gds_id)) +
geom_bar(stat = "identity", fill = "#7AC5CD") +
xlab("Disease Name") +
ylab("Number of GDS") +
ggtitle("Number of GDS per Disease") +
theme(
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 14),
axis.text.y= element_text(size=14),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14),
panel.background = element_rect(fill = "white", color = "black"),
panel.grid.major.x = element_blank(), # Remove vertical grid lines
panel.grid.minor.x = element_blank(), # Remove minor vertical grid lines
panel.grid.major.y = element_line(color = "gray90"), # Optional: Add horizontal grid lines
panel.grid.minor.y = element_blank(), # Optional: Remove minor horizontal grid lines
axis.line = element_line(color = "black"), # Add axis lines
plot.title = element_text(hjust = 0.5, size = 16, color = "black",face = "bold")
) +
scale_y_continuous(expand = expansion(mult = c(0, 0.1))) # Adjust the y-axis to start at 0 and add some padding at the top
dis_gds_plot
ggsave("C:/Users/Laura/Documents/Master Cuatri 2/Practicas/DEA/exprs_dea/results/expr_descriptive_analysis/number_of_gds_per_disease.png", plot = dis_gds_plot, width = 15, height = 12)
##REPRESENT THE NUMBER OF samples per dataset (GSMs per GDS)
query_gds_gsm <- paste0("SELECT gds_id,gsm_id FROM expr_processed_annot")
gds_gsm <- dbGetQuery(con, query_gds_gsm)
gds_count <- gds_gsm %>%
group_by(gds_id) %>%
summarise(num_gsm = n())
gds_gsm_plot <- ggplot(gds_count, aes(x = as.factor(gds_id), y = num_gsm)) +
geom_bar(stat = "identity", fill = "#7AC5CD", width = 0.9)+
xlab("GDS ID") +
ylab("Number of GSM IDs") +
ggtitle("Number of samples per dataset (GSMs per GDS)") +
theme(
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 14),
axis.text.y = element_text(size = 14),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14),
panel.background = element_rect(fill = "white", color = "black"),
panel.grid.major.x = element_blank(), # Remove vertical grid lines
panel.grid.minor.x = element_blank(), # Remove minor vertical grid lines
panel.grid.major.y = element_line(color = "gray90"), # Optional: Add horizontal grid lines
panel.grid.minor.y = element_blank(), # Optional: Remove minor horizontal grid lines
axis.line = element_line(color = "black"), # Add axis lines
plot.title = element_text(hjust = 0.5, size = 16, color = "black",face = "bold")
) +
scale_y_continuous(expand = expansion(mult = c(0, 0.1)))+
scale_x_discrete(expand = expansion(add = c(0.9, 0.9)))
gds_gsm_plot
ggsave("C:/Users/Laura/Documents/Master Cuatri 2/Practicas/DEA/exprs_dea/results/expr_descriptive_analysis/number_of_gsm_per_gds.png", plot = gds_gsm_plot, width = 15, height = 12)
##THE REST OF DESCRIPTIVE ANALYSIS ITERING GDS_ID PER GDS_ID
gds_ids <- dbGetQuery(con, "SELECT DISTINCT gds_id FROM expr_processed_annot WHERE flag = 1")
all_expr_values <- data.frame()
all_normalized_values <- data.frame()
for (i in 1:nrow(gds_ids)) {
gds_id<-gds_ids$gds_id[i]
gds_output_dir <- file.path(output_dir, gds_id)
# Check if the directory already exists
if (!dir.exists(gds_output_dir)) {
dir.create(gds_output_dir, recursive = TRUE)
}
setwd(gds_output_dir)
#extraction of data from expr_values table
query_expr_values <- paste0("SELECT gsm_id, id_ref, gene_symbol, value FROM expr_values WHERE gds_id = '", gds_id, "'")
expr_values <- dbGetQuery(con, query_expr_values)
#extraction of data from expr_processed_annot table with flag = 1
query_expr_raw_annot <- paste0("SELECT gsm_id, disease_state FROM expr_raw_annot WHERE gds_id = '", gds_id, "'")
expr_raw_annot <- dbGetQuery(con, query_expr_raw_annot)
#extraction of data from expr_processed_annot table with flag = 1
query_expr_processed_annot <- paste0("SELECT gsm_id, disease_state FROM expr_processed_annot WHERE gds_id = '", gds_id, "'")
expr_processed_annot <- dbGetQuery(con, query_expr_processed_annot)
#to keep gene_symbol for reference at the end
gene_symbols <- unique(expr_values[, c("id_ref", "gene_symbol")])
if (nrow(expr_values) == 0) {
warning(paste("No data found for gds_id:", gds_id))
next
}
if (!is.numeric(expr_values$value)) {
expr_values$value <- as.numeric(expr_values$value)
}
#create the expression matrix without gene_symbol
expression <- dcast(expr_values, id_ref ~ gsm_id, value.var = "value", fun.aggregate = mean)
rownames(expression) <- expression$id_ref
expression <- expression[, -1] # Remove the id_ref column
if (ncol(expression) == 0) {
warning(paste("No expression data available for gds_id:", gds_id))
next
}
#normalization of expression matrix
exprs_matrix <- as.matrix(expression)
exprs_normalized <- normalizeBetweenArrays(exprs_matrix, method = 'quantile')
exprs_normalized_df <- melt(exprs_normalized, varnames = c("id_ref", "gsm_id"), value.name = "value")
normalized_df <- merge(exprs_normalized_df, gene_symbols, by = "id_ref", all.x = TRUE)
expr_values$gds_id <- gds_id
normalized_df$gds_id <- gds_id
all_expr_values <- rbind(all_expr_values, expr_values)
all_normalized_values <- rbind(all_normalized_values, normalized_df)
expr_stats <- summarize(
expr_values,
mean = mean(value, na.rm = TRUE),
std = sd(value, na.rm = TRUE),
min = min(value, na.rm = TRUE),
`25%` = quantile(value, 0.25, na.rm = TRUE),
`50%` = median(value, na.rm = TRUE),
`75%` = quantile(value, 0.75, na.rm = TRUE),
max = max(value, na.rm = TRUE)
)
write.csv(expr_stats, file.path(gds_output_dir, paste0(gds_id, "_expression_stats_prenorm.csv")))
expr_stats_norm <- summarize(
normalized_df,
mean = mean(value, na.rm = TRUE),
std = sd(value, na.rm = TRUE),
min = min(value, na.rm = TRUE),
`25%` = quantile(value, 0.25, na.rm = TRUE),
`50%` = median(value, na.rm = TRUE),
`75%` = quantile(value, 0.75, na.rm = TRUE),
max = max(value, na.rm = TRUE)
)
write.csv(expr_stats_norm, file.path(gds_output_dir, paste0(gds_id, "_expression_stats_postnorm.csv")))
# Count categories and generate plots only for columns with data
categorical_columns <- c('agent', 'disease_state', 'tissue', 'cell_type')
for (col in categorical_columns) {
if (col %in% colnames(expr_raw_annot) && !all(is.na(expr_raw_annot[[col]]))) {
cat_counts <- table(expr_raw_annot[[col]])
df_cat_counts <- as.data.frame(cat_counts)
colnames(df_cat_counts) <- c(col, "freq")
write.csv(df_cat_counts, file.path(gds_output_dir, paste0(gds_id, "_raw_", col, "_counts.csv")))
if (col %in% colnames(expr_processed_annot) && !all(is.na(expr_processed_annot[[col]]))) {
cat_counts <- table(expr_processed_annot[[col]])
df_cat_counts <- as.data.frame(cat_counts)
colnames(df_cat_counts) <- c(col, "freq")
write.csv(df_cat_counts, file.path(gds_output_dir, paste0(gds_id, "_processed_", col, "_counts.csv")))
plot <- ggplot(expr_processed_annot, aes(y = .data[[col]])) +
geom_bar() +
labs(title = paste0(str_to_title(col), " count for ", gds_id), hjust = 0.5) +
theme_minimal()+
theme(
plot.title = element_text(size = 18, face = "bold", hjust = 0.5),
axis.title.x = element_text(size = 16),
axis.title.y = element_text(size = 16),
axis.text.x = element_text(size = 15),
axis.text.y = element_text(size = 15)
)
ggsave(file.path(gds_output_dir, paste0(gds_id, "_", col, "_counts.png")), plot = plot, width = 10, height = 6,bg='white')
}
}
# Boxplot of expression by disease state
# Boxplot of expression by disease state
if ("disease_state" %in% colnames(expr_processed_annot)) {
# Merge expression and annotation dataframes
merged_df <- inner_join(normalized_df, expr_processed_annot %>% select(gsm_id, disease_state), by = "gsm_id")
# Violin plot
violin_disease <- ggplot(merged_df, aes(x = disease_state, y = value)) +
geom_violin(aes(fill = disease_state), position = position_dodge(0.9)) +
scale_fill_manual(
values = c("c" = "#FFFFF0", "d" = "#838B83"), # Specify colors for the groups
labels = c("c" = "Control", "d" = "Disease"), # Set the labels for the legend
name = "Disease State" # Set the title for the legend
) +
scale_x_discrete(
labels = c("c" = "Control", "d" = "Disease") # Rename x-axis labels
) +
labs(
title = paste0("Expression Values by Disease State for ", gds_id),
x = "Disease State",
y = "Expression Value"
) +
theme_minimal() +
theme(
plot.title = element_text(size = 18, face = "bold", hjust = 0.5),
axis.title.x = element_text(size = 16),
axis.title.y = element_text(size = 16),
axis.text.x = element_text(size = 15),
axis.text.y = element_text(size = 15),
legend.position = "none" # Remove the legend
)
# Save the plot
ggsave(
file.path(gds_output_dir, paste0(gds_id, "_expression_by_disease_state.png")),
plot = violin_disease,
width = 14,
height = 8,
bg = 'white'
)
}
# Boxplot of expression values
boxplot_expression <- ggplot(expr_values, aes(x=gds_id, y = value)) +
geom_boxplot() +
labs(title = paste0("Distribution of Expression Values for ", gds_id),x='',
y = "Expression Value") +
theme_minimal()+
theme(
plot.title = element_text(size = 18, face = "bold", hjust = 0.5),
axis.title.x = element_text(size = 16),
axis.title.y = element_text(size = 16),
axis.text.x = element_text(size = 15),
axis.text.y = element_text(size = 15)
)
ggsave(file.path(gds_output_dir, paste0(gds_id, "_expression_values_boxplot.png")), plot = boxplot_expression, width = 14, height = 8, bg = "white")
# Boxplot of normalized expression values
boxplot_norm_expression <- ggplot(normalized_df, aes(x=gds_id, y = value)) +
geom_boxplot() +
labs(title = paste0("Distribution of Normalized Expression Values for ", gds_id), x='',
y = "Expression Value") +
theme_minimal()+
theme(
plot.title = element_text(size = 18, face = "bold", hjust = 0.5),
axis.title.x = element_text(size = 16),
axis.title.y = element_text(size = 16),
axis.text.x = element_text(size = 15),
axis.text.y = element_text(size = 15)
)
ggsave(file.path(gds_output_dir, paste0(gds_id, "_norm_expression_values_boxplot.png")), plot = boxplot_norm_expression, width = 14, height = 8, bg = "white")
}
}
plot_expr_values <- ggplot(all_expr_values, aes(x = as.factor(gds_id), y = value)) +
geom_boxplot() +
labs(title = "Distribution of Expression Values Before Normalization",
x = "GDS ID", y = "Expression Value") +
theme_minimal()+
theme(
plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14),
axis.text.x = element_text(size = 12),
axis.text.y = element_text(size = 12)
)
ggsave(file.path(output_dir, "boxplot_expression_values_prenormalization.png"), plot = plot_expr_values, width = 14, height = 8, bg = 'white')
# Graficar caja y bigotes para valores después de la normalización
plot_normalized_values <- ggplot(all_normalized_values, aes(x = as.factor(gds_id), y = value)) +
geom_boxplot() +
labs(title = "Distribution of Normalized Expression Values",
x = "GDS ID", y = "Expression Value") +
theme_minimal()+
theme(
plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14),
axis.text.x = element_text(size = 12),
axis.text.y = element_text(size = 12)
)
ggsave(file.path(output_dir, "boxplot_expression_values_postnormalization.png"), plot = plot_normalized_values, width = 14, height = 8, bg = 'white')
#combined_plot <- grid.arrange(plot_expr_values, plot_normalized_values, nrow = 2)
#ggsave(file.path(output_dir, "boxplot_expression_values_combined.png"), plot = combined_plot, width = 14, height = 8, bg = 'white')
#install.packages("DBI")
#install.packages("RMySQL")
library(DBI)
library(RMySQL)
library(limma)
library(ggplot2)
library(pheatmap)
library(dplyr)
library(reshape2)
library(grid)
library(gtable)
#dbDisconnect(con)
con <- dbConnect(MySQL(),
dbname = "disnet_biolayer",
host = "ares.ctb.upm.es",
port = 30604,
user = "gene_expression",
password = "gene_expression")
graphs_dir <- "C:/Users/Laura/Documents/Master Cuatri 2/Practicas/DEA/exprs_dea/results/expr_dea_groupwise_graphs"
gds_ids <- dbGetQuery(con, "SELECT DISTINCT gds_id FROM expr_processed_annot WHERE flag = 1")
differential_expression_analysis <- function(gds_id, con) {
#extraction of data from expr_values table
query_expr_values <- paste0("SELECT gsm_id, id_ref, gene_symbol, value FROM expr_values WHERE gds_id = '", gds_id, "'")
expr_values <- dbGetQuery(con, query_expr_values, columns = c("gsm_id" = "character",
"id_ref" = "character",
"gene_symbol" = "character",
"value" = "numeric"))
if (nrow(expr_values) == 0) {
warning(paste("No data found for gds_id:", gds_id))
return(NULL)
}
if (!is.numeric(expr_values$value)) {
expr_values$value <- as.numeric(expr_values$value)
}
#extraction of data from expr_processed_annot table with flag = 1
query_expr_processed_annot <- paste0("SELECT gsm_id, disease_state FROM expr_processed_annot WHERE gds_id = '", gds_id, "' AND flag = 1")
expr_processed_annot <- dbGetQuery(con, query_expr_processed_annot)
if (nrow(expr_processed_annot) == 0) {
warning(paste("No data found in expr_processed_annot for gds_id:", gds_id))
return(NULL)
}
#ensure there are enough disease state levels
if (length(unique(expr_processed_annot$disease_state)) < 2) {
warning(paste("Not enough disease state levels for gds_id:", gds_id))
return(NULL)
}
#to keep gene_symbol for reference at the end
gene_symbols <- unique(expr_values[, c("id_ref", "gene_symbol")])
#creation of the expression matrix without gene_symbol
expression <- reshape2::dcast(expr_values, id_ref ~ gsm_id, value.var = "value", fun.aggregate = mean)
rownames(expression) <- expression$id_ref
expression <- expression[, -1] # Remove the id_ref column
if (ncol(expression) == 0) {
warning(paste("No expression data available for gds_id:", gds_id))
return(NULL)
}
#normalize expression matrix
exprs_matrix <- as.matrix(expression)
exprs_normalized <- normalizeBetweenArrays(exprs_matrix, method = 'quantile')
#create design matrix
target <- expr_processed_annot
target$disease_state <- factor(target$disease_state)
if (length(levels(target$disease_state)) < 2) {
warning(paste("Not enough disease state levels for gds_id:", gds_id))
return(NULL)
}
design <- model.matrix(~ 0 + disease_state, data = target)
colnames(design) <- levels(target$disease_state)
# Fit linear model
fit <- lmFit(exprs_normalized, design)
#to handle NA coefficients by excluding probes with missing data
na_probes <- rownames(fit$coefficients)[apply(is.na(fit$coefficients), 1, any)]
if (length(na_probes) > 0) {
warning(paste("Partial NA coefficients for", length(na_probes), "probe(s). These will be excluded from the analysis."))
fit <- fit[-match(na_probes, rownames(fit$coefficients)), ]
}
#create and apply contrasts
contrast_matrix <- makeContrasts(
control_vs_disease = c - d,
levels = design
)
fit2 <- contrasts.fit(fit, contrast_matrix)
fit2 <- eBayes(fit2)
#handle zero sample variances by offsetting or excluding
zero_variances <- fit2$sigma == 0
if (any(zero_variances)) {
warning("Zero sample variances detected, have been offset away from zero or excluded")
fit2$sigma[zero_variances] <- min(fit2$sigma[!zero_variances]) * 1e-6
}
# results
results <- topTable(fit2, adjust = "fdr", number = Inf)
results$significant <- ifelse(results$adj.P.Val < 0.05 & abs(results$logFC) > 1,
ifelse(results$logFC > 0, "o", "u"),
"n")
#check if results contain any significant genes
if (nrow(results) == 0) {
warning(paste("No significant results for gds_id:", gds_id))
return(NULL)
}
#add gene symbols back to results to have a reference
results$id_ref <- rownames(results)
results <- merge(results, gene_symbols, by = "id_ref", all.x = TRUE)
#add gds_id column to know from which dataset comes from
results$gds_id <- gds_id
results <- results[, c("gds_id", "id_ref", "gene_symbol", "logFC", "AveExpr", "t", "P.Value", "adj.P.Val", "B", "significant")]
return(list(results = results, gene_symbols=gene_symbols, exprs_normalized = exprs_normalized,target=target))
}
# Function: individual_expression_analysis
# Description: Performs individual expression analysis by calculating z-scores for each gene in each sample,
# based on normalized gene expression data. It identifies significant gene expression changes
# in individual samples compared to a control group.
# Inputs:
# - gds_id: Identifier for the gene expression dataset.
# - target: Data frame containing sample metadata, including the disease state (control or diseased).
# - exprs_normalized: Matrix of normalized gene expression data, where rows represent genes and columns
# represent samples.
# - gene_symbols: Data frame containing gene symbols or identifiers corresponding to the rows of exprs_normalized.
# - output_dir: Directory path where the output results will be stored.
# Outputs:
# - A data frame containing the following columns:
# - gds_id: Identifier for the gene expression dataset.
# - gsm_id: Identifier for the individual sample.
# - id_ref: Identifier for the gene with significant expression changes in the sample.
# - gene_symbol: Symbol or identifier for the gene with significant expression changes.
# - z_score: Z-score indicating the magnitude and direction of expression change for the gene in the sample.
# - If no individual significant results are found for a given gds_id, the function issues a warning message indicating this outcome.
individual_expression_analysis <- function(gds_id, target, exprs_normalized, gene_symbols, output_dir) {
if (!is.matrix(exprs_normalized) || ncol(exprs_normalized) < 2 || nrow(exprs_normalized) < 2) {
warning(paste("exprs_normalized does not have at least dos dimensiones para gds_id:", gds_id))
return(NULL)
}
if (sum(target$disease_state == 'c') == 0) {
warning("No control samples found.")
return(NULL)
}
gsm_control <- subset(target, disease_state == 'c')
gsm_control_ids <- gsm_control$gsm_id
exprs_control <- exprs_normalized[, gsm_control_ids]
gene_means_control <- rowMeans(exprs_control, na.rm = TRUE)
gene_sds_control <- apply(exprs_control, 1, sd, na.rm = TRUE)
z <- sweep(exprs_normalized, 1, gene_means_control, "-")
z <- sweep(z, 1, gene_sds_control, "/")
cutoff <- 2.5
results_list <- list()
for (i in 1:ncol(exprs_normalized)) {
df <- data.frame(
gsm_id = colnames(exprs_normalized)[i],
id_ref = rownames(exprs_normalized),
z_score = z[, i]
)
df$significant <- ifelse(abs(df$z_score) > cutoff,
ifelse(df$z_score > 0, "o", "u"),
"n")
results_list[[length(results_list) + 1]] <- df
}
individual_results <- do.call(rbind, results_list)
individual_results <- merge(individual_results, gene_symbols, by = "id_ref", all.x = TRUE)
individual_results$gds_id <- gds_id
individual_results <- individual_results[, c("gds_id", "gsm_id", "id_ref", "gene_symbol", "z_score", "significant")]
return(individual_results)
}
randomized_model <- function(exprs_normalized, target) {
set.seed(123) # Para la reproducibilidad
G <- nrow(exprs_normalized) # Número de genes
n <- ncol(exprs_normalized) # Número de muestras
num_simulations <- 1000
z_cutoff <- 2.5
max_subjects_per_gene_random <- numeric(num_simulations)
for (sim in 1:num_simulations) {
# Generar una matriz de Z-scores aleatorizados
random_z_scores <- matrix(rnorm(G * n), nrow = G, ncol = n)
# Contar el número de sujetos con Z-score mayor que el umbral en cada gen
diseased_subjects <- which(target$disease_state == 'd')
counts_per_gene <- rowSums(random_z_scores[, diseased_subjects] > z_cutoff)
# Almacenar el máximo número de sujetos por gen en esta simulación
max_subjects_per_gene_random[sim] <- max(counts_per_gene)
}
# Determinar el umbral X (percentil 95 de las simulaciones)
X_rand <- quantile(max_subjects_per_gene_random, 0.95)
return(X_rand)
}
# This function generates a volcano plot from differential expression results and saves it as a PNG file.
#
# Args:
# results: A data frame containing the results of differential expression analysis with columns for
# log fold change (logFC), p-values (P.Value), and significance status (significant).
# gds_id: A string representing the GDS ID, which will be used in the plot title and filename.
# graphs_dir: A string representing the directory where the plot will be saved.
#
# Output:
# A volcano plot is saved as a PNG file in the specified directory.
create_volcano_plot <- function(results, gds_id, graphs_dir) {
# Map the significant values to corresponding labels for better readability in the legend
results$significant_label <- factor(results$significant, levels = c("n", "o", "u"),
labels = c("not significant", "overexpressed", "underexpressed"))
# Create the volcano plot using ggplot2
volcano_plot <- ggplot(results, aes(x = logFC, y = -log10(P.Value), color = significant_label)) +
geom_point() + # Plot the points
theme_minimal() + # Use a minimal theme for the plot
labs(
title = paste("Volcano Plot for", gds_id), # Title of the plot
x = "Log Fold Change", # X-axis label
y = "-log10 adj-p-value", # Y-axis label
color = "Differential Expression" # Legend title
) +
scale_color_manual(
values = c("not significant" = "grey", "overexpressed" = "red", "underexpressed" = "blue") # Custom colors
) +
theme(
plot.title = element_text(size = 16, face = "bold", hjust = 0.5), # Customize plot title
axis.title.x = element_text(size = 14), # Customize X-axis title
axis.title.y = element_text(size = 14), # Customize Y-axis title
axis.text.x = element_text(size = 12), # Customize X-axis text
axis.text.y = element_text(size = 12), # Customize Y-axis text
legend.position = "right", # Position the legend on the right
legend.justification = "center", # Center the legend
legend.title = element_text(size = 14), # Customize legend title
legend.text = element_text(size = 12), # Customize legend text
legend.box.background = element_rect(color = "black", linewidth = 0.5, linetype = "solid"), # Legend box background
legend.box.margin = margin(5, 5, 5, 5), # Margin around legend box
legend.box.spacing = unit(0.5, "cm"), # Spacing within legend box
legend.direction = "vertical" # Legend direction
)
# Save the plot as a PNG file
ggsave(file.path(graphs_dir, paste0("volcano_plot_", gds_id, ".png")), plot = volcano_plot, bg = 'white')
}
create_heatmap <- function(exprs_normalized, results, gds_id, graphs_dir, target) {
# Seleccionar los 50 genes más significativos
top_genes <- results[order(results$adj.P.Val), ][1:50, ]
top_gene_names <- top_genes$id_ref
if (length(top_gene_names) < 50) {
warning(paste("Less than 50 significant genes for gds_id:", gds_id))
}
exprs_top <- exprs_normalized[top_gene_names, ]
if (any(!is.finite(as.matrix(exprs_top)))) {
warning(paste("Non-finite values detected in the top expression data for gds_id:", gds_id))
return(NULL)
}
if (!is.factor(target$disease_state)) {
target$disease_state <- factor(target$disease_state)
}
annotation_col <- data.frame(DiseaseState = target$disease_state)
rownames(annotation_col) <- target$gsm_id
annotation_colors <- list(DiseaseState = c("c" = "white", "d" = "grey"))
heatmap <- pheatmap(
exprs_top,
scale = "row",
clustering_distance_rows = "correlation",
clustering_distance_cols = "correlation",
main = paste("Heatmap of top 50 genes present in the most GSMs for", gds_id),
fontsize_row = 8,
fontsize_col = 4,
cluster_cols = FALSE,
annotation_col = annotation_col,
annotation_colors = annotation_colors
)
ggsave(file.path(graphs_dir, paste0("heatmap_", gds_id, ".png")), plot = heatmap)
}
all_results <- list()
all_individual_results <- list()
summary_results <- list()
for (gds_id in gds_ids$gds_id) {
de_analysis <- differential_expression_analysis(gds_id, con)
if (!is.null(de_analysis)) {
create_volcano_plot(de_analysis$results, gds_id, graphs_dir)
#create_heatmap(de_analysis$exprs_normalized, de_analysis$results, gds_id, graphs_dir, de_analysis$target)
all_results[[length(all_results) + 1]] <- de_analysis$results
}
individual_results <- individual_expression_analysis(gds_id, de_analysis$target, de_analysis$exprs_normalized, de_analysis$gene_symbols, output_dir)
if (is.null(individual_results)) next
#DEG only in diseased individuals
filtered_ind_results <- individual_results %>%
filter(significant != "n") %>%
inner_join(de_analysis$target %>% filter(disease_state == "d"), by = "gsm_id")
#define X with the random model
X <- randomized_model(de_analysis$exprs_normalized, de_analysis$target)
#calculate the number of individuals with DEG
gene_counts <- filtered_ind_results %>%
group_by(gene_symbol) %>%
summarize(num_subjects = n_distinct(gsm_id))
#filter the genes that are DEG in a number of individuals > x
genes_above_threshold <- gene_counts %>%
filter(num_subjects >= X)
# get results
final_results <- filtered_ind_results %>%
filter(gene_symbol %in% genes_above_threshold$gene_symbol)
summary_results[[length(summary_results) + 1]] <- data.frame(
gds_id = gds_id,
gsm_min = X,
sig_gene_count = nrow(genes_above_threshold)
)
all_results[[length(all_results) + 1]] <- de_analysis$results
all_individual_results[[length(all_individual_results) + 1]] <- final_results
}
if (length(all_results) > 0) {
combined_results <- do.call(rbind, all_results)
colnames(combined_results)[colnames(combined_results) == "adj.P.Val"] <- "adj_p_val"
colnames(combined_results)[colnames(combined_results) == "P.Value"] <- "p_val"
combined_results_round <- combined_results %>% mutate(across(where(is.numeric), \(x) round(x, 4)))
significant_results <- subset(combined_results_round, significant != 'n')
# write.csv(significant_results, file = "C:/Users/Laura/Documents/Master Cuatri 2/Practicas/DEA/exprs_dea/results/expr_dea_groupwise/significant_results.csv", row.names = FALSE)
# write.csv(combined_results_round, file = "C:/Users/Laura/Documents/Master Cuatri 2/Practicas/DEA/exprs_dea/results/expr_dea_groupwise/combined_results.csv", row.names = FALSE)
}
#upload group wise results to MySQL
#dbWriteTable(con, "expr_dea_groupwise", combined_results_round, append = TRUE, row.names = FALSE)
if (length(all_individual_results) > 0) {
combined_individual_results <- do.call(rbind, all_individual_results)
combined_individual_results_round <- combined_individual_results %>%
mutate(across(where(is.numeric), \(x) round(x, 4)))
}
individual_summary <- do.call(rbind, summary_results)
#write.csv(individual_summary, file = "C:/Users/Laura/Documents/Master Cuatri 2/Practicas/DEA/exprs_dea/results/expr_dea_perindividual/individual_summary.csv", row.names = FALSE)
df_without_disease_state <- combined_individual_results_round %>% select(-disease_state)
#write.csv(df_without_disease_state, file = "C:/Users/Laura/Documents/Master Cuatri 2/Practicas/DEA/exprs_dea/results/expr_dea_perindividual/combined_individual_results.csv", row.names = FALSE)
#dbWriteTable(con, "expr_dea_perindividual", df_without_disease_state, append = TRUE, row.names = FALSE)
#Plot the adj p value of all the gds_id
p <- ggplot(combined_results, aes(x = adj_p_val, color = as.factor(gds_id))) +
geom_histogram(binwidth = 0.01, fill = NA, position = "identity") +
labs(title = "Histogram of adjusted p-value for each GDS",
x = "Adjusted p-value",
y = "Frequency") +
theme_minimal() +
xlim(0, 1.3) +
scale_color_discrete(name = "GDS ID")
ggsave(file.path(graphs_dir, "adj_p_val_distribution_all_gds_ids.png"), plot = p, bg = 'white')
#initialize an empty list to store all results
all_results <- list()
all_individual_results <- list()
for (gds_id in gds_ids$gds_id) {
de_analysis <- differential_expression_analysis(gds_id, con)
individual_results <- individual_expression_analysis(gds_id, de_analysis$target,de_analysis$exprs_normalized,de_analysis$gene_symbols, output_dir)
# disease_associated_genes <- randomized_model(gds_id, de_analysis$exprs_normalized, de_analysis$target)
if (!is.null(individual_results)) {
all_individual_results[[length(all_individual_results) + 1]] <- individual_results
}
if (!is.null(de_analysis)) {
# create_volcano_plot(de_analysis$results, gds_id, graphs_dir)
#create_heatmap(de_analysis$exprs_normalized, de_analysis$results, gds_id, graphs_dir, de_analysis$target)
all_results[[length(all_results) + 1]] <- de_analysis$results
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment