Commit 65e44f58 authored by Belen Otero Carrasco's avatar Belen Otero Carrasco

first code

parents
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
disease_id gene_id score
0 C0001080 2261 1.0
1 C0001815 3717 1.0
2 C0001815 4352 1.0
3 C0002066 3081 1.0
4 C0002986 2717 1.0
5 C0003076 5080 1.0
6 C0004135 472 1.0
7 C0004779 5727 1.0
8 C0005859 641 1.0
9 C0006413 4609 1.0
10 C0007959 4359 1.0
11 C0007965 1130 1.0
12 C0008029 6452 1.0
13 C0008525 1121 1.0
14 C0008533 2158 1.0
15 C0008928 860 1.0
16 C0010674 1080 1.0
17 C0011195 1959 0.93
18 C0011195 4359 1.0
19 C0011195 5376 1.0
20 C0013264 1756 1.0
21 C0013364 8518 1.0
22 C0013903 2121 0.96
23 C0013903 132884 1.0
24 C0015306 2131 1.0
25 C0015523 2160 1.0
26 C0015526 2161 0.96
27 C0016037 90 1.0
28 C0016395 64840 1.0
29 C0016667 2332 1.0
30 C0016719 2395 1.0
31 C0017495 5621 1.0
32 C0017920 2538 1.0
33 C0017922 178 1.0
34 C0017924 5837 1.0
35 C0017926 5213 0.94
36 C0018425 4942 1.0
37 C0018553 5728 1.0
38 C0018609 340024 1.0
39 C0019069 2157 1.0
40 C0019202 540 1.0
41 C0019562 7428 1.0
42 C0020074 4914 1.0
43 C0020725 79158 1.0
44 C0021171 8517 1.0
45 C0022336 5621 1.0
46 C0022350 1244 1.0
47 C0022387 3784 1.0
48 C0022521 27019 1.0
49 C0022595 488 1.0
50 C0022716 538 1.0
51 C0022797 54982 0.92
52 C0023195 3931 1.0
53 C0023374 3251 1.0
54 C0023467 1050 1.0
55 C0023467 2322 1.0
56 C0023817 4023 1.0
57 C0023931 1277 1.0
58 C0024591 6261 1.0
59 C0024796 2200 1.0
60 C0025202 673 1.0
61 C0025221 57152 1.0
62 C0025267 4221 1.0
63 C0025269 5979 1.0
64 C0026705 3423 1.0
65 C0027341 4010 1.0
66 C0027831 4763 1.0
67 C0027832 4771 1.0
68 C0028326 5781 1.0
69 C0031069 4210 1.0
70 C0031269 6794 1.0
71 C0031511 5979 1.0
72 C0032463 3717 1.0
73 C0033300 4000 1.0
74 C0033835 2778 1.0
75 C0033847 368 1.0
76 C0034960 5264 1.0
77 C0035372 4204 1.0
78 C0035828 3784 1.0
79 C0035934 1387 1.0
80 C0036391 3339 0.97
81 C0039292 19 1.0
82 C0039445 94 1.0
83 C0039445 2022 1.0
84 C0039585 367 1.0
85 C0039685 1482 0.96
86 C0039685 23414 0.98
87 C0040015 3690 1.0
88 C0041341 7248 1.0
89 C0041341 7249 1.0
90 C0043116 6606 1.0
91 C0043119 7486 1.0
92 C0043194 7454 1.0
93 C0043207 7466 1.0
94 C0043459 5189 1.0
95 C0079153 3848 1.0
96 C0079295 3852 0.91
97 C0079295 3861 0.93
98 C0079474 1294 1.0
99 C0079683 3914 0.92
100 C0080024 3815 1.0
101 C0085131 2720 1.0
102 C0085132 2990 1.0
103 C0085390 7157 1.0
104 C0085413 5310 1.0
105 C0085548 5314 1.0
106 C0085859 326 1.0
107 C0086647 6448 1.0
108 C0086648 4669 1.0
109 C0086651 2588 1.0
110 C0152109 6606 0.98
111 C0162309 215 1.0
112 C0162361 10804 1.0
113 C0162530 7390 1.0
114 C0162531 1371 1.0
115 C0162565 3145 1.0
116 C0162566 7389 1.0
117 C0162635 7337 1.0
118 C0175692 197131 1.0
119 C0175694 1717 1.0
120 C0175697 3664 1.0
121 C0175699 7291 1.0
122 C0175704 5781 1.0
123 C0206042 5621 1.0
124 C0220663 668 1.0
125 C0220668 2201 1.0
126 C0220685 1280 0.96
127 C0220704 6899 1.0
128 C0220710 34 1.0
129 C0220743 249 0.92
130 C0220754 686 1.0
131 C0220767 1947 1.0
132 C0221026 695 1.0
133 C0221036 55630 1.0
134 C0221043 6338 1.0
135 C0238198 3815 1.0
136 C0238357 6329 1.0
137 C0239849 26154 1.0
138 C0265205 7474 0.96
139 C0265234 2138 1.0
140 C0265252 6197 1.0
141 C0265259 3664 1.0
142 C0265260 8200 0.95
143 C0265264 6910 1.0
144 C0265269 2255 0.93
145 C0265289 1300 0.94
146 C0265306 2737 1.0
147 C0265326 5728 1.0
148 C0265354 55636 1.0
149 C0265961 2707 1.0
150 C0266526 4693 1.0
151 C0267662 1811 0.99
152 C0268113 7369 1.0
153 C0268120 353 0.99
154 C0268125 4860 0.93
155 C0268225 175 1.0
156 C0268237 1352 0.94
157 C0268238 51099 1.0
158 C0268250 2629 0.93
159 C0268251 2629 0.93
160 C0268255 427 1.0
161 C0268263 285362 1.0
162 C0268275 2760 0.92
163 C0268338 1281 1.0
164 C0268362 1277 0.96
165 C0268362 1278 0.95
166 C0268363 1277 0.96
167 C0268412 249 1.0
168 C0268413 249 0.94
169 C0268425 7840 1.0
170 C0268450 6559 1.0
171 C0268490 2184 1.0
172 C0268494 7299 1.0
173 C0268495 4948 1.0
174 C0268547 435 1.0
175 C0268548 383 1.0
176 C0268623 3242 0.94
177 C0270724 8398 1.0
178 C0270911 5376 1.0
179 C0270912 4359 1.0
180 C0270968 22954 1.0
181 C0271093 24 0.98
182 C0271568 2690 1.0
183 C0271829 5172 1.0
184 C0272302 23218 0.99
185 C0272375 462 1.0
186 C0282102 10682 1.0
187 C0339510 7439 1.0
188 C0339541 10002 0.95
189 C0340968 5313 1.0
190 C0341306 4645 1.0
191 C0342287 10560 1.0
192 C0342288 50943 1.0
193 C0342482 190 1.0
194 C0342637 846 1.0
195 C0342642 8074 0.94
196 C0342684 4935 0.99
197 C0342687 1621 0.91
198 C0342783 35 0.97
199 C0342788 6584 1.0
200 C0343068 114548 1.0
201 C0345893 657 1.0
202 C0345893 4089 1.0
203 C0346010 201163 1.0
204 C0349639 3845 1.0
205 C0349639 5781 1.0
206 C0349653 5373 1.0
207 C0391816 4286 0.93
208 C0391826 5728 1.0
209 C0393576 23230 1.0
210 C0393590 6575 0.97
211 C0393808 2705 1.0
212 C0393814 5376 1.0
213 C0393818 1959 0.99
214 C0398738 3689 1.0
215 C0406557 55612 1.0
216 C0409818 114548 1.0
217 C0410173 6445 1.0
218 C0410174 2218 1.0
219 C0410180 57190 1.0
220 C0410538 1311 1.0
221 C0432215 8838 1.0
222 C0432217 9451 1.0
223 C0432252 4041 1.0
224 C0432284 5159 0.94
225 C0521802 7018 0.91
226 C0524582 4591 1.0
227 C0543669 1441 0.93
228 C0549463 4068 1.0
229 C0574080 2593 0.94
230 C0587248 3265 1.0
231 C0599973 64093 0.96
232 C0687720 551 1.0
233 C0733682 5251 1.0
234 C0751122 6323 1.0
235 C0751383 1201 1.0
236 C0751587 4854 1.0
237 C0751748 2731 1.0
238 C0751753 1373 0.92
239 C0751783 7957 1.0
240 C0751783 378884 1.0
241 C0751785 1476 1.0
242 C0751951 6261 1.0
243 C0752123 6712 1.0
244 C0752124 773 1.0
245 C0795833 79813 0.99
246 C0795889 6567 1.0
247 C0795953 3897 1.0
248 C0796004 8085 1.0
249 C0796085 4810 1.0
250 C0796147 374654 0.95
251 C0796154 2719 1.0
252 C0812437 2697 1.0
253 C0877024 50485 1.0
254 C0878677 3920 1.0
255 C0917796 4541 1.0
256 C0950121 7490 1.0
257 C1142166 6331 1.0
258 C1261473 7157 1.0
259 C1263858 3908 1.0
260 C1275081 673 1.0
261 C1275126 7132 1.0
262 C1275808 8929 1.0
263 C1327915 4352 1.0
264 C1328840 355 1.0
265 C1328840 356 0.93
266 C1333990 4292 1.0
267 C1414216 55145 0.95
268 C1449563 4000 1.0
269 C1567742 1287 1.0
270 C1567744 1285 1.0
271 C1568247 4647 1.0
272 C1568248 7401 1.0
273 C1568249 25861 0.94
274 C1623209 57167 1.0
275 C1631597 6262 1.0
276 C1719788 3736 1.0
277 C1720416 773 1.0
278 C1720861 5468 0.94
279 C1721005 3851 0.94
280 C1785148 8626 0.99
281 C1832174 2202 1.0
282 C1832274 2617 0.97
283 C1832370 1674 1.0
284 C1832399 8898 0.97
285 C1832525 6444 0.94
286 C1832567 859 0.92
287 C1832702 8200 0.95
288 C1832916 775 1.0
289 C1833104 3630 0.98
290 C1833104 3767 1.0
291 C1833662 7415 1.0
292 C1834570 8910 1.0
293 C1835896 547 0.91
294 C1836336 121512 0.95
295 C1836383 2259 0.91
296 C1836727 6663 0.92
297 C1836876 3913 1.0
298 C1837355 57165 0.92
299 C1838062 9211 1.0
300 C1838103 80324 0.95
301 C1838244 7273 0.98
302 C1838457 675 0.92
303 C1838570 2055 0.93
304 C1838571 256471 0.95
305 C1839264 5354 1.0
306 C1840333 2625 1.0
307 C1841679 3209 1.0
308 C1842362 8546 0.94
309 C1842983 54332 0.91
310 C1843225 4747 0.94
311 C1843512 1282 0.92
312 C1845055 546 1.0
313 C1845862 6535 1.0
314 C1846545 841 0.92
315 C1846564 6687 0.92
316 C1846574 55775 0.96
317 C1846672 79147 0.95
318 C1847024 7299 0.91
319 C1847640 23400 1.0
320 C1847800 5077 1.0
321 C1847827 3981 0.99
322 C1847836 51151 0.95
323 C1848336 1184 0.94
324 C1848410 5429 0.94
325 C1848519 1910 1.0
326 C1848533 7274 1.0
327 C1848561 25974 0.95
328 C1848634 7399 0.98
329 C1849334 4920 0.99
330 C1849394 10002 1.0
331 C1849678 51 0.92
332 C1850343 701 0.96
333 C1850386 8139 1.0
334 C1850442 1203 0.91
335 C1850451 5538 0.94
336 C1850889 8291 1.0
337 C1850938 7078 1.0
338 C1852020 2202 0.99
339 C1852406 2263 0.97
340 C1852529 866 0.94
341 C1852759 5076 1.0
342 C1853249 10939 0.96
343 C1853733 30061 0.98
344 C1854065 114902 0.97
345 C1854467 3329 0.92
346 C1855465 24 1.0
347 C1855794 2304 0.93
348 C1856728 7476 0.93
349 C1856738 8200 0.92
350 C1856934 3691 1.0
351 C1857242 8443 0.91
352 C1857277 4036 0.95
353 C1857569 83959 1.0
354 C1857663 9896 0.95
355 C1858084 1301 0.95
356 C1858479 80208 0.96
357 C1858517 3508 1.0
358 C1858593 6443 0.94
359 C1858664 7036 0.99
360 C1858680 5274 0.93
361 C1859133 5191 0.98
362 C1859405 10436 0.95
363 C1859486 285440 1.0
364 C1859722 26276 1.0
365 C1859727 5167 0.98
366 C1859844 6121 0.94
367 C1861922 6662 1.0
368 C1862103 8200 1.0
369 C1862151 3549 0.97
370 C1864356 4882 0.96
371 C1864436 2261 1.0
372 C1865020 3757 0.91
373 C1865349 23474 1.0
374 C1865872 27130 0.93
375 C1866282 54982 0.91
376 C1866495 6557 0.91
377 C1866636 79628 0.96
378 C1866855 6683 0.99
379 C1866994 6926 1.0
380 C1868570 7021 0.96
381 C1869117 25953 1.0
382 C1869123 825 1.0
383 C1876161 1200 0.98
384 C1879286 6331 0.96
385 C1961835 2629 1.0
386 C1969785 203859 0.92
387 C1970011 9896 0.99
388 C1970431 6925 1.0
389 C2607929 5573 0.92
390 C2675204 26090 0.94
391 C2678061 2273 0.92
392 C2699746 3239 1.0
393 C2748515 9060 0.92
394 C2748536 83706 0.92
395 C2748572 3766 1.0
396 C2750737 4072 0.96
397 C2750785 4000 0.91
398 C2931008 4247 0.92
399 C2936332 6442 1.0
400 C2940786 7068 1.0
401 C3280428 23600 0.91
402 C3539495 23259 0.92
403 C3553774 22909 0.91
404 C3711389 58 0.92
405 C3887523 37 0.98
406 C3888198 7439 1.0
407 C4225349 1909 0.91
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
drug_id
0 CHEMBL578
2 CHEMBL577
4 CHEMBL1513
8 CHEMBL59
11 CHEMBL191
15 CHEMBL1168
17 CHEMBL1393
23 CHEMBL1017
25 CHEMBL1069
27 CHEMBL1042
38 CHEMBL1201203
39 CHEMBL1040
41 CHEMBL1560
43 CHEMBL502
45 CHEMBL1536
49 CHEMBL2356023
51 CHEMBL846
54 CHEMBL25
72 CHEMBL1434
81 CHEMBL1373
83 CHEMBL1175
88 CHEMBL1237
92 CHEMBL1581
102 CHEMBL611
103 CHEMBL2110563
105 CHEMBL2103737
109 CHEMBL1201236
157 CHEMBL1229517
0 CHEMBL641
1 CHEMBL81
9 CHEMBL118
23 CHEMBL960
39 CHEMBL1201572
45 CHEMBL1366
49 CHEMBL53
63 CHEMBL1908360
93 CHEMBL413
119 CHEMBL1201182
120 CHEMBL1138
130 CHEMBL672
146 CHEMBL557555
147 CHEMBL41
179 CHEMBL490
200 CHEMBL809
230 CHEMBL814
234 CHEMBL415
238 CHEMBL1628227
252 CHEMBL726
255 CHEMBL71
258 CHEMBL643
268 CHEMBL567
269 CHEMBL1423
312 CHEMBL54
317 CHEMBL831
319 CHEMBL715
331 CHEMBL716
349 CHEMBL85
354 CHEMBL1112
364 CHEMBL1621
367 CHEMBL1237021
372 CHEMBL1201154
377 CHEMBL564
388 CHEMBL741
403 CHEMBL807
438 CHEMBL49
450 CHEMBL11
453 CHEMBL445
455 CHEMBL21731
457 CHEMBL644
459 CHEMBL46516
461 CHEMBL26
462 CHEMBL908
463 CHEMBL2105760
464 CHEMBL42
465 CHEMBL1764
466 CHEMBL243712
468 CHEMBL6437
472 CHEMBL479
473 CHEMBL297302
552 CHEMBL563
558 CHEMBL154
575 CHEMBL424
597 CHEMBL15770
637 CHEMBL112
650 CHEMBL94081
651 CHEMBL622
655 CHEMBL1297
657 CHEMBL521
667 CHEMBL6
675 CHEMBL571
680 CHEMBL509
684 CHEMBL599
686 CHEMBL1070
687 CHEMBL527
688 CHEMBL122
693 CHEMBL154111
694 CHEMBL421
696 CHEMBL93645
699 CHEMBL139
707 CHEMBL898
710 CHEMBL1071
712 CHEMBL365795
714 CHEMBL1020
716 CHEMBL468
862 CHEMBL35
865 CHEMBL1577
866 CHEMBL349803
867 CHEMBL1148
869 CHEMBL1055
870 CHEMBL888
1021 CHEMBL629
1062 CHEMBL654
1065 CHEMBL621
1125 CHEMBL435
1126 CHEMBL406
1128 CHEMBL842
1131 CHEMBL389621
1387 CHEMBL650
1627 CHEMBL131
1927 CHEMBL635
2131 CHEMBL1451
2308 CHEMBL632
2461 CHEMBL1650
2656 CHEMBL384467
3001 CHEMBL1091
3118 CHEMBL1370
3135 CHEMBL1648
3137 CHEMBL45816
3138 CHEMBL1484
3139 CHEMBL193
3143 CHEMBL6966
3153 CHEMBL1491
3167 CHEMBL23
3179 CHEMBL742
3229 CHEMBL220492
3240 CHEMBL750
3244 CHEMBL108
3258 CHEMBL973
3285 CHEMBL301265
3289 CHEMBL493
3291 CHEMBL1009
3294 CHEMBL589
3295 CHEMBL1303
3316 CHEMBL219916
3392 CHEMBL945
3393 CHEMBL585
3413 CHEMBL549
3434 CHEMBL637
3563 CHEMBL46
3571 CHEMBL894
3763 CHEMBL657
4051 CHEMBL14370
4172 CHEMBL1372950
4296 CHEMBL896
4398 CHEMBL1101
4626 CHEMBL34259
4671 CHEMBL19019
4689 CHEMBL13
4703 CHEMBL649
4704 CHEMBL546
4705 CHEMBL27
4713 CHEMBL24
4715 CHEMBL423
4717 CHEMBL645
4718 CHEMBL642
4720 CHEMBL499
4724 CHEMBL839
4726 CHEMBL723
4729 CHEMBL27810
4771 CHEMBL802
4784 CHEMBL134
4790 CHEMBL2
4791 CHEMBL707
4832 CHEMBL405
4836 CHEMBL1574
4837 CHEMBL1201201
4844 CHEMBL1437
4877 CHEMBL428647
4899 CHEMBL40
4908 CHEMBL861
4909 CHEMBL45029
4910 CHEMBL448
4921 CHEMBL452
4949 CHEMBL856
4953 CHEMBL441
4955 CHEMBL267894
4957 CHEMBL447
5079 CHEMBL608
5172 CHEMBL796
5226 CHEMBL1336
5229 CHEMBL535
5234 CHEMBL477772
5235 CHEMBL1289601
5249 CHEMBL1289926
5284 CHEMBL185
5302 CHEMBL1773
5305 CHEMBL1324
5311 CHEMBL109
5442 CHEMBL264374
5445 CHEMBL111
5453 CHEMBL1200826
5475 CHEMBL1396
5481 CHEMBL2097081
5482 CHEMBL1873475
5511 CHEMBL862
5512 CHEMBL420
5546 CHEMBL16
5785 CHEMBL160
6380 CHEMBL92
6396 CHEMBL53463
6432 CHEMBL44657
6444 CHEMBL32
6457 CHEMBL1201496
6458 CHEMBL1487
6490 CHEMBL1064
6622 CHEMBL135
6638 CHEMBL691
6652 CHEMBL940
6657 CHEMBL294199
6662 CHEMBL1140
6689 CHEMBL573
6697 CHEMBL1489
6703 CHEMBL1201129
6713 CHEMBL225072
6770 CHEMBL659
6778 CHEMBL636
6815 CHEMBL503
6822 CHEMBL98
6829 CHEMBL465
6856 CHEMBL911
6903 CHEMBL1286
6933 CHEMBL553
6965 CHEMBL1431
7079 CHEMBL679
7113 CHEMBL1399
7115 CHEMBL595
7129 CHEMBL547
7131 CHEMBL45
7203 CHEMBL86304
7223 CHEMBL414357
7225 CHEMBL1201866
7245 CHEMBL773
7263 CHEMBL386630
7271 CHEMBL554
7273 CHEMBL266481
7295 CHEMBL395429
7347 CHEMBL1201293
7352 CHEMBL848
7379 CHEMBL600
7385 CHEMBL1201497
7555 CHEMBL1094
7582 CHEMBL94
7583 CHEMBL1535
7589 CHEMBL76
7693 CHEMBL1029
7694 CHEMBL972
7700 CHEMBL1624
7705 CHEMBL1544
7708 CHEMBL1542
7715 CHEMBL2216870
7716 CHEMBL1201570
7730 CHEMBL1464
7744 CHEMBL1276308
7746 CHEMBL121
7766 CHEMBL1622
7816 CHEMBL472
7820 CHEMBL314854
7822 CHEMBL1201222
7826 CHEMBL932
7827 CHEMBL19224
7831 CHEMBL1131
7834 CHEMBL1520
7835 CHEMBL1201563
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"from sqlalchemy import create_engine\n",
"from sklearn import preprocessing\n",
"import mysql.connector\n",
"from pandas import DataFrame"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Load data"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"cases_csbj = pd.read_csv(\"final_cases_csbj.tsv\", sep='\\t')\n",
"cases_csbj = cases_csbj.drop([\"Unnamed: 0\"],axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"triplets_csbj = pd.read_excel(\"triplets_chembl_disnet.xlsx\",engine='openpyxl')\n",
"triplets_csbj =triplets_csbj.drop([\"Unnamed: 0\"],axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"triplets_csbj =triplets_csbj.rename(columns={\"Original Condition CUI\": \"disease_id\"})"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"join_csbj = cases_csbj.merge(triplets_csbj,how = \"inner\",on = [\"drug_id\",\"disease_id\"])"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"join_csbj_diseases = join_csbj[\"New Condition CUI\"]"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"join_csbj_diseases = pd.DataFrame(join_csbj_diseases).drop_duplicates()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"join_csbj_diseases =join_csbj_diseases.rename(columns={\"New Condition CUI\": \"disease_id\"})"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1. DRUG - GENE - TARGET"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"dis_gen = pd.read_csv('dis_genes.tsv', sep='\\t')\n",
"dis_gen = dis_gen.drop([\"Unnamed: 0\"],axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"gen_dise_join = join_csbj_diseases.merge(dis_gen,how = \"inner\",on = \"disease_id\")"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"drug_gen = pd.read_csv('drug_gen.tsv', sep='\\t')\n",
"drug_gen = drug_gen.drop([\"Unnamed: 0\"],axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"gen_dise_join_dru = gen_dise_join.merge(drug_gen,how = \"inner\",on = \"gene_id\")"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [],
"source": [
"score_gdas_csbj = gen_dise_join_dru[gen_dise_join_dru[\"drug_id\"]== \"CHEMBL1581\"]"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [],
"source": [
"score_gdas_csbj.to_csv(\"score_gdas_csbj.tsv\", sep='\\t')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 2.PATHWAYS"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 2.1 Pathways direct "
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [],
"source": [
"dis_path_direct = pd.read_csv('disease_pathway.tsv', sep='\\t')"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [],
"source": [
"direct_dise_pw = join_csbj_diseases.merge(dis_path_direct,how = \"inner\",on = \"disease_id\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 2.2 Pathways via genes"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [],
"source": [
"dis_gen_pw = pd.read_csv('dis_gen_pw.tsv', sep='\\t')\n",
"dis_gen_pw = dis_gen_pw.drop([\"Unnamed: 0\"],axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {},
"outputs": [],
"source": [
"join_csbj_filter = join_csbj.drop(columns = [\"disease_id\",\"drug_id\",\"gene_id\",\"Original Condition\",\"Drugs\",\"New Condition\"])"
]
},
{
"cell_type": "code",
"execution_count": 84,
"metadata": {},
"outputs": [],
"source": [
"join_csbj_filter = join_csbj_filter.rename(columns={\"New Condition CUI\": \"disease_id\"})"
]
},
{
"cell_type": "code",
"execution_count": 89,
"metadata": {},
"outputs": [],
"source": [
"pws_via_gen = join_csbj_filter.merge(dis_gen_pw,how = \"inner\",on = [\"disease_id\",\"pathway_id\"])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"from sqlalchemy import create_engine\n",
"from sklearn import preprocessing\n",
"import mysql.connector\n",
"from pandas import DataFrame"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1. DATA "
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"duples_repodb = pd.read_csv(\"repoDB_all_disdru.tsv\", sep='\\t')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"duples_csbj = pd.read_csv(\"duplas_CSBJ.tsv\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"duples_csbj = duples_csbj.drop([\"Unnamed: 0\"],axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"duples_csbj = duples_csbj.rename(columns={\"Disease CUI\": \"disease_id\"})"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>disease_id</th>\n",
" <th>pathway_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>C0020538</td>\n",
" <td>WP554</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>C0018799</td>\n",
" <td>WP1544</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>C0018799</td>\n",
" <td>WP1528</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>C0027947</td>\n",
" <td>WP229</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>C0013369</td>\n",
" <td>WP229</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>659</th>\n",
" <td>C0268274</td>\n",
" <td>WP4153</td>\n",
" </tr>\n",
" <tr>\n",
" <th>660</th>\n",
" <td>C0085131</td>\n",
" <td>WP4153</td>\n",
" </tr>\n",
" <tr>\n",
" <th>661</th>\n",
" <td>C0036161</td>\n",
" <td>WP4153</td>\n",
" </tr>\n",
" <tr>\n",
" <th>662</th>\n",
" <td>C0268275</td>\n",
" <td>WP4153</td>\n",
" </tr>\n",
" <tr>\n",
" <th>663</th>\n",
" <td>C0162666</td>\n",
" <td>WP4236</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>664 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" disease_id pathway_id\n",
"0 C0020538 WP554\n",
"1 C0018799 WP1544\n",
"2 C0018799 WP1528\n",
"3 C0027947 WP229\n",
"4 C0013369 WP229\n",
".. ... ...\n",
"659 C0268274 WP4153\n",
"660 C0085131 WP4153\n",
"661 C0036161 WP4153\n",
"662 C0268275 WP4153\n",
"663 C0162666 WP4236\n",
"\n",
"[664 rows x 2 columns]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dis_path_direct = pd.read_csv('disease_pathway.tsv', sep='\\t')\n",
"dis_path_direct"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"drug_gen = pd.read_csv('drug_gen.tsv', sep='\\t')"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"drug_gen = drug_gen.drop([\"Unnamed: 0\"],axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"dis_gen = pd.read_csv('dis_genes_gda.tsv', sep='\\t')"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"dis_gen = dis_gen.drop([\"Unnamed: 0\"],axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
"dis_gen_sinfil = pd.read_csv('dis_genes.tsv', sep='\\t')"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [],
"source": [
"dis_gen_sinfil = dis_gen_sinfil.drop([\"Unnamed: 0\"],axis=1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### JOIN REPODB"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [],
"source": [
"dis_drug_gen_sinfil = dis_gen_sinfil.merge(drug_gen, how =\"inner\", on = \"gene_id\")"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [],
"source": [
"duplas_target_repodb = dis_drug_path_fil.merge(dis_drug_gen_sinfil, how = \"inner\", on =[\"disease_id\",\"drug_id\"])"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [],
"source": [
"cases_repodb_target = duples_repodb.merge(duplas_target_repodb,how = \"inner\",on = [\"disease_id\",\"drug_id\"])"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [],
"source": [
"cases_repodb_target.to_csv(\"cases_repodb_target.tsv\", sep =\"\\t\")"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.27445783132529894"
]
},
"execution_count": 55,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cases_repodb_target[\"score\"].mean()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### JOIN CSBJ"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [],
"source": [
"duplas_target_csbj = dis_drug_path_csbj_fil.merge(dis_drug_gen_sinfil, how = \"inner\", on =[\"disease_id\",\"drug_id\"])"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [],
"source": [
"cases_csbj_target = duples_csbj.merge(duplas_target_csbj,how = \"inner\",on = [\"disease_id\",\"drug_id\"])"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [],
"source": [
"cases_csbj_target.to_csv(\"cases_csbj_target.tsv\", sep =\"\\t\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
{
"cells": [
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"from sqlalchemy import create_engine\n",
"from sklearn import preprocessing\n",
"import mysql.connector\n",
"from pandas import DataFrame"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Load data"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
"cases_csbj_target = pd.read_csv(\"cases_csbj_target.tsv\", sep =\"\\t\")\n",
"cases_csbj = cases_csbj_target.drop([\"Unnamed: 0\"],axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"22"
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(cases_csbj[\"disease_id\"].unique())"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"triplets_csbj = pd.read_excel(\"triplets_chembl_disnet.xlsx\",engine='openpyxl')"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"triplets_csbj =triplets_csbj.drop([\"Unnamed: 0\"],axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"triplets_csbj =triplets_csbj.rename(columns={\"Original Condition CUI\": \"disease_id\"})"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"join_csbj = cases_csbj.merge(triplets_csbj,how = \"inner\",on = [\"drug_id\",\"disease_id\"])"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"13"
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(join_csbj[\"disease_id\"].unique())"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"join_csbj_drug = join_csbj.drop([\"gene_id\",\"score\",\"Original Condition\",\"New Condition\",\"New Condition CUI\",\"Drugs\"],axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"join_csbj_drug =join_csbj_drug.drop_duplicates()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"join_csbj_diseases = join_csbj[\"New Condition CUI\"]"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"join_csbj_diseases = pd.DataFrame(join_csbj_diseases).drop_duplicates()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"join_csbj_diseases =join_csbj_diseases.rename(columns={\"New Condition CUI\": \"disease_id\"})"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1. DRUG - GENE - TARGET"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"dis_gen = pd.read_csv('dis_genes.tsv', sep='\\t')\n",
"dis_gen = dis_gen.drop([\"Unnamed: 0\"],axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [],
"source": [
"gen_dise_join = join_csbj_diseases.merge(dis_gen,how = \"inner\",on = \"disease_id\")"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [],
"source": [
"drug_gen = pd.read_csv('drug_gen.tsv', sep='\\t')\n",
"drug_gen = drug_gen.drop([\"Unnamed: 0\"],axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [],
"source": [
"gen_dise_join_dru = gen_dise_join.merge(drug_gen,how = \"inner\",on = \"gene_id\")"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [],
"source": [
"score_gdas_csbj = gen_dise_join_dru.merge(join_csbj ,how = \"inner\",on = [\"drug_id\",\"disease_id\",\"score\",\"gene_id\"])"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.28196850393700795"
]
},
"execution_count": 72,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"score_gdas_csbj[\"score\"].mean()"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {},
"outputs": [],
"source": [
"score_gdas_csbj.to_csv(\"score_gdas_csbj_target_filtergen.tsv\", sep='\\t')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment