# Lung Cancer sequences analysis - SANKEY PATTERNS
--------------------------------------------------------------------------------


Author: Belén Otero Carrasco

Last updated 11 April 2024

--------------------------------------------------------------------------------

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pandas import DataFrame
from scipy import stats
from sklearn.metrics import jaccard_score
from sklearn.metrics import pairwise_distances
from statsmodels.stats.diagnostic import lilliefors
from scipy.stats import mannwhitneyu, levene
import mysql.connector
import re

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [None]:
## dividir el script por los diferentes pasos que hay que seguir

# Patterns found in treatment lung cancer

In [54]:
pat_iden_5 = pd.read_csv(("patronesIdenticos5_treat.csv"),sep= ",")

In [55]:
pat_iden_5

Unnamed: 0,Patron,Proteina,Posiciones,classesProt
0,AVNWAGGLHHAKKSEASGFCYVNDIVLAILELLKYHQRVLYIDIDI...,Q13547,[131],"['hydrolase', 'nucleic acid binding', 'oxidore..."
1,AVNWAGGLHHAKKSEASGFCYVNDIVLAILELLKYHQRVLYIDIDI...,Q92769,[132],"['hydrolase', 'nucleic acid binding', 'oxidore..."
2,WLEAVIFLIGIIVANVPEGLLATVTVCLTLTAKRMARKNCLVKNLE...,P05023,[316],"['hydrolase', 'transporter']"
3,WLEAVIFLIGIIVANVPEGLLATVTVCLTLTAKRMARKNCLVKNLE...,P50993,[314],"['hydrolase', 'transporter']"
4,SMIDPPRAAVPDAVGKCRSAGIKVIMVTGDHPITAKAIAKGVGIIS...,P05023,[589],"['hydrolase', 'transporter']"
...,...,...,...,...
9858,YYS,P37231,[114],"['nucleic acid binding', 'receptor', 'transcri..."
9859,YYS,P42345,[1774],"['kinase', 'nucleic acid binding', 'transferase']"
9860,YYS,P42574,[202],"['enzyme modulator', 'hydrolase', 'protease', ..."
9861,YYV,P20618,[131],"['hydrolase', 'protease']"


In [56]:
len(pat_iden_5["Patron"].unique())

4170

In [57]:
len(pat_iden_5["Proteina"].unique())

52

In [58]:
pat_iden_5['Len_pattern'] = pat_iden_5['Patron'].apply(lambda x: len(x))

In [59]:
pat_iden_5['Len_pattern'].describe()

count    9863.000000
mean        3.912805
std         3.246529
min         3.000000
25%         3.000000
50%         4.000000
75%         4.000000
max        94.000000
Name: Len_pattern, dtype: float64

In [60]:
pat_iden_5['Len_pattern'].value_counts()

Len_pattern
3     4924
4     4101
5      494
6       78
7       38
10      30
8       26
9       24
14      18
11      16
13      12
15      12
18      10
12       8
16       8
20       6
33       6
23       4
48       4
40       4
25       2
24       2
62       2
54       2
50       2
47       2
39       2
35       2
32       2
31       2
30       2
29       2
28       2
17       2
27       2
19       2
26       2
22       2
88       2
94       2
Name: count, dtype: int64

In [61]:
pat_iden_5.groupby('Patron').size().sort_values(ascending=False)

Patron
AEA     9
YLN     7
DSK     7
VSV     7
SNL     7
       ..
ILGY    2
ILH     2
ILIL    2
ILKK    2
YYV     2
Length: 4170, dtype: int64

In [62]:
pat_iden_5["Patron"].describe()

count     9863
unique    4170
top        AEA
freq         9
Name: Patron, dtype: object

In [63]:
pat_iden_5.groupby('Proteina')['Patron'].count().sort_values(ascending=False)

Proteina
P42345    955
P22102    403
Q99460    374
P23921    351
Q99835    308
P31939    266
P11387    263
Q16881    256
P35354    243
P51649    242
P10696    239
P23219    234
P37231    212
P50993    212
Q969P6    202
P05091    200
Q9UGI9    197
Q99808    195
Q96SW2    193
P10276    192
P29466    189
P01106    188
P62508    184
P14324    181
P05023    178
O75469    175
Q71U36    174
P04637    167
P05121    165
P11474    158
O15379    153
P05412    151
P04818    145
Q9H4B7    140
Q04828    140
P07437    131
P24385    128
P28702    126
P42574    116
P20618    115
P28074    115
P12004    114
P48443    110
P19793    103
P01375     97
P10415     95
Q92769     84
P30085     82
Q13547     80
P00374     72
P62942     40
P54710     30
Name: Patron, dtype: int64

In [64]:
metrics_005 = pat_iden_5.groupby('Proteina')['Patron'].count().sort_values(ascending=False)

In [65]:
metrics_005.describe()

count     52.000000
mean     189.673077
std      133.308535
min       30.000000
25%      115.750000
50%      174.500000
75%      212.000000
max      955.000000
Name: Patron, dtype: float64

In [66]:
pat_iden_5_len4 = pat_iden_5[pat_iden_5["Len_pattern"]>= 4]

In [67]:
pat_iden_5_len4 = pat_iden_5_len4.drop_duplicates()

In [68]:
len(pat_iden_5_len4["Patron"].unique())

2368

In [69]:
pat_iden_5_bis = pd.read_csv(("patronesIdenticos5Treat.csv"),sep= ",")

In [70]:
pat_iden_5_bis['Len_pattern'] = pat_iden_5_bis['Patron'].apply(lambda x: len(x))

In [71]:
check_coin_prot = pat_iden_5_bis[pat_iden_5_bis["Len_pattern"]>10]

In [132]:
# Obtener valores únicos de la columna "Patron"
valores_unicos_patron = pat_iden_5_bis["Patron"].unique()

# Inicializar contadores para valores iguales y diferentes
iguales_count = 0
diferentes_count = 0

# Iterar sobre cada valor único en la columna "Patron"
for p in valores_unicos_patron:
    # Obtener subconjunto del DataFrame donde el valor en la columna "Patron" coincide con el valor actual de la iteración
    subset = pat_iden_5_bis[pat_iden_5_bis["Patron"] == p]
    
    # Verificar si los valores en la columna "classesProt" son iguales para este subconjunto
    if subset["classesProt"].nunique() == 1:
        iguales_count += 1
    else:
        diferentes_count += 1

# Calcular porcentaje de valores iguales y diferentes
total_patrones = len(valores_unicos_patron)
porcentaje_iguales = (iguales_count / total_patrones) * 100
porcentaje_diferentes = (diferentes_count / total_patrones) * 100

# Mostrar resultados
print(f"Porcentaje de patrones con valores iguales en 'classesProt': {porcentaje_iguales:.2f}%")
print(f"Porcentaje de patrones con valores diferentes en 'classesProt': {porcentaje_diferentes:.2f}%")


Porcentaje de patrones con valores iguales en 'classesProt': 10.36%
Porcentaje de patrones con valores diferentes en 'classesProt': 89.64%


In [140]:
# Obtener valores únicos de la columna "Patron"
valores_unicos_patron = check_coin_prot["Patron"].unique()

# Inicializar contadores para valores iguales y diferentes
iguales_count = 0
diferentes_count = 0

# Iterar sobre cada valor único en la columna "Patron"
for p in valores_unicos_patron:
    # Obtener subconjunto del DataFrame donde el valor en la columna "Patron" coincide con el valor actual de la iteración
    subset = check_coin_prot[check_coin_prot["Patron"] == p]
    
    # Verificar si los valores en la columna "classesProt" son iguales para este subconjunto
    if subset["classesProt"].nunique() == 1:
        iguales_count += 1
    else:
        diferentes_count += 1

# Calcular porcentaje de valores iguales y diferentes
total_patrones = len(valores_unicos_patron)
porcentaje_iguales = (iguales_count / total_patrones) * 100
porcentaje_diferentes = (diferentes_count / total_patrones) * 100

# Mostrar resultados
print(f"Porcentaje de patrones con valores iguales en 'classesProt': {porcentaje_iguales:.2f}%")
print(f"Porcentaje de patrones con valores diferentes en 'classesProt': {porcentaje_diferentes:.2f}%")

Porcentaje de patrones con valores iguales en 'classesProt': 100.00%
Porcentaje de patrones con valores diferentes en 'classesProt': 0.00%


In [None]:
# how many in diseases

In [None]:
# how many in other cancers

In [72]:
pat_iden_10 = pd.read_csv(("patronesIdenticos10_treat.csv"),sep= ",")

In [73]:
pat_iden_10

Unnamed: 0,Patron,Proteina,Posiciones,classesProt
0,CEGCKGFF,O75469,[57],"['nucleic acid binding', 'receptor', 'transcri..."
1,CEGCKGFF,P10276,[104],"['nucleic acid binding', 'receptor', 'transcri..."
2,CEGCKGFF,P19793,[151],"['nucleic acid binding', 'receptor', 'transcri..."
3,CEGCKGFF,P28702,[221],"['nucleic acid binding', 'receptor', 'transcri..."
4,CEGCKGFF,P37231,[155],"['nucleic acid binding', 'receptor', 'transcri..."
...,...,...,...,...
14325,YM,Q71U36,[311],['cytoskeletal protein']
14326,YM,Q92769,[358],"['hydrolase', 'nucleic acid binding', 'oxidore..."
14327,YM,Q99460,[204],['enzyme modulator']
14328,YM,Q99835,[129],"['enzyme modulator', 'receptor', 'signaling mo..."


In [74]:
len(pat_iden_10["Patron"].unique())

2034

In [75]:
len(pat_iden_10["Proteina"].unique())

52

In [76]:
pat_iden_10['Len_pattern'] = pat_iden_10['Patron'].apply(lambda x: len(x))

In [77]:
pat_iden_10

Unnamed: 0,Patron,Proteina,Posiciones,classesProt,Len_pattern
0,CEGCKGFF,O75469,[57],"['nucleic acid binding', 'receptor', 'transcri...",8
1,CEGCKGFF,P10276,[104],"['nucleic acid binding', 'receptor', 'transcri...",8
2,CEGCKGFF,P19793,[151],"['nucleic acid binding', 'receptor', 'transcri...",8
3,CEGCKGFF,P28702,[221],"['nucleic acid binding', 'receptor', 'transcri...",8
4,CEGCKGFF,P37231,[155],"['nucleic acid binding', 'receptor', 'transcri...",8
...,...,...,...,...,...
14325,YM,Q71U36,[311],['cytoskeletal protein'],2
14326,YM,Q92769,[358],"['hydrolase', 'nucleic acid binding', 'oxidore...",2
14327,YM,Q99460,[204],['enzyme modulator'],2
14328,YM,Q99835,[129],"['enzyme modulator', 'receptor', 'signaling mo...",2


In [78]:
pat_iden_10.groupby('Patron').size().sort_values(ascending=False)

Patron
HA     21
FM     21
MN     20
TC     18
GW     18
       ..
EVT     5
EVS     5
PRG     5
PRR     5
YYD     5
Length: 2034, dtype: int64

In [79]:
pat_iden_10["Patron"].describe()

count     14330
unique     2034
top          HA
freq         21
Name: Patron, dtype: object

In [80]:
pat_iden_10_len4 = pat_iden_10[pat_iden_10["Len_pattern"]>= 4]

In [81]:
pat_iden_10_len4 = pat_iden_10_len4.drop_duplicates()

In [82]:
len(pat_iden_10_len4["Patron"].unique())

47

In [83]:
pat_iden_10_len4

Unnamed: 0,Patron,Proteina,Posiciones,classesProt,Len_pattern
0,CEGCKGFF,O75469,[57],"['nucleic acid binding', 'receptor', 'transcri...",8
1,CEGCKGFF,P10276,[104],"['nucleic acid binding', 'receptor', 'transcri...",8
2,CEGCKGFF,P19793,[151],"['nucleic acid binding', 'receptor', 'transcri...",8
3,CEGCKGFF,P28702,[221],"['nucleic acid binding', 'receptor', 'transcri...",8
4,CEGCKGFF,P37231,[155],"['nucleic acid binding', 'receptor', 'transcri...",8
...,...,...,...,...,...
245,WTYE,O15379,[306],"['hydrolase', 'nucleic acid binding', 'oxidore...",4
246,WTYE,P05023,[905],"['hydrolase', 'transporter']",4
247,WTYE,P50993,[902],"['hydrolase', 'transporter']",4
248,WTYE,Q13547,[311],"['hydrolase', 'nucleic acid binding', 'oxidore...",4


In [84]:
pat_iden_10.groupby('Proteina')['Patron'].count().sort_values(ascending=False)

Proteina
P42345    987
P50993    591
P05023    575
P22102    504
Q99460    494
P23921    392
Q99835    385
P11387    377
P31939    349
Q969P6    345
Q16881    340
P28702    322
P35354    318
P23219    315
P51649    298
P48443    295
Q13547    291
P19793    290
Q92769    288
P10696    285
P05091    280
Q9UGI9    278
P11474    275
Q9H4B7    273
P07437    272
P62508    260
P37231    260
Q99808    254
P01106    253
P10276    251
Q71U36    238
O75469    234
Q96SW2    231
P05121    227
O15379    226
P14324    224
P29466    215
P04637    213
P05412    186
Q04828    177
P04818    174
P12004    156
P28074    151
P24385    151
P42574    149
P01375    138
P20618    127
P10415    109
P30085    109
P00374     99
P62942     66
P54710     33
Name: Patron, dtype: int64

In [85]:
metrics_010 = pat_iden_10.groupby('Proteina')['Patron'].count().sort_values(ascending=False)

In [86]:
metrics_010.describe()

count     52.000000
mean     275.576923
std      153.045597
min       33.000000
25%      183.750000
50%      260.000000
75%      315.750000
max      987.000000
Name: Patron, dtype: float64

In [87]:
pat_iden_10_bis = pd.read_csv(("patronesIdenticos10Treat.csv"),sep= ",")

In [88]:
pat_iden_10_bis['Len_pattern'] = pat_iden_10_bis['Patron'].apply(lambda x: len(x))

In [147]:
# Obtener valores únicos de la columna "Patron"
valores_unicos_patron = pat_iden_10_bis["Patron"].unique()

# Inicializar contadores para valores iguales y diferentes
iguales_count = 0
diferentes_count = 0

# Iterar sobre cada valor único en la columna "Patron"
for p in valores_unicos_patron:
    # Obtener subconjunto del DataFrame donde el valor en la columna "Patron" coincide con el valor actual de la iteración
    subset = pat_iden_10_bis[pat_iden_10_bis["Patron"] == p]
    
    # Verificar si los valores en la columna "classesProt" son iguales para este subconjunto
    if subset["classesProt"].nunique() == 1:
        iguales_count += 1
    else:
        diferentes_count += 1

# Calcular porcentaje de valores iguales y diferentes
total_patrones = len(valores_unicos_patron)
porcentaje_iguales = (iguales_count / total_patrones) * 100
porcentaje_diferentes = (diferentes_count / total_patrones) * 100

# Mostrar resultados
print(f"Porcentaje de patrones con valores iguales en 'classesProt': {porcentaje_iguales:.2f}%")
print(f"Porcentaje de patrones con valores diferentes en 'classesProt': {porcentaje_diferentes:.2f}%")


Porcentaje de patrones con valores iguales en 'classesProt': 0.29%
Porcentaje de patrones con valores diferentes en 'classesProt': 99.71%


In [148]:
check_coin_prot_10 = pat_iden_10_bis[pat_iden_10_bis["Len_pattern"]>4]

In [149]:
# Obtener valores únicos de la columna "Patron"
valores_unicos_patron = check_coin_prot_10["Patron"].unique()

# Inicializar contadores para valores iguales y diferentes
iguales_count = 0
diferentes_count = 0

# Iterar sobre cada valor único en la columna "Patron"
for p in valores_unicos_patron:
    # Obtener subconjunto del DataFrame donde el valor en la columna "Patron" coincide con el valor actual de la iteración
    subset = check_coin_prot_10[check_coin_prot_10["Patron"] == p]
    
    # Verificar si los valores en la columna "classesProt" son iguales para este subconjunto
    if subset["classesProt"].nunique() == 1:
        iguales_count += 1
    else:
        diferentes_count += 1

# Calcular porcentaje de valores iguales y diferentes
total_patrones = len(valores_unicos_patron)
porcentaje_iguales = (iguales_count / total_patrones) * 100
porcentaje_diferentes = (diferentes_count / total_patrones) * 100

# Mostrar resultados
print(f"Porcentaje de patrones con valores iguales en 'classesProt': {porcentaje_iguales:.2f}%")
print(f"Porcentaje de patrones con valores diferentes en 'classesProt': {porcentaje_diferentes:.2f}%")

Porcentaje de patrones con valores iguales en 'classesProt': 100.00%
Porcentaje de patrones con valores diferentes en 'classesProt': 0.00%


In [None]:
## numero de patrones encontrados, en 0.1 y 0.05, cuantos de esos patrones están en las proteinas del cancer de pulmon
## sacar un % o alguna metrica, tipo de proteinas que tienen patrones en comun 
## hacer lo mismo para el dataset de cancer 

Los 47 patrones con mas de 4 aa detectados se encuentran todos ellos en proteinas de others cancer en el caso de 0.1 

De los 47 patrones con mas de 4 aa detectados en treatment se encuentran 46 de ellos en proteinas de lung cancer en el caso de 0.1. No se encuentra el patron FFKRT

En 0.05, 2200 de 2368 patrones se encuentran en others cancer 

En 0.05,2071 de 2368 patrones se encuentran en lung cancer 

In [None]:
### Summary plot 

In [None]:
## sankey lung cancer

In [89]:
import holoviews as hv

In [90]:
import bokeh
print(bokeh.__version__)  # Verifica la versión instalada


1.4.0


In [257]:
hv.extension('bokeh')

In [102]:
phases=pd.read_excel("phases.xlsx")

In [103]:
phases

Unnamed: 0,Source,Target,Value
0,Ocurrence 0.05,Lost,1802
1,Ocurrence 0.05,>= 4 aa 0.05,2368
2,Ocurrence 0.10,Lost,1987
3,Ocurrence 0.10,>= 4 aa 0.10,47
4,>= 4 aa 0.05,Non-small cell lung cancer,2071
5,>= 4 aa 0.05,Lost,297
6,>= 4 aa 0.10,Non-small cell lung cancer,46
7,>= 4 aa 0.10,Lost,1


In [256]:
hv.Sankey(phases)

In [105]:
from holoviews.plotting.util import process_cmap
cmap_list = process_cmap("glasbey_light")

In [157]:
cmap = {
    "Ocurrence 0.05": cmap_list[3], 
    "Ocurrence 0.10": cmap_list[122],
    "Lost": cmap_list[255],
    ">= 4 aa 0.05": cmap_list[69],
    ">= 4 aa 0.10": cmap_list[102],
    "Non-small cell lung cancer": cmap_list[5],
    
}


In [158]:
sankey1 = hv.Sankey(phases, kdims=["Source", "Target"], vdims=["Value"])

sankey1.opts(cmap=cmap, label_text_font_size='0pt',
                                 edge_color='Target', edge_line_width=0.3,
                                 node_alpha=1.0, node_width=40, node_sort=True,
                                 width=1500, height=800, bgcolor="white")

In [None]:
## other types of cancer

In [30]:
prot_by_cancer_01=pd.read_excel("ProtByPatternCanc01_summary.xlsx")

In [202]:
# Función para obtener los identificadores únicos manteniendo el id completo
def obtener_ids_unicos_manteniendo_id_completo(row):
    ids = re.findall(r"'(.*?)'", row)
    return list(set(ids))

In [70]:
prot_by_cancer_01['ids_unicos'] = prot_by_cancer_01['desease_id'].apply(obtener_ids_unicos_manteniendo_id_completo)

In [71]:
prot_by_cancer_01

Unnamed: 0.1,Unnamed: 0,pattern,proteins,desease_id,protein_names,proteins_treat,names_Treat,ids_unicos
0,0,CEGCKGFF,"[['P22736', 283], ['Q07869', 118], ['Q03181', ...","[['C0006142', 'C0007102', 'C0346647'], ['C0006...","[['NR4A1_HUMAN'], ['PPARA_HUMAN'], ['PPARD_HUM...","{'O75469': [57], 'P10276': [104], 'P19793': [1...","[['NR1I2_HUMAN'], ['RARA_HUMAN'], ['RXRA_HUMAN...","[C0006142, C0346647, C0007102]"
1,1,CQYCR,"[['P10588', 107], ['Q07869', 152], ['Q03181', ...","[['C0006142', 'C0007102'], ['C0006142', 'C0007...","[['N/A'], ['PPARA_HUMAN'], ['PPARD_HUMAN'], ['...","{'P10276': [139], 'P19793': [186], 'P28702': [...","[['RARA_HUMAN'], ['RXRA_HUMAN'], ['RXRB_HUMAN'...","[C0006142, C0007102]"
2,2,FFKRT,"[['O95718', 125], ['O00482', 108], ['P22736', ...","[['C0006142', 'C0007102', 'C0346647'], ['C0006...","[['ERR2_HUMAN'], ['NR5A2_HUMAN'], ['NR4A1_HUMA...","{'P11474': [101], 'P19793': [157], 'P28702': [...","[['ERR1_HUMAN'], ['RXRA_HUMAN'], ['RXRB_HUMAN'...","[C0006142, C0346647, C0007102]"
3,3,ADLR,"[['Q93084', 160], ['P13639', 797], ['P35575', ...","[['C0006142', 'C0007102'], ['C0006142'], ['C00...","[['AT2A3_HUMAN'], ['N/A'], ['G6PC1_HUMAN'], ['...","{'P05023': [200], 'P07437': [247], 'P37231': [...","[['AT1A1_HUMAN'], ['TBB5_HUMAN'], ['PPARG_HUMA...","[C0006142, C0010606, C0346647, C0007102]"
4,4,AGLA,"[['P01019', 27], ['P30556', 155], ['Q13796', 7...","[['C0006142', 'C0346647'], ['C0006142', 'C0346...","[['ANGT_HUMAN'], ['AGTR1_HUMAN'], ['N/A'], ['B...","{'P05412': [192], 'P31939': [43], 'P54710': [3...","[['JUN_HUMAN'], ['PUR9_HUMAN'], ['ATNG_HUMAN']...","[C0007102, C0006142, C0010606, C0346647]"
5,5,AKLL,"[['P78325', 640], ['O95782', 905], ['Q01484', ...","[['C0006142'], ['C0006142'], ['C0006142'], ['C...","[['ADAM8_HUMAN'], ['AP2A1_HUMAN'], ['ANK2_HUMA...","{'P05091': [381], 'P19793': [415], 'P28702': [...","[['ALDH2_HUMAN'], ['RXRA_HUMAN'], ['RXRB_HUMAN...","[C0006142, C0010606, C0346647, C0007102]"
6,6,AVAG,"[['Q99758', 498], ['P05141', 221], ['P13569', ...","[['C0006142'], ['C0006142'], ['C0006142', 'C00...","[['ABCA3_HUMAN'], ['ADT2_HUMAN'], ['CFTR_HUMAN...","{'P05023': [445], 'P11474': [181], 'P22102': [...","[['AT1A1_HUMAN'], ['ERR1_HUMAN'], ['PUR2_HUMAN...","[C0006142, C0010606, C0346647, C0007102]"
7,7,AVQE,"[['P21926', 80], ['Q92985', 279], ['P49959', 4...","[['C0006142', 'C0007102'], ['C0006142'], ['C00...","[['CD9_HUMAN'], ['IRF7_HUMAN'], ['MRE11_HUMAN'...","{'P19793': [203], 'P22102': [169], 'P28702': [...","[['RXRA_HUMAN'], ['PUR2_HUMAN'], ['RXRB_HUMAN'...","[C0006142, C0010606, C0346647, C0007102]"
8,8,DTLS,"[['Q13085', 56], ['P02771', 292], ['P50851', 9...","[['C0006142', 'C0007102'], ['C0006142', 'C0007...","[['ACACA_HUMAN'], ['FETA_HUMAN'], ['LRBA_HUMAN...","{'P05121': [253], 'P10276': [420], 'P10696': [...","[['PAI1_HUMAN'], ['RARA_HUMAN'], ['PPBN_HUMAN'...","[C0006142, C0010606, C0346647, C0007102]"
9,9,EAEK,"[['P25054', 1551], ['Q99728', 220], ['P20248',...","[['C0006142', 'C0007102', 'C0346647'], ['C0006...","[['APC_HUMAN'], ['BARD1_HUMAN'], ['CCNA2_HUMAN...","{'P14324': [359], 'P31939': [520], 'P42345': [...","[['FPPS_HUMAN'], ['PUR9_HUMAN'], ['MTOR_HUMAN'...","[C0006142, C0010606, C0346647, C0007102]"


In [75]:
# Crear un nuevo DataFrame con filas expandidas
expanded_rows = []
for _, row in prot_by_cancer_01.iterrows():
    for id_unico in row['ids_unicos']:
        new_row = row.copy()
        new_row['name_disease'] = id_unico
        expanded_rows.append(new_row)

expanded_df = pd.DataFrame(expanded_rows)

In [76]:
expanded_df

Unnamed: 0.1,Unnamed: 0,pattern,proteins,desease_id,protein_names,proteins_treat,names_Treat,ids_unicos,name_disease
0,0,CEGCKGFF,"[['P22736', 283], ['Q07869', 118], ['Q03181', ...","[['C0006142', 'C0007102', 'C0346647'], ['C0006...","[['NR4A1_HUMAN'], ['PPARA_HUMAN'], ['PPARD_HUM...","{'O75469': [57], 'P10276': [104], 'P19793': [1...","[['NR1I2_HUMAN'], ['RARA_HUMAN'], ['RXRA_HUMAN...","[C0006142, C0346647, C0007102]",C0006142
0,0,CEGCKGFF,"[['P22736', 283], ['Q07869', 118], ['Q03181', ...","[['C0006142', 'C0007102', 'C0346647'], ['C0006...","[['NR4A1_HUMAN'], ['PPARA_HUMAN'], ['PPARD_HUM...","{'O75469': [57], 'P10276': [104], 'P19793': [1...","[['NR1I2_HUMAN'], ['RARA_HUMAN'], ['RXRA_HUMAN...","[C0006142, C0346647, C0007102]",C0346647
0,0,CEGCKGFF,"[['P22736', 283], ['Q07869', 118], ['Q03181', ...","[['C0006142', 'C0007102', 'C0346647'], ['C0006...","[['NR4A1_HUMAN'], ['PPARA_HUMAN'], ['PPARD_HUM...","{'O75469': [57], 'P10276': [104], 'P19793': [1...","[['NR1I2_HUMAN'], ['RARA_HUMAN'], ['RXRA_HUMAN...","[C0006142, C0346647, C0007102]",C0007102
1,1,CQYCR,"[['P10588', 107], ['Q07869', 152], ['Q03181', ...","[['C0006142', 'C0007102'], ['C0006142', 'C0007...","[['N/A'], ['PPARA_HUMAN'], ['PPARD_HUMAN'], ['...","{'P10276': [139], 'P19793': [186], 'P28702': [...","[['RARA_HUMAN'], ['RXRA_HUMAN'], ['RXRB_HUMAN'...","[C0006142, C0007102]",C0006142
1,1,CQYCR,"[['P10588', 107], ['Q07869', 152], ['Q03181', ...","[['C0006142', 'C0007102'], ['C0006142', 'C0007...","[['N/A'], ['PPARA_HUMAN'], ['PPARD_HUMAN'], ['...","{'P10276': [139], 'P19793': [186], 'P28702': [...","[['RARA_HUMAN'], ['RXRA_HUMAN'], ['RXRB_HUMAN'...","[C0006142, C0007102]",C0007102
...,...,...,...,...,...,...,...,...,...
45,45,VVEP,"[['Q15109', 240], ['P35869', 630], ['O15111', ...","[['C0006142', 'C0007102'], ['C0006142', 'C0007...","[['RAGE_HUMAN'], ['N/A'], ['IKKA_HUMAN'], ['CO...","{'P07437': [178], 'P11474': [202], 'P42345': [...","[['TBB5_HUMAN'], ['ERR1_HUMAN'], ['MTOR_HUMAN'...","[C0006142, C0010606, C0346647, C0007102]",C0007102
46,46,WTYE,"[['P35475', 578], ['P00403', 105], ['P08922', ...","[['C0006142'], ['C0006142', 'C0007102', 'C0346...","[['IDUA_HUMAN'], ['COX2_HUMAN'], ['N/A'], ['TR...","{'O15379': [306], 'P05023': [905], 'P50993': [...","[['HDAC3_HUMAN'], ['AT1A1_HUMAN'], ['N/A'], ['...","[C0006142, C0010606, C0346647, C0007102]",C0006142
46,46,WTYE,"[['P35475', 578], ['P00403', 105], ['P08922', ...","[['C0006142'], ['C0006142', 'C0007102', 'C0346...","[['IDUA_HUMAN'], ['COX2_HUMAN'], ['N/A'], ['TR...","{'O15379': [306], 'P05023': [905], 'P50993': [...","[['HDAC3_HUMAN'], ['AT1A1_HUMAN'], ['N/A'], ['...","[C0006142, C0010606, C0346647, C0007102]",C0010606
46,46,WTYE,"[['P35475', 578], ['P00403', 105], ['P08922', ...","[['C0006142'], ['C0006142', 'C0007102', 'C0346...","[['IDUA_HUMAN'], ['COX2_HUMAN'], ['N/A'], ['TR...","{'O15379': [306], 'P05023': [905], 'P50993': [...","[['HDAC3_HUMAN'], ['AT1A1_HUMAN'], ['N/A'], ['...","[C0006142, C0010606, C0346647, C0007102]",C0346647


In [77]:
value_counts = expanded_df['name_disease'].value_counts().reset_index()

In [78]:
value_counts
# breast cancer, colon, pancreas, head_neck (este es el orden segun aparecen)

Unnamed: 0,name_disease,count
0,C0006142,47
1,C0007102,47
2,C0346647,46
3,C0010606,38


In [5]:
prot_by_cancer_005 = pd.read_excel("ProtByPatternCanc005_summary.xlsx")

In [81]:
prot_by_cancer_005['ids_unicos'] = prot_by_cancer_005['desease_id'].apply(obtener_ids_unicos_manteniendo_id_completo)

In [82]:
# Crear un nuevo DataFrame con filas expandidas
expanded_rows = []
for _, row in prot_by_cancer_005.iterrows():
    for id_unico in row['ids_unicos']:
        new_row = row.copy()
        new_row['name_disease'] = id_unico
        expanded_rows.append(new_row)

expanded_df_005 = pd.DataFrame(expanded_rows)

In [83]:
value_counts_005 = expanded_df_005['name_disease'].value_counts().reset_index()

In [84]:
value_counts_005
# breast cancer, colon, pancreas, head_neck (este es el orden segun aparecen)


Unnamed: 0,name_disease,count
0,C0006142,2193
1,C0007102,2138
2,C0346647,2023
3,C0010606,1589


In [159]:
phases_cancer =pd.read_excel("phases_cancers.xlsx")

In [160]:
phases_cancer

Unnamed: 0,Source,Target,Value
0,Ocurrence 0.05,Lost,1802
1,Ocurrence 0.05,>= 4 aa 0.05,2368
2,Ocurrence 0.10,Lost,1987
3,Ocurrence 0.10,>= 4 aa 0.10,47
4,>= 4 aa 0.05,other types cancer_ 0.05,2200
5,>= 4 aa 0.05,Lost,168
6,>= 4 aa 0.10,other types cancer_0.10,47
7,>= 4 aa 0.10,Lost,0
8,other types cancer_ 0.05,Breast cancer,2193
9,other types cancer_ 0.05,Colon cancer,2138


In [258]:
hv.Sankey(phases_cancer)

In [197]:
cmapone = { 
    "Ocurrence 0.10": cmap_list[122],
    "Lost": cmap_list[255],
    ">= 4 aa 0.10": cmap_list[102],
    "other types cancer_ 0.10": cmap_list[29],
    "Breast cancer": cmap_list[6],
    "Colon cancer": cmap_list[7],
    "Pancreas cancer": cmap_list[9],
    "Head_neck cancer": cmap_list[14],
    "Ocurrence 0.05": cmap_list[3], 
    ">= 4 aa 0.05": cmap_list[69],
    "other types cancer_ 0.05": cmap_list[28]
      
    
}


In [198]:
sankey1 = hv.Sankey(phases_cancer, kdims=["Source", "Target"], vdims=["Value"])

sankey1.opts(cmap=cmapone, label_text_font_size='0pt',
                                 edge_color='Target', edge_line_width=0.3,
                                 node_alpha=1.0, node_width=40, node_sort=True,
                                 width=1500, height=800, bgcolor="white")

In [195]:
phases_cancer_005 =pd.read_excel("phases_cancers.xlsx", sheet_name=1)

In [164]:
phases_cancer_005

Unnamed: 0,Source,Target,Value
0,Ocurrence 0.05,Lost,1802
1,Ocurrence 0.05,>= 4 aa 0.05,2368
2,>= 4 aa 0.05,other types cancer_ 0.05,2200
3,>= 4 aa 0.05,Lost,168
4,other types cancer_ 0.05,Breast cancer,2193
5,other types cancer_ 0.05,Colon cancer,2138
6,other types cancer_ 0.05,Pancreas cancer,2023
7,other types cancer_ 0.05,Head_neck cancer,1589


In [182]:
cmapdos = {
    "Ocurrence 0.05": cmap_list[3], 
    "Lost": cmap_list[255],
    ">= 4 aa 0.05": cmap_list[69],
    "other types cancer_ 0.05": cmap_list[28],
    "Breast cancer": cmap_list[6],
    "Colon cancer": cmap_list[7],
    "Pancreas cancer": cmap_list[9],
    "Head_neck cancer": cmap_list[14]
    
}


In [183]:
sankey2 = hv.Sankey(phases_cancer_005, kdims=["Source", "Target"], vdims=["Value"])

sankey2.opts(cmap=cmapdos, label_text_font_size='0pt',
                                 edge_color='Target', edge_line_width=0.3,
                                 node_alpha=1.0, node_width=40, node_sort=True,
                                 width=1500, height=800, bgcolor="white")

In [186]:
phases_cancer_010 =pd.read_excel("phases_cancers.xlsx", sheet_name=2)

In [189]:
phases_cancer_010

Unnamed: 0,Source,Target,Value
0,Ocurrence 0.10,Lost,1987
1,Ocurrence 0.10,>= 4 aa 0.10,47
2,>= 4 aa 0.10,other types cancer_0.10,47
3,>= 4 aa 0.10,Lost,0
4,other types cancer_0.10,Breast cancer,47
5,other types cancer_0.10,Colon cancer,47
6,other types cancer_0.10,Pancreas cancer,46
7,other types cancer_0.10,Head_neck cancer,38


In [192]:
cmaptres = { 
    "Ocurrence 0.10": cmap_list[122],
    "Lost": cmap_list[255],
    ">= 4 aa 0.10": cmap_list[102],
    "other types cancer_0.10": cmap_list[29],
    "Breast cancer": cmap_list[6],
    "Colon cancer": cmap_list[7],
    "Pancreas cancer": cmap_list[9],
    "Head_neck cancer": cmap_list[14]
    
    
}


In [193]:
sankey3 = hv.Sankey(phases_cancer_010, kdims=["Source", "Target"], vdims=["Value"])

sankey3.opts(cmap=cmaptres, label_text_font_size='0pt',
                                 edge_color='Target', edge_line_width=0.3,
                                 node_alpha=1.0, node_width=40, node_sort=True,
                                 width=1500, height=800, bgcolor="white")

In [None]:
### Immune 

In [199]:
prot_by_immune_01 = pd.read_excel("ProtByPatternImmun01_summary.xlsx")

In [203]:
prot_by_immune_01['ids_unicos'] = prot_by_immune_01['desease_id'].apply(obtener_ids_unicos_manteniendo_id_completo)

In [209]:
# Crear un nuevo DataFrame con filas expandidas
expanded_rows = []
for _, row in prot_by_immune_01.iterrows():
    for id_unico in row['ids_unicos']:
        new_row = row.copy()
        new_row['name_disease'] = id_unico
        expanded_rows.append(new_row)
        
expanded_df_010_imm = pd.DataFrame(expanded_rows)

In [211]:
value_counts = expanded_df_010_imm['name_disease'].value_counts().reset_index()

In [212]:
value_counts
# Rheumatoid Arthritis,Diabetes Mellitus, Insulin-Dependent, Multiple sclerosis, Lupus

Unnamed: 0,name_disease,count
0,C0003873,47
1,C0011854,45
2,C0026769,45
3,C0024141,43


In [213]:
prot_by_immune_005 = pd.read_excel("ProtByPatternImmun005_summary.xlsx")

In [214]:
prot_by_immune_005['ids_unicos'] = prot_by_immune_005['desease_id'].apply(obtener_ids_unicos_manteniendo_id_completo)

In [217]:
# Crear un nuevo DataFrame con filas expandidas
expanded_rows = []
for _, row in prot_by_immune_005.iterrows():
    for id_unico in row['ids_unicos']:
        new_row = row.copy()
        new_row['name_disease'] = id_unico
        expanded_rows.append(new_row)
        
expanded_df_005_imm = pd.DataFrame(expanded_rows)

In [218]:
value_counts = expanded_df_005_imm['name_disease'].value_counts().reset_index()

In [219]:
value_counts
##  Rheumatoid Arthritis, Lupus, Diabetes Mellitus, Insulin-Dependent, Multiple sclerosis 

Unnamed: 0,name_disease,count
0,C0003873,2038
1,C0024141,1930
2,C0011854,1904
3,C0026769,1877


In [222]:
phases_immune =pd.read_excel("phases_immune.xlsx")

In [223]:
phases_immune

Unnamed: 0,Source,Target,Value
0,Ocurrence 0.05,Lost,1802
1,Ocurrence 0.05,>= 4 aa 0.05,2368
2,Ocurrence 0.10,Lost,1987
3,Ocurrence 0.10,>= 4 aa 0.10,47
4,>= 4 aa 0.05,immune diseases_0.05,2131
5,>= 4 aa 0.05,Lost,237
6,>= 4 aa 0.10,immune diseases_0.10,47
7,immune diseases_0.05,RA,2038
8,immune diseases_0.05,LU,1930
9,immune diseases_0.05,DB,1904


In [259]:
hv.Sankey(phases_immune)

In [239]:
cmapcuatro = { 
    "Ocurrence 0.10": cmap_list[122],
    "Lost": cmap_list[255],
    ">= 4 aa 0.10": cmap_list[102],
    "immune diseases_0.10": cmap_list[38],
     "Ocurrence 0.05": cmap_list[3], 
    ">= 4 aa 0.05": cmap_list[69],
    "immune diseases_0.05": cmap_list[40],
    "RA": cmap_list[20],
    "MS": cmap_list[21],
    "LU": cmap_list[22],
    "DB": cmap_list[23]
   
    
    
}


In [240]:
sankey4 = hv.Sankey(phases_immune, kdims=["Source", "Target"], vdims=["Value"])

sankey4.opts(cmap=cmapcuatro, label_text_font_size='0pt',
                                 edge_color='Target', edge_line_width=0.3,
                                 node_alpha=1.0, node_width=40, node_sort=True,
                                 width=1500, height=800, bgcolor="white")

In [None]:
## Rare 

In [241]:
prot_by_rare_01 = pd.read_excel("ProtByPatternRare01_summary.xlsx")

In [242]:
prot_by_rare_01['ids_unicos'] = prot_by_rare_01['desease_id'].apply(obtener_ids_unicos_manteniendo_id_completo)

In [243]:
# Crear un nuevo DataFrame con filas expandidas
expanded_rows = []
for _, row in prot_by_rare_01.iterrows():
    for id_unico in row['ids_unicos']:
        new_row = row.copy()
        new_row['name_disease'] = id_unico
        expanded_rows.append(new_row)
        
expanded_df_010_rare = pd.DataFrame(expanded_rows)

In [244]:
value_counts = expanded_df_010_rare['name_disease'].value_counts().reset_index()

In [245]:
value_counts
# X-Linked Emery-Dreifuss Muscular Dystrophy, Acromegaloid facial, Lown-Ganong-Levine, DERMODISTORTIVE URTICARIA

Unnamed: 0,name_disease,count
0,C0751337,27
1,C0796280,3
2,C0024054,1
3,C1852146,1


In [246]:
prot_by_rare_005 = pd.read_excel("ProtByPatternRare005_summary.xlsx")

In [247]:
prot_by_rare_005['ids_unicos'] = prot_by_rare_005['desease_id'].apply(obtener_ids_unicos_manteniendo_id_completo)

In [248]:
# Crear un nuevo DataFrame con filas expandidas
expanded_rows = []
for _, row in prot_by_rare_005.iterrows():
    for id_unico in row['ids_unicos']:
        new_row = row.copy()
        new_row['name_disease'] = id_unico
        expanded_rows.append(new_row)
        
expanded_df_005_rare = pd.DataFrame(expanded_rows)

In [249]:
value_counts = expanded_df_005_rare['name_disease'].value_counts().reset_index()

In [250]:
value_counts
#X-Linked Emery-Dreifuss Muscular Dystrophy, Acromegaloid facial,urticaria, lown, neonatal, locked

Unnamed: 0,name_disease,count
0,C0751337,1090
1,C0796280,56
2,C1852146,25
3,C0024054,17
4,C0268059,12
5,C0023944,9


In [262]:
phases_rare =pd.read_excel("phases_rare.xlsx")

In [263]:
phases_rare

Unnamed: 0,Source,Target,Value
0,Ocurrence 0.05,Lost,1802
1,Ocurrence 0.05,>= 4 aa 0.05,2368
2,Ocurrence 0.10,Lost,1987
3,Ocurrence 0.10,>= 4 aa 0.10,47
4,>= 4 aa 0.05,rare diseases_0.05,1121
5,>= 4 aa 0.05,Lost,1247
6,>= 4 aa 0.10,rare diseases_0.10,28
7,>= 4 aa 0.10,Lost,19
8,rare diseases_0.05,X-Linked,1090
9,rare diseases_0.05,Acromega,56


In [277]:
cmapcinco = { 
    "Ocurrence 0.10": cmap_list[122],
    "Lost": cmap_list[255],
    ">= 4 aa 0.10": cmap_list[102],
    "rare diseases_0.10": cmap_list[49],
     "Ocurrence 0.05": cmap_list[3], 
    ">= 4 aa 0.05": cmap_list[69],
    "rare diseases_0.05": cmap_list[42],
    "X-Linked": cmap_list[70],
    "Acromega": cmap_list[61],
    "urticaria": cmap_list[62],
    "Lown": cmap_list[103],    
    "Neonatal": cmap_list[64],
    "Locked": cmap_list[75]
    
}


In [278]:
sankey5 = hv.Sankey(phases_rare, kdims=["Source", "Target"], vdims=["Value"])

sankey5.opts(cmap=cmapcinco, label_text_font_size='0pt',
                                 edge_color='Target', edge_line_width=0.3,
                                 node_alpha=1.0, node_width=40, node_sort=True,
                                 width=1500, height=800, bgcolor="white")

In [279]:
hv.Sankey(phases_rare)