In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import networkx as nx
from tabulate import tabulate
from networkx.algorithms import bipartite
import random
from scipy.stats import norm
from itertools import combinations
import re
from itertools import product
import sys
sys.path.append('schizophrenia/functions_network_medicine_schizo.py')  # Reemplaza '/ruta/a/tu/notebook' con la ruta real al directorio que contiene el archivo functions.py
import functions_network_medicine_schizo

### Data

In [2]:
#nodes
pro = pd.read_csv('../data/nodes/pro.tsv', sep="\t")
gen = pd.read_csv('../data/nodes/gen.tsv', sep="\t")
dru = pd.read_csv('../data/nodes/dru.tsv', sep="\t")
dis = pd.read_csv('../data/nodes/dis.tsv', sep="\t")

In [3]:
#links
pro_pro = pd.read_csv('data/links/pro_pro.tsv', sep="\t")
dis_gen = pd.read_csv('data/links/dis_gen.tsv', sep="\t")
dse_sym = pd.read_csv('data/dse_sym_limpio.tsv', sep="\t")
dis_dru_the = pd.read_csv('data/links/dis_dru_the.tsv', sep="\t")
gen_pro = pd.read_csv('data/links/gen_pro.tsv', sep="\t")
dru_pro = pd.read_csv('data/links/dru_pro.tsv', sep="\t")

In [4]:
#file with SPLs between all PPI nodes
spl = pd.read_csv('files/SPL PPI.csv', index_col='Source')

### Interactome

In [5]:
G_ppi = nx.from_pandas_edgelist(pro_pro,'prA','prB')

### Identification of disease module

In [None]:
# Dictionaries to store the results for each disease
dis_gen_dict = {}  # Seed genes
dis_lcc = {}  # Module size
dis_lcc_dp = {}  # Results for statistical validation dp
dis_lcc_ndp = {}  # Results for statistical validation ndp

dis_total = dis_dru_the['dis'].unique().tolist()

for dis in dis_total:
    # Seed genes
    genes = functions_network_medicine_schizo.genes_dis(dis, dis_gen)
    dis_gen_dict[dis] = genes
    
    # Disease proteins 
    prots = functions_network_medicine_schizo.pro_gen_dict(genes, gen_pro)
    
    # Disease proteins in interactome
    prots_interactome = functions_network_medicine_schizo.gen_pro_PPI(prots, pro_pro)

    # Module size
    SG_dis = G_ppi.subgraph(prots_interactome)
    
    if SG_dis: # if disease has at least one protein in its module
        lcc = functions_network_medicine_schizo.lcc(SG_dis)
        dis_lcc[dis] = lcc
        
        # Statistical validation ndp
        lcc_ndp = functions_network_medicine_schizo.lcc_simulation(SG_dis, lcc, G_ppi, dp=False)
        dis_lcc_ndp[dis] = lcc_ndp
        
        # Statistical validation dp
        lcc_dp = functions_network_medicine_schizo.lcc_simulation(SG_dis, lcc, G_ppi, dp=True)
        dis_lcc_dp[dis] = lcc_dp   
        
    else:
        dis_lcc[dis] = []
        dis_lcc_ndp[dis] = []
        dis_lcc_dp[dis] = []


results = {
    "ID": dis_total,
    "Seed genes": [len(dis_gen_dict[dis]) for dis in dis_total],
    "Genes in PPI": [len(df_pro_dis_total_filt[df_pro_dis_total_filt["dis"] == dis]["pro_ppi"].iloc[0]) for dis in dis_total],
    "Genes in LCC": [len(dis_lcc[dis]) for dis in dis_total],
    "Relative LCC": [len(dis_lcc[dis]) / len(df_pro_dis_total_filt[df_pro_dis_total_filt["dis"] == dis]["pro_ppi"].iloc[0]) for dis in dis_total],
    "Mean random LCC (ndp)": [dis_lcc_ndp[dis][0] for dis in dis_total],
    "Std random LCC (ndp)": [dis_lcc_ndp[dis][1] for dis in dis_total],
    "Z-score (ndp)": [dis_lcc_ndp[dis][2] for dis in dis_total],
    "Mean random LCC (dp)": [dis_lcc_dp[dis][0] for dis in dis_total],
    "Std random LCCs (dp)": [dis_lcc_dp[dis][1] for dis in dis_total], 
    "Z-score (dp)": [dis_lcc_dp[dis][2] for dis in dis_total]
}

dis_module_df = pd.DataFrame(results)
dis_module_df.to_csv("Module.csv", index=False)

### Disease filtering

In [None]:
# Filter of diseases with a significant module size
significative = dis_module_df[(dis_module_df['Z-score (dp)'] > 1.65) & (dis_module_df['Z-score (ndp)'] > 1.65)]

In [42]:
# Filter of diseases that have more than 15 pathological proteins in its module belonging to the interactome

for i, lcc in enumerate(significative["Genes in LCC"]):
    if lcc < 15: 
        disease_no_sig = significative["ID"][i]
        
        row = significative[significative['ID'] == disease_no_sig]
        
        significative = significative.drop(row.index)
        
significative.reset_index(drop=True, inplace=True)
significative.to_csv("Module signif.csv", index = False)

### Closest distance disease-drug

In [8]:
significative = pd.read_csv('Module signif.csv', sep=",")

In [9]:
#DataFrame with drugs and their targets:
total_drug_list = set()
for drug in dru_pro["dru"]:
     if drug in set(dis_dru_the["dru"].values): # drugs for which we have information about what diseases they treat
            list_drugs_total.add(drug)
targets_total = functions_network_medicine.targets(drug_list_total, dru_pro)

In [None]:
results = [] #list with the results of each disease for the observed distance

for i, dis in enumerate(df_pro_dis_total_filt["dis"]):
    SG_dis = G_ppi.subgraph(df_pro_dis_total_filt["pro_ppi"][i]) #disease subnetwork
    lcc = funciones_network_medicine.lcc(SG_dis) # disease module
    proximity_obs = funciones_network_medicine.proximity(targets_total, spl, lcc, G_ppi) # DF with drugs and distance to disease
    
    for _, row in proximity_obs.iterrows():
        results.append({"ID": dis, "Drugs": row["FÃ¡rmacos"], "Closest distance": row["dc"]})

df_distance = pd.DataFrame(results)

df_distance.to_csv("Closest distance.csv", index=False)

### Proximity disease-drug

In [13]:
# random target modules
target_modules_list_random_drugs = functions_network_medicine_schizo.calculate_random_drug_target_modules(1000, targets_total, G_ppi)

In [None]:
df = pd.DataFrame(columns=["ID", "Drugs", "Dc_mean", "Dc_std"])

# empty file in which I will add the results
with open("Rand_closest_distance_rand.csv", 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["ID", "Drugs", "Dc_mean", "Dc_std"])

# determination of closest distance in random set of disease module and target module
for i, dis in enumerate(significative["ID"]):
    genes = functions_network_medicine_schizo.genes_dis(dis, dis_gen)
    prots = functions_network_medicine_schizo.pro_gen_dict(genes, gen_pro)
    pro_ppi = functions_network_medicine_schizo.gen_pro_PPI(prots, pro_pro)
    SG_dis = G_ppi.subgraph(pro_ppi)
    lcc = functions_network_medicine_schizo.lcc(SG_dis)
    proximity_rand = functions_network_medicine_schizo.proximity_random(targets_total, spl, lcc, G_ppi, 1000, lista_modulos_diana_farmacos_aleatorios)
        
    results = []
    for _, row in proximity_rand.iterrows():
        results.append({"ID": dis, "Drugs": row["Drugs"], "Dc_mean": row["dc_mean"], "Dc_std": row["dc_std"]})
    
    with open("Rand_closest_distance_rand.csv", 'a', newline='') as file:
        writer = csv.writer(file)
        for result in results:
            writer.writerow([result["ID"], result["Drugs"], result["Dc_mean"], result["Dc_std"]])


In [16]:
rand_prox = pd.read_csv("Rand_closest_distance_rand.csv", sep = ",")

In [51]:
distance=pd.read_csv("Closest distance.csv", sep=",")

In [57]:
# Join df_obs and df_prox based on the ID and Drugs columns
df_merged = pd.merge(distance, rand_prox, on=['ID', 'Drugs'])

# dc_zscore
df_merged['Dc_zscore'] = (df_merged['Closest distance'] - df_merged['Dc_mean']) / df_merged['Dc_std']

# New column indicating if each drug is used for the treatment of the disease
df_merged['Treatment'] = df_merged.apply(lambda row: functions_network_medicine_schizo.determine_treatment(row, dis_dru_the), axis=1)


In [62]:
df_merged.to_csv("results/Proximity_results.csv", index = False)

In [64]:
proximity = pd.read_csv("Proximity_results.csv", sep = ",")

### Statistical analysis of the proximity results across all the diseases

In [117]:
# Filter the data to include only relevant columns
df_relevant = proximity[['Closest distance', 'Dc_zscore', 'Treatment']]

# Compute descriptive statistics for 'Closest distance' and 'Dc_zscore' grouped by 'Treatment'
describe_closest_distance = df_relevant.groupby('Treatment')['Closest distance'].describe()
describe_dc_zscore = df_relevant.groupby('Treatment')['Dc_zscore'].describe()


In [118]:
print("\nDescribe Closest distance:")
describe_closest_distance.head()


Describe de Closest distance:


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Treatment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
unknown,516993.0,1.741093,0.590947,0.0,1.4,2.0,2.0,4.0
yes,12162.0,1.473756,0.691416,0.0,1.0,1.6,2.0,3.0


In [119]:
print("\nDescribe Dc_zscore:")
describe_dc_zscore.head()


Describe de Dc_zscore:


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Treatment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
unknown,510797.0,-0.159867,4.989091,-187.636997,-0.70903,0.123404,0.733509,164.760255
yes,12053.0,-1.295405,8.074283,-163.309391,-1.741129,-0.465569,0.400364,104.948135
