import networkx as nx from deepsnap.hetero_graph import HeteroGraph import pandas as pd import numpy as np import torch class DISNETConstructor: def __init__(self, device='cpu'): self.device = device @staticmethod def getNodeInfo(full=True): dis = pd.read_csv('graphData/data/nodes/dis.tsv', sep='\t') dru = pd.read_csv('graphData/data/nodes/dru.tsv', sep='\t') if full: pat = pd.read_csv('graphData/data/nodes/pat.tsv', sep='\t') pro = pd.read_csv('graphData/data/nodes/pro.tsv', sep='\t') ddi = pd.read_csv('graphData/data/nodes/ddi.tsv', sep='\t') return dis, dru, pat, pro, ddi else: return dis, dru @staticmethod def getEdgeInfo(full=True): dis_dru_the = pd.read_csv('graphData/data/links/dis_dru_the.tsv', sep='\t') dis_sym = pd.read_csv('graphData/data/links/dis_sym.tsv', sep='\t') if full: dis_pat = pd.read_csv('graphData/data/links/dis_pat.tsv', sep='\t') dis_pro = pd.read_csv('graphData/data/links/dis_pro.tsv', sep='\t') dru_dru = pd.read_csv('graphData/data/links/dru_dru.tsv', sep='\t') dru_pro = pd.read_csv('graphData/data/links/dru_pro.tsv', sep='\t') dru_sym_ind = pd.read_csv('graphData/data/links/dru_sym_ind.tsv', sep='\t') dru_sym_sef = pd.read_csv('graphData/data/links/dru_sym_sef.tsv', sep='\t') pro_pat = pd.read_csv('graphData/data/links/pro_pat.tsv', sep='\t') pro_pro = pd.read_csv('graphData/data/links/pro_pro.tsv', sep='\t') ddi_phe = pd.read_csv('graphData/data/links/ddi_phe.tsv', sep='\t') ddi_dru = pd.read_csv('graphData/data/links/ddi_dru.tsv', sep='\t') return dis_dru_the, dis_sym, dis_pat, dis_pro, dru_dru, dru_pro, dru_sym_ind, dru_sym_sef, pro_pat, \ pro_pro, ddi_phe, ddi_dru else: return dis_dru_the, dis_sym def DISNETHeterographDeepSnap(self, full=False, withoutRepoDB=True): # DataFrames of each type of nodes if full: dis, dru, pat, pro, ddi = self.getNodeInfo(full) nodes = [dis, dru, pat, pro, ddi] # Store types and its size. ntypes = ['disorder', 'drug', 'pathway', 'protein', 'drug-drug-interaction'] nsizes = {'disorder': len(dis.index), 'drug': len(dru.index), 'pathway': len(pat.index), 'protein': len(pro.index), 'drug-drug-interaction': len(ddi.index) } else: dis, dru = self.getNodeInfo(full) nodes = [dis, dru] # Store types and its size. ntypes = ['disorder', 'drug'] nsizes = {'disorder': len(dis.index), 'drug': len(dru.index) } # Nodes data pre-processing (mapping and data for the graph) nodes_flat = pd.concat(nodes, keys=ntypes, names=['node_type', 'NID']).reset_index() nodes_flat['node_id'] = nodes_flat.index nodes_flat_list = nodes_flat.values.tolist() # data for the graph nodes_mapping = nodes_flat[['NID', 'node_id', 'node_type']].set_index(['NID', 'node_type']).to_dict()[ 'node_id'] # mapping # Adding NID to nodes dis['NID'] = dis.index dis['node_type'] = 'disorder' dis['node_id'] = nodes_flat.loc[nodes_flat['node_type'] == 'disorder'].reset_index(drop=True).node_id dru['NID'] = dru.index dru['node_type'] = 'drug' dru['node_id'] = nodes_flat.loc[nodes_flat['node_type'] == 'drug'].reset_index(drop=True).node_id # Nodes dataframes to dict to apply map later dis_dict = dis[['id', 'node_id']].set_index('id').to_dict()['node_id'] dru_dict = dru[['id', 'node_id']].set_index('id').to_dict()['node_id'] dis_feat = torch.tensor([[1] * 100] * nsizes['disorder'], dtype=torch.float32) dru_feat = torch.tensor([[1] * 100] * nsizes['drug'], dtype=torch.float32) if full: # Adding NID to nodes pat['NID'] = pat.index pat['node_type'] = 'pathway' pat['node_id'] = nodes_flat.loc[nodes_flat['node_type'] == 'pathway'].reset_index(drop=True).node_id pro['NID'] = pro.index pro['node_type'] = 'protein' pro['node_id'] = nodes_flat.loc[nodes_flat['node_type'] == 'protein'].reset_index(drop=True).node_id ddi['NID'] = ddi.index ddi['node_type'] = 'drug-drug-interaction' ddi['node_id'] = nodes_flat.loc[nodes_flat['node_type'] == 'drug-drug-interaction'].reset_index(drop=True).node_id # Nodes dataframes to dict to apply map later pat_dict = pat[['id', 'node_id']].set_index('id').to_dict()['node_id'] pro_dict = pro[['id', 'node_id']].set_index('id').to_dict()['node_id'] ddi_dict = ddi[['id', 'node_id']].set_index('id').to_dict()['node_id'] pat_feat = torch.tensor([[1] * 100] * nsizes['pathway'], dtype=torch.float32) pro_feat = torch.tensor([[1] * 100] * nsizes['protein'], dtype=torch.float32) ddi_feat = torch.tensor([[1] * 100] * nsizes['drug-drug-interaction'], dtype=torch.float32) feats = {'disorder': dis_feat, 'drug': dru_feat, 'pathway': pat_feat, 'protein': pro_feat, 'drug-drug-interaction': ddi_feat} else: feats = {'disorder': dis_feat, 'drug': dru_feat} # Add nodes to the graph G = nx.DiGraph() for node in nodes_flat_list: G.add_node(node[4], node_type=node[0], node_feature=feats[node[0]][node[1]], node_label=node[1]) # ----------------------------- # LINKS DATA # ----------------------------- # DataFrames of each type of edges if full: dis_dru_the, dis_sym, dis_pat, dis_pro, dru_dru, dru_pro, dru_sym_ind, dru_sym_sef, pro_pat, \ pro_pro, ddi_phe, ddi_dru = self.getEdgeInfo(full) dis_pat['disNID'] = dis_pat.dis.map(dis_dict) dis_pat['patNID'] = dis_pat.pat.map(pat_dict) dis_pro['disNID'] = dis_pro.dis.map(dis_dict) dis_pro['proNID'] = dis_pro.pro.map(pro_dict) dru_dru['druANID'] = dru_dru.drA.map(dru_dict) dru_dru['druBNID'] = dru_dru.drB.map(dru_dict) dru_pro['druNID'] = dru_pro.dru.map(dru_dict) dru_pro['proNID'] = dru_pro.pro.map(pro_dict) dru_sym_ind['druNID'] = dru_sym_ind.dru.map(dru_dict) dru_sym_ind['symNID'] = dru_sym_ind.sym.map(dis_dict) dru_sym_sef['druNID'] = dru_sym_sef.dru.map(dru_dict) dru_sym_sef['symNID'] = dru_sym_sef.sym.map(dis_dict) pro_pat['proNID'] = pro_pat.pro.map(pro_dict) pro_pat['patNID'] = pro_pat.pat.map(pat_dict) pro_pro['proANID'] = pro_pro.prA.map(pro_dict) pro_pro['proBNID'] = pro_pro.prB.map(pro_dict) ddi_phe['ddiNID'] = ddi_phe.ddi.map(ddi_dict) ddi_phe['pheNID'] = ddi_phe.phe.map(dis_dict) ddi_dru['ddiNID'] = ddi_dru.ddi.map(ddi_dict) ddi_dru['druNID'] = ddi_dru.dru.map(dru_dict) else: dis_dru_the, dis_sym = self.getEdgeInfo(full) # Convert ids to NIDs dis_dru_the['disNID'] = dis_dru_the.dis.map(dis_dict) dis_dru_the['druNID'] = dis_dru_the.dru.map(dru_dict) dis_sym['disNID'] = dis_sym.dis.map(dis_dict) dis_sym['symNID'] = dis_sym.sym.map(dis_dict) if withoutRepoDB: dis_dru_the_repoDBAll = pd.read_csv('graphData/testData/drugdis_repodb_ALLlinks.tsv', sep='\t') dis_dru_the_repoDBAll['disNID'] = dis_dru_the_repoDBAll.dis.map(dis_dict) dis_dru_the_repoDBAll['druNID'] = dis_dru_the_repoDBAll.dru.map(dru_dict) # Delete from the graph the edges of repo_db dis_dru_the = pd.concat([dis_dru_the, dis_dru_the_repoDBAll]) dis_dru_the.drop_duplicates(keep=False, inplace=True) dis_dru_the = dis_dru_the[:50355] dis_dru_the_repoDBAll = ( torch.tensor(dis_dru_the_repoDBAll['disNID'].astype(np.int32).to_numpy(), dtype=torch.int32, device=self.device), torch.tensor(dis_dru_the_repoDBAll['druNID'].astype(np.int32).to_numpy() - len(dis_dict), dtype=torch.int32, device=self.device)) else: dis_dru_the_repoDBAll = {} if full: edges = { 'dis_dru_the': dis_dru_the[['disNID', 'druNID']].values.tolist(), 'dru_dis_the': dis_dru_the[['druNID', 'disNID']].values.tolist(), 'dis_sym': dis_sym[['disNID', 'symNID']].values.tolist(), 'sym_dis': dis_sym[['symNID', 'disNID']].values.tolist(), 'dis_pat': dis_pat[['disNID', 'patNID']].values.tolist(), 'pat_dis': dis_pat[['patNID', 'disNID']].values.tolist(), 'dis_pro': dis_pro[['disNID', 'proNID', 'w']].values.tolist(), 'pro_dis': dis_pro[['proNID', 'disNID', 'w']].values.tolist(), 'druA_druB': dru_dru[['druANID', 'druBNID']].values.tolist(), 'druB_druA': dru_dru[['druBNID', 'druANID']].values.tolist(), 'dru_pro': dru_pro[['druNID', 'proNID']].values.tolist(), 'pro_dru': dru_pro[['proNID', 'druNID']].values.tolist(), 'dru_sym_ind': dru_sym_ind[['druNID', 'symNID']].values.tolist(), 'sym_dru_ind': dru_sym_ind[['symNID', 'druNID']].values.tolist(), 'dru_sym_sef': dru_sym_sef[['druNID', 'symNID', 'w']].values.tolist(), 'sym_dru_sef': dru_sym_sef[['symNID', 'druNID', 'w']].values.tolist(), 'pro_pat': pro_pat[['proNID', 'patNID']].values.tolist(), 'pat_pro': pro_pat[['patNID', 'proNID']].values.tolist(), 'proA_proB': pro_pro[['proANID', 'proBNID']].values.tolist(), 'proB_proA': pro_pro[['proBNID', 'proANID']].values.tolist(), 'ddi_phe': ddi_phe[['ddiNID', 'pheNID']].values.tolist(), 'phe_ddi': ddi_phe[['pheNID', 'ddiNID']].values.tolist(), 'ddi_dru': ddi_dru[['ddiNID', 'druNID']].values.tolist(), 'dru_ddi': ddi_dru[['druNID', 'ddiNID']].values.tolist() } edges_dict = {'dis_dru_the': ('disorder', 'drug'), 'dru_dis_the': ('drug', 'disorder'), 'dis_sym': ('disorder', 'disorder'), 'sym_dis': ('disorder', 'disorder'), 'dis_pat': ('disorder', 'pathway'), 'pat_dis': ('pathway', 'disorder'), 'dis_pro': ('disorder', 'protein'), 'pro_dis': ('protein', 'disorder'), 'druA_druB': ('drug', 'drug'), 'druB_druA': ('drug', 'drug'), 'dru_pro': ('drug', 'protein'), 'pro_dru': ('protein', 'drug'), 'dru_sym_ind': ('drug', 'disease'), 'sym_dru_ind': ('disease', 'drug'), 'dru_sym_sef': ('drug', 'disease'), 'sym_dru_sef': ('disease', 'drug'), 'pro_pat': ('protein', 'pathway'), 'pat_pro': ('pathway', 'protein'), 'proA_proB': ('protein', 'protein'), 'proB_proA': ('protein', 'protein'), 'ddi_phe': ('drug-drug-interaction', 'disorder'), 'phe_ddi': ('disorder', 'drug-drug-interaction'), 'ddi_dru': ('drug-drug-interaction', 'dru'), 'dru_ddi': ('dru', 'drug-drug-interaction'), } else: edges = { 'dis_dru_the': dis_dru_the[['disNID', 'druNID']].values.tolist(), 'dru_dis_the': dis_dru_the[['druNID', 'disNID']].values.tolist(), 'dis_sym': dis_sym[['disNID', 'symNID']].values.tolist(), 'sym_dis': dis_sym[['symNID', 'disNID']].values.tolist() } edges_dict = {'dis_dru_the': ('disorder', 'drug'), 'dru_dis_the': ('drug', 'disorder'), 'dis_sym': ('disorder', 'disorder'), 'sym_dis': ('disorder', 'disorder') } for edge_t in edges_dict.keys(): for edge in edges[edge_t]: try: G.add_edge(int(edge[0]), int(edge[1]), edge_feature=edge[2], edge_type=edge_t) except IndexError: G.add_edge(int(edge[0]), int(edge[1]), edge_type=edge_t) # -------------------------- # HETEROGRAPGH # ------------------------ hetero = HeteroGraph(G) message = 'The DISNET DEEPSNAP HETEROGRAPH' if full: message = message + ' (complete)' if withoutRepoDB: message = message + ' (without RepoDB edges)' print(message + ' has been generated! :)') return hetero, dis_dru_the_repoDBAll # --------------------------------------------------------------------------------------------------------------------------------------------------- # UTILITY FUNCTIONS # --------------------------------------------------------------------------------------------------------------------------------------------------- def allRepoDB(self): # DataFrames of each type of nodes dis, dru = self.getNodeInfo(full=False) # Nodes data pre-processing (mapping and data for the graph) nodes_flat = pd.concat([dis, dru], keys=['disorder', 'drug'], names=['node_type', 'NID']).reset_index() nodes_flat['node_id'] = nodes_flat.index nodes_flat_list = nodes_flat.values.tolist() # data for the graph nodes_mapping = nodes_flat[['NID', 'node_id', 'node_type']].set_index(['NID', 'node_type']).to_dict()[ 'node_id'] # mapping # Adding NID to nodes dis['NID'] = dis.index dis['node_type'] = 'disorder' dis['node_id'] = nodes_flat.loc[nodes_flat['node_type'] == 'disorder'].reset_index(drop=True).node_id dru['NID'] = dru.index dru['node_type'] = 'drug' dru['node_id'] = nodes_flat.loc[nodes_flat['node_type'] == 'drug'].reset_index(drop=True).node_id # Nodes dataframes to dict to apply map later dis_dict = dis[['id', 'node_id']].set_index('id').to_dict()['node_id'] dru_dict = dru[['id', 'node_id']].set_index('id').to_dict()['node_id'] # ----------------------------- # LINKS DATA # ----------------------------- # DataFrames of each type of edges dis_dru_the_repoDB = pd.read_csv('graphData/drugdis_repodb_ALLlinks.tsv', sep='\t') # Convert ids to NIDs dis_dru_the_repoDB['disNID'] = dis_dru_the_repoDB.dis.map(dis_dict) dis_dru_the_repoDB['druNID'] = dis_dru_the_repoDB.dru.map(dru_dict) dis_dru_the_repoDB = ( torch.tensor(dis_dru_the_repoDB['disNID'].astype(np.int32).to_numpy(), dtype=torch.int32, device=self.device), torch.tensor(dis_dru_the_repoDB['druNID'].astype(np.int32).to_numpy() - len(dis_dict), dtype=torch.int32, device=self.device)) return dis_dru_the_repoDB def getNid(self, elem, type): if type == 'drug': dru = pd.read_csv('graphData/data/nodes/dru.tsv', sep='\t') dru['NID'] = dru.index dru_dict = dru[['id', 'NID']].set_index('id').to_dict()['NID'] index = dru_dict[elem] data = dru.iloc[index] return index, data[1] elif type == 'disease': dis = pd.read_csv('graphData/data/nodes/dis.tsv', sep='\t') dis['NID'] = dis.index dis_dict = dis[['id', 'NID']].set_index('id').to_dict()['NID'] index = dis_dict[elem] data = dis.iloc[index] return index, data[1] def decodePredictions(self, list, type, n, prepared=False, name=''): dir = 'results/' + type + '_' + str(n) + name + '_' + 'table.csv' if (type == 'dis_dru_the'): # Read the data h, t = self.getNodeInfo(full=False) # Give each node an ID h['NID'] = h.index t['NID'] = t.index # Nodes dataframes to dict to apply map later head_dict = h[['id', 'NID']].set_index('id').to_dict()['NID'] tail_dict = t[['id', 'NID']].set_index('id').to_dict()['NID'] decoded = [] vals = [] if not prepared: for i, (head, tails, preds) in enumerate(list): decoded.append([[head, item.item()] for item in tails]) vals.append(preds) if i % 1000 == 0: print(' Decoded', i, 'elements.') # Flatten both lists decoded = [item for sublist in decoded for item in sublist] vals = [item for sublist in vals for item in sublist] else: decoded = list vals = [item[2] for item in list] vals = np.array(vals) index = np.argsort(vals) index = np.flip(index) topN = index[:n] complete = [] for i in topN: headData = h.iloc[int(decoded[i][0])] tailData = t.iloc[int(decoded[i][1])] complete.append([headData[0], headData[1], tailData[0], tailData[1], vals[i]]) complete = np.array(complete) headType, tailType, _ = type.split('_', 2) df = pd.DataFrame( {headType: complete[:, 0], headType + ' name': complete[:, 1], tailType: complete[:, 2], tailType + ' name': complete[:, 3], 'pred': complete[:, 4] }) print('Table for', type, 'new edge predictions:') pd.set_option('display.max_rows', n, 'display.max_columns', None, 'expand_frame_repr', False) df2 = pd.concat([df[:50], df[-50:]]) print(df2) print(' Table saved as:', dir) df.to_csv('./' + dir) return df