heterograph_construction.py

import networkx as nx
from deepsnap.hetero_graph import HeteroGraph
import pandas as pd
import numpy as np
import torch

"""
Class that contains all the information related to the DISNET's graph.
"""
class DISNETConstructor:
    """
    It instantiates the constructor.
    Input:
        device: Location of the graph data (cpu or gpu).
    """
    def __init__(self, device='cpu'):
        self.device = device

    """
    It reads the nodes' TSVs.
    Input:
        full: Whether to read all the nodes information or just the simple graph ones.
    Output:
        Pandas dataframes with the nodes' information.
    """
    @staticmethod
    def getNodeInfo(full=True):
        dis = pd.read_csv('data/nodes/dis.tsv', sep='\t')
        dru = pd.read_csv('data/nodes/dru.tsv', sep='\t')

        if full:
            pat = pd.read_csv('data/nodes/pat.tsv', sep='\t')
            pro = pd.read_csv('data/nodes/pro.tsv', sep='\t')
            ddi = pd.read_csv('data/nodes/ddi.tsv', sep='\t')

            return dis, dru, pat, pro, ddi

        else:
            return dis, dru

    """
    It reads the edges' TSVs.
    Input:
        full: Whether to read all the edges information or just the simple graph ones.
    Output:
        Pandas dataframes with the edges' information.
    """
    @staticmethod
    def getEdgeInfo(full=True):
        dis_dru_the = pd.read_csv('data/links/dis_dru_the.tsv', sep='\t')
        dis_sym = pd.read_csv('data/links/dis_sym.tsv', sep='\t')

        if full:
            dis_pat = pd.read_csv('data/links/dis_pat.tsv', sep='\t')
            dis_pro = pd.read_csv('data/links/dis_pro.tsv', sep='\t')
            dru_dru = pd.read_csv('data/links/dru_dru.tsv', sep='\t')
            dru_pro = pd.read_csv('data/links/dru_pro.tsv', sep='\t')
            dru_sym_ind = pd.read_csv('data/links/dru_sym_ind.tsv', sep='\t')
            dru_sym_sef = pd.read_csv('data/links/dru_sym_sef.tsv', sep='\t')
            pro_pat = pd.read_csv('data/links/pro_pat.tsv', sep='\t')
            pro_pro = pd.read_csv('data/links/pro_pro.tsv', sep='\t')
            ddi_phe = pd.read_csv('data/links/ddi_phe.tsv', sep='\t')
            ddi_dru = pd.read_csv('data/links/ddi_dru.tsv', sep='\t')

            return dis_dru_the, dis_sym, dis_pat, dis_pro, dru_dru, dru_pro, dru_sym_ind, dru_sym_sef, pro_pat, \
                pro_pro, ddi_phe, ddi_dru

        else:
            return dis_dru_the, dis_sym

    """
    It generates the DISNET's heterograph.
    Input:
        full: Whether or not to generate the full graph.
        withoutRepoDB: Whether or not to remove the RepoDB edges.
    Output:
        The graph and the removed (RepoDB) edges listed.
    """
    def DISNETHeterograph(self, full=False, withoutRepoDB=True):
        # DataFrames of each type of nodes
        if full:
            dis, dru, pat, pro, ddi = self.getNodeInfo(full)
            nodes = [dis, dru, pat, pro, ddi]
            # Store types and its size.
            ntypes = ['disorder', 'drug', 'pathway', 'protein', 'drug-drug-interaction']
            nsizes = {'disorder': len(dis.index),
                      'drug': len(dru.index),
                      'pathway': len(pat.index),
                      'protein': len(pro.index),
                      'drug-drug-interaction': len(ddi.index)
                      }

        else:
            dis, dru = self.getNodeInfo(full)
            nodes = [dis, dru]
            # Store types and its size.
            ntypes = ['disorder', 'drug']
            nsizes = {'disorder': len(dis.index),
                      'drug': len(dru.index)
                      }

        # Nodes data pre-processing (mapping and data for the graph)
        nodes_flat = pd.concat(nodes,
                               keys=ntypes,
                               names=['node_type', 'NID']).reset_index()
        nodes_flat['node_id'] = nodes_flat.index
        nodes_flat_list = nodes_flat.values.tolist()  # data for the graph
        nodes_mapping = nodes_flat[['NID', 'node_id', 'node_type']].set_index(['NID', 'node_type']).to_dict()[
            'node_id']  # mapping

        # Adding NID to nodes
        dis['NID'] = dis.index
        dis['node_type'] = 'disorder'
        dis['node_id'] = nodes_flat.loc[nodes_flat['node_type'] == 'disorder'].reset_index(drop=True).node_id

        dru['NID'] = dru.index
        dru['node_type'] = 'drug'
        dru['node_id'] = nodes_flat.loc[nodes_flat['node_type'] == 'drug'].reset_index(drop=True).node_id

        # Nodes dataframes to dict to apply map later
        dis_dict = dis[['id', 'node_id']].set_index('id').to_dict()['node_id']
        dru_dict = dru[['id', 'node_id']].set_index('id').to_dict()['node_id']

        # Nodes features.
        dis_feat = torch.tensor([[1] * 100] * nsizes['disorder'], dtype=torch.float32)
        dru_feat = torch.load('data/features/dru.pt', map_location=self.device)

        if full:
            # Adding NID to nodes
            pat['NID'] = pat.index
            pat['node_type'] = 'pathway'
            pat['node_id'] = nodes_flat.loc[nodes_flat['node_type'] == 'pathway'].reset_index(drop=True).node_id

            pro['NID'] = pro.index
            pro['node_type'] = 'protein'
            pro['node_id'] = nodes_flat.loc[nodes_flat['node_type'] == 'protein'].reset_index(drop=True).node_id

            ddi['NID'] = ddi.index
            ddi['node_type'] = 'drug-drug-interaction'
            ddi['node_id'] = nodes_flat.loc[nodes_flat['node_type'] == 'drug-drug-interaction'].reset_index(drop=True).node_id

            # Nodes dataframes to dict to apply map later
            pat_dict = pat[['id', 'node_id']].set_index('id').to_dict()['node_id']
            pro_dict = pro[['id', 'node_id']].set_index('id').to_dict()['node_id']
            ddi_dict = ddi[['id', 'node_id']].set_index('id').to_dict()['node_id']

            pat_feat = torch.tensor([[1] * 100] * nsizes['pathway'], dtype=torch.float32)
            pro_feat = torch.tensor([[1] * 100] * nsizes['protein'], dtype=torch.float32)
            ddi_feat = torch.tensor([[1] * 100] * nsizes['drug-drug-interaction'], dtype=torch.float32)


            feats = {'disorder': dis_feat, 'drug': dru_feat, 'pathway': pat_feat, 'protein': pro_feat,
                     'drug-drug-interaction': ddi_feat}

        else:
            feats = {'disorder': dis_feat, 'drug': dru_feat}

        # Add nodes to the graph
        G = nx.DiGraph()  # It is a directed graph.
        for node in nodes_flat_list:
            G.add_node(node[4], node_type=node[0], node_feature=feats[node[0]][node[1]], node_label=node[1])

        # DataFrames of each type of edges
        if full:
            dis_dru_the, dis_sym, dis_pat, dis_pro, dru_dru, dru_pro, dru_sym_ind, dru_sym_sef, pro_pat, \
                pro_pro,  ddi_phe, ddi_dru = self.getEdgeInfo(full)

            dis_pat['disNID'] = dis_pat.dis.map(dis_dict)
            dis_pat['patNID'] = dis_pat.pat.map(pat_dict)

            dis_pro['disNID'] = dis_pro.dis.map(dis_dict)
            dis_pro['proNID'] = dis_pro.pro.map(pro_dict)

            dru_dru['druANID'] = dru_dru.drA.map(dru_dict)
            dru_dru['druBNID'] = dru_dru.drB.map(dru_dict)

            dru_pro['druNID'] = dru_pro.dru.map(dru_dict)
            dru_pro['proNID'] = dru_pro.pro.map(pro_dict)

            dru_sym_ind['druNID'] = dru_sym_ind.dru.map(dru_dict)
            dru_sym_ind['symNID'] = dru_sym_ind.sym.map(dis_dict)

            dru_sym_sef['druNID'] = dru_sym_sef.dru.map(dru_dict)
            dru_sym_sef['symNID'] = dru_sym_sef.sym.map(dis_dict)

            pro_pat['proNID'] = pro_pat.pro.map(pro_dict)
            pro_pat['patNID'] = pro_pat.pat.map(pat_dict)

            pro_pro['proANID'] = pro_pro.prA.map(pro_dict)
            pro_pro['proBNID'] = pro_pro.prB.map(pro_dict)

            ddi_phe['ddiNID'] = ddi_phe.ddi.map(ddi_dict)
            ddi_phe['pheNID'] = ddi_phe.phe.map(dis_dict)

            ddi_dru['ddiNID'] = ddi_dru.ddi.map(ddi_dict)
            ddi_dru['druNID'] = ddi_dru.dru.map(dru_dict)

        else:
            dis_dru_the, dis_sym = self.getEdgeInfo(full)

        # Convert ids to NIDs
        dis_dru_the['disNID'] = dis_dru_the.dis.map(dis_dict)
        dis_dru_the['druNID'] = dis_dru_the.dru.map(dru_dict)

        dis_sym['disNID'] = dis_sym.dis.map(dis_dict)
        dis_sym['symNID'] = dis_sym.sym.map(dis_dict)

        # Delete RepoDB cases.
        if withoutRepoDB:
            dis_dru_the_repoDBAll = pd.read_csv('testData/drugdis_repodb_ALLlinks.tsv', sep='\t')
            dis_dru_the_repoDBAll['disNID'] = dis_dru_the_repoDBAll.dis.map(dis_dict)
            dis_dru_the_repoDBAll['druNID'] = dis_dru_the_repoDBAll.dru.map(dru_dict)

            # Delete from the graph the edges of repo_db
            dis_dru_the = pd.concat([dis_dru_the, dis_dru_the_repoDBAll])
            dis_dru_the.drop_duplicates(keep=False, inplace=True)

            dis_dru_the = dis_dru_the[:50355]

            dis_dru_the_repoDBAll = (
                torch.tensor(dis_dru_the_repoDBAll['disNID'].astype(np.int32).to_numpy(), dtype=torch.int32,
                             device=self.device),
                torch.tensor(dis_dru_the_repoDBAll['druNID'].astype(np.int32).to_numpy() - len(dis_dict),
                             dtype=torch.int32, device=self.device))

        else:
            dis_dru_the_repoDBAll = {}

        if full:
            # Create edges dictionary.
            edges = {
                'dis_dru_the': dis_dru_the[['disNID', 'druNID']].values.tolist(),
                'dru_dis_the': dis_dru_the[['druNID', 'disNID']].values.tolist(),

                'dis_sym': dis_sym[['disNID', 'symNID']].values.tolist(),
                'sym_dis': dis_sym[['symNID', 'disNID']].values.tolist(),

                'dis_pat': dis_pat[['disNID', 'patNID']].values.tolist(),
                'pat_dis': dis_pat[['patNID', 'disNID']].values.tolist(),

                'dis_pro': dis_pro[['disNID', 'proNID', 'w']].values.tolist(),
                'pro_dis': dis_pro[['proNID', 'disNID', 'w']].values.tolist(),

                'druA_druB': dru_dru[['druANID', 'druBNID']].values.tolist(),
                'druB_druA': dru_dru[['druBNID', 'druANID']].values.tolist(),

                'dru_pro': dru_pro[['druNID', 'proNID']].values.tolist(),
                'pro_dru': dru_pro[['proNID', 'druNID']].values.tolist(),

                'dru_sym_ind': dru_sym_ind[['druNID', 'symNID']].values.tolist(),
                'sym_dru_ind': dru_sym_ind[['symNID', 'druNID']].values.tolist(),

                'dru_sym_sef': dru_sym_sef[['druNID', 'symNID', 'w']].values.tolist(),
                'sym_dru_sef': dru_sym_sef[['symNID', 'druNID', 'w']].values.tolist(),

                'pro_pat': pro_pat[['proNID', 'patNID']].values.tolist(),
                'pat_pro': pro_pat[['patNID', 'proNID']].values.tolist(),

                'proA_proB': pro_pro[['proANID', 'proBNID']].values.tolist(),
                'proB_proA': pro_pro[['proBNID', 'proANID']].values.tolist(),

                'ddi_phe': ddi_phe[['ddiNID', 'pheNID']].values.tolist(),
                'phe_ddi': ddi_phe[['pheNID', 'ddiNID']].values.tolist(),

                'ddi_dru': ddi_dru[['ddiNID', 'druNID']].values.tolist(),
                'dru_ddi': ddi_dru[['druNID', 'ddiNID']].values.tolist()
            }
            # Create edge_type dictionary.
            edges_dict = {'dis_dru_the': ('phenotype', 'drug'),
                          'dru_dis_the': ('drug', 'phenotype'),

                          'dis_sym': ('phenotype', 'phenotype'),
                          'sym_dis': ('phenotype', 'phenotype'),

                          'dis_pat': ('phenotype', 'pathway'),
                          'pat_dis': ('pathway', 'phenotype'),

                          'dis_pro': ('phenotype', 'protein'),
                          'pro_dis': ('protein', 'phenotype'),

                          'druA_druB': ('drug', 'drug'),
                          'druB_druA': ('drug', 'drug'),

                          'dru_pro': ('drug', 'protein'),
                          'pro_dru': ('protein', 'drug'),

                          'dru_sym_ind': ('drug', 'phenotype'),
                          'sym_dru_ind': ('phenotype', 'drug'),

                          'dru_sym_sef': ('drug', 'phenotype'),
                          'sym_dru_sef': ('phenotype', 'drug'),

                          'pro_pat': ('protein', 'pathway'),
                          'pat_pro': ('pathway', 'protein'),

                          'proA_proB': ('protein', 'protein'),
                          'proB_proA': ('protein', 'protein'),

                          'ddi_phe': ('drug-drug-interaction', 'phenotype'),
                          'phe_ddi': ('phenotype', 'drug-drug-interaction'),

                          'ddi_dru': ('drug-drug-interaction', 'drug'),
                          'dru_ddi': ('drug', 'drug-drug-interaction'),
                          }
        else:
            edges = {
                'dis_dru_the': dis_dru_the[['disNID', 'druNID']].values.tolist(),
                'dru_dis_the': dis_dru_the[['druNID', 'disNID']].values.tolist(),

                'dis_sym': dis_sym[['disNID', 'symNID']].values.tolist(),
                'sym_dis': dis_sym[['symNID', 'disNID']].values.tolist()

            }

            edges_dict = {'dis_dru_the': ('phenotype', 'drug'),
                          'dru_dis_the': ('drug', 'phenotype'),

                          'dis_sym': ('phenotype', 'phenotype'),
                          'sym_dis': ('phenotype', 'phenotype')
                          }

        # Add the edges to the graph.
        for edge_t in edges_dict.keys():
            for edge in edges[edge_t]:
                try:
                    G.add_edge(int(edge[0]), int(edge[1]), edge_feature=edge[2], edge_type=edge_t)
                except IndexError:
                    G.add_edge(int(edge[0]), int(edge[1]), edge_feature=1, edge_type=edge_t)

        # Generate DeepSnap heterograph from NetworkX heterograph.
        hetero = HeteroGraph(G)
        message = 'The DISNET HETEROGRAPH'

        if full:
            message = message + ' (complete)'

        if withoutRepoDB:
            message = message + ' (without RepoDB edges)'

        print(message + ' has been generated! :)')
        return hetero, dis_dru_the_repoDBAll

    # ---------------------------------------------------------------------------------------------------------------------------------------------------
    #                                                                          UTILITY FUNCTIONS
    # ---------------------------------------------------------------------------------------------------------------------------------------------------

    """
    It returns all the RepoDB edges.
    Output:
        List containing all the RepoDB edges.
    """
    def allRepoDB(self):
        # DataFrames of each type of nodes
        dis, dru = self.getNodeInfo(full=False)

        # Nodes data pre-processing (mapping and data for the graph)
        nodes_flat = pd.concat([dis, dru],
                               keys=['disorder', 'drug'],
                               names=['node_type', 'NID']).reset_index()

        nodes_flat['node_id'] = nodes_flat.index
        nodes_flat_list = nodes_flat.values.tolist()  # data for the graph

        nodes_mapping = nodes_flat[['NID', 'node_id', 'node_type']].set_index(['NID', 'node_type']).to_dict()[
            'node_id']  # mapping

        # Adding NID to nodes
        dis['NID'] = dis.index
        dis['node_type'] = 'disorder'
        dis['node_id'] = nodes_flat.loc[nodes_flat['node_type'] == 'disorder'].reset_index(drop=True).node_id

        dru['NID'] = dru.index
        dru['node_type'] = 'drug'
        dru['node_id'] = nodes_flat.loc[nodes_flat['node_type'] == 'drug'].reset_index(drop=True).node_id

        # Nodes dataframes to dict to apply map later
        dis_dict = dis[['id', 'node_id']].set_index('id').to_dict()['node_id']
        dru_dict = dru[['id', 'node_id']].set_index('id').to_dict()['node_id']

        # DataFrames of each type of edges
        dis_dru_the_repoDB = pd.read_csv('drugdis_repodb_ALLlinks.tsv', sep='\t')

        # Convert ids to NIDs
        dis_dru_the_repoDB['disNID'] = dis_dru_the_repoDB.dis.map(dis_dict)
        dis_dru_the_repoDB['druNID'] = dis_dru_the_repoDB.dru.map(dru_dict)

        dis_dru_the_repoDB = (
            torch.tensor(dis_dru_the_repoDB['disNID'].astype(np.int32).to_numpy(), dtype=torch.int32,
                         device=self.device),
            torch.tensor(dis_dru_the_repoDB['druNID'].astype(np.int32).to_numpy() - len(dis_dict), dtype=torch.int32,
                         device=self.device))

        return dis_dru_the_repoDB

    """
    It decodes the predictions to a format where the drugs and diseases are recognisable easily.
    Input:
        list: List of predictions.
        type: Type of the edges to be decoded.
        n: Number of predictions to save.
        prepared: Whether or not the predictions have been prepared before (just NIDs are provided)
        name: Name of the dataframe to be saved.
    """
    def decodePredictions(self, list, type, n=50, prepared=False, name=''):
        dir = 'results/' + type + '_' + str(n) + name + '_' + 'table.csv'
        if (type == 'dis_dru_the'):
            # Read the data
            h, t = self.getNodeInfo(full=False)
            # Give each node its ID
            h['NID'] = h.index
            t['NID'] = t.index

        decoded = []
        vals = []
        # If the data is in the shape [head], [tails], [preds] instead of [heads], [tails], [preds] it is transformed.
        if not prepared:
            for i, (head, tails, preds) in enumerate(list):
                decoded.append([[head, item.item()] for item in tails])
                vals.append(preds)

                if i % 1000 == 0:
                    print('         Decoded', i, 'elements.')

            # Flatten both lists
            decoded = [item for sublist in decoded for item in sublist]
            vals = [item for sublist in vals for item in sublist]

        else:
            decoded = list
            vals = [item[2] for item in list]

        vals = np.array(vals)
        index = np.argsort(vals)  # Sort them by prediction value (ascending).

        index = np.flip(index)  # Order to descending.
        topN = index[:n]  # Get topN.

        complete = []
        # Get name and DISNET id for every node.
        for i in topN:
            headData = h.iloc[int(decoded[i][0])]
            tailData = t.iloc[int(decoded[i][1])]

            complete.append([headData[0], headData[1], tailData[0], tailData[1], vals[i]])

        # Form the dataset.
        complete = np.array(complete)
        headType, tailType, _ = type.split('_', 2)
        df = pd.DataFrame(
            {headType: complete[:, 0],
             headType + ' name': complete[:, 1],
             tailType: complete[:, 2],
             tailType + ' name': complete[:, 3],
             'pred': complete[:, 4]
             })
        print('Table for', type, 'new edge predictions:')
        pd.set_option('display.max_rows', n, 'display.max_columns', None, 'expand_frame_repr', False)
        df2 = pd.concat([df[:n], df[-n:]])
        print(df2)
        print('    Table saved as:', dir)
        df.to_csv('./' + dir)
        return df