import numpy as np from pysmiles import read_smiles import pandas as pd import mysql.connector import torch from autoencoder import GCNEncoder, Trainer from torch_geometric.nn import GAE import itertools import random from pubchempy import * device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # It defines whether to execute on cpu or gpu. """ Class that contains the data associated to the drug graphs. """ class DrugGraph: """ It instantiates the drug molecular graph. Input: graph: Graph. feats: Feats of the graph nodes.. edge_index: Two lists which contain the edges of the graph (first contains heads and second tails) neg_edge_index: Two lists which contain negative edges, not present in the graph (first contains heads and second tails) """ def __init__(self, graph, feats, edge_index, neg_edge_index): self.graph = graph self.x = feats self.edge_index = edge_index self.neg_edge_index = neg_edge_index """ It generates a CSV with the SMILES representations of the DISNET's graph drugs. """ def getSmiles(): cnx = mysql.connector.connect(user='edsss_usr', password='1AftIRWJa93P', host='ares.ctb.upm.es', port='30100', database='disnet_drugslayer') cursor = cnx.cursor() query = ("SELECT drug_id, drug_name, chemical_structure FROM drug;") cursor.execute(query) newDf = pd.DataFrame(cursor.fetchall()) newDf.columns = ['id', 'name', 'stru'] cursor.close() cnx.close() newDf.to_csv("data/druStruc.tsv", sep='\t', index=False) """ It transforms the SMILES representations to their graph representations. In case of error instead of the graph a 0 is given. Input: df: Dataframe containing drug's name and SMILES. complete: Decides if drugs that result in errors should be completed or not (Searching for them in PubChem). Output: Array with all the NetworkX objects representing the drugs. """ def getGraph(df, complete=False): networks = [] for name, smiles in zip(df['name'].tolist(), df['stru'].tolist()): # Stereochemical information that will be discarded. if isinstance(smiles, str) and smiles != '0': new = read_smiles(smiles).to_directed() else: if complete: p = get_compounds(name, 'name') if len(p) > 0: smiles = p[0].canonical_smiles new = read_smiles(smiles) df.loc[df["name"] == name, "stru"] = smiles else: df.loc[df["name"] == name, "stru"] = 0 new = 0 else: new = 0 df.loc[df["name"] == name, "stru"] = 0 networks.append(new) df.to_csv("data/druStruc.tsv", sep='\t', index=False) return np.array(networks, dtype=object) """ It builds a dataset made of the drugs molecular structures. Input: graphs: Array containing all the NetworkX objects representing the drugs. Output: Array containing all the DrugGraph objects of the drugs. """ def buildDataset(graphs): res = [] # Dictionary containing the conversion from element to int. elements = {'Error': -1, 'Ag': 0, 'Al': 1, 'As': 2, 'Au': 3, 'B': 4, 'Ba': 5, 'Bi': 6, 'Br': 7, 'C': 8, 'Ca': 9, 'Cl': 10, 'Co': 11, 'Cr': 12, 'Cu': 13, 'F': 14, 'Fe': 15, 'Ga': 16, 'Gd': 17, 'H': 18, 'He': 19, 'Hg': 20, 'I': 21, 'In': 22, 'K': 23, 'Kr': 24, 'La': 25, 'Li': 26, 'Lu': 27, 'Mg': 28, 'Mn': 29, 'N': 30, 'Na': 31, 'O': 32, 'P': 33, 'Pt': 34, 'Ra': 35, 'Rb': 36, 'S': 37, 'Sb': 38, 'Se': 39, 'Si': 40, 'Sm': 41, 'Sn': 42, 'Sr': 43, 'Tc': 44, 'Ti': 45, 'Tl': 46, 'Xe': 47, 'Yb': 48, 'Zn': 49} for graph in graphs: if graph != 0: # Node Features feats2 = [] feats = [data[1] for data in graph.nodes(data=True)] # Need this to make all node feats follow the same format. for node in feats: node2 = {'element': elements[node['element']], 'charge': node['charge'], 'aromatic': node['aromatic'], 'hcount': node["hcount"]} try: node2['stereo'] = len(node['stereo']) except KeyError: node2['stereo'] = 0 feats2.append(list(node2.values())) # Generate all possible edges. neg_edges = list(itertools.combinations(list(range(0, len(feats2))), 2)) # Remove from the negative edge list those that appear in the original graph. e0 = [] e1 = [] for e in list(graph.edges): if e in neg_edges: neg_edges.remove(e) e0.append(e[0]) e1.append(e[1]) # If there are more negative edges than positive ones, the number of negatives is reduced to match # the positives. eN0 = [] eN1 = [] if len(e1) < len(neg_edges): neg_edges = random.sample(neg_edges, len(e0)) # Format in edge_index format (head, tail). for e in neg_edges: eN0.append(e[0]) eN1.append(e[1]) edges = [e0, e1] neg_edges = [eN0, eN1] # If there are no negative edges, one needs to be introduced. if len(neg_edges[0]) == 0: neg_edges = [[0], [0]] else: # Error case. feats2 = [[-1, -1, -1, -1, -1]] edges = [[], []] neg_edges = [[0], [0]] res.append(DrugGraph(graph, torch.tensor(feats2, device=device, dtype=torch.float), torch.tensor(edges, device=device, dtype=torch.int64), torch.tensor(neg_edges, device=device, dtype=torch.int64))) return res """ It traines the autoencoder model. Input: model: Autoencoder model. optimizer: Optimizer to use along training. dataset: Dataset to train the autoencoder. epochs: Number of epochs to train the autoencoder. """ def trainAE(model, optimizer, dataset, epochs): trainer = Trainer(model, optimizer, dataset) trainer.fit(epochs) """ It generates an embedding for a given graph applying all the instantiated layers in the previous function. The embedding is the code of the autoencoder. Input: model: Autoencoder. dataset: Dataset to generate embeddings from. Output: Drug molecular structure embeddings. """ def getEmbed(model, dataset): embeddings = [] model.eval() model = model.to('cpu') with torch.no_grad(): for data in dataset: embeddings.append(torch.mean(model.encode(data.x.cpu(), data.edge_index.cpu()), dim=0)) return embeddings if __name__ == '__main__': getSmiles() df = pd.read_csv('data/druStruc.tsv', sep='\t') graphs = getGraph(df) dataset = buildDataset(graphs) dataset2 = [] # Those elements without errors are incorporated in the used dataset. for elem in dataset: if elem.graph != 0: dataset2.append(elem) # Training model = GAE(GCNEncoder(5, 32)) model = model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=0.001) trainAE(model, optimizer, dataset2, 500) # Getting Embeddings embeddings = getEmbed(model, dataset) # Saving model. torch.save(model.state_dict(), "./models/structureEmbedder") # Those drugs without embedding are assigned a predefined embedding. for i in range(len(embeddings)): if dataset[i].graph == 0: embeddings[i] = torch.tensor([1] * 32, dtype=torch.float32) # Save the embeddings. df["embedding"] = embeddings df.to_csv("data/features/dru.tsv", sep='\t', index=False) torch.save(embeddings, 'data/features/dru.pt')