diff --git a/documentation/w&bInstructions.md b/documentation/w&bInstructions.md new file mode 100644 index 0000000000000000000000000000000000000000..fbddc9d540f351d6f7e78f4dea9da5a7e8b76081 --- /dev/null +++ b/documentation/w&bInstructions.md @@ -0,0 +1,13 @@ +# Instructions for doing hyperparameter search with Weights and Biases. +1. Log into W&B. +2. If you have already created the project skip to the next step, if not create it in the "Projects" tab and clicking "Create new project". +3. Then click on the "Sweeps" tab (broom icon). +4. If you have already created the sweep skip to 6th step, if not create it by clicking "Create Sweep". +5. In the sweep creation tab you need to define the parameters of the sweep, once the first run has been launched they cannot be changed. It is highly recommended saving your sweep configuration, since it is possible that future hyperparameter searches will be similar. https://docs.wandb.ai/guides/sweeps/define-sweep-configuration + 1. program -> Python script that will be called by the agent, the one that communicates with the W&B platform. + 2. method -> Optimisation method (grid, random or bayes). + 3. parameters -> Dictionary of parameters to be optimised, boundaries and distribution need to be indicated. +6. Copy the agent command, it will be similar to this "wandb agent ayusoupm/dmsr/ih7wyixk". +7. Paste the agent command on your terminal and the process will begin. +8. It is highly recommended running the agent command using some asynchronous functionality (https://linux.die.net/man/1/screen. +9. The results and logs of the sweep are accessible through the "Sweep" tab. diff --git a/testRepoDBWeightsAndBiases.py b/testRepoDBWeightsAndBiases.py new file mode 100644 index 0000000000000000000000000000000000000000..18d9694257bdd86991b0d0cbc8041c0fcfc6ba40 --- /dev/null +++ b/testRepoDBWeightsAndBiases.py @@ -0,0 +1,213 @@ +import numpy as np +import torch +from dmsr import main +from deepsnap.batch import Batch +from torch.utils.data import DataLoader +import heterograph_construction +from deepsnap.dataset import GraphDataset +from datetime import datetime +from utilities import plot_roc, plot_prc, plot_dist +import matplotlib.pyplot as plt + +import scipy.stats as st + +# Import W&B +import wandb +wandb.init(project="dmsr", entity="ayusoupm") + +device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # It defines whether to execute on cpu or gpu. +constructor = heterograph_construction.DISNETConstructor(device=device) # Graph constructor. +toStudy = 'dis_dru_the' # Graph edge type to study. + +""" +It gets the predictions of the model and decodes them. +Input: + model: Model to generate predictions. + eid: Edges to predict. + dataloader: Graph. + random: Whether the edges to predict are random or not. +Output: + Dataframe containing all the predictions ordered and decoded. +""" + + +def getDecode(model, eid, dataloader, random=''): + print(" Looking for new edges.") + for batch in dataloader: + batch.to(device) + preds = model.pred(batch, eid) + new = [] + for i, pred in enumerate(preds): + new.append([eid[0][i].item(), eid[1][i].item(), pred.cpu().detach().numpy().item()]) + + n = len(preds) + print(" Decoding predictions, this may take a while.") + return constructor.decodePredictions(new, toStudy, n, True, random), torch.tensor(preds).cpu().detach() + + +""" +It generates random edges in the graph. +Output: + Randomly generated edges. +""" + + +def randomEids(): + tensor1 = torch.randint(0, 30729, (5013,), device=torch.device(device)) + tensor2 = torch.randint(0, 3944, (5013,), device=torch.device(device)) + return (tensor1, tensor2) + + +""" +It plots the metrics for the results of the real edge and the random edge set. It joins them vertically and horizontally. +Input: + fpr: False positve rate. + tpr: True positive rate. + label1: Area Under the ROC curve. + recall: Recall. + precision: Precision. + label2: Area Under the PR curve. +""" + + +def plotMetrics(fpr, tpr, label1, recall, precision, label2): + # Vertical plotting. + fig, axs = plt.subplots(2, figsize=(6, 10)) + + axs[0].plot(fpr, tpr, label="AUC ROC = " + np.array2string(label1, formatter={'float_kind': lambda x: "%.2f" % x})) + axs[0].set_title('ROC Curve') + axs[0].legend(loc='lower right') + axs[0].plot([0, 1], [0, 1], 'r--') + axs[0].set_xlim([0, 1]) + axs[0].set_ylim([0, 1]) + axs[0].set_ylabel('True Positive Rate') + axs[0].set_xlabel('False Positive Rate') + + axs[1].set_title('Precision-Recall Curve') + axs[1].plot(recall, precision, + label="PRC = " + np.array2string(label2, formatter={'float_kind': lambda x: "%.2f" % x})) + axs[1].legend(loc='lower right') + axs[1].set_xlim([0, 1]) + axs[1].set_ylim([0, 1]) + axs[1].set_ylabel('Precision') + axs[1].set_xlabel('Recall') + + # Horizontal plotting. + fig2, axs2 = plt.subplots(1, 2, figsize=(12, 4)) + + axs2[0].plot(fpr, tpr, label="AUC ROC = " + np.array2string(label1, formatter={'float_kind': lambda x: "%.2f" % x})) + axs2[0].set_title('ROC Curve') + axs2[0].legend(loc='lower right') + axs2[0].plot([0, 1], [0, 1], 'r--') + axs2[0].set_xlim([0, 1]) + axs2[0].set_ylim([0, 1]) + axs2[0].set_ylabel('True Positive Rate') + axs2[0].set_xlabel('False Positive Rate') + + axs2[1].set_title('Precision-Recall Curve') + axs2[1].plot(recall, precision, + label="PRC = " + np.array2string(label2, formatter={'float_kind': lambda x: "%.2f" % x})) + axs2[1].legend(loc='lower right') + axs2[1].set_xlim([0, 1]) + axs2[1].set_ylim([0, 1]) + axs2[1].set_ylabel('Precision') + axs2[1].set_xlabel('Recall') + + fig.savefig('metrics/aucroc&prcRepoDBVertical.svg', format='svg', dpi=1200) + fig2.savefig('metrics/aucroc&prcRepoDBHorizontal.svg', format='svg', dpi=1200) + plt.close(fig) + plt.close(fig2) + plot_dist() + + +""" +It generates the metrics for the model. +Input: + model: Model to generate metrics of. +Output: + Area Under the ROC and PR curve. +""" + + +def metrics(model): + hetero, eids = constructor.DISNETHeterograph(full=False, withoutRepoDB=True) + dataset = GraphDataset( + [hetero], + task='link_pred', + edge_train_mode='disjoint', + edge_message_ratio=0.8 + ) + toInfer = DataLoader( + dataset, collate_fn=Batch.collate(), batch_size=1 + ) + + model = model.to(device) + model.eval() + + print("Started getting repoDB predictions at", datetime.now().strftime("%H:%M:%S")) + _, preds = getDecode(model, eids, toInfer) + print("Finished getting repoDB predictions at", datetime.now().strftime("%H:%M:%S")) + + print("Started getting random predictions at", datetime.now().strftime("%H:%M:%S")) + _, predsN = getDecode(model, randomEids(), toInfer, 'R') + print("Finished getting random predictions at", datetime.now().strftime("%H:%M:%S")) + + labels1 = torch.ones(len(preds)) + labels2 = torch.zeros(len(predsN)) + + # Join real and random edge results in one list to calculate metrics. + pure_predictions = [item for sublist in [preds, predsN] for item in sublist] + labels = torch.tensor([item for sublist in [labels1, labels2] for item in sublist]) + + fpr, tpr, label1 = plot_roc(labels, pure_predictions, ('disorder', 'dis_dru_the', 'drug'), "dmsr-f/", + "RepoDB") + recall, precision, label2 = plot_prc(labels, pure_predictions, ('disorder', 'dis_dru_the', 'drug'), "dmsr-f/", + "RepoDB") + + plotMetrics(fpr, tpr, label1, recall, precision, label2) + return label1, label2 + + +if __name__ == '__main__': + # Get config of W&B + config = wandb.config + # Metrics list for each model. + rocL, prcL = np.array([]), np.array([]) + # Number of iterations. + k = 50 + + # Set of hyperparameters. + epochs = config.epochs + hidden_dim = config.hidden_dim + lr = config.lr + weight_decay = config.weight_decay + dropout = config.dropout + + # Train and test k models and obtain their metrics. + for i in range(k): + model = main(epochs, hidden_dim, lr, weight_decay, dropout) + roc1, prc1 = metrics(model) + rocL = np.append(rocL, roc1) + prcL = np.append(prcL, prc1) + # Keep track of evolution along iterations. It is not necessary. + wandb.log({'rocAverage': sum(rocL) / (i + 1), 'prcAverage': sum(prcL) / (i + 1)}) + + # Average of the metrics of all the generated models. + rocM = sum(rocL) / k + prcM = sum(prcL) / k + + # Obtain confidence intervals, if number of samples is under 30 t-distribution is used, if over 30 the normal + # distribution is used. + if k < 30: + r = st.t.interval(0.95, k - 1, loc=np.mean(rocL), scale=st.sem(rocL)) + p = st.t.interval(0.95, k - 1, loc=np.mean(prcL), scale=st.sem(prcL)) + else: + r = st.norm.interval(0.95, loc=np.mean(rocL), scale=st.sem(rocL)) + p = st.norm.interval(0.95, loc=np.mean(prcL), scale=st.sem(prcL)) + + # Send data to W&B. Necessary. + wandb.log({'rocAverage': rocM, 'prcAverage': prcM}) + + print("AUCROC: ", rocM, "+-", rocM - r[0]) + print("AUCPR: ", prcM, "+-", prcM - p[0]) +