From c504df721c70779312fd4a5042bd6e82634b93ae Mon Sep 17 00:00:00 2001 From: Jorge Molina Date: Fri, 19 Feb 2021 12:25:14 +0100 Subject: [PATCH] Extract hemograms concepts --- Hemograms/BertHemograms.py | 84 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 Hemograms/BertHemograms.py diff --git a/Hemograms/BertHemograms.py b/Hemograms/BertHemograms.py new file mode 100644 index 0000000..f5e392e --- /dev/null +++ b/Hemograms/BertHemograms.py @@ -0,0 +1,84 @@ +import json +import copy +import datetime +from datetime import date +import nltk +import requests +import pandas as pd +from ConceptExtractor_v3 import extractionOfConcepts +from timeit import default_timer +currentDate = str(date.today()).split(" ")[0] + +def dictConceptsDoc(listTuples): + dict = {} + listConcepts = [] + for i in range(len(listTuples)): + listConcepts.clear() + idDoc = listTuples[i][4] + if idDoc not in dict: + for j in range(len(listTuples)): + if idDoc == listTuples[j][4]: + listConcepts.append(listTuples[j]) + dict[idDoc] = copy.copy(listConcepts) + return dict + +def checkDoses(idDoc, sentDocs): + countSents = 0 + metricsList = [] + dictMetrics = {} + resultList = [] + for sents in sentDocs: + resp = requests.post(url="http://138.4.130.153:8088/jkes/annotator/dateAnnotator", + json={idDoc: [currentDate, sents]}, + verify=False) + result_metrics = resp.json() + metricsList.append(result_metrics['response'][1]) + #if dictMetrics != {}: + # metricsList.append([dictMetrics[idDoc]]) + countSents += 1 + for j in range(len(metricsList)): + for key in metricsList[j]: + if metricsList[j][key] != []: + resultList.append(metricsList[j][key]) + dictMetrics[idDoc] = copy.copy(resultList) + return dictMetrics + +def bertConcepts(): + inicio = default_timer() + fileDocs = 'breast_notes_100randomly_sampled.xlsx' + textDocs = pd.read_excel(fileDocs, sheet_name='Sheet1') + textDocs = textDocs['DESCRIPCION'] + sentsDoc = {} + spanish_tokenizer = nltk.data.load('tokenizers/punkt/spanish.pickle') + for i in range(len(textDocs)): + sentsDoc[i] = spanish_tokenizer.tokenize(textDocs[i]) + listHemogramConcepts = [] + dictAnotacionesDoc = {} + dictMetrics = {} + with open('annotations.json') as json_file: + annotations = json.load(json_file) + data = extractionOfConcepts(annotations) + with open('entity_cuis.json') as file: + dictAnnotationsEntities = json.load(file) + resultado = extractionOfConcepts(annotations) + anotaciones = [anotacion for lista in resultado for anotacion in lista] + dictAnotacionesDoc = dictConceptsDoc(anotaciones) + #anotaciones = [concept, Entity, offset, end, ID] + for key in dictAnotacionesDoc.keys(): + for tupleAnot in dictAnotacionesDoc[key]: + if tupleAnot[1] == 'DATE': + idDoc = tupleAnot[4] + listHemogramConcepts.append(tupleAnot) + dictMetrics = checkDoses(idDoc, sentsDoc[idDoc]) + for key in dictMetrics: + for metrics in dictMetrics[key]: + for metricConcept in (metrics): + for z in range(len(metricConcept)): + if metricConcept[z][5] != 'NUMBER': + distanceMetric = sentsDoc[idDoc].find(metrics[0]) + distanceConcept = sentsDoc[idDoc].find(tupleAnot[0]) + if(distanceMetric - distanceConcept < 2): + completeHemogram = tupleAnot[0] + metrics[0] + print(completeHemogram) + print("wait") +bertConcepts() \ No newline at end of file -- 2.24.1