import json import copy import datetime from datetime import date import nltk import requests import pandas as pd from ConceptExtractor_v3 import extractionOfConcepts from timeit import default_timer currentDate = str(date.today()).split(" ")[0] def dictConceptsDoc(listTuples): dict = {} listConcepts = [] for i in range(len(listTuples)): listConcepts.clear() idDoc = listTuples[i][4] if idDoc not in dict: for j in range(len(listTuples)): if idDoc == listTuples[j][4]: listConcepts.append(listTuples[j]) dict[idDoc] = copy.copy(listConcepts) return dict def checkDoses(idDoc, sentDocs): countSents = 0 metricsList = [] dictMetrics = {} resultList = [] for sents in sentDocs: resp = requests.post(url="http://138.4.130.153:8088/jkes/annotator/dateAnnotator", json={idDoc: [currentDate, sents]}, verify=False) result_metrics = resp.json() metricsList.append(result_metrics['response'][1]) #if dictMetrics != {}: # metricsList.append([dictMetrics[idDoc]]) countSents += 1 for j in range(len(metricsList)): for key in metricsList[j]: if metricsList[j][key] != []: resultList.append(metricsList[j][key]) dictMetrics[idDoc] = copy.copy(resultList) return dictMetrics def bertConcepts(): inicio = default_timer() fileDocs = 'breast_notes_100randomly_sampled.xlsx' textDocs = pd.read_excel(fileDocs, sheet_name='Sheet1') textDocs = textDocs['DESCRIPCION'] sentsDoc = {} spanish_tokenizer = nltk.data.load('tokenizers/punkt/spanish.pickle') for i in range(len(textDocs)): sentsDoc[i] = spanish_tokenizer.tokenize(textDocs[i]) listHemogramConcepts = [] dictAnotacionesDoc = {} dictMetrics = {} with open('annotations.json') as json_file: annotations = json.load(json_file) data = extractionOfConcepts(annotations) with open('entity_cuis.json') as file: dictAnnotationsEntities = json.load(file) resultado = extractionOfConcepts(annotations) anotaciones = [anotacion for lista in resultado for anotacion in lista] dictAnotacionesDoc = dictConceptsDoc(anotaciones) #anotaciones = [concept, Entity, offset, end, ID] for key in dictAnotacionesDoc.keys(): for tupleAnot in dictAnotacionesDoc[key]: if tupleAnot[1] == 'DATE': idDoc = tupleAnot[4] listHemogramConcepts.append(tupleAnot) dictMetrics = checkDoses(idDoc, sentsDoc[idDoc]) for key in dictMetrics: for metrics in dictMetrics[key]: for metricConcept in (metrics): for z in range(len(metricConcept)): if metricConcept[z][5] != 'NUMBER': distanceMetric = sentsDoc[idDoc].find(metrics[0]) distanceConcept = sentsDoc[idDoc].find(tupleAnot[0]) if(distanceMetric - distanceConcept < 2): completeHemogram = tupleAnot[0] + metrics[0] print(completeHemogram) print("wait") bertConcepts()