Commit c504df72 authored by Jorge Molina Gomez's avatar Jorge Molina Gomez

Extract hemograms concepts

parent 0c5771fd
import json
import copy
import datetime
from datetime import date
import nltk
import requests
import pandas as pd
from ConceptExtractor_v3 import extractionOfConcepts
from timeit import default_timer
currentDate = str(date.today()).split(" ")[0]
def dictConceptsDoc(listTuples):
dict = {}
listConcepts = []
for i in range(len(listTuples)):
listConcepts.clear()
idDoc = listTuples[i][4]
if idDoc not in dict:
for j in range(len(listTuples)):
if idDoc == listTuples[j][4]:
listConcepts.append(listTuples[j])
dict[idDoc] = copy.copy(listConcepts)
return dict
def checkDoses(idDoc, sentDocs):
countSents = 0
metricsList = []
dictMetrics = {}
resultList = []
for sents in sentDocs:
resp = requests.post(url="http://138.4.130.153:8088/jkes/annotator/dateAnnotator",
json={idDoc: [currentDate, sents]},
verify=False)
result_metrics = resp.json()
metricsList.append(result_metrics['response'][1])
#if dictMetrics != {}:
# metricsList.append([dictMetrics[idDoc]])
countSents += 1
for j in range(len(metricsList)):
for key in metricsList[j]:
if metricsList[j][key] != []:
resultList.append(metricsList[j][key])
dictMetrics[idDoc] = copy.copy(resultList)
return dictMetrics
def bertConcepts():
inicio = default_timer()
fileDocs = 'breast_notes_100randomly_sampled.xlsx'
textDocs = pd.read_excel(fileDocs, sheet_name='Sheet1')
textDocs = textDocs['DESCRIPCION']
sentsDoc = {}
spanish_tokenizer = nltk.data.load('tokenizers/punkt/spanish.pickle')
for i in range(len(textDocs)):
sentsDoc[i] = spanish_tokenizer.tokenize(textDocs[i])
listHemogramConcepts = []
dictAnotacionesDoc = {}
dictMetrics = {}
with open('annotations.json') as json_file:
annotations = json.load(json_file)
data = extractionOfConcepts(annotations)
with open('entity_cuis.json') as file:
dictAnnotationsEntities = json.load(file)
resultado = extractionOfConcepts(annotations)
anotaciones = [anotacion for lista in resultado for anotacion in lista]
dictAnotacionesDoc = dictConceptsDoc(anotaciones)
#anotaciones = [concept, Entity, offset, end, ID]
for key in dictAnotacionesDoc.keys():
for tupleAnot in dictAnotacionesDoc[key]:
if tupleAnot[1] == 'DATE':
idDoc = tupleAnot[4]
listHemogramConcepts.append(tupleAnot)
dictMetrics = checkDoses(idDoc, sentsDoc[idDoc])
for key in dictMetrics:
for metrics in dictMetrics[key]:
for metricConcept in (metrics):
for z in range(len(metricConcept)):
if metricConcept[z][5] != 'NUMBER':
distanceMetric = sentsDoc[idDoc].find(metrics[0])
distanceConcept = sentsDoc[idDoc].find(tupleAnot[0])
if(distanceMetric - distanceConcept < 2):
completeHemogram = tupleAnot[0] + metrics[0]
print(completeHemogram)
print("wait")
bertConcepts()
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment