BertHemograms.py 3.31 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84
import json
import copy
import datetime
from datetime import date
import nltk
import requests
import pandas as pd
from ConceptExtractor_v3 import extractionOfConcepts
from timeit import default_timer
currentDate = str(date.today()).split(" ")[0]

def dictConceptsDoc(listTuples):
    dict = {}
    listConcepts = []
    for i in range(len(listTuples)):
        listConcepts.clear()
        idDoc = listTuples[i][4]
        if idDoc not in dict:
            for j in range(len(listTuples)):
                if idDoc == listTuples[j][4]:
                    listConcepts.append(listTuples[j])
            dict[idDoc] = copy.copy(listConcepts)
    return dict

def checkDoses(idDoc, sentDocs):
    countSents = 0
    metricsList = []
    dictMetrics = {}
    resultList = []
    for sents in sentDocs:
        resp = requests.post(url="http://138.4.130.153:8088/jkes/annotator/dateAnnotator",
                             json={idDoc: [currentDate, sents]},
                             verify=False)
        result_metrics = resp.json()
        metricsList.append(result_metrics['response'][1])
        #if dictMetrics != {}:
        #    metricsList.append([dictMetrics[idDoc]])
        countSents += 1
    for j in range(len(metricsList)):
        for key in metricsList[j]:
            if metricsList[j][key] != []:
                resultList.append(metricsList[j][key])
    dictMetrics[idDoc] = copy.copy(resultList)
    return dictMetrics

def bertConcepts():
    inicio = default_timer()
    fileDocs = 'breast_notes_100randomly_sampled.xlsx'
    textDocs = pd.read_excel(fileDocs, sheet_name='Sheet1')
    textDocs = textDocs['DESCRIPCION']
    sentsDoc = {}
    spanish_tokenizer = nltk.data.load('tokenizers/punkt/spanish.pickle')
    for i in range(len(textDocs)):
        sentsDoc[i] = spanish_tokenizer.tokenize(textDocs[i])
    listHemogramConcepts = []
    dictAnotacionesDoc = {}
    dictMetrics = {}
    with open('annotations.json') as json_file:
        annotations = json.load(json_file)
    data = extractionOfConcepts(annotations)
    with open('entity_cuis.json') as file:
        dictAnnotationsEntities = json.load(file)
    resultado = extractionOfConcepts(annotations)
    anotaciones = [anotacion for lista in resultado for anotacion in lista]
    dictAnotacionesDoc = dictConceptsDoc(anotaciones)
    #anotaciones = [concept, Entity, offset, end, ID]
    for key in dictAnotacionesDoc.keys():
        for tupleAnot in dictAnotacionesDoc[key]:
            if tupleAnot[1] == 'DATE':
                idDoc = tupleAnot[4]
                listHemogramConcepts.append(tupleAnot)
                dictMetrics = checkDoses(idDoc, sentsDoc[idDoc])
                for key in dictMetrics:
                    for metrics in dictMetrics[key]:
                        for metricConcept in (metrics):
                            for z in range(len(metricConcept)):
                                if metricConcept[z][5] != 'NUMBER':
                                    distanceMetric = sentsDoc[idDoc].find(metrics[0])
                                    distanceConcept = sentsDoc[idDoc].find(tupleAnot[0])
                                    if(distanceMetric - distanceConcept < 2):
                                        completeHemogram = tupleAnot[0] + metrics[0]
                                        print(completeHemogram)
                print("wait")
bertConcepts()