Commit 1378bd54 authored by Jorge Molina Gomez's avatar Jorge Molina Gomez

Proceso para extraer los hemogramas de umls_old_dx

parent 9396d39a
import copy
import pickle
from datetime import date
from utils_sql import *
import requests
currentDate = str(date.today()).split(" ")[0]
#Funcion de llamada al modulo de metricas para obtener las metricas que aparecen en un texto
#Input: id Documento, frases a anotar
#Output: diccionario {key: idDoc, value: [metricas]}
def checkDoses(idDoc, sent):
metricsList = []
try:
resp = requests.post(url="http://138.4.130.153:8088/jkes/annotator/dateAnnotator",
json={str(idDoc): [currentDate, [sent.strip()]]},
verify=False)
result_metrics = resp.json()
for metrics in result_metrics['response'][1]:
for metricConcepts in result_metrics['response'][1][metrics]:
for metric in metricConcepts:
if metric[5] == 'METRIC' or metric[5] == 'NUMBER':
metricsList.append(metric[0])
except:
pass
return metricsList
#Funcion que obtiene los leucocitos, linfocitos y la hemoglobina dada en un texto junto
#sus metricas correspondientes
#Input: diccionario {key: EHR, value: [conceptos]
#Output: diccionario {key: EHR, value: {key: idDoc, value: [metricas]}}
def checkHemograms(dictHemograms):
#dict: {EHR: [(docId, sentence, sentence_id, concepts, begin, end),...()]}
sentsHemogram = []
listLeucocitos = []
listFLeucocitos = []
listLinfocitos = []
listFLinfocitos = []
listHemoglob = []
listFHemoglob = []
dictResult = {}
dictFails = {}
count = 1
for key in dictHemograms:
print("Process: " + str(key))
print(count)
count += 1
dictResult[key] = {}
dictFails[key] = {}
for concepts in dictHemograms[key]:
listLeucocitos.clear()
listLinfocitos.clear()
listHemoglob.clear()
listFLeucocitos.clear()
listFLinfocitos.clear()
listFHemoglob.clear()
dictResult[key], dictFails[key] = findHemograms(concepts, dictResult[key], dictFails[key], listLeucocitos, listFLeucocitos,
listLinfocitos, listFLinfocitos, listHemoglob, listFHemoglob)
pickle.dump(dictResult, open("hemograms_v2.p", "wb"))
pickle.dump(dictFails, open("failsHemogram_v2.p", "wb"))
print("finish")
#Funcion auxiliar que busca las palabras leucocitos, linfocitos y hemoglobina dadas en un texto junto
#sus metricas correspondientes
def findHemograms(concepts, patient, errors, listLeucocitos, listFLeucocitos, listLinfocitos, listFLinfocitos, listHemoglob, listFHemoglob):
sentConcept = concepts[1]
sentConceptId = concepts[2]
sentsHemogram = sentConcept.split(",")
idDoc = concepts[0]
tamSentence = 0
if (idDoc, sentConceptId) not in patient:
for sent in sentsHemogram:
sent = sent.lower()
if ('leucoci' in sent):
sent_metric = sent.split("leucoci", 1)[1]
metric = checkDoses(idDoc, sent_metric)
if metric != []:
completeHemogram = 'Leucocitos' + " " + metric[0]
positionConcept = sent.find('leucoci')
begin = tamSentence + positionConcept
end = begin + len('leucocitos')
listLeucocitos.append((completeHemogram, begin, end))
else:
listFLeucocitos.append((sentConcept, sent, 'Leucocitos'))
listLeucocitos.append('Leucocitos')
if ('linfoci' in sent):
sent_metric = sent.split("linfoci", 1)[1]
metric = checkDoses(idDoc, sent_metric)
if metric != []:
completeHemogram = 'Linfocitos' + " " + metric[0]
positionConcept = sent.find('linfoci')
begin = tamSentence + positionConcept
end = begin + len('linfocitos')
listLinfocitos.append((completeHemogram, begin, end))
else:
listFLinfocitos.append((sentConcept, sent, 'Linfocitos'))
listLinfocitos.append('Linfocitos')
if ('hemoglob' in sent):
sent_metric = sent.split("hemoglob", 1)[1]
metric = checkDoses(idDoc, sent_metric)
if metric != []:
completeHemogram = 'Hemoglobina' + " " + metric[0]
positionConcept = sent.find('hemoglob')
begin = tamSentence + positionConcept
end = begin + len('hemoglobina')
listHemoglob.append((completeHemogram, begin, end))
else:
listFHemoglob.append((sentConcept, sent, 'Hemoglobina',))
listHemoglob.append('Hemoglobina')
tamSentence += len(sent) + 1
patient[(idDoc, sentConceptId)] = [copy.copy(listLeucocitos), copy.copy(listLinfocitos), copy.copy(listHemoglob)]
errors[(idDoc, sentConceptId)] = [copy.copy(listFLeucocitos), copy.copy(listFLinfocitos), copy.copy(listFHemoglob)]
errors[(idDoc, sentConceptId)] = [x for x in errors[(idDoc, sentConceptId)] if x != []]
return patient, errors
#Funcion auxiliar para insertar los conceptos de hemograma de un paciente en la BBDD de breast_annotations y
#concept_extraction
def insertHemograms(hemograms):
breast_clarif_breast_mngr = generate_db_connection("138.4.130.153", 3306, "medaldeveloper", "currentClarif3D$B",
"clarify_breast_annotations")
queryLastId = "select max(hemogram_id) from concept_extraction.hemogram order by hemogram_id asc;" # Obtenemos el ultimo id insertado en la tabla
breast_clarif_breast_mngr['cursor'].execute(queryLastId)
lastIdHemogram = 0
for row in breast_clarif_breast_mngr['cursor']:
if ((row[0] is not None) and (int(row[0]) >= 0)):
lastIdHemogram = int(row[0]) + 1
breast_clarif_breast_mngr['cnx'].close()
breast_clarif_breast_mngr['cursor'].close()
breast_clarif_breast = generate_db_connection("138.4.130.153", 3306, "medaldeveloper", "currentClarif3D$B",
"clarify_breast_annotations")
clarify_conceptExt = generate_db_connection("138.4.130.153", 3306, "medaldeveloper", "currentClarif3D$B",
"concept_extraction")
for ehr in hemograms:
#Insert Breast annotations
insertAnnotations(ehr, hemograms[ehr], lastIdHemogram, breast_clarif_breast, clarify_conceptExt)
breast_clarif_breast_mngr['cnx'].close()
breast_clarif_breast_mngr['cursor'].close()
#Funcion auxiliar para insertar los conceptos de hemograma de un paciente en la BBDD de breast_annotations y
#concept_extraction
def insertAnnotations(ehr, concepts, lastIdHemogram, cursorBreast, cursorConcept):
for docs in concepts:
for i in range(len(concepts[docs])):
if concepts[docs][i][0] == []:
concepts[docs][i][0] = 'None'
listConcepts = concepts[docs]
idDoc = docs[0]
sentence_id = docs[1]
insert_breast = "insert into clarify_breast_annotations.hemogram (EHR, leucocytes, lymphocytes, redBloodCells) values ('"+str(ehr)+"','"+str(listConcepts[0][0])+"','"+str(listConcepts[1][0])+"','"+str(listConcepts[2][0])+"')"
#cursorBreast['cursor'].execute(insert_breast)
#cursorBreast['cnx'].commit()
insert_conceptExt = "insert into concept_extraction.hemogram (hemogram_id, leucocytes, lymphocytes, red_blood_cells) values ('"+str(lastIdHemogram)+"','"+str(listConcepts[0][0])+"','"+str(listConcepts[1][0])+"','"+str(listConcepts[2][0])+"')"
#cursorConcept['cursor'].execute(insert_conceptExt)
#cursorConcept['cnx'].commit()
lastIdHemogram += 1
insert_note_concept_leu = "insert into concept_extraction.note_hemogram (note_id, sentence_id, hemogram_id, begin, end, negation, speculation) values ('"+str(idDoc)+"','"+str(sentence_id)+"','"+str(lastIdHemogram)+"','"+str(listConcepts[0][1])+"', ,'"+str(listConcepts[0][2])+"')"
insert_note_concept_lin = "insert into concept_extraction.note_hemogram (note_id, sentence_id, hemogram_id, begin, end, negation, speculation) values ('" + str(idDoc) + "','" + str(sentence_id) + "','" + str(lastIdHemogram) + "','" + str(listConcepts[1][1]) + "', ,'" + str(listConcepts[1][2]) + "')"
insert_note_concept_hem = "insert into concept_extraction.note_hemogram (note_id, sentence_id, hemogram_id, begin, end, negation, speculation) values ('" + str(idDoc) + "','" + str(sentence_id) + "','" + str(lastIdHemogram) + "','" + str(listConcepts[2][1]) + "', ,'" + str(listConcepts[2][2]) + "')"
#cursorConcept['cursor'].execute(insert_note_concept_leu)
#cursorConcept['cursor'].execute(insert_note_concept_lin)
#cursorConcept['cursor'].execute(insert_note_concept_hem)
#cursorConcept['cnx'].commit()
#Funcion para obtener todos los conceptos anotados de un determinado paciente
#Input: tuplas de conceptos con su informaciin anotada
#Output: diccionario {key: idDoc, value: [conceptos]}
def dictConceptsEhr(docId, listEhr, sentence, sentence_id, concepts, begin, end, umlsBatch):
dict = {}
listConcepts = []
for i in range(len(umlsBatch)):
listConcepts.clear()
if (umlsBatch['ehr'][i] not in dict):
for j in range(len(listEhr)):
if (umlsBatch['ehr'][i] == listEhr[j]):
listConcepts.append((docId[j], sentence[j], sentence_id[j], concepts[j], begin[j], end[j]))
dict[umlsBatch['ehr'][i]] = copy.copy(listConcepts)
return dict
#Flujo principal del proceso de extracción de los hemogramas
def hemogramConcepts():
'''
umls_hemograms_v2 = compose_dataframe_from_query(breast_clarif_mngr_umls, "umls_old_dx", None, None,
"concept in ('Hemoglobina', 'Leucocitos', 'Linfocitos') and entity_flag = 'Hemogram'",
None)
pickle.dump(umls_hemograms_v2, open("umls_hemograms_v2.p", "wb"))
umls_hemograms_v2 = pickle.load(open('umls_hemograms_v2.p', "rb"))
docId_hemogram = umls_hemograms_v2['document_id']
ehr_hemogram = umls_hemograms_v2['ehr']
concepts_hemogram = umls_hemograms_v2['concept']
sentence_hemogram = umls_hemograms_v2['sentence']
sentenceId_hemogram = umls_hemograms_v2['sentence_id']
beginConcept_hemogram = umls_hemograms_v2['begin']
endConcept_hemogram = umls_hemograms_v2['end']
dictHemograms_v2 = dictConceptsEhr(docId_hemogram, ehr_hemogram, sentence_hemogram, sentenceId_hemogram,
concepts_hemogram, beginConcept_hemogram, endConcept_hemogram, umls_hemograms_v2)
pickle.dump(dictHemograms_v2, open("umlsDict_hemograms_v2.p", "wb"))
'''
dict_hemograms = pickle.load(open('umlsDict_hemograms_v2.p', "rb"))
hemograms = checkHemograms(dict_hemograms) # Output: hemograms and list of errors
#hemograms = pickle.load(open('hemograms.p', "rb"))
#failsHemograms = pickle.load(open('failsHemogram.p', "rb"))
insertHemograms(hemograms)
hemogramConcepts()
\ No newline at end of file
# -*- coding: utf-8 -*-
"""
Created on Tue Feb 2 11:07:37 2021
@author: ctb
"""
import mysql.connector
import pandas as pd
def generate_db_connection(ip, port, user, password, db_name):
db_conn = mysql.connector.connect(
host=ip,
port=port,
user=user,
password=password,
database=db_name,
auth_plugin='mysql_native_password'
)
db_cursor = db_conn.cursor()
return {"cnx": db_conn, "cursor": db_cursor}
def get_columns_from_table(db_mngr, table_name):
db_mngr["db_crs"].execute("show columns from "+table_name)
semantic_annotation_table_cols = db_mngr["db_crs"].fetchall()
col_list = []
for x in semantic_annotation_table_cols:
col_list.append(x[0])
return col_list
# Genera un dataframe con el contenido de la tabla indicada en 'table_name'.
# Si se pasan numeros enteros en los campos opcionales 'batch_size' y 'offset', se devolvera el dataframe
# correspondiente a un numero de 'batch_size' filas de la tabla, desde la fila indicada por 'offset'.
def compose_dataframe_from_query(db_mngr, table_name, batch_size = None, offset = None, where_clause = None, join_clause = None):
if(join_clause is not None):
query = "Select u.document_id, u.ehr, u.concept, d.subcategory, u.begin, u.end from " + table_name + " u inner join document d on d.ID = u.document_id";
else:
query = "Select document_id, ehr, sentence, sentence_id, concept, begin, end from " + table_name
if where_clause is not None:
query = query + " where " + where_clause
#if group_by is not None:
# query = query + " group by " + group_by
if batch_size is not None and offset is not None:
query = query + " limit " + str(batch_size) + " offset " + str(offset)
db_mngr["db_crs"].execute(query)
result = db_mngr["db_crs"].fetchall()
if(join_clause is not None):
table_cols = ["document_id", "ehr", "concept", "subcategory", "begin", "end"]
else:
table_cols = ["document_id", "ehr", "sentence", "sentence_id", "concept", "begin", "end"]
if len(result) > 0:
result = pd.DataFrame(result)
result.columns = table_cols
else:
result = pd.DataFrame(columns=table_cols)
return result
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment