# -*- coding: utf-8 -*- """ Created on Tue Feb 9 10:15:21 2021 @author: Lucia """ import json import ConceptExtractor import pandas as pd #----------------- EXTRAER ANOTACIONES ---------------------------------------- with open('annotations.json') as json_file: annotations = json.load(json_file) #------------------ clarifyv2.document ---------------------------------------- tabla_documentos = pd.read_csv("documentos_clarifyv2.csv") #------------------- ANTECEDENTES FAMILIARES ---------------------------------- # Rellena las tablas family_antecedents y note_family_antecendets de concept_extraction #Input: anotaciones de bert (lista de listas de diccionarios) # EHR e id de los documentos de los que provienen las notas #Output: dos csv def antecedentes_familiares_tablas(annotations, tabla_documentos): #Estraemos anotaciones resultado = ConceptExtractor.extractionOfConcepts(annotations) anotaciones = [anotacion for lista in resultado for anotacion in lista] concepts = pd.DataFrame({ 'EHR': [tabla_documentos.loc[anotaciones[i][4]][1] for i in range(0, len(anotaciones))], 'document_id': [tabla_documentos.loc[anotaciones[i][4]][0] for i in range(0, len(anotaciones))], 'concept' : [anotaciones[i][0] for i in range(0, len(anotaciones))], 'entity' : [anotaciones[i][1] for i in range(0, len(anotaciones))], 'start':[anotaciones[i][2] for i in range(0, len(anotaciones))], 'end': [anotaciones[i][3] for i in range(0, len(anotaciones))], 'id_doc': [anotaciones[i][4] for i in range(0, len(anotaciones))]}) #Filtramos por FAMILY Family = concepts.loc[concepts['entity'] == 'FAMILY'] #Variables another_family_flag = False conteo = 0 family_antecedents_id = [] family_member = [] cancer_type_family_member = [] begin = [] end = [] note_id = [] for j in range(0, len(Family)): #Indice de la primera palabra del concepto de familia indice = next((pos for pos, item in enumerate(annotations[Family.iloc[j][6]]) if item["word"] == Family.iloc[j][2].split()[0]), None) for i in range(indice + 1, indice + 4): #Si encuentro un concepto de cancer if annotations[Family.iloc[j][6]][i].get('entity') == 'B_CANCER_CONCEPT' and another_family_flag == False: #id de la anotacion family_antecedents_id.append(conteo) conteo = conteo + 1 family_member.append(Family.iloc[j][2]) note_id.append(Family.iloc[j][1]) begin.append(Family.iloc[j][4]) end.append(Family.iloc[j][5]) for a in anotaciones: if (a[1] == 'CANCER_CONCEPT') and (a[4] == Family.iloc[j][6]) and (a[2] == annotations[Family.iloc[j][6]][i].get('start')): cancer_type_family_member.append(a[0]) break #No busca mas break #si encuentro otro concepto de FAMILY elif annotations[Family.iloc[j][6]][i].get('entity') == 'B_FAMILY': another_family_flag = True break another_family_flag = False #TABLAS family_antecedents = pd.DataFrame({'family_antecedents_id':family_antecedents_id, 'family_member': family_member, 'cancer_type_family_member':cancer_type_family_member}) family_antecedents.to_csv(r'family_antecedents.csv', index = False) note_family_antecendets = pd.DataFrame({'note_id': note_id, 'family_antecedents_id':family_antecedents_id, 'begin': begin, 'end':end}) note_family_antecendets.to_csv(r'note_family_antecendets.csv', index = False) antecedentes_familiares_tablas(annotations, tabla_documentos)