4.04 KB
Newer Older
Lucia Catalan Gris's avatar
Lucia Catalan Gris committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96
# -*- coding: utf-8 -*-
Created on Tue Feb  9 10:15:21 2021

@author: Lucia
import json
import ConceptExtractor 
import pandas as pd

#----------------- EXTRAER ANOTACIONES ----------------------------------------

with open('annotations.json') as json_file:
   annotations = json.load(json_file)

#------------------ clarifyv2.document ----------------------------------------

tabla_documentos = pd.read_csv("documentos_clarifyv2.csv")
#------------------- ANTECEDENTES FAMILIARES ----------------------------------  

# Rellena las tablas family_antecedents y note_family_antecendets de concept_extraction
#Input: anotaciones de bert (lista de listas de diccionarios)
#       EHR e id de los documentos de los que provienen las notas
#Output: dos csv
def antecedentes_familiares_tablas(annotations, tabla_documentos):
    #Estraemos anotaciones
    resultado = ConceptExtractor.extractionOfConcepts(annotations)
    anotaciones = [anotacion for lista in resultado for anotacion in lista]
    concepts = pd.DataFrame({
            'EHR': [tabla_documentos.loc[anotaciones[i][4]][1] for i in range(0, len(anotaciones))],  
            'document_id': [tabla_documentos.loc[anotaciones[i][4]][0] for i in range(0, len(anotaciones))],
            'concept' : [anotaciones[i][0] for i in range(0, len(anotaciones))],
            'entity' : [anotaciones[i][1] for i in range(0, len(anotaciones))],
            'start':[anotaciones[i][2] for i in range(0, len(anotaciones))],
            'end': [anotaciones[i][3] for i in range(0, len(anotaciones))],
            'id_doc': [anotaciones[i][4] for i in range(0, len(anotaciones))]})
    #Filtramos por FAMILY
    Family = concepts.loc[concepts['entity'] == 'FAMILY']
    another_family_flag = False
    conteo = 0
    family_antecedents_id = []
    family_member = []
    cancer_type_family_member = []
    begin = []
    end = []
    note_id = []

    for j in range(0, len(Family)): 
        #Indice de la primera palabra del concepto de familia
        indice = next((pos for pos, item in enumerate(annotations[Family.iloc[j][6]]) if item["word"] == Family.iloc[j][2].split()[0]), None)
        for i in range(indice + 1, indice + 4):
            #Si encuentro un concepto de cancer
            if annotations[Family.iloc[j][6]][i].get('entity') == 'B_CANCER_CONCEPT' and another_family_flag == False:
                #id de la anotacion
                conteo = conteo + 1
                for a in anotaciones:
                    if (a[1] == 'CANCER_CONCEPT') and (a[4] == Family.iloc[j][6]) and (a[2] == annotations[Family.iloc[j][6]][i].get('start')):
                #No busca mas
            #si encuentro otro concepto de FAMILY           
            elif annotations[Family.iloc[j][6]][i].get('entity') == 'B_FAMILY':
                another_family_flag = True
        another_family_flag = False     
    family_antecedents = pd.DataFrame({'family_antecedents_id':family_antecedents_id,
                                   'family_member': family_member,
    family_antecedents.to_csv(r'family_antecedents.csv', index = False)   
    note_family_antecendets = pd.DataFrame({'note_id': note_id,
                                        'begin': begin,
    note_family_antecendets.to_csv(r'note_family_antecendets.csv', index = False) 

antecedentes_familiares_tablas(annotations, tabla_documentos)