diff --git a/Familiar_Antecedents_Extractor/antecedentesFamiliares.py b/Familiar_Antecedents_Extractor/antecedentesFamiliares.py new file mode 100644 index 0000000000000000000000000000000000000000..6a721d9a73eec49ac692f702fff60a41ac53c2d4 --- /dev/null +++ b/Familiar_Antecedents_Extractor/antecedentesFamiliares.py @@ -0,0 +1,127 @@ +# -*- coding: utf-8 -*- +""" +Created on Tue Feb 9 10:15:21 2021 + +@author: Lucia +""" +import sys, os, json +import ConceptExtractor +import pandas as pd + +#------------------- ANTECEDENTES FAMILIARES ---------------------------------- + +# Rellena las tablas family_antecedents y note_family_antecendets de concept_extraction +#Input: anotaciones de bert (lista de listas de diccionarios) +# EHR e id de los documentos de los que provienen las notas +#Output: dos csv +def antecedentes_familiares_tablas(annotations, tabla_documentos): + + #Estraemos anotaciones + resultado = ConceptExtractor.extractionOfConcepts(annotations) + anotaciones = [anotacion for lista in resultado for anotacion in lista] + concepts = pd.DataFrame({ + 'EHR': [tabla_documentos.loc[anotaciones[i][4]][1] for i in range(0, len(anotaciones))], + 'document_id': [tabla_documentos.loc[anotaciones[i][4]][0] for i in range(0, len(anotaciones))], + 'concept' : [anotaciones[i][0] for i in range(0, len(anotaciones))], + 'entity' : [anotaciones[i][1] for i in range(0, len(anotaciones))], + 'start':[anotaciones[i][2] for i in range(0, len(anotaciones))], + 'end': [anotaciones[i][3] for i in range(0, len(anotaciones))], + 'id_doc': [anotaciones[i][4] for i in range(0, len(anotaciones))]}) + + #Filtramos por FAMILY + Family = concepts.loc[concepts['entity'] == 'FAMILY'] + #Variables + another_family_flag = False + conteo = 0 + family_antecedents_id = [] + family_member = [] + cancer_type_family_member = [] + begin = [] + end = [] + note_id = [] + + for j in range(0, len(Family)): + + #Indice de la primera palabra del concepto de familia + indice = next((pos for pos, item in enumerate(annotations[Family.iloc[j][6]]) if item["word"] == Family.iloc[j][2].split()[0]), None) + + for i in range(indice + 1, indice + 4): + + #Si encuentro un concepto de cancer + if annotations[Family.iloc[j][6]][i].get('entity') == 'B_CANCER_CONCEPT' and another_family_flag == False: + #id de la anotacion + family_antecedents_id.append(conteo) + conteo = conteo + 1 + + family_member.append(Family.iloc[j][2]) + note_id.append(Family.iloc[j][1]) + begin.append(Family.iloc[j][4]) + end.append(Family.iloc[j][5]) + + for a in anotaciones: + if (a[1] == 'CANCER_CONCEPT') and (a[4] == Family.iloc[j][6]) and (a[2] == annotations[Family.iloc[j][6]][i].get('start')): + cancer_type_family_member.append(a[0]) + break + #No busca mas + break + + #si encuentro otro concepto de FAMILY + elif annotations[Family.iloc[j][6]][i].get('entity') == 'B_FAMILY': + another_family_flag = True + break + + another_family_flag = False + + + #TABLAS + family_antecedents = pd.DataFrame({'family_antecedents_id':family_antecedents_id, + 'family_member': family_member, + 'cancer_type_family_member':cancer_type_family_member}) + family_antecedents.to_csv(r'family_antecedents.csv', index = False) + note_family_antecendets = pd.DataFrame({'note_id': note_id, + 'family_antecedents_id':family_antecedents_id, + 'begin': begin, + 'end':end}) + note_family_antecendets.to_csv(r'note_family_antecendets.csv', index = False) + + +#-------------------------- MAIN ---------------------------------------------- +#Input: anotaciones de bert (lista de listas de diccionarios) +# EHR e id de los documentos de los que provienen las notas (pendiente quitarlo) +#Output: dos csv +def main(): + + jsonRoute = sys.argv[1] + documentRoute = sys.argv[2] + + if os.path.exists(jsonRoute): + + with open(jsonRoute) as json_file: + annotations = json.load(json_file) + + if os.path.exists(documentRoute): + tabla_documentos = pd.read_csv(documentRoute) + antecedentes_familiares_tablas(annotations, tabla_documentos) + + else: + print("Second argument file doesn't exist") + + else: + print("First argument file doesn't exist") + +if __name__ == "__main__": + main() + +''' +#----------------- EXTRAER ANOTACIONES ---------------------------------------- + +with open('annotations.json') as json_file: + annotations = json.load(json_file) + +#------------------ clarifyv2.document ---------------------------------------- + +tabla_documentos = pd.read_csv("documentos_clarifyv2.csv") + +antecedentes_familiares_tablas(annotations, tabla_documentos) + +'''