Upload New File

b2213934 · Lucia Catalan Gris · 8f4b70e8 · b2213934
Commit b2213934 authored Feb 19, 2021 by Lucia Catalan Gris
Hide whitespace changes
Inline Side-by-side

Showing with 127 additions and 0 deletions

Familiar_Antecedents_Extractor/antecedentesFamiliares.py Familiar_Antecedents_Extractor/antecedentesFamiliares.py +127 -0

No files found.
--- a/Familiar_Antecedents_Extractor/antecedentesFamiliares.py
+++ b/Familiar_Antecedents_Extractor/antecedentesFamiliares.py
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Feb  9 10:15:21 2021
+
+@author: Lucia
+"""
+import sys, os, json
+import ConceptExtractor 
+import pandas as pd
+    
+#------------------- ANTECEDENTES FAMILIARES ----------------------------------  
+
+# Rellena las tablas family_antecedents y note_family_antecendets de concept_extraction
+#Input: anotaciones de bert (lista de listas de diccionarios)
+#       EHR e id de los documentos de los que provienen las notas
+#Output: dos csv
+def antecedentes_familiares_tablas(annotations, tabla_documentos):
+    
+    #Estraemos anotaciones
+    resultado = ConceptExtractor.extractionOfConcepts(annotations)
+    anotaciones = [anotacion for lista in resultado for anotacion in lista]
+    concepts = pd.DataFrame({
+            'EHR': [tabla_documentos.loc[anotaciones[i][4]][1] for i in range(0, len(anotaciones))],  
+            'document_id': [tabla_documentos.loc[anotaciones[i][4]][0] for i in range(0, len(anotaciones))],
+            'concept' : [anotaciones[i][0] for i in range(0, len(anotaciones))],
+            'entity' : [anotaciones[i][1] for i in range(0, len(anotaciones))],
+            'start':[anotaciones[i][2] for i in range(0, len(anotaciones))],
+            'end': [anotaciones[i][3] for i in range(0, len(anotaciones))],
+            'id_doc': [anotaciones[i][4] for i in range(0, len(anotaciones))]})
+    
+    #Filtramos por FAMILY
+    Family = concepts.loc[concepts['entity'] == 'FAMILY']
+    #Variables
+    another_family_flag = False
+    conteo = 0
+    family_antecedents_id = []
+    family_member = []
+    cancer_type_family_member = []
+    begin = []
+    end = []
+    note_id = []
+
+    for j in range(0, len(Family)): 
+    
+        #Indice de la primera palabra del concepto de familia
+        indice = next((pos for pos, item in enumerate(annotations[Family.iloc[j][6]]) if item["word"] == Family.iloc[j][2].split()[0]), None)
+    
+        for i in range(indice + 1, indice + 4):
+        
+            #Si encuentro un concepto de cancer
+            if annotations[Family.iloc[j][6]][i].get('entity') == 'B_CANCER_CONCEPT' and another_family_flag == False:
+                #id de la anotacion
+                family_antecedents_id.append(conteo)
+                conteo = conteo + 1
+            
+                family_member.append(Family.iloc[j][2])
+                note_id.append(Family.iloc[j][1])            
+                begin.append(Family.iloc[j][4])
+                end.append(Family.iloc[j][5])
+            
+                for a in anotaciones:
+                    if (a[1] == 'CANCER_CONCEPT') and (a[4] == Family.iloc[j][6]) and (a[2] == annotations[Family.iloc[j][6]][i].get('start')):
+                        cancer_type_family_member.append(a[0])                
+                        break
+                #No busca mas
+                break
+        
+            #si encuentro otro concepto de FAMILY           
+            elif annotations[Family.iloc[j][6]][i].get('entity') == 'B_FAMILY':
+                another_family_flag = True
+                break
+        
+        another_family_flag = False     
+            
+        
+    #TABLAS
+    family_antecedents = pd.DataFrame({'family_antecedents_id':family_antecedents_id,
+                                   'family_member': family_member,
+                                   'cancer_type_family_member':cancer_type_family_member})
+    family_antecedents.to_csv(r'family_antecedents.csv', index = False)   
+    note_family_antecendets = pd.DataFrame({'note_id': note_id,
+                                        'family_antecedents_id':family_antecedents_id,
+                                        'begin': begin,
+                                        'end':end})
+    note_family_antecendets.to_csv(r'note_family_antecendets.csv', index = False) 
+
+
+#-------------------------- MAIN ----------------------------------------------
+#Input: anotaciones de bert (lista de listas de diccionarios)
+#       EHR e id de los documentos de los que provienen las notas (pendiente quitarlo)
+#Output: dos csv   
+def main():
+    
+    jsonRoute = sys.argv[1]
+    documentRoute = sys.argv[2]
+    
+    if os.path.exists(jsonRoute):
+        
+        with open(jsonRoute) as json_file:
+            annotations = json.load(json_file)
+        
+        if os.path.exists(documentRoute):
+            tabla_documentos = pd.read_csv(documentRoute)
+            antecedentes_familiares_tablas(annotations, tabla_documentos) 
+            
+        else:
+            print("Second argument file doesn't exist")
+
+    else:
+        print("First argument file doesn't exist")
+
+if __name__ == "__main__":
+    main()   
+
+'''
+#----------------- EXTRAER ANOTACIONES ----------------------------------------
+
+with open('annotations.json') as json_file:
+   annotations = json.load(json_file)
+
+#------------------ clarifyv2.document ----------------------------------------
+
+tabla_documentos = pd.read_csv("documentos_clarifyv2.csv")
+
+antecedentes_familiares_tablas(annotations, tabla_documentos) 
+ 
+'''