Upload New File

4e0e41a7 · Lucia Catalan Gris · 37653974 · 4e0e41a7
Commit 4e0e41a7 authored Feb 19, 2021 by Lucia Catalan Gris
Show whitespace changes
Inline Side-by-side

Showing with 171 additions and 0 deletions

Dates_Extractor/Dates.py Dates_Extractor/Dates.py +171 -0

No files found.
--- a/Dates_Extractor/Dates.py
+++ b/Dates_Extractor/Dates.py
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Feb 10 18:29:03 2021
+@author: Lucia
+"""
+import ConceptExtractor 
+import pandas as pd
+import sys, os, json, dateparser
+#---------------------------- FUNCIONES ---------------------------------------
+#Elimina las letras que sobran en una fecha introducida como una lista
+#Input: list
+#Output: string
+def eliminar_letras_sobrantes(elementos_fecha):
+    meses = ['enero','febrero','marzo','abril','mayo','junio','julio','agosto',
+             'septiembre','octubre','noviembre','diciembre']
+    for i in range(0, len(elementos_fecha)):
+        if elementos_fecha[i] not in meses and elementos_fecha[i] not in '/.-':
+            elementos_fecha[i] = ''.join(filter(str.isdigit, elementos_fecha[i]))
+    return ' '.join(elementos_fecha)
+#Parsea una fecha a formato yyyy-mm-dd
+#Input: string
+#Output: string
+def parsearFecha(fecha):
+    fecha_final = []
+    #Formateamos
+    fecha_formateada = dateparser.parse(fecha, languages=['es'], settings={'STRICT_PARSING': True})
+    #dateparser.parse('enero de 2003', settings={'RELATIVE_BASE': datetime.datetime(2000, None, 32)}).date()
+    elementos_fecha = []
+    #para rangos
+    fecha1 = ''
+    fecha2 = ''
+    if fecha_formateada is None:
+        #mas comunes / - .
+        if '-' in fecha:
+            elementos_fecha = fecha.split('-') 
+            #Si es un rango
+            if len(elementos_fecha) == 2 and '/' in elementos_fecha[0] or '/' in elementos_fecha[1]:
+                fecha1 = elementos_fecha[0].split('/')
+                fecha2 = elementos_fecha[0].split('/')
+                if len(fecha2)<3 and len(fecha1)==3:
+                    fecha2.append(fecha1[2])
+                elif len(fecha1)<3 and len(fecha2)==3:
+                    fecha1.append(fecha2[2])
+        elif '.' in fecha:
+            elementos_fecha = fecha.split('.')
+        elif '/' in fecha:
+            elementos_fecha = fecha.split('/')
+        else:
+            elementos_fecha = fecha.split()
+        #no rangos
+        if len(fecha1) == 0:         
+            fecha = dateparser.parse(eliminar_letras_sobrantes(elementos_fecha), languages=['es'], settings={'STRICT_PARSING': True} )
+            if fecha != None:
+                fecha_final.append(fecha.strftime('%Y-%m-%d'))
+        #rangos
+        else: 
+            fecha1 = dateparser.parse(eliminar_letras_sobrantes(fecha1), languages=['es'], settings={'STRICT_PARSING': True} )
+            fecha_final.append(fecha1.strftime('%Y-%m-%d'))
+            fecha2 = dateparser.parse(eliminar_letras_sobrantes(fecha2), languages=['es'], settings={'STRICT_PARSING': True} )
+            fecha_final.append(fecha2.strftime('%Y-%m-%d'))
+    else:
+        fecha_final.append(fecha_formateada.strftime('%Y-%m-%d'))
+    return fecha_final
+#Rellena las tablas date y note_date de concept_extraction
+#Input: anotaciones de bert (lista de listas de diccionarios)
+#       EHR e id de los documentos de los que provienen las notas (pendiente quitarlo)
+#Output: dos csv   
+def fechas_tablas(annotations, tabla_documentos):
+    #Estraemos anotaciones
+    resultado = ConceptExtractor.extractionOfConcepts(annotations)
+    anotaciones = [anotacion for lista in resultado for anotacion in lista]
+    concepts = pd.DataFrame({
+            'EHR': [tabla_documentos.loc[anotaciones[i][4]][1] for i in range(0, len(anotaciones))],  
+            'document_id': [tabla_documentos.loc[anotaciones[i][4]][0] for i in range(0, len(anotaciones))],
+            'concept' : [anotaciones[i][0] for i in range(0, len(anotaciones))],
+            'entity' : [anotaciones[i][1] for i in range(0, len(anotaciones))],
+            'start':[anotaciones[i][2] for i in range(0, len(anotaciones))],
+            'end': [anotaciones[i][3] for i in range(0, len(anotaciones))],
+            'id_doc': [anotaciones[i][4] for i in range(0, len(anotaciones))]})
+    #Filtramos por DATE
+    Date = concepts.loc[concepts['entity'] == 'DATE']
+    #Variables
+    conteo = 0
+    date_id = []
+    note_date = []
+    begin = []
+    end = []
+    note_id = []
+    # Formato -> yyyy-mm-dd
+    for j in range(0, len(Date)):
+        fecha = parsearFecha(Date.iloc[j][2]) 
+        if len(fecha) == 1:
+            date_id.append(conteo)
+            conteo = conteo + 1
+            note_date.append(fecha[0])
+            note_id.append(Date.iloc[j][1])            
+            begin.append(Date.iloc[j][4])
+            end.append(Date.iloc[j][5]) 
+        elif len(fecha) == 2:
+            for element in fecha:
+                date_id.append(conteo)
+                conteo = conteo + 1
+                note_date.append(element)
+                note_id.append(Date.iloc[j][1])            
+                begin.append(Date.iloc[j][4])
+                end.append(Date.iloc[j][5])  
+    #TABLAS
+    date = pd.DataFrame({'date_id':date_id,
+                         'note_date': note_date}) 
+    note_date = pd.DataFrame({'note_id': note_id,
+                              'date_id':date_id,
+                              'begin': begin,
+                              'end':end})
+    return date, note_date
+#-------------------------- MAIN ----------------------------------------------
+#Input: anotaciones de bert (lista de listas de diccionarios)
+#       EHR e id de los documentos de los que provienen las notas (pendiente quitarlo)
+#Output: dos csv   
+def main():
+    jsonRoute = sys.argv[1]
+    documentRoute = sys.argv[2]
+    if os.path.exists(jsonRoute):
+        with open(jsonRoute) as json_file:
+            annotations = json.load(json_file)
+        if os.path.exists(documentRoute): 
+            tabla_documentos = pd.read_csv(documentRoute)
+            fechas = fechas_tablas(annotations, tabla_documentos)           
+            fechas[0].to_csv(r'date.csv', index = False)  
+            fechas[1].to_csv(r'note_date.csv', index = False, encoding='utf-8-sig') 
+        else:
+            print("Second argument file doesn't exist")
+    else:
+        print("First argument file doesn't exist")
+if __name__ == "__main__":
+    main()            
\ No newline at end of file