Upload New File

3cb781cd · Lucia Catalan Gris · c040bbb2 · 3cb781cd
Commit 3cb781cd authored Feb 19, 2021 by Lucia Catalan Gris
Hide whitespace changes
Inline Side-by-side

Showing with 191 additions and 0 deletions

Stage_Tnm_Extractor/estadioTnm.py Stage_Tnm_Extractor/estadioTnm.py +191 -0

No files found.
--- a/Stage_Tnm_Extractor/estadioTnm.py
+++ b/Stage_Tnm_Extractor/estadioTnm.py
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Feb  8 09:49:51 2021
+
+@author: Lucia
+"""
+import sys, os, json
+import ConceptExtractor 
+import pandas as pd
+
+#---------------------------- FUNCIONES ---------------------------------------
+
+def stage_tablas(Estadio):       
+
+    note_id_stage = []
+    stage_id = []
+    begin_stage = []
+    end_stage = []
+    stage = []
+    conteo = 0
+
+    for j in range(0, len(Estadio)): 
+    
+        stage_id.append(conteo)
+        conteo = conteo + 1
+        note_id_stage.append(Estadio.iloc[j][1])            
+        begin_stage.append(Estadio.iloc[j][4])
+        end_stage.append(Estadio.iloc[j][5])
+        stage.append(str(Estadio.iloc[j][2].split()[1]))
+        
+    stage = pd.DataFrame({'stage_id':stage_id, 'stage': stage})
+    note_stage = pd.DataFrame({'note_id': note_id_stage,
+                               'stage_id':stage_id,
+                               'begin': begin_stage,
+                               'end':end_stage})
+    return stage, note_stage
+
+def tnm_tablas(Tnm, stage):
+
+    conteo = 0
+    note_id_tnm = []
+    tnm_id = []
+    begin_tnm = []
+    end_tnm = []
+    t = []
+    n = []
+    m = []
+    
+    for j in range(0, len(Tnm)): 
+    
+        tnm_id.append(conteo)
+        conteo = conteo + 1
+        
+        note_id_tnm.append(Tnm.iloc[j][1])            
+        begin_tnm.append(Tnm.iloc[j][4])
+        end_tnm.append(Tnm.iloc[j][5])
+    
+        for i in range(0,len(Tnm.iloc[j][2])-1):
+            if Tnm.iloc[j][2][i] == 't' and i < len(Tnm.iloc[j][2]):
+                t.append(Tnm.iloc[j][2][i+1])
+            elif Tnm.iloc[j][2][i] == 'n' and i < len(Tnm.iloc[j][2]):
+                n.append(Tnm.iloc[j][2][i+1])
+            elif Tnm.iloc[j][2][i] == 'm' and i < len(Tnm.iloc[j][2]):
+                m.append(Tnm.iloc[j][2][i+1])
+            
+        if len(t) < len(tnm_id):
+            t.append(None)
+        if len(n) < len(tnm_id):
+            n.append(None)
+        if len(m) < len(tnm_id):
+            m.append(None)
+            
+    control = 1
+    note_id_trad = []
+    begin_trad = []
+    end_trad = []
+    stage_id = []
+    contador_stage = stage['stage_id'].loc[stage.index[-1]]
+    traducciones = []
+    
+    for z in range(0, len(t)):
+        if m[z] == '0':
+            if t[z] == '4':
+                traducciones.append('iiib')             
+            elif n[z] == '3':
+                traducciones.append('iiic') 
+            elif n[z] == '0':
+                if t[z] == '1':
+                    traducciones.append('i')
+                elif t[z] == '2':
+                    traducciones.append('iia')
+                elif t[z] == '3':
+                    traducciones.append('iib')
+            elif t[z] == '0': 
+                if n[z] == '1':
+                    traducciones.append('iia')
+                elif n[z] == '2':
+                    traducciones.append('iiia')
+            elif (n[z] == '1' and t[z] == '1'):
+                traducciones.append('iia')
+            elif (n[z] == '1' and t[z] == '2'): 
+                traducciones.append('iib')
+            elif (n[z] == '2') and (t[z] == '1' or t[z] == '2'):
+                traducciones.append('iiia')
+            elif (t[z] == '3') and (n[z] == '1' or n[z] == '2'):
+                traducciones.append('iiia')
+        elif m[z] == '1':
+            traducciones.append('iv') 
+            
+        if control == len(traducciones):
+            stage_id.append(contador_stage)
+            contador_stage = contador_stage + 1
+            note_id_trad.append(note_id_tnm[z])
+            begin_trad.append(begin_tnm[z])
+            end_trad.append(end_tnm[z])
+            control = control + 1
+    
+    tnm = pd.DataFrame({'tnm_id':tnm_id, 't': t, 'n': n, 'm': m, 'stage_id':stage_id})       
+    note_tnm = pd.DataFrame({'note_id': note_id_tnm,
+                         'tnm_id':tnm_id,
+                         'begin': begin_tnm,
+                         'end':end_tnm})
+    
+    traduccion_stage = pd.DataFrame({'stage_id':note_id_trad, 'stage': traducciones})
+    traduccion_note_stage = pd.DataFrame({'note_id': note_id_trad,
+                               'stage_id':stage_id,
+                               'begin': begin_trad,
+                               'end':end_trad})
+    
+    return tnm, note_tnm, traduccion_stage, traduccion_note_stage
+
+def estadioTnm(annotations, tabla_documentos):
+    
+    resultado = ConceptExtractor.extractionOfConcepts(annotations)
+    anotaciones = [anotacion for lista in resultado for anotacion in lista]
+    concepts = pd.DataFrame({
+            'EHR': [tabla_documentos.loc[anotaciones[i][4]][1] for i in range(0, len(anotaciones))],  
+            'document_id': [tabla_documentos.loc[anotaciones[i][4]][0] for i in range(0, len(anotaciones))],
+            'concept' : [anotaciones[i][0] for i in range(0, len(anotaciones))],
+            'entity' : [anotaciones[i][1] for i in range(0, len(anotaciones))],
+            'start':[anotaciones[i][2] for i in range(0, len(anotaciones))],
+            'end': [anotaciones[i][3] for i in range(0, len(anotaciones))],
+            'id_doc': [anotaciones[i][4] for i in range(0, len(anotaciones))]})
+
+    Estadio = concepts.loc[concepts['entity'] == 'STADIO']
+    tablas_estadio = stage_tablas(Estadio)
+    stage = tablas_estadio[0]
+    note_stage = tablas_estadio[1]
+    
+    Tnm = concepts.loc[concepts['entity'] == 'TNM']
+    tablas_tnm = tnm_tablas(Tnm, stage)
+    tnm = tablas_tnm[0]
+    note_tnm = tablas_tnm[1]
+    stage = stage.append(tablas_tnm[2], ignore_index = True) 
+    note_stage = note_stage.append(tablas_tnm[3], ignore_index = True)
+    
+    return tnm, note_tnm, stage, note_stage
+    
+#-------------------------- MAIN ----------------------------------------------
+
+#Input: anotaciones de bert (lista de listas de diccionarios)
+#       EHR e id de los documentos de los que provienen las notas (pendiente quitarlo)
+#Output: dos csv   
+def main():
+    
+    jsonRoute = sys.argv[1]
+    documentRoute = sys.argv[2]
+    
+    if os.path.exists(jsonRoute):
+        
+        with open(jsonRoute) as json_file:
+            annotations = json.load(json_file)
+        
+        if os.path.exists(documentRoute):
+            
+            tabla_documentos = pd.read_csv(documentRoute)
+            resultado = estadioTnm(annotations, tabla_documentos)
+            resultado[0].to_csv(r'tnm.csv', index = False)   
+            resultado[1].to_csv(r'note_tnm.csv', index = False)  
+            resultado[2].to_csv(r'stage.csv', index = False)   
+            resultado[3].to_csv(r'note_stage.csv', index = False) 
+            
+        else:
+            print("Second argument file doesn't exist")
+
+    else:
+        print("First argument file doesn't exist")
+
+if __name__ == "__main__":
+    main()   
+