From 3cb781cd03175024a2bdbbe9bd9446cffe996088 Mon Sep 17 00:00:00 2001 From: Lucia Catalan Gris Date: Fri, 19 Feb 2021 11:44:44 +0000 Subject: [PATCH] Upload New File --- Stage_Tnm_Extractor/estadioTnm.py | 191 ++++++++++++++++++++++++++++++ 1 file changed, 191 insertions(+) create mode 100644 Stage_Tnm_Extractor/estadioTnm.py diff --git a/Stage_Tnm_Extractor/estadioTnm.py b/Stage_Tnm_Extractor/estadioTnm.py new file mode 100644 index 0000000..adae158 --- /dev/null +++ b/Stage_Tnm_Extractor/estadioTnm.py @@ -0,0 +1,191 @@ +# -*- coding: utf-8 -*- +""" +Created on Mon Feb 8 09:49:51 2021 + +@author: Lucia +""" +import sys, os, json +import ConceptExtractor +import pandas as pd + +#---------------------------- FUNCIONES --------------------------------------- + +def stage_tablas(Estadio): + + note_id_stage = [] + stage_id = [] + begin_stage = [] + end_stage = [] + stage = [] + conteo = 0 + + for j in range(0, len(Estadio)): + + stage_id.append(conteo) + conteo = conteo + 1 + note_id_stage.append(Estadio.iloc[j][1]) + begin_stage.append(Estadio.iloc[j][4]) + end_stage.append(Estadio.iloc[j][5]) + stage.append(str(Estadio.iloc[j][2].split()[1])) + + stage = pd.DataFrame({'stage_id':stage_id, 'stage': stage}) + note_stage = pd.DataFrame({'note_id': note_id_stage, + 'stage_id':stage_id, + 'begin': begin_stage, + 'end':end_stage}) + return stage, note_stage + +def tnm_tablas(Tnm, stage): + + conteo = 0 + note_id_tnm = [] + tnm_id = [] + begin_tnm = [] + end_tnm = [] + t = [] + n = [] + m = [] + + for j in range(0, len(Tnm)): + + tnm_id.append(conteo) + conteo = conteo + 1 + + note_id_tnm.append(Tnm.iloc[j][1]) + begin_tnm.append(Tnm.iloc[j][4]) + end_tnm.append(Tnm.iloc[j][5]) + + for i in range(0,len(Tnm.iloc[j][2])-1): + if Tnm.iloc[j][2][i] == 't' and i < len(Tnm.iloc[j][2]): + t.append(Tnm.iloc[j][2][i+1]) + elif Tnm.iloc[j][2][i] == 'n' and i < len(Tnm.iloc[j][2]): + n.append(Tnm.iloc[j][2][i+1]) + elif Tnm.iloc[j][2][i] == 'm' and i < len(Tnm.iloc[j][2]): + m.append(Tnm.iloc[j][2][i+1]) + + if len(t) < len(tnm_id): + t.append(None) + if len(n) < len(tnm_id): + n.append(None) + if len(m) < len(tnm_id): + m.append(None) + + control = 1 + note_id_trad = [] + begin_trad = [] + end_trad = [] + stage_id = [] + contador_stage = stage['stage_id'].loc[stage.index[-1]] + traducciones = [] + + for z in range(0, len(t)): + if m[z] == '0': + if t[z] == '4': + traducciones.append('iiib') + elif n[z] == '3': + traducciones.append('iiic') + elif n[z] == '0': + if t[z] == '1': + traducciones.append('i') + elif t[z] == '2': + traducciones.append('iia') + elif t[z] == '3': + traducciones.append('iib') + elif t[z] == '0': + if n[z] == '1': + traducciones.append('iia') + elif n[z] == '2': + traducciones.append('iiia') + elif (n[z] == '1' and t[z] == '1'): + traducciones.append('iia') + elif (n[z] == '1' and t[z] == '2'): + traducciones.append('iib') + elif (n[z] == '2') and (t[z] == '1' or t[z] == '2'): + traducciones.append('iiia') + elif (t[z] == '3') and (n[z] == '1' or n[z] == '2'): + traducciones.append('iiia') + elif m[z] == '1': + traducciones.append('iv') + + if control == len(traducciones): + stage_id.append(contador_stage) + contador_stage = contador_stage + 1 + note_id_trad.append(note_id_tnm[z]) + begin_trad.append(begin_tnm[z]) + end_trad.append(end_tnm[z]) + control = control + 1 + + tnm = pd.DataFrame({'tnm_id':tnm_id, 't': t, 'n': n, 'm': m, 'stage_id':stage_id}) + note_tnm = pd.DataFrame({'note_id': note_id_tnm, + 'tnm_id':tnm_id, + 'begin': begin_tnm, + 'end':end_tnm}) + + traduccion_stage = pd.DataFrame({'stage_id':note_id_trad, 'stage': traducciones}) + traduccion_note_stage = pd.DataFrame({'note_id': note_id_trad, + 'stage_id':stage_id, + 'begin': begin_trad, + 'end':end_trad}) + + return tnm, note_tnm, traduccion_stage, traduccion_note_stage + +def estadioTnm(annotations, tabla_documentos): + + resultado = ConceptExtractor.extractionOfConcepts(annotations) + anotaciones = [anotacion for lista in resultado for anotacion in lista] + concepts = pd.DataFrame({ + 'EHR': [tabla_documentos.loc[anotaciones[i][4]][1] for i in range(0, len(anotaciones))], + 'document_id': [tabla_documentos.loc[anotaciones[i][4]][0] for i in range(0, len(anotaciones))], + 'concept' : [anotaciones[i][0] for i in range(0, len(anotaciones))], + 'entity' : [anotaciones[i][1] for i in range(0, len(anotaciones))], + 'start':[anotaciones[i][2] for i in range(0, len(anotaciones))], + 'end': [anotaciones[i][3] for i in range(0, len(anotaciones))], + 'id_doc': [anotaciones[i][4] for i in range(0, len(anotaciones))]}) + + Estadio = concepts.loc[concepts['entity'] == 'STADIO'] + tablas_estadio = stage_tablas(Estadio) + stage = tablas_estadio[0] + note_stage = tablas_estadio[1] + + Tnm = concepts.loc[concepts['entity'] == 'TNM'] + tablas_tnm = tnm_tablas(Tnm, stage) + tnm = tablas_tnm[0] + note_tnm = tablas_tnm[1] + stage = stage.append(tablas_tnm[2], ignore_index = True) + note_stage = note_stage.append(tablas_tnm[3], ignore_index = True) + + return tnm, note_tnm, stage, note_stage + +#-------------------------- MAIN ---------------------------------------------- + +#Input: anotaciones de bert (lista de listas de diccionarios) +# EHR e id de los documentos de los que provienen las notas (pendiente quitarlo) +#Output: dos csv +def main(): + + jsonRoute = sys.argv[1] + documentRoute = sys.argv[2] + + if os.path.exists(jsonRoute): + + with open(jsonRoute) as json_file: + annotations = json.load(json_file) + + if os.path.exists(documentRoute): + + tabla_documentos = pd.read_csv(documentRoute) + resultado = estadioTnm(annotations, tabla_documentos) + resultado[0].to_csv(r'tnm.csv', index = False) + resultado[1].to_csv(r'note_tnm.csv', index = False) + resultado[2].to_csv(r'stage.csv', index = False) + resultado[3].to_csv(r'note_stage.csv', index = False) + + else: + print("Second argument file doesn't exist") + + else: + print("First argument file doesn't exist") + +if __name__ == "__main__": + main() + -- 2.24.1