From 61b10d7483fc474d0d1d10d697fb61a8821e9f56 Mon Sep 17 00:00:00 2001 From: Javier Rodriguez Vidal Date: Fri, 19 Feb 2021 11:12:00 +0000 Subject: [PATCH] Script que permite componer los conceptos de las anotaciones BERT (B, B+I) --- UMLS_Extractor/ConceptExtractor.py | 59 ++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 UMLS_Extractor/ConceptExtractor.py diff --git a/UMLS_Extractor/ConceptExtractor.py b/UMLS_Extractor/ConceptExtractor.py new file mode 100644 index 0000000..49fa246 --- /dev/null +++ b/UMLS_Extractor/ConceptExtractor.py @@ -0,0 +1,59 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed Jan 27 10:26:48 2021 +Library +""" + +# Extract the B or B+Is concepts and their entities, starts and ends. +# Input: list of list of dictionaries where each dictionary: +# {'word' : '', 'score' : '', entity : '', index : '', start : '', end : ''} +# Output: List of tuples with four elements. [(concept, entity, start, end)] +def extractionOfConcepts(annotations): + + #VARIABLES + entities = [] + + complete_word = '' + start = 0 + end = 0 + entity = '' + i=0 + + for document in annotations: + i = i+1 + for word in document: + #B + if word.get('entity')[0] == 'B': + + #If there was a previous concept started, finish it + if len(complete_word) > 0: + entities.append((complete_word, entity, start, end)) + complete_word = '' + start = 0 + end = 0 + entity = '' + + #Strat a new concept + complete_word = word.get('word') + start = word.get('start') + end = word.get('end') + entity = word.get('entity')[2:] + + #I + elif word.get('entity')[0] == 'I': + #if there isnt a B before, ignore + if len(complete_word) > 0: + complete_word = complete_word + ' ' + word.get('word') + #Update end + end = word.get('end') + + #O + elif word.get('entity')[0] == 'O': + #If there was a previous concept started, finish it + if len(complete_word) > 0: + entities.append((complete_word, entity, start, end)) + complete_word = '' + start = 0 + end = 0 + entity = '' + return entities -- 2.24.1