# -*- coding: utf-8 -*- """ Created on Wed Jan 27 10:26:48 2021 Library """ import textdistance # Extract the B or B+Is concepts and their entities, starts and ends. # Input: list of list of dictionaries where each dictionary: # {'word' : '', 'score' : '', entity : '', index : '', start : '', end : ''} # Output: List of list of similar (levenshtein distance >0.85) tuples with four elements. # [(concept, entity, start, end, num_documento)] def extractionOfConcepts(annotations): #EXTRACTION OF B/B+I CONCEPTS entities = [] complete_word = '' start = 0 end = 0 entity = '' num_doc = -1 for document in annotations: num_doc = num_doc + 1 for word in document: #B if word.get('entity')[0] == 'B': #If there was a previous concept started, finish it if len(complete_word) > 0: entities.append((complete_word, entity, start, end, num_doc)) complete_word = '' start = 0 end = 0 entity = '' #Start a new concept complete_word = word.get('word') start = word.get('start') end = word.get('end') entity = word.get('entity')[2:] #I elif word.get('entity')[0] == 'I': #If there isnt a B before, ignore if len(complete_word) > 0: complete_word = complete_word + ' ' + word.get('word') #Update end end = word.get('end') #O elif word.get('entity')[0] == 'O': #If there was a previous concept started, finish it if len(complete_word) > 0: entities.append((complete_word, entity, start, end, num_doc)) complete_word = '' start = 0 end = 0 entity = '' if len(complete_word) > 0: entities.append((complete_word, entity, start, end, num_doc)) complete_word = '' start = 0 end = 0 entity = '' #Sort by length entities.sort(key = lambda x: x[0]) entities.sort(key = lambda x: len(x[0])) #SIMILARITIES final_entities = [] control_list = [] for i in range(0, len(entities)): #Remove the concepts of length 1 if len(entities[i][0]) != 1 and entities[i] not in control_list: control_list.append(entities[i]) lista_similaridad = [entities[i]] for rest in entities: dist = textdistance.levenshtein.normalized_similarity(entities[i][0], rest[0]) #If distance > 0.85 if dist > 0.85 and rest not in control_list: lista_similaridad.append(rest) control_list.append(rest) final_entities.append(lista_similaridad) lista_similaridad = [] return final_entities