Commit 9396d39a authored by Lucia Catalan Gris's avatar Lucia Catalan Gris

Upload New File

parent 0f20cbb3
# -*- coding: utf-8 -*-
"""
Created on Wed Jan 27 10:26:48 2021
Library
"""
import textdistance
# Extract the B or B+Is concepts and their entities, starts and ends.
# Input: list of list of dictionaries where each dictionary:
# {'word' : '', 'score' : '', entity : '', index : '', start : '', end : ''}
# Output: List of list of similar (levenshtein distance >0.85) tuples with four elements.
# [(concept, entity, start, end, num_documento)]
def extractionOfConcepts(annotations):
#EXTRACTION OF B/B+I CONCEPTS
entities = []
complete_word = ''
start = 0
end = 0
entity = ''
num_doc = -1
for document in annotations:
num_doc = num_doc + 1
for word in document:
#B
if word.get('entity')[0] == 'B':
#If there was a previous concept started, finish it
if len(complete_word) > 0:
entities.append((complete_word, entity, start, end, num_doc))
complete_word = ''
start = 0
end = 0
entity = ''
#Start a new concept
complete_word = word.get('word')
start = word.get('start')
end = word.get('end')
entity = word.get('entity')[2:]
#I
elif word.get('entity')[0] == 'I':
#If there isnt a B before, ignore
if len(complete_word) > 0:
complete_word = complete_word + ' ' + word.get('word')
#Update end
end = word.get('end')
#O
elif word.get('entity')[0] == 'O':
#If there was a previous concept started, finish it
if len(complete_word) > 0:
entities.append((complete_word, entity, start, end, num_doc))
complete_word = ''
start = 0
end = 0
entity = ''
if len(complete_word) > 0:
entities.append((complete_word, entity, start, end, num_doc))
complete_word = ''
start = 0
end = 0
entity = ''
#Sort by length
entities.sort(key = lambda x: x[0])
entities.sort(key = lambda x: len(x[0]))
#SIMILARITIES
final_entities = []
control_list = []
for i in range(0, len(entities)):
#Remove the concepts of length 1
if len(entities[i][0]) != 1 and entities[i] not in control_list:
control_list.append(entities[i])
lista_similaridad = [entities[i]]
for rest in entities:
dist = textdistance.levenshtein.normalized_similarity(entities[i][0], rest[0])
#If distance > 0.85
if dist > 0.85 and rest not in control_list:
lista_similaridad.append(rest)
control_list.append(rest)
final_entities.append(lista_similaridad)
lista_similaridad = []
return final_entities
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment