diff --git a/ConceptExtractor/ConceptExtractor.py b/ConceptExtractor/ConceptExtractor.py new file mode 100644 index 0000000000000000000000000000000000000000..1e59eb634e2de9befca06196f7a48e4a79ad8a94 --- /dev/null +++ b/ConceptExtractor/ConceptExtractor.py @@ -0,0 +1,96 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed Jan 27 10:26:48 2021 +Library +""" + +import textdistance + +# Extract the B or B+Is concepts and their entities, starts and ends. +# Input: list of list of dictionaries where each dictionary: +# {'word' : '', 'score' : '', entity : '', index : '', start : '', end : ''} +# Output: List of list of similar (levenshtein distance >0.85) tuples with four elements. +# [(concept, entity, start, end, num_documento)] +def extractionOfConcepts(annotations): + + #EXTRACTION OF B/B+I CONCEPTS + entities = [] + complete_word = '' + start = 0 + end = 0 + entity = '' + num_doc = -1 + + for document in annotations: + num_doc = num_doc + 1 + + for word in document: + + #B + if word.get('entity')[0] == 'B': + #If there was a previous concept started, finish it + if len(complete_word) > 0: + entities.append((complete_word, entity, start, end, num_doc)) + complete_word = '' + start = 0 + end = 0 + entity = '' + + #Start a new concept + complete_word = word.get('word') + start = word.get('start') + end = word.get('end') + entity = word.get('entity')[2:] + + #I + elif word.get('entity')[0] == 'I': + #If there isnt a B before, ignore + if len(complete_word) > 0: + complete_word = complete_word + ' ' + word.get('word') + #Update end + end = word.get('end') + + #O + elif word.get('entity')[0] == 'O': + #If there was a previous concept started, finish it + if len(complete_word) > 0: + entities.append((complete_word, entity, start, end, num_doc)) + complete_word = '' + start = 0 + end = 0 + entity = '' + + if len(complete_word) > 0: + entities.append((complete_word, entity, start, end, num_doc)) + complete_word = '' + start = 0 + end = 0 + entity = '' + + + #Sort by length + entities.sort(key = lambda x: x[0]) + entities.sort(key = lambda x: len(x[0])) + + #SIMILARITIES + final_entities = [] + control_list = [] + for i in range(0, len(entities)): + + #Remove the concepts of length 1 + if len(entities[i][0]) != 1 and entities[i] not in control_list: + control_list.append(entities[i]) + lista_similaridad = [entities[i]] + + for rest in entities: + + dist = textdistance.levenshtein.normalized_similarity(entities[i][0], rest[0]) + #If distance > 0.85 + if dist > 0.85 and rest not in control_list: + lista_similaridad.append(rest) + control_list.append(rest) + + final_entities.append(lista_similaridad) + lista_similaridad = [] + + return final_entities