ConceptExtractor.py

# -*- coding: utf-8 -*-
"""
Created on Wed Jan 27 10:26:48 2021
Library
"""

import textdistance

# Extract the B or B+Is concepts and their entities, starts and ends.
# Input: list of list of dictionaries where each dictionary: 
# {'word' : '', 'score' : '', entity : '', index : '', start : '', end : ''}
# Output: List of list of similar (levenshtein distance >0.85) tuples with four elements. 
#                            [(concept, entity, start, end, num_documento)]
def extractionOfConcepts(annotations):
    
    #EXTRACTION OF B/B+I CONCEPTS
    entities = []
    complete_word = '' 
    start = 0
    end = 0
    entity = ''
    num_doc = -1
    
    for document in annotations:   
        num_doc = num_doc + 1
        
        for word in document:
            
            #B
            if word.get('entity')[0] == 'B':                 
                #If there was a previous concept started, finish it
                if len(complete_word) > 0:
                    entities.append((complete_word, entity, start, end, num_doc))
                    complete_word = ''
                    start = 0
                    end = 0
                    entity = ''
                    
                #Start a new concept
                complete_word = word.get('word')
                start = word.get('start')
                end = word.get('end')
                entity = word.get('entity')[2:]
                
            #I
            elif word.get('entity')[0] == 'I':
                #If there isnt a B before, ignore
                if len(complete_word) > 0:
                    complete_word = complete_word + ' ' + word.get('word')
                    #Update end
                    end = word.get('end')
                
            #O
            elif word.get('entity')[0] == 'O':
                #If there was a previous concept started, finish it
                if len(complete_word) > 0:
                    entities.append((complete_word, entity, start, end, num_doc))
                    complete_word = ''
                    start = 0
                    end = 0
                    entity = ''
                    
        if len(complete_word) > 0:
            entities.append((complete_word, entity, start, end, num_doc))
            complete_word = ''
            start = 0
            end = 0
            entity = ''
                    
                    
    #Sort by length    
    entities.sort(key = lambda x: x[0])  
    entities.sort(key = lambda x: len(x[0]))
    
    #SIMILARITIES
    final_entities = []
    control_list = []
    for i in range(0, len(entities)):
        
        #Remove the concepts of length 1   
        if len(entities[i][0]) != 1 and entities[i] not in control_list:            
            control_list.append(entities[i])
            lista_similaridad = [entities[i]]
            
            for rest in entities:
                
                dist = textdistance.levenshtein.normalized_similarity(entities[i][0], rest[0])                 
                #If distance > 0.85
                if dist > 0.85 and rest not in control_list:
                    lista_similaridad.append(rest)
                    control_list.append(rest)
                    
            final_entities.append(lista_similaridad)
            lista_similaridad = []
                    
    return final_entities