ConceptExtractor.py 1.91 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
# -*- coding: utf-8 -*-
"""
Created on Wed Jan 27 10:26:48 2021
Library
"""

# Extract the B or B+Is concepts and their entities, starts and ends.
# Input: list of list of dictionaries where each dictionary: 
# {'word' : '', 'score' : '', entity : '', index : '', start : '', end : ''}
# Output: List of tuples with four elements. [(concept, entity, start, end)]
def extractionOfConcepts(annotations):
    
    #VARIABLES
    entities = []

    complete_word = '' 
    start = 0
    end = 0
    entity = ''
    i=0

    for document in annotations:   
        i = i+1
        for word in document:
            #B
            if word.get('entity')[0] == 'B':   
                
                #If there was a previous concept started, finish it
                if len(complete_word) > 0:
                    entities.append((complete_word, entity, start, end))
                    complete_word = ''
                    start = 0
                    end = 0
                    entity = ''
                    
                #Strat a new concept
                complete_word = word.get('word')
                start = word.get('start')
                end = word.get('end')
                entity = word.get('entity')[2:]
                
            #I
            elif word.get('entity')[0] == 'I':
                #if there isnt a B before, ignore
                if len(complete_word) > 0:
                    complete_word = complete_word + ' ' + word.get('word')
                    #Update end
                    end = word.get('end')
                
            #O
            elif word.get('entity')[0] == 'O':
                #If there was a previous concept started, finish it
                if len(complete_word) > 0:
                    entities.append((complete_word, entity, start, end))
                    complete_word = ''
                    start = 0
                    end = 0
                    entity = ''
    return entities