Upload New File

9396d39a · Lucia Catalan Gris · 0f20cbb3 · 9396d39a
Commit 9396d39a authored Feb 19, 2021 by Lucia Catalan Gris
Show whitespace changes
Inline Side-by-side

Showing with 96 additions and 0 deletions

ConceptExtractor/ConceptExtractor.py ConceptExtractor/ConceptExtractor.py +96 -0

No files found.
--- a/ConceptExtractor/ConceptExtractor.py
+++ b/ConceptExtractor/ConceptExtractor.py
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Jan 27 10:26:48 2021
+Library
+"""
+
+import textdistance
+
+# Extract the B or B+Is concepts and their entities, starts and ends.
+# Input: list of list of dictionaries where each dictionary: 
+# {'word' : '', 'score' : '', entity : '', index : '', start : '', end : ''}
+# Output: List of list of similar (levenshtein distance >0.85) tuples with four elements. 
+#                            [(concept, entity, start, end, num_documento)]
+def extractionOfConcepts(annotations):
+    
+    #EXTRACTION OF B/B+I CONCEPTS
+    entities = []
+    complete_word = '' 
+    start = 0
+    end = 0
+    entity = ''
+    num_doc = -1
+    
+    for document in annotations:   
+        num_doc = num_doc + 1
+        
+        for word in document:
+            
+            #B
+            if word.get('entity')[0] == 'B':                 
+                #If there was a previous concept started, finish it
+                if len(complete_word) > 0:
+                    entities.append((complete_word, entity, start, end, num_doc))
+                    complete_word = ''
+                    start = 0
+                    end = 0
+                    entity = ''
+                    
+                #Start a new concept
+                complete_word = word.get('word')
+                start = word.get('start')
+                end = word.get('end')
+                entity = word.get('entity')[2:]
+                
+            #I
+            elif word.get('entity')[0] == 'I':
+                #If there isnt a B before, ignore
+                if len(complete_word) > 0:
+                    complete_word = complete_word + ' ' + word.get('word')
+                    #Update end
+                    end = word.get('end')
+                
+            #O
+            elif word.get('entity')[0] == 'O':
+                #If there was a previous concept started, finish it
+                if len(complete_word) > 0:
+                    entities.append((complete_word, entity, start, end, num_doc))
+                    complete_word = ''
+                    start = 0
+                    end = 0
+                    entity = ''
+                    
+        if len(complete_word) > 0:
+            entities.append((complete_word, entity, start, end, num_doc))
+            complete_word = ''
+            start = 0
+            end = 0
+            entity = ''
+                    
+                    
+    #Sort by length    
+    entities.sort(key = lambda x: x[0])  
+    entities.sort(key = lambda x: len(x[0]))
+    
+    #SIMILARITIES
+    final_entities = []
+    control_list = []
+    for i in range(0, len(entities)):
+        
+        #Remove the concepts of length 1   
+        if len(entities[i][0]) != 1 and entities[i] not in control_list:            
+            control_list.append(entities[i])
+            lista_similaridad = [entities[i]]
+            
+            for rest in entities:
+                
+                dist = textdistance.levenshtein.normalized_similarity(entities[i][0], rest[0])                 
+                #If distance > 0.85
+                if dist > 0.85 and rest not in control_list:
+                    lista_similaridad.append(rest)
+                    control_list.append(rest)
+                    
+            final_entities.append(lista_similaridad)
+            lista_similaridad = []
+                    
+    return final_entities