jkesExtractor.py 5.76 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181
import sys, os, json
import ConceptExtractor 
import mysql.connector
import textdistance
import configparser
import inflect
import re
from ConceptExtractor import extractionOfConcepts
from umlsExtractor import umls_concept_extractor, get_words

configuration = configparser.ConfigParser()
configuration.read('config.ini')

config = {'user':configuration['ARES']['DB_USER'],
'password':configuration['ARES']['DB_PASSWORD'],
'port':configuration['ARES']['DB_PORT'],
'host':configuration['ARES']['DB_HOST'],
'db':configuration['ARES']['DB_NAME'],
'auth_plugin':configuration['ARES']['DB_AUTH_PLUGIN']
}

#Funcion que busca en la tabla de anotaciones de JKES (umls_old_dx), los conceptos encontrados por BERT
#Input: el nombre del anotador, la tupla de conceptos a ser procesados
def select_query_umls_jkes(tuplesConcepts):

	cnx = mysql.connector.connect(**config)
	#Creamos el cursor 
	cursor = cnx.cursor()
	#Nuestra query
	query = "SELECT cui FROM umls_old_dx where concept='"+tuplesConcepts[0]+"';"
	cursor.execute(query)		
	lUmls = []
			
	for row in cursor:
		if((row[0] is not None) and (row[0].strip()!="")):
			lUmls.append(str(row[0]))

	cnx.close()
	return lUmls


#Funcion que busca en la tabla de anotaciones de JKES (umls_old_dx), los conceptos encontrados por BERT
#Input: el nombre del anotador, la tupla de conceptos a ser procesados
def select_query_jkes(concepts):

	cnx = mysql.connector.connect(**config)
	#Creamos el cursor 
	cursor = cnx.cursor()
	#Nuestra query
	for concept in concepts:
			
		query = "SELECT cui FROM umls_old_dx where sentence like '%"+concept+"%';"
		cursor.execute(query)		
		lUmls = []
				
		for row in cursor:
			if((row[0] is not None) and (row[0].strip()!="")):
				lUmls.append(str(row[0]))

	cnx.close()
	return lUmls

#Funcion que busca en la tabla de anotaciones de clarifyv2 (umls_old_dx), los conceptos encontrados en BERT
#Si estos son encontrados en JKES, se añaden a un listado para su guardado posterior, en caso contrario, 
#se buscan los conceptos mas similares en UMLS
#Input: path de archivo de anotaciones
def jkes_concept_extractor(pathAnnotations):

	with open(pathAnnotations) as json_file:
        	annotations = json.load(json_file)
	data = extractionOfConcepts(annotations)

	dictAnnotationsEntities = {}
	dictUmls = {}
	conceptsSearchUmls = []
	cuiSearchUMLS = []
	dictTraduccion = {}
	dictConceptDoc = {}
	jkes = 0
	umls = 0
	notCui = 0
	p = inflect.engine()

	with open('entity_cuis.json') as file:
		dictAnnotationsEntities = json.load(file)

	#tuplesConcepts = concepto anotado por BIO
	for tuplesConcepts in data:
		#Annotator = anotador de la lista de anotadores de JKES
		if(not(tuplesConcepts[0] in dictUmls.keys())):

			lUmls = select_query_umls_jkes(tuplesConcepts)

			if(len(lUmls)>0):
				dictUmls[tuplesConcepts[4]] = {tuplesConcepts[0]:(lUmls,tuplesConcepts[2],tuplesConcepts[3])}
				jkes+=1
			else:
				if(not(tuplesConcepts[0] in conceptsSearchUmls)):
					for annotator in dictAnnotationsEntities.keys():
					    #Compruebo si la entidad correspondiente a mi anotador es igual a la entidad del concepto conceptKey
						if (dictAnnotationsEntities[annotator] == tuplesConcepts[1]):
							regex = r"(?i)(\bca\b)"
							regexAdenoca = r"(?i)(\badenoca\b)"
							regexCar = r"(?i)(\bcar\b)"
							aux = ""
							changed=False

							if(re.search(regex,tuplesConcepts[0])): 
								tuplesConcepts = list(tuplesConcepts)
								aux = tuplesConcepts[0]
								if("infiltrante" in aux):
									aux = re.sub(regex,"carcinoma",aux)
								else:
									aux = re.sub(regex,"cancer",aux)
								changed = True
							elif(re.search(regexAdenoca,tuplesConcepts[0])):
								tuplesConcepts = list(tuplesConcepts)
								aux = tuplesConcepts[0]
								aux = re.sub(regexAdenoca,"adenocarcinoma",aux)
								changed = True
							elif(re.search(regexCar,tuplesConcepts[0])):
								tuplesConcepts = list(tuplesConcepts)
								aux = tuplesConcepts[0]
								aux = re.sub(regexCar,"carcinoma",aux)
								changed = True
							else:
								tuplesConcepts = list(tuplesConcepts)
								aux = p.singular_noun(tuplesConcepts[0])
								if(not aux):
									changed=False
								else:
									changed = True
						
							if(changed):
								dictTraduccion[tuplesConcepts[0]] = aux
								conceptsSearchUmls.append(aux)
							else:
								conceptsSearchUmls.append(tuplesConcepts[0])

							dictConceptDoc[tuplesConcepts[0]] = [(tuplesConcepts[4],tuplesConcepts[2],tuplesConcepts[3])]
							umls+=1
				else:
					lAux = dictConceptDoc[tuplesConcepts[0]]
					lAux.append((tuplesConcepts[4],tuplesConcepts[2],tuplesConcepts[3]))
					dictConceptDoc[tuplesConcepts[0]] = lAux


	listConceptsJkes = []
	for key in dictUmls.keys():
		for key2 in dictUmls[key]:
			lAux = dictUmls[key][key2][0]
			for i in range(0,len(lAux)):
				if(not((key2,lAux[i],"JKES",key,dictUmls[key][key2][1],dictUmls[key][key2][2]) in listConceptsJkes)):
					listConceptsJkes.append((key2,lAux[i],"JKES",key,dictUmls[key][key2][1],dictUmls[key][key2][2]))
					print((key2,lAux[i],"JKES",key,dictUmls[key][key2][1],dictUmls[key][key2][2]))


	print("Checking UMLS CUIS")
	listCuisUmls = []
	cuisUmls = select_query_jkes(conceptsSearchUmls)
	for cui in cuisUmls:
		if(cui not in listCuisUmls):
			listCuisUmls.append(cui)
	
	listConceptsUmls = umls_concept_extractor(conceptsSearchUmls,listCuisUmls)

	for i in range(0,len(listConceptsUmls)):
		lAux = list(listConceptsUmls[i])
		if(lAux[0] in dictTraduccion.values()):
			position = list(dictTraduccion.values()).index(lAux[0])
			lAux[0] = list(dictTraduccion.keys())[position]
			listConceptsUmls[i] = tuple(lAux)

	for i in range(0,len(listConceptsUmls)):
		lAux = list(dictConceptDoc[listConceptsUmls[i][0]])
		lAux2 = list(listConceptsUmls[i])
		lAux2 = lAux2 + lAux
		listConceptsUmls[i] = tuple(lAux2)
	
	print(jkes,umls)
	return listConceptsJkes,listConceptsUmls