Script que extrae los conceptos + CUI de UMLS

parent 0fb8dd0b
import es_core_news_md
import mysql.connector
import textdistance
import configparser
from mysql.connector import errorcode
from nltk.corpus import stopwords
nlp = es_core_news_md.load()
#Diccionario con los datos de conexion a la BBDD (se lee desde archivo de configuracion: DEFAULT --> LOCAL, TESTING --> ARES)
configuration = configparser.ConfigParser()
configuration.read('config.ini')
config2 = {'user':configuration['DEFAULT']['DB_USER'],
'password':configuration['DEFAULT']['DB_PASSWORD'],
'port':configuration['DEFAULT']['DB_PORT'],
'host':configuration['DEFAULT']['DB_HOST'],
'db':configuration['DEFAULT']['DB_NAME'],
'auth_plugin':configuration['DEFAULT']['DB_AUTH_PLUGIN']
}
#Función que tokeniza una lista de conceptos dada
#Output: listado de las palabras tokenizadas
def get_words(concepts):
words = []
for i in range(0,len(concepts)):
ent = nlp(concepts[i])
for i in range(len(ent)):
if((ent[i].text.lower() not in stopwords.words('spanish')) and (ent[i].text.lower() not in words)):
words.append(ent[i].text.lower())
return words
#Función que dada una palabra busca en UMLS conceptos que la contengan y el CUI asociado
#Output: lista pares (concepto,CUI)
def search_umls(word):
try:
#Conectamos con nuestra BD
cnx = mysql.connector.connect(**config2)
#Creamos el cursor
cursor = cnx.cursor()
#Nuestra query
query = "SELECT STR,CUI FROM MRCONSO where LAT='SPA' and STR like '%"+word+"%';"
cursor.execute(query)
lUmls = []
for row in cursor:
if(row[0].strip!="" and row[1].strip()!=""):
lUmls.append(str(row[0])+"\t"+str(row[1]))
except mysql.connector.Error as err:
if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
print("No pudo conectarse a la BBDD, revisar usuario y password")
elif err.errno == errorcode.ER_BAD_DB_ERROR:
print("La BD introducida no existe")
else:
print(err)
else:
cnx.close()
return lUmls
#Función que dado un cui busca en UMLS conceptos que lo contengan
#Output: lista (concepto)
def search_umls_cui(cui):
try:
#Conectamos con nuestra BD
cnx = mysql.connector.connect(**config2)
#Creamos el cursor
cursor = cnx.cursor()
#Nuestra query
query = "SELECT STR FROM MRCONSO where LAT='SPA' and CUI='"+cui+"';"
cursor.execute(query)
lUmls = []
for row in cursor:
if(row[0].strip!=""):
lUmls.append(str(row[0]))
except mysql.connector.Error as err:
if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
print("No pudo conectarse a la BBDD, revisar usuario y password")
elif err.errno == errorcode.ER_BAD_DB_ERROR:
print("La BD introducida no existe")
else:
print(err)
else:
cnx.close()
return lUmls
#Función que busca en umls los conceptos que contengan las palabras asociadas
#Output: Diccionario, key: words, values: (umls_concept,CUI)
def get_umls_concept_cui(words):
dictConcepts = {}
for i in range(0,len(words)):
lUmls = search_umls(words[i])
if (words[i] in dictConcepts.keys()):
dictConcepts[words[i]] = dictConcepts[words[i]] + lUmls
else:
dictConcepts[words[i]] = lUmls
return dictConcepts
#Función que devuelve la similitud entre dos strings
def similarities(str1,str2):
levenshtein,jaccard,ratcliff = 0,0,0
if(len(str1.strip())+2>=len(str2.split("\t")[0].strip())):
levenshtein = textdistance.levenshtein.normalized_similarity(str1.strip().lower(),str2.split("\t")[0].strip().lower())
return levenshtein
#Función que devuelve el concepto UMLS más similar a un concepto dado
def get_similarity(concept,lUMLSConcepts):
lJaccard = []
maxSimilarLevenshtein = 0
umlsConceptLevenshtein = ""
cuiConceptLevenshtein = ""
for i in range(0,len(lUMLSConcepts)):
levenshtein = similarities(concept,lUMLSConcepts[i])
if(levenshtein>maxSimilarLevenshtein):
maxSimilarLevenshtein = levenshtein
auxUMLS = lUMLSConcepts[i].split("\t")
umlsConceptLevenshtein = auxUMLS[0]
cuiConceptLevenshtein = auxUMLS[1]
lJaccard.append((concept,"Levenshtein:",umlsConceptLevenshtein,cuiConceptLevenshtein,maxSimilarLevenshtein,"UMLS"))
return lJaccard
#Función que devuelve una lista de conceptos UMLS más similares a unos conceptos dados y sus CUIS asociadas
def similarity_concept(concepts, dictConcepts):
lSimilarConcepts = []
for i in range(0,len(concepts)):
words = get_words([concepts[i]])
lAux = []
for j in range(0,len(words)):
if(words[j] in dictConcepts.keys()):
lAux = lAux + dictConcepts[words[j]]
lSimilarConcepts = lSimilarConcepts + get_similarity(concepts[i],lAux)
return lSimilarConcepts
def similarity_cui (concepts, jkesCuis):
listConceptsJKES = []
for i in range(0,len(jkesCuis)):
lAux = search_umls_cui(jkesCuis[i])
if(len(lAux)>0):
for j in range(0,len(lAux)):
if(lAux[j] not in listConceptsJKES):
listConceptsJKES.append(lAux[j]+"\t"+jkesCuis[i])
lSimilaritiesJKES = []
for i in range(0,len(concepts)):
lSimilaritiesJKES += get_similarity(concepts[i],listConceptsJKES)
return lSimilaritiesJKES
def get_final_concepts(lConceptsUMLS, lConceptsUMLSJKES):
listConcepts = []
for i in range(0,len(lConceptsUMLS)):
if (lConceptsUMLS[i][4] >= lConceptsUMLSJKES[i][4]):
listConcepts.append(lConceptsUMLS[i])
else:
listConcepts.append(lConceptsUMLSJKES[i])
return listConcepts
#Main
def umls_concept_extractor(concepts,jkesCuis):
words = get_words(concepts)
dictConcepts = get_umls_concept_cui(words)
lSimilarConcepts = similarity_concept(concepts,dictConcepts)
if(len(jkesCuis)>0):
lSimilaritiesJKES = similarity_cui(concepts,jkesCuis)
listConcepts = get_final_concepts(lSimilarConcepts,lSimilaritiesJKES)
return listConcepts
def umls_concept_extractor2(concepts):
print("Get words")
words = get_words(concepts)
print("Tokenized Words")
dictConcepts = get_umls_concept_cui(words)
print("Dictionary concepts")
lSimilarConcepts = similarity_concept(concepts,dictConcepts)
return lSimilarConcepts
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment