From b595900fb4b7b97f59491311b2a9080571cb0854 Mon Sep 17 00:00:00 2001 From: Javier Rodriguez Vidal Date: Fri, 19 Feb 2021 11:13:13 +0000 Subject: [PATCH] Script que extrae los conceptos + CUI de UMLS --- UMLS_Extractor/umlsExtractor.py | 216 ++++++++++++++++++++++++++++++++ 1 file changed, 216 insertions(+) create mode 100644 UMLS_Extractor/umlsExtractor.py diff --git a/UMLS_Extractor/umlsExtractor.py b/UMLS_Extractor/umlsExtractor.py new file mode 100644 index 0000000..fc2806e --- /dev/null +++ b/UMLS_Extractor/umlsExtractor.py @@ -0,0 +1,216 @@ +import es_core_news_md +import mysql.connector +import textdistance +import configparser +from mysql.connector import errorcode +from nltk.corpus import stopwords + +nlp = es_core_news_md.load() + +#Diccionario con los datos de conexion a la BBDD (se lee desde archivo de configuracion: DEFAULT --> LOCAL, TESTING --> ARES) +configuration = configparser.ConfigParser() +configuration.read('config.ini') + +config2 = {'user':configuration['DEFAULT']['DB_USER'], +'password':configuration['DEFAULT']['DB_PASSWORD'], +'port':configuration['DEFAULT']['DB_PORT'], +'host':configuration['DEFAULT']['DB_HOST'], +'db':configuration['DEFAULT']['DB_NAME'], +'auth_plugin':configuration['DEFAULT']['DB_AUTH_PLUGIN'] +} + +#Función que tokeniza una lista de conceptos dada +#Output: listado de las palabras tokenizadas +def get_words(concepts): + + words = [] + for i in range(0,len(concepts)): + + ent = nlp(concepts[i]) + + for i in range(len(ent)): + if((ent[i].text.lower() not in stopwords.words('spanish')) and (ent[i].text.lower() not in words)): + words.append(ent[i].text.lower()) + + + return words + +#Función que dada una palabra busca en UMLS conceptos que la contengan y el CUI asociado +#Output: lista pares (concepto,CUI) +def search_umls(word): + try: + #Conectamos con nuestra BD + cnx = mysql.connector.connect(**config2) + #Creamos el cursor + cursor = cnx.cursor() + #Nuestra query + query = "SELECT STR,CUI FROM MRCONSO where LAT='SPA' and STR like '%"+word+"%';" + cursor.execute(query) + lUmls = [] + + for row in cursor: + if(row[0].strip!="" and row[1].strip()!=""): + lUmls.append(str(row[0])+"\t"+str(row[1])) + + except mysql.connector.Error as err: + if err.errno == errorcode.ER_ACCESS_DENIED_ERROR: + print("No pudo conectarse a la BBDD, revisar usuario y password") + elif err.errno == errorcode.ER_BAD_DB_ERROR: + print("La BD introducida no existe") + else: + print(err) + + else: + cnx.close() + return lUmls + +#Función que dado un cui busca en UMLS conceptos que lo contengan +#Output: lista (concepto) +def search_umls_cui(cui): + try: + #Conectamos con nuestra BD + cnx = mysql.connector.connect(**config2) + #Creamos el cursor + cursor = cnx.cursor() + #Nuestra query + query = "SELECT STR FROM MRCONSO where LAT='SPA' and CUI='"+cui+"';" + cursor.execute(query) + lUmls = [] + + for row in cursor: + if(row[0].strip!=""): + lUmls.append(str(row[0])) + + except mysql.connector.Error as err: + if err.errno == errorcode.ER_ACCESS_DENIED_ERROR: + print("No pudo conectarse a la BBDD, revisar usuario y password") + elif err.errno == errorcode.ER_BAD_DB_ERROR: + print("La BD introducida no existe") + else: + print(err) + + else: + cnx.close() + return lUmls + + +#Función que busca en umls los conceptos que contengan las palabras asociadas +#Output: Diccionario, key: words, values: (umls_concept,CUI) +def get_umls_concept_cui(words): + + dictConcepts = {} + + for i in range(0,len(words)): + lUmls = search_umls(words[i]) + if (words[i] in dictConcepts.keys()): + dictConcepts[words[i]] = dictConcepts[words[i]] + lUmls + else: + dictConcepts[words[i]] = lUmls + + return dictConcepts + +#Función que devuelve la similitud entre dos strings +def similarities(str1,str2): + + levenshtein,jaccard,ratcliff = 0,0,0 + + if(len(str1.strip())+2>=len(str2.split("\t")[0].strip())): + levenshtein = textdistance.levenshtein.normalized_similarity(str1.strip().lower(),str2.split("\t")[0].strip().lower()) + + return levenshtein + +#Función que devuelve el concepto UMLS más similar a un concepto dado +def get_similarity(concept,lUMLSConcepts): + + lJaccard = [] + maxSimilarLevenshtein = 0 + umlsConceptLevenshtein = "" + cuiConceptLevenshtein = "" + + + for i in range(0,len(lUMLSConcepts)): + + levenshtein = similarities(concept,lUMLSConcepts[i]) + + if(levenshtein>maxSimilarLevenshtein): + maxSimilarLevenshtein = levenshtein + auxUMLS = lUMLSConcepts[i].split("\t") + umlsConceptLevenshtein = auxUMLS[0] + cuiConceptLevenshtein = auxUMLS[1] + + lJaccard.append((concept,"Levenshtein:",umlsConceptLevenshtein,cuiConceptLevenshtein,maxSimilarLevenshtein,"UMLS")) + + return lJaccard + + +#Función que devuelve una lista de conceptos UMLS más similares a unos conceptos dados y sus CUIS asociadas +def similarity_concept(concepts, dictConcepts): + lSimilarConcepts = [] + + for i in range(0,len(concepts)): + words = get_words([concepts[i]]) + lAux = [] + + for j in range(0,len(words)): + if(words[j] in dictConcepts.keys()): + lAux = lAux + dictConcepts[words[j]] + + lSimilarConcepts = lSimilarConcepts + get_similarity(concepts[i],lAux) + + return lSimilarConcepts + +def similarity_cui (concepts, jkesCuis): + + listConceptsJKES = [] + + for i in range(0,len(jkesCuis)): + lAux = search_umls_cui(jkesCuis[i]) + + if(len(lAux)>0): + for j in range(0,len(lAux)): + if(lAux[j] not in listConceptsJKES): + listConceptsJKES.append(lAux[j]+"\t"+jkesCuis[i]) + + lSimilaritiesJKES = [] + + for i in range(0,len(concepts)): + lSimilaritiesJKES += get_similarity(concepts[i],listConceptsJKES) + + + return lSimilaritiesJKES + +def get_final_concepts(lConceptsUMLS, lConceptsUMLSJKES): + + listConcepts = [] + for i in range(0,len(lConceptsUMLS)): + if (lConceptsUMLS[i][4] >= lConceptsUMLSJKES[i][4]): + listConcepts.append(lConceptsUMLS[i]) + else: + listConcepts.append(lConceptsUMLSJKES[i]) + + return listConcepts + +#Main +def umls_concept_extractor(concepts,jkesCuis): + + words = get_words(concepts) + dictConcepts = get_umls_concept_cui(words) + lSimilarConcepts = similarity_concept(concepts,dictConcepts) + if(len(jkesCuis)>0): + lSimilaritiesJKES = similarity_cui(concepts,jkesCuis) + + listConcepts = get_final_concepts(lSimilarConcepts,lSimilaritiesJKES) + return listConcepts + + +def umls_concept_extractor2(concepts): + + print("Get words") + words = get_words(concepts) + print("Tokenized Words") + dictConcepts = get_umls_concept_cui(words) + print("Dictionary concepts") + lSimilarConcepts = similarity_concept(concepts,dictConcepts) + + return lSimilarConcepts + -- 2.24.1