import es_core_news_md import mysql.connector import textdistance import configparser from mysql.connector import errorcode from nltk.corpus import stopwords nlp = es_core_news_md.load() #Diccionario con los datos de conexion a la BBDD (se lee desde archivo de configuracion: DEFAULT --> LOCAL, TESTING --> ARES) configuration = configparser.ConfigParser() configuration.read('config.ini') config2 = {'user':configuration['DEFAULT']['DB_USER'], 'password':configuration['DEFAULT']['DB_PASSWORD'], 'port':configuration['DEFAULT']['DB_PORT'], 'host':configuration['DEFAULT']['DB_HOST'], 'db':configuration['DEFAULT']['DB_NAME'], 'auth_plugin':configuration['DEFAULT']['DB_AUTH_PLUGIN'] } #Función que tokeniza una lista de conceptos dada #Output: listado de las palabras tokenizadas def get_words(concepts): words = [] for i in range(0,len(concepts)): ent = nlp(concepts[i]) for i in range(len(ent)): if((ent[i].text.lower() not in stopwords.words('spanish')) and (ent[i].text.lower() not in words)): words.append(ent[i].text.lower()) return words #Función que dada una palabra busca en UMLS conceptos que la contengan y el CUI asociado #Output: lista pares (concepto,CUI) def search_umls(word): try: #Conectamos con nuestra BD cnx = mysql.connector.connect(**config2) #Creamos el cursor cursor = cnx.cursor() #Nuestra query query = "SELECT STR,CUI FROM MRCONSO where LAT='SPA' and STR like '%"+word+"%';" cursor.execute(query) lUmls = [] for row in cursor: if(row[0].strip!="" and row[1].strip()!=""): lUmls.append(str(row[0])+"\t"+str(row[1])) except mysql.connector.Error as err: if err.errno == errorcode.ER_ACCESS_DENIED_ERROR: print("No pudo conectarse a la BBDD, revisar usuario y password") elif err.errno == errorcode.ER_BAD_DB_ERROR: print("La BD introducida no existe") else: print(err) else: cnx.close() return lUmls #Función que dado un cui busca en UMLS conceptos que lo contengan #Output: lista (concepto) def search_umls_cui(cui): try: #Conectamos con nuestra BD cnx = mysql.connector.connect(**config2) #Creamos el cursor cursor = cnx.cursor() #Nuestra query query = "SELECT STR FROM MRCONSO where LAT='SPA' and CUI='"+cui+"';" cursor.execute(query) lUmls = [] for row in cursor: if(row[0].strip!=""): lUmls.append(str(row[0])) except mysql.connector.Error as err: if err.errno == errorcode.ER_ACCESS_DENIED_ERROR: print("No pudo conectarse a la BBDD, revisar usuario y password") elif err.errno == errorcode.ER_BAD_DB_ERROR: print("La BD introducida no existe") else: print(err) else: cnx.close() return lUmls #Función que busca en umls los conceptos que contengan las palabras asociadas #Output: Diccionario, key: words, values: (umls_concept,CUI) def get_umls_concept_cui(words): dictConcepts = {} for i in range(0,len(words)): lUmls = search_umls(words[i]) if (words[i] in dictConcepts.keys()): dictConcepts[words[i]] = dictConcepts[words[i]] + lUmls else: dictConcepts[words[i]] = lUmls return dictConcepts #Función que devuelve la similitud entre dos strings def similarities(str1,str2): levenshtein,jaccard,ratcliff = 0,0,0 if(len(str1.strip())+2>=len(str2.split("\t")[0].strip())): levenshtein = textdistance.levenshtein.normalized_similarity(str1.strip().lower(),str2.split("\t")[0].strip().lower()) return levenshtein #Función que devuelve el concepto UMLS más similar a un concepto dado def get_similarity(concept,lUMLSConcepts): lJaccard = [] maxSimilarLevenshtein = 0 umlsConceptLevenshtein = "" cuiConceptLevenshtein = "" for i in range(0,len(lUMLSConcepts)): levenshtein = similarities(concept,lUMLSConcepts[i]) if(levenshtein>maxSimilarLevenshtein): maxSimilarLevenshtein = levenshtein auxUMLS = lUMLSConcepts[i].split("\t") umlsConceptLevenshtein = auxUMLS[0] cuiConceptLevenshtein = auxUMLS[1] lJaccard.append((concept,"Levenshtein:",umlsConceptLevenshtein,cuiConceptLevenshtein,maxSimilarLevenshtein,"UMLS")) return lJaccard #Función que devuelve una lista de conceptos UMLS más similares a unos conceptos dados y sus CUIS asociadas def similarity_concept(concepts, dictConcepts): lSimilarConcepts = [] for i in range(0,len(concepts)): words = get_words([concepts[i]]) lAux = [] for j in range(0,len(words)): if(words[j] in dictConcepts.keys()): lAux = lAux + dictConcepts[words[j]] lSimilarConcepts = lSimilarConcepts + get_similarity(concepts[i],lAux) return lSimilarConcepts def similarity_cui (concepts, jkesCuis): listConceptsJKES = [] for i in range(0,len(jkesCuis)): lAux = search_umls_cui(jkesCuis[i]) if(len(lAux)>0): for j in range(0,len(lAux)): if(lAux[j] not in listConceptsJKES): listConceptsJKES.append(lAux[j]+"\t"+jkesCuis[i]) lSimilaritiesJKES = [] for i in range(0,len(concepts)): lSimilaritiesJKES += get_similarity(concepts[i],listConceptsJKES) return lSimilaritiesJKES def get_final_concepts(lConceptsUMLS, lConceptsUMLSJKES): listConcepts = [] for i in range(0,len(lConceptsUMLS)): if (lConceptsUMLS[i][4] >= lConceptsUMLSJKES[i][4]): listConcepts.append(lConceptsUMLS[i]) else: listConcepts.append(lConceptsUMLSJKES[i]) return listConcepts #Main def umls_concept_extractor(concepts,jkesCuis): words = get_words(concepts) dictConcepts = get_umls_concept_cui(words) lSimilarConcepts = similarity_concept(concepts,dictConcepts) if(len(jkesCuis)>0): lSimilaritiesJKES = similarity_cui(concepts,jkesCuis) listConcepts = get_final_concepts(lSimilarConcepts,lSimilaritiesJKES) return listConcepts def umls_concept_extractor2(concepts): print("Get words") words = get_words(concepts) print("Tokenized Words") dictConcepts = get_umls_concept_cui(words) print("Dictionary concepts") lSimilarConcepts = similarity_concept(concepts,dictConcepts) return lSimilarConcepts