Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Submit feedback
Sign in
Toggle navigation
C
ConceptExtractor
Project overview
Project overview
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Javier Rodriguez Vidal
ConceptExtractor
Commits
b595900f
Commit
b595900f
authored
Feb 19, 2021
by
Javier Rodriguez Vidal
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Script que extrae los conceptos + CUI de UMLS
parent
0fb8dd0b
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
216 additions
and
0 deletions
+216
-0
UMLS_Extractor/umlsExtractor.py
UMLS_Extractor/umlsExtractor.py
+216
-0
No files found.
UMLS_Extractor/umlsExtractor.py
0 → 100644
View file @
b595900f
import
es_core_news_md
import
mysql.connector
import
textdistance
import
configparser
from
mysql.connector
import
errorcode
from
nltk.corpus
import
stopwords
nlp
=
es_core_news_md
.
load
()
#Diccionario con los datos de conexion a la BBDD (se lee desde archivo de configuracion: DEFAULT --> LOCAL, TESTING --> ARES)
configuration
=
configparser
.
ConfigParser
()
configuration
.
read
(
'config.ini'
)
config2
=
{
'user'
:
configuration
[
'DEFAULT'
][
'DB_USER'
],
'password'
:
configuration
[
'DEFAULT'
][
'DB_PASSWORD'
],
'port'
:
configuration
[
'DEFAULT'
][
'DB_PORT'
],
'host'
:
configuration
[
'DEFAULT'
][
'DB_HOST'
],
'db'
:
configuration
[
'DEFAULT'
][
'DB_NAME'
],
'auth_plugin'
:
configuration
[
'DEFAULT'
][
'DB_AUTH_PLUGIN'
]
}
#Función que tokeniza una lista de conceptos dada
#Output: listado de las palabras tokenizadas
def
get_words
(
concepts
):
words
=
[]
for
i
in
range
(
0
,
len
(
concepts
)):
ent
=
nlp
(
concepts
[
i
])
for
i
in
range
(
len
(
ent
)):
if
((
ent
[
i
]
.
text
.
lower
()
not
in
stopwords
.
words
(
'spanish'
))
and
(
ent
[
i
]
.
text
.
lower
()
not
in
words
)):
words
.
append
(
ent
[
i
]
.
text
.
lower
())
return
words
#Función que dada una palabra busca en UMLS conceptos que la contengan y el CUI asociado
#Output: lista pares (concepto,CUI)
def
search_umls
(
word
):
try
:
#Conectamos con nuestra BD
cnx
=
mysql
.
connector
.
connect
(
**
config2
)
#Creamos el cursor
cursor
=
cnx
.
cursor
()
#Nuestra query
query
=
"SELECT STR,CUI FROM MRCONSO where LAT='SPA' and STR like '
%
"
+
word
+
"
%
';"
cursor
.
execute
(
query
)
lUmls
=
[]
for
row
in
cursor
:
if
(
row
[
0
]
.
strip
!=
""
and
row
[
1
]
.
strip
()
!=
""
):
lUmls
.
append
(
str
(
row
[
0
])
+
"
\t
"
+
str
(
row
[
1
]))
except
mysql
.
connector
.
Error
as
err
:
if
err
.
errno
==
errorcode
.
ER_ACCESS_DENIED_ERROR
:
print
(
"No pudo conectarse a la BBDD, revisar usuario y password"
)
elif
err
.
errno
==
errorcode
.
ER_BAD_DB_ERROR
:
print
(
"La BD introducida no existe"
)
else
:
print
(
err
)
else
:
cnx
.
close
()
return
lUmls
#Función que dado un cui busca en UMLS conceptos que lo contengan
#Output: lista (concepto)
def
search_umls_cui
(
cui
):
try
:
#Conectamos con nuestra BD
cnx
=
mysql
.
connector
.
connect
(
**
config2
)
#Creamos el cursor
cursor
=
cnx
.
cursor
()
#Nuestra query
query
=
"SELECT STR FROM MRCONSO where LAT='SPA' and CUI='"
+
cui
+
"';"
cursor
.
execute
(
query
)
lUmls
=
[]
for
row
in
cursor
:
if
(
row
[
0
]
.
strip
!=
""
):
lUmls
.
append
(
str
(
row
[
0
]))
except
mysql
.
connector
.
Error
as
err
:
if
err
.
errno
==
errorcode
.
ER_ACCESS_DENIED_ERROR
:
print
(
"No pudo conectarse a la BBDD, revisar usuario y password"
)
elif
err
.
errno
==
errorcode
.
ER_BAD_DB_ERROR
:
print
(
"La BD introducida no existe"
)
else
:
print
(
err
)
else
:
cnx
.
close
()
return
lUmls
#Función que busca en umls los conceptos que contengan las palabras asociadas
#Output: Diccionario, key: words, values: (umls_concept,CUI)
def
get_umls_concept_cui
(
words
):
dictConcepts
=
{}
for
i
in
range
(
0
,
len
(
words
)):
lUmls
=
search_umls
(
words
[
i
])
if
(
words
[
i
]
in
dictConcepts
.
keys
()):
dictConcepts
[
words
[
i
]]
=
dictConcepts
[
words
[
i
]]
+
lUmls
else
:
dictConcepts
[
words
[
i
]]
=
lUmls
return
dictConcepts
#Función que devuelve la similitud entre dos strings
def
similarities
(
str1
,
str2
):
levenshtein
,
jaccard
,
ratcliff
=
0
,
0
,
0
if
(
len
(
str1
.
strip
())
+
2
>=
len
(
str2
.
split
(
"
\t
"
)[
0
]
.
strip
())):
levenshtein
=
textdistance
.
levenshtein
.
normalized_similarity
(
str1
.
strip
()
.
lower
(),
str2
.
split
(
"
\t
"
)[
0
]
.
strip
()
.
lower
())
return
levenshtein
#Función que devuelve el concepto UMLS más similar a un concepto dado
def
get_similarity
(
concept
,
lUMLSConcepts
):
lJaccard
=
[]
maxSimilarLevenshtein
=
0
umlsConceptLevenshtein
=
""
cuiConceptLevenshtein
=
""
for
i
in
range
(
0
,
len
(
lUMLSConcepts
)):
levenshtein
=
similarities
(
concept
,
lUMLSConcepts
[
i
])
if
(
levenshtein
>
maxSimilarLevenshtein
):
maxSimilarLevenshtein
=
levenshtein
auxUMLS
=
lUMLSConcepts
[
i
]
.
split
(
"
\t
"
)
umlsConceptLevenshtein
=
auxUMLS
[
0
]
cuiConceptLevenshtein
=
auxUMLS
[
1
]
lJaccard
.
append
((
concept
,
"Levenshtein:"
,
umlsConceptLevenshtein
,
cuiConceptLevenshtein
,
maxSimilarLevenshtein
,
"UMLS"
))
return
lJaccard
#Función que devuelve una lista de conceptos UMLS más similares a unos conceptos dados y sus CUIS asociadas
def
similarity_concept
(
concepts
,
dictConcepts
):
lSimilarConcepts
=
[]
for
i
in
range
(
0
,
len
(
concepts
)):
words
=
get_words
([
concepts
[
i
]])
lAux
=
[]
for
j
in
range
(
0
,
len
(
words
)):
if
(
words
[
j
]
in
dictConcepts
.
keys
()):
lAux
=
lAux
+
dictConcepts
[
words
[
j
]]
lSimilarConcepts
=
lSimilarConcepts
+
get_similarity
(
concepts
[
i
],
lAux
)
return
lSimilarConcepts
def
similarity_cui
(
concepts
,
jkesCuis
):
listConceptsJKES
=
[]
for
i
in
range
(
0
,
len
(
jkesCuis
)):
lAux
=
search_umls_cui
(
jkesCuis
[
i
])
if
(
len
(
lAux
)
>
0
):
for
j
in
range
(
0
,
len
(
lAux
)):
if
(
lAux
[
j
]
not
in
listConceptsJKES
):
listConceptsJKES
.
append
(
lAux
[
j
]
+
"
\t
"
+
jkesCuis
[
i
])
lSimilaritiesJKES
=
[]
for
i
in
range
(
0
,
len
(
concepts
)):
lSimilaritiesJKES
+=
get_similarity
(
concepts
[
i
],
listConceptsJKES
)
return
lSimilaritiesJKES
def
get_final_concepts
(
lConceptsUMLS
,
lConceptsUMLSJKES
):
listConcepts
=
[]
for
i
in
range
(
0
,
len
(
lConceptsUMLS
)):
if
(
lConceptsUMLS
[
i
][
4
]
>=
lConceptsUMLSJKES
[
i
][
4
]):
listConcepts
.
append
(
lConceptsUMLS
[
i
])
else
:
listConcepts
.
append
(
lConceptsUMLSJKES
[
i
])
return
listConcepts
#Main
def
umls_concept_extractor
(
concepts
,
jkesCuis
):
words
=
get_words
(
concepts
)
dictConcepts
=
get_umls_concept_cui
(
words
)
lSimilarConcepts
=
similarity_concept
(
concepts
,
dictConcepts
)
if
(
len
(
jkesCuis
)
>
0
):
lSimilaritiesJKES
=
similarity_cui
(
concepts
,
jkesCuis
)
listConcepts
=
get_final_concepts
(
lSimilarConcepts
,
lSimilaritiesJKES
)
return
listConcepts
def
umls_concept_extractor2
(
concepts
):
print
(
"Get words"
)
words
=
get_words
(
concepts
)
print
(
"Tokenized Words"
)
dictConcepts
=
get_umls_concept_cui
(
words
)
print
(
"Dictionary concepts"
)
lSimilarConcepts
=
similarity_concept
(
concepts
,
dictConcepts
)
return
lSimilarConcepts
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment