Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Submit feedback
Sign in
Toggle navigation
H
Harmonize_Scripts
Project overview
Project overview
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Uncover
Harmonize_Scripts
Commits
2dd8e999
Commit
2dd8e999
authored
Apr 20, 2023
by
Pepe Márquez Romero
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
pass ULSS6
parent
37a069f3
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
3 additions
and
515 deletions
+3
-515
.Rhistory
.Rhistory
+0
-512
connection_parameters.R
connection_parameters.R
+2
-2
remote_execute_as_admin.R
remote_execute_as_admin.R
+1
-1
No files found.
.Rhistory
deleted
100644 → 0
View file @
37a069f3
install_dependencies
(
dep_list
)
codebook_file
<-
"20220315_Data Harmonisation.xlsb.xlsx"
codebook_demo
<-
read.xlsx
(
codebook_file
,
sheet
=
2
)
codebook_com_and_rf
<-
read.xlsx
(
codebook_file
,
sheet
=
3
)
codebook_home_med
<-
read.xlsx
(
codebook_file
,
sheet
=
4
)
codebook_si_sympt
<-
read.xlsx
(
codebook_file
,
sheet
=
5
)
codebook_treatments
<-
read.xlsx
(
codebook_file
,
sheet
=
6
)
codebook_labo
<-
read.xlsx
(
codebook_file
,
sheet
=
7
)
codebook_complications
<-
read.xlsx
(
codebook_file
,
sheet
=
8
)
codebook_imaging_data
<-
read.xlsx
(
codebook_file
,
sheet
=
9
)
codebook_lifestyle_diet
<-
read.xlsx
(
codebook_file
,
sheet
=
10
)
codebook_dates
<-
read.xlsx
(
codebook_file
,
sheet
=
11
)
codebook
<-
rbind
(
codebook_demo
,
codebook_com_and_rf
)
codebook
<-
rbind
(
codebook
,
codebook_home_med
)
codebook
<-
rbind
(
codebook
,
codebook_si_sympt
)
codebook
<-
rbind
(
codebook
,
codebook_treatments
)
codebook
<-
rbind
(
codebook
,
codebook_labo
)
codebook
<-
rbind
(
codebook
,
codebook_complications
)
codebook
<-
rbind
(
codebook
,
codebook_imaging_data
)
codebook_lifestyle_diet
<-
codebook_lifestyle_diet
[,
!
names
(
codebook_lifestyle_diet
)
%in%
c
(
"X2"
,
"X4"
,
"X10"
)]
codebook
<-
rbind
(
codebook
,
codebook_lifestyle_diet
)
codebook
<-
rbind
(
codebook
,
codebook_dates
)
codebook_col_names
<-
as.data.frame
(
codebook
$
Harmonised.variable.name
)
names
(
codebook_col_names
)
<-
c
(
"col_names"
)
setwd
(
paste
(
dir_name
,
"/harmonized_data"
,
sep
=
""
))
file_name
<-
readline
(
"Introduce the name of the file to check the values: "
)
harmonized_data
<-
""
if
(
grepl
(
".csv"
,
file_name
,
fixed
=
TRUE
)){
harmonized_data
<-
read.csv
(
file_name
)
}
else
if
(
grepl
(
".xlsx"
,
file_name
,
fixed
=
TRUE
)){
harmonized_data
<-
read.xlsx
(
file_name
)
}
#Test if column names are valid
check_column_names
<-
function
(
col_names
){
str_res
<-
"The column names:"
valid_colnames
<-
c
()
repeated_colnames
<-
c
()
for
(
i
in
1
:
(
nrow
(
col_names
))){
col_name
<-
col_names
[
i
,
1
]
number_of_column
<-
check_valid_name
(
col_name
)
if
(
number_of_column
==
0
){
str_res
<-
paste
(
str_res
,
col_name
,
sep
=
" "
)
}
else
if
(
number_of_column
==
1
){
valid_colnames
=
c
(
valid_colnames
,
col_name
)
}
else
{
repeated_colnames
=
c
(
repeated_colnames
,
col_name
)
}
}
str_res
<-
paste
(
str_res
,
"are not registered in the harmonized data codebook \n"
,
sep
=
" "
)
new_list
<-
list
(
"not_colnames"
=
str_res
,
"colnames"
=
valid_colnames
,
"repeated_colnames"
=
repeated_colnames
)
return
(
new_list
)
}
#Test if a single variable name is valid
check_valid_name
<-
function
(
col_name
){
valid
<-
0
if
(
col_name
%in%
codebook_col_names
$
col_names
){
valid
<-
length
(
grep
(
col_name
,
names
(
harmonized_data
)))
}
return
(
valid
)
}
check_valid_values_continuous
<-
function
(
colname
,
codebook_param
,
column
)
{
column
<-
column
[
column
!=
"."
]
possible_values_format
<-
codebook_param
$
Possible.values.format
[
codebook_param
$
Harmonised.variable.name
==
colname
]
possible_values_list
=
str_split
(
possible_values_format
,
"/"
)[[
1
]]
range_as_str
<-
str_trim
(
possible_values_list
[
1
])
missing_value_format
<-
str_trim
(
str_trim
(
possible_values_list
[
2
]))
separate_range
<-
str_split
(
range_as_str
,
"-"
)[[
1
]]
min_value
<-
strtoi
(
separate_range
[
1
])
max_value
<-
strtoi
(
separate_range
[
2
])
failing_values
<-
column
[
column
<
min_value
|
column
>
max_value
]
number_of_failing_values
<-
length
(
failing_values
[
!
is.na
(
failing_values
)])
str_res
<-
""
if
(
number_of_failing_values
==
0
)
{
str_res
<-
"No failing values"
}
else
{
range_as_str
<-
paste
(
min_value
,
"-"
,
max_value
,
"(continuous)"
)
str_res
<-
paste
(
colname
,
"has"
,
number_of_failing_values
,
"failing values"
)
str_res
<-
paste
(
str_res
,
"should be in range"
,
range_as_str
,
sep
=
" "
)
}
return
(
str_res
)
}
check_valid_values_categorical
<-
function
(
colname
,
codebook_param
,
column
)
{
column
<-
column
[
column
!=
"."
]
possible_values_format
<-
codebook_param
$
Possible.values.format
[
codebook_param
$
Harmonised.variable.name
==
colname
]
possible_values_list
<-
str_split
(
possible_values_format
,
"/"
)[[
1
]]
possible_values_list
<-
lapply
(
possible_values_list
,
str_trim
)
str_res
<-
""
min_value
<-
0
max_value
<-
0
if
(
length
(
possible_values_list
[[
1
]])
==
2
)
{
separate_range
<-
str_split
(
possible_values_list
[[
1
]][
1
],
"-"
)[[
1
]]
min_value
<-
strtoi
(
separate_range
[
1
])
max_value
<-
strtoi
(
separate_range
[
2
])
}
else
{
possible_values_list
<-
lapply
(
possible_values_list
,
strtoi
)[[
1
]]
min_value
<-
possible_values_list
[
1
]
max_value
<-
possible_values_list
[
length
(
possible_values_list
)
-
1
]
}
failing_values
<-
column
[
column
<
min_value
|
column
>
max_value
]
number_of_failing_values
<-
length
(
failing_values
[
!
is.na
(
failing_values
)])
if
(
number_of_failing_values
==
0
)
{
str_res
<-
"No failing values"
}
else
{
range_as_str
<-
paste
(
min_value
,
"-"
,
max_value
,
" (categorical)"
)
str_res
<-
paste
(
colname
,
"has"
,
number_of_failing_values
,
"failing values"
)
str_res
<-
paste
(
str_res
,
"should be in range"
,
range_as_str
,
sep
=
" "
)
}
return
(
str_res
)
}
check_valid_values_binary
<-
function
(
colname
,
column
)
{
column
<-
column
[
column
!=
"."
]
failing_values
<-
column
[
column
<
0
|
column
>
1
]
number_of_failing_values
<-
length
(
failing_values
[
!
is.na
(
failing_values
)])
str_res
<-
""
if
(
number_of_failing_values
==
0
)
str_res
<-
"No failing values"
else
{
range_as_str
<-
"0-1 (binary)"
str_res
<-
paste
(
colname
,
"has"
,
number_of_failing_values
,
"failing values"
)
str_res
<-
paste
(
str_res
,
"should be in range"
,
range_as_str
,
sep
=
" "
)
}
return
(
str_res
)
}
check_valid_values
<-
function
(
valid_colnames
,
codebook_param
){
res
<-
""
for
(
i
in
1
:
(
ncol
(
valid_colnames
))){
name
<-
names
(
valid_colnames
)[
i
]
if
(
grepl
(
"DAT"
,
name
,
fixed
=
TRUE
)){
next
}
#if("DMRBORN" == name | grepl("DAT", name, fixed=TRUE) | grepl("ISO", name , fixed=TRUE) | grepl("BEF", name, fixed=TRUE)){
# next
#}
column
<-
valid_colnames
[,
i
]
# Esto falla si tu codebook no es mismo que new_harmon.csv
column_type
<-
codebook_param
$
Variable.type
[
codebook_param
$
Harmonised.variable.name
==
name
]
if
(
is.na
(
column_type
)
)
{
variable
<-
paste
(
"Variable "
,
name
,
" wrong"
,
sep
=
" "
)
res
<-
paste
(
res
,
variable
,
sep
=
"\n"
)
next
}
result
=
switch
(
column_type
,
"Continuous"
=
check_valid_values_continuous
(
name
,
codebook_param
,
column
),
"Binary"
=
check_valid_values_binary
(
name
,
column
),
"Categorical"
=
check_valid_values_categorical
(
name
,
codebook_param
,
column
),
"Calendar date"
=
paste
(
"No failing values"
),
"ISO country code"
=
paste
(
"No failing values"
),
{
paste
(
"some column "
,
column_type
,
sep
=
" "
)
}
)
if
(
result
!=
"No failing values"
){
res
<-
paste
(
res
,
result
,
sep
=
"\n"
)
}
}
return
(
res
)
}
data_colnames
<-
as.data.frame
(
colnames
(
harmonized_data
))
check_valid_columns
<-
check_column_names
(
data_colnames
)
columns_not_valid
<-
check_valid_columns
$
not_colnames
valid_colnames_column
<-
as.data.frame
(
check_valid_columns
$
colnames
)
names
(
valid_colnames_column
)
=
c
(
"valid_colnames"
)
valid_colnames_with_data
<-
subset
(
harmonized_data
,
select
=
valid_colnames_column
$
valid_colnames
)
result
<-
""
result
<-
check_valid_values
(
valid_colnames_with_data
,
codebook
)
print
(
columns_not_valid
)
cat
(
result
)
check_valid_values_continuous
<-
function
(
colname
,
codebook_param
,
column
){
column
<-
column
[
column
!=
"."
]
possible_values_format
<-
codebook_param
$
Possible.values.format
[
codebook_param
$
Harmonised.variable.name
==
colname
]
possible_values_list
=
str_split
(
possible_values_format
,
"/"
)[[
1
]]
range_as_str
<-
str_trim
(
possible_values_list
[
1
])
missing_value_format
<-
str_trim
(
str_trim
(
possible_values_list
[
2
]))
separate_range
<-
str_split
(
range_as_str
,
"-"
)[[
1
]]
min_value
<-
strtoi
(
separate_range
[
1
])
max_value
<-
strtoi
(
separate_range
[
2
])
failing_values
<-
column
[
column
<
min_value
|
column
>
max_value
]
number_of_failing_values
<-
length
(
failing_values
[
!
is.na
(
failing_values
)])
str_res
<-
""
if
(
number_of_failing_values
==
0
)
str_res
<-
"No failing values"
else
{
failing_values
<-
failing_values
[
!
is.na
(
failing_values
)]
failing_value_counts
<-
table
(
failing_values
)
str_res
<-
paste
(
colname
,
"has failing values:"
)
for
(
i
in
seq_along
(
failing_value_counts
))
{
value
<-
names
(
failing_value_counts
)[
i
]
count
<-
failing_value_counts
[
i
]
str_res
<-
paste
(
str_res
,
paste
(
value
,
"("
,
count
,
"times)"
,
collapse
=
" "
),
sep
=
" "
)
}
str_res
<-
paste
(
str_res
,
"should be in range"
,
range_as_str
,
"(continuous)"
,
sep
=
" "
)
}
return
(
str_res
)
}
check_valid_values_categorical
<-
function
(
colname
,
codebook_param
,
column
)
{
column
<-
column
[
column
!=
"."
]
possible_values_format
<-
codebook_param
$
Possible.values.format
[
codebook_param
$
Harmonised.variable.name
==
colname
]
possible_values_list
<-
str_split
(
possible_values_format
,
"/"
)[[
1
]]
possible_values_list
<-
lapply
(
possible_values_list
,
str_trim
)
str_res
<-
""
min_value
<-
0
max_value
<-
0
if
(
length
(
possible_values_list
[[
1
]])
==
2
)
{
separate_range
<-
str_split
(
possible_values_list
[[
1
]][
1
],
"-"
)[[
1
]]
min_value
<-
strtoi
(
separate_range
[
1
])
max_value
<-
strtoi
(
separate_range
[
2
])
}
else
{
possible_values_list
<-
lapply
(
possible_values_list
,
strtoi
)[[
1
]]
min_value
<-
possible_values_list
[
1
]
max_value
<-
possible_values_list
[
length
(
possible_values_list
)
-
1
]
}
failing_values
<-
column
[
column
<
min_value
|
column
>
max_value
]
number_of_failing_values
<-
length
(
failing_values
[
!
is.na
(
failing_values
)])
if
(
number_of_failing_values
==
0
)
{
str_res
<-
"No failing values"
}
else
{
range_as_str
<-
paste
(
min_value
,
"-"
,
max_value
,
" (categorical)"
)
failing_values
<-
failing_values
[
!
is.na
(
failing_values
)]
failing_value_counts
<-
table
(
failing_values
)
str_res
<-
paste
(
colname
,
"has failing values:"
)
for
(
i
in
seq_along
(
failing_value_counts
))
{
value
<-
names
(
failing_value_counts
)[
i
]
count
<-
failing_value_counts
[
i
]
str_res
<-
paste
(
str_res
,
paste
(
value
,
"("
,
count
,
"times)"
,
collapse
=
" "
),
sep
=
" "
)
}
str_res
<-
paste
(
str_res
,
"should be in range"
,
range_as_str
,
sep
=
" "
)
}
return
(
str_res
)
}
check_valid_values_binary
<-
function
(
colname
,
column
)
{
column
<-
column
[
column
!=
"."
]
failing_values
<-
column
[
column
<
0
|
column
>
1
]
number_of_failing_values
<-
length
(
failing_values
[
!
is.na
(
failing_values
)])
str_res
<-
""
if
(
number_of_failing_values
==
0
)
str_res
<-
"No failing values"
else
{
range_as_str
<-
"0-1 (binary)"
failing_values
<-
failing_values
[
!
is.na
(
failing_values
)]
failing_value_counts
<-
table
(
failing_values
)
str_res
<-
paste
(
colname
,
"has failing values:"
)
for
(
i
in
seq_along
(
failing_value_counts
))
{
value
<-
names
(
failing_value_counts
)[
i
]
count
<-
failing_value_counts
[
i
]
str_res
<-
paste
(
str_res
,
paste
(
value
,
"("
,
count
,
"times)"
,
collapse
=
" "
),
sep
=
" "
)
}
str_res
<-
paste
(
str_res
,
"should be in range"
,
range_as_str
,
sep
=
" "
)
}
return
(
str_res
)
}
check_valid_values
<-
function
(
valid_colnames
,
codebook_param
){
res
<-
""
for
(
i
in
1
:
(
ncol
(
valid_colnames
))){
name
<-
names
(
valid_colnames
)[
i
]
if
(
grepl
(
"DAT"
,
name
,
fixed
=
TRUE
)){
next
}
#if("DMRBORN" == name | grepl("DAT", name, fixed=TRUE) | grepl("ISO", name , fixed=TRUE) | grepl("BEF", name, fixed=TRUE)){
# next
#}
column
<-
valid_colnames
[,
i
]
# Esto falla si tu codebook no es mismo que new_harmon.csv
column_type
<-
codebook_param
$
Variable.type
[
codebook_param
$
Harmonised.variable.name
==
name
]
if
(
is.na
(
column_type
)
)
{
variable
<-
paste
(
"Variable "
,
name
,
" wrong"
,
sep
=
" "
)
res
<-
paste
(
res
,
variable
,
sep
=
"\n"
)
next
}
result
=
switch
(
column_type
,
"Continuous"
=
check_valid_values_continuous
(
name
,
codebook_param
,
column
),
"Binary"
=
check_valid_values_binary
(
name
,
column
),
"Categorical"
=
check_valid_values_categorical
(
name
,
codebook_param
,
column
),
"Calendar date"
=
paste
(
"No failing values"
),
"ISO country code"
=
paste
(
"No failing values"
),
{
paste
(
"some column "
,
column_type
,
sep
=
" "
)
}
)
if
(
result
!=
"No failing values"
){
res
<-
paste
(
res
,
result
,
sep
=
"\n"
)
}
}
return
(
res
)
}
data_colnames
<-
as.data.frame
(
colnames
(
harmonized_data
))
check_valid_columns
<-
check_column_names
(
data_colnames
)
columns_not_valid
<-
check_valid_columns
$
not_colnames
valid_colnames_column
<-
as.data.frame
(
check_valid_columns
$
colnames
)
names
(
valid_colnames_column
)
=
c
(
"valid_colnames"
)
valid_colnames_with_data
<-
subset
(
harmonized_data
,
select
=
valid_colnames_column
$
valid_colnames
)
result
<-
""
result
<-
check_valid_values
(
valid_colnames_with_data
,
codebook
)
print
(
columns_not_valid
)
cat
(
result
)
check_valid_values_continuous
<-
function
(
colname
,
codebook_param
,
column
)
{
column
<-
column
[
column
!=
"."
]
possible_values_format
<-
codebook_param
$
Possible.values.format
[
codebook_param
$
Harmonised.variable.name
==
colname
]
possible_values_list
=
str_split
(
possible_values_format
,
"/"
)[[
1
]]
range_as_str
<-
str_trim
(
possible_values_list
[
1
])
missing_value_format
<-
str_trim
(
str_trim
(
possible_values_list
[
2
]))
separate_range
<-
str_split
(
range_as_str
,
"-"
)[[
1
]]
min_value
<-
strtoi
(
separate_range
[
1
])
max_value
<-
strtoi
(
separate_range
[
2
])
failing_values
<-
column
[
column
<
min_value
|
column
>
max_value
]
number_of_failing_values
<-
length
(
failing_values
[
!
is.na
(
failing_values
)])
str_res
<-
""
if
(
number_of_failing_values
==
0
)
{
str_res
<-
"No failing values"
}
else
{
range_as_str
<-
paste
(
min_value
,
"-"
,
max_value
,
"(continuous)"
)
str_res
<-
paste
(
colname
,
"has"
,
number_of_failing_values
,
"failing values"
)
str_res
<-
paste
(
str_res
,
"should be in range"
,
range_as_str
,
sep
=
" "
)
}
return
(
str_res
)
}
check_valid_values_categorical
<-
function
(
colname
,
codebook_param
,
column
)
{
column
<-
column
[
column
!=
"."
]
possible_values_format
<-
codebook_param
$
Possible.values.format
[
codebook_param
$
Harmonised.variable.name
==
colname
]
possible_values_list
<-
str_split
(
possible_values_format
,
"/"
)[[
1
]]
possible_values_list
<-
lapply
(
possible_values_list
,
str_trim
)
str_res
<-
""
min_value
<-
0
max_value
<-
0
if
(
length
(
possible_values_list
[[
1
]])
==
2
)
{
separate_range
<-
str_split
(
possible_values_list
[[
1
]][
1
],
"-"
)[[
1
]]
min_value
<-
strtoi
(
separate_range
[
1
])
max_value
<-
strtoi
(
separate_range
[
2
])
}
else
{
possible_values_list
<-
lapply
(
possible_values_list
,
strtoi
)[[
1
]]
min_value
<-
possible_values_list
[
1
]
max_value
<-
possible_values_list
[
length
(
possible_values_list
)
-
1
]
}
failing_values
<-
column
[
column
<
min_value
|
column
>
max_value
]
number_of_failing_values
<-
length
(
failing_values
[
!
is.na
(
failing_values
)])
if
(
number_of_failing_values
==
0
)
{
str_res
<-
"No failing values"
}
else
{
range_as_str
<-
paste
(
min_value
,
"-"
,
max_value
,
" (categorical)"
)
str_res
<-
paste
(
colname
,
"has"
,
number_of_failing_values
,
"failing values"
)
str_res
<-
paste
(
str_res
,
"should be in range"
,
range_as_str
,
sep
=
" "
)
}
return
(
str_res
)
}
check_valid_values_binary
<-
function
(
colname
,
column
)
{
column
<-
column
[
column
!=
"."
]
failing_values
<-
column
[
column
<
0
|
column
>
1
]
number_of_failing_values
<-
length
(
failing_values
[
!
is.na
(
failing_values
)])
str_res
<-
""
if
(
number_of_failing_values
==
0
)
str_res
<-
"No failing values"
else
{
range_as_str
<-
"0-1 (binary)"
str_res
<-
paste
(
colname
,
"has"
,
number_of_failing_values
,
"failing values"
)
str_res
<-
paste
(
str_res
,
"should be in range"
,
range_as_str
,
sep
=
" "
)
}
return
(
str_res
)
}
check_valid_values
<-
function
(
valid_colnames
,
codebook_param
){
res
<-
""
for
(
i
in
1
:
(
ncol
(
valid_colnames
))){
name
<-
names
(
valid_colnames
)[
i
]
if
(
grepl
(
"DAT"
,
name
,
fixed
=
TRUE
)){
next
}
#if("DMRBORN" == name | grepl("DAT", name, fixed=TRUE) | grepl("ISO", name , fixed=TRUE) | grepl("BEF", name, fixed=TRUE)){
# next
#}
column
<-
valid_colnames
[,
i
]
# Esto falla si tu codebook no es mismo que new_harmon.csv
column_type
<-
codebook_param
$
Variable.type
[
codebook_param
$
Harmonised.variable.name
==
name
]
if
(
is.na
(
column_type
)
)
{
variable
<-
paste
(
"Variable "
,
name
,
" wrong"
,
sep
=
" "
)
res
<-
paste
(
res
,
variable
,
sep
=
"\n"
)
next
}
result
=
switch
(
column_type
,
"Continuous"
=
check_valid_values_continuous
(
name
,
codebook_param
,
column
),
"Binary"
=
check_valid_values_binary
(
name
,
column
),
"Categorical"
=
check_valid_values_categorical
(
name
,
codebook_param
,
column
),
"Calendar date"
=
paste
(
"No failing values"
),
"ISO country code"
=
paste
(
"No failing values"
),
{
paste
(
"some column "
,
column_type
,
sep
=
" "
)
}
)
if
(
result
!=
"No failing values"
){
res
<-
paste
(
res
,
result
,
sep
=
"\n"
)
}
}
return
(
res
)
}
data_colnames
<-
as.data.frame
(
colnames
(
harmonized_data
))
check_valid_columns
<-
check_column_names
(
data_colnames
)
columns_not_valid
<-
check_valid_columns
$
not_colnames
valid_colnames_column
<-
as.data.frame
(
check_valid_columns
$
colnames
)
names
(
valid_colnames_column
)
=
c
(
"valid_colnames"
)
valid_colnames_with_data
<-
subset
(
harmonized_data
,
select
=
valid_colnames_column
$
valid_colnames
)
result
<-
""
result
<-
check_valid_values
(
valid_colnames_with_data
,
codebook
)
print
(
columns_not_valid
)
cat
(
result
)
check_valid_values_continuous
<-
function
(
colname
,
codebook_param
,
column
)
{
column
<-
column
[
column
!=
"."
]
possible_values_format
<-
codebook_param
$
Possible.values.format
[
codebook_param
$
Harmonised.variable.name
==
colname
]
possible_values_list
=
str_split
(
possible_values_format
,
"/"
)[[
1
]]
range_as_str
<-
str_trim
(
possible_values_list
[
1
])
missing_value_format
<-
str_trim
(
str_trim
(
possible_values_list
[
2
]))
separate_range
<-
str_split
(
range_as_str
,
"-"
)[[
1
]]
min_value
<-
strtoi
(
separate_range
[
1
])
max_value
<-
strtoi
(
separate_range
[
2
])
failing_values
<-
column
[
column
<
min_value
|
column
>
max_value
]
number_of_failing_values
<-
length
(
failing_values
[
!
is.na
(
failing_values
)])
str_res
<-
""
if
(
number_of_failing_values
==
0
)
{
str_res
<-
"No failing values"
}
else
{
range_as_str
<-
paste
(
min_value
,
"-"
,
max_value
,
"(continuous)"
)
str_res
<-
paste
(
colname
,
"has"
,
number_of_failing_values
,
"failing values"
)
str_res
<-
paste
(
str_res
,
"should be in range"
,
range_as_str
,
sep
=
" "
)
}
return
(
str_res
)
}
check_valid_values_categorical
<-
function
(
colname
,
codebook_param
,
column
)
{
column
<-
column
[
column
!=
"."
]
possible_values_format
<-
codebook_param
$
Possible.values.format
[
codebook_param
$
Harmonised.variable.name
==
colname
]
possible_values_list
<-
str_split
(
possible_values_format
,
"/"
)[[
1
]]
possible_values_list
<-
lapply
(
possible_values_list
,
str_trim
)
str_res
<-
""
min_value
<-
0
max_value
<-
0
if
(
length
(
possible_values_list
[[
1
]])
==
2
)
{
separate_range
<-
str_split
(
possible_values_list
[[
1
]][
1
],
"-"
)[[
1
]]
min_value
<-
strtoi
(
separate_range
[
1
])
max_value
<-
strtoi
(
separate_range
[
2
])
}
else
{
possible_values_list
<-
lapply
(
possible_values_list
,
strtoi
)[[
1
]]
min_value
<-
possible_values_list
[
1
]
max_value
<-
possible_values_list
[
length
(
possible_values_list
)
-
1
]
}
failing_values
<-
column
[
column
<
min_value
|
column
>
max_value
]
number_of_failing_values
<-
length
(
failing_values
[
!
is.na
(
failing_values
)])
if
(
number_of_failing_values
==
0
)
{
str_res
<-
"No failing values"
}
else
{
range_as_str
<-
paste
(
min_value
,
"-"
,
max_value
,
" (categorical)"
)
str_res
<-
paste
(
colname
,
"has"
,
number_of_failing_values
,
"failing values"
)
str_res
<-
paste
(
str_res
,
"should be in range"
,
range_as_str
,
sep
=
" "
)
}
return
(
str_res
)
}
check_valid_values_binary
<-
function
(
colname
,
column
)
{
column
<-
column
[
column
!=
"."
]
failing_values
<-
column
[
column
<
0
|
column
>
1
]
number_of_failing_values
<-
length
(
failing_values
[
!
is.na
(
failing_values
)])
str_res
<-
""
if
(
number_of_failing_values
==
0
)
str_res
<-
"No failing values"
else
{
range_as_str
<-
"0-1 (binary)"
str_res
<-
paste
(
colname
,
"has"
,
number_of_failing_values
,
"failing values"
)
str_res
<-
paste
(
str_res
,
"should be in range"
,
range_as_str
,
sep
=
" "
)
}
return
(
str_res
)
}
check_valid_values
<-
function
(
valid_colnames
,
codebook_param
){
res
<-
""
for
(
i
in
1
:
(
ncol
(
valid_colnames
))){
name
<-
names
(
valid_colnames
)[
i
]
if
(
grepl
(
"DAT"
,
name
,
fixed
=
TRUE
)){
next
}
#if("DMRBORN" == name | grepl("DAT", name, fixed=TRUE) | grepl("ISO", name , fixed=TRUE) | grepl("BEF", name, fixed=TRUE)){
# next
#}
column
<-
valid_colnames
[,
i
]
# Esto falla si tu codebook no es mismo que new_harmon.csv
column_type
<-
codebook_param
$
Variable.type
[
codebook_param
$
Harmonised.variable.name
==
name
]
if
(
is.na
(
column_type
)
)
{
variable
<-
paste
(
"Variable "
,
name
,
" wrong"
,
sep
=
" "
)
res
<-
paste
(
res
,
variable
,
sep
=
"\n"
)
next
}
result
=
switch
(
column_type
,
"Continuous"
=
check_valid_values_continuous
(
name
,
codebook_param
,
column
),
"Binary"
=
check_valid_values_binary
(
name
,
column
),
"Categorical"
=
check_valid_values_categorical
(
name
,
codebook_param
,
column
),
"Calendar date"
=
paste
(
"No failing values"
),
"ISO country code"
=
paste
(
"No failing values"
),
{
paste
(
"some column "
,
column_type
,
sep
=
" "
)
}
)
if
(
result
!=
"No failing values"
){
res
<-
paste
(
res
,
result
,
sep
=
"\n"
)
}
}
return
(
res
)
}
data_colnames
<-
as.data.frame
(
colnames
(
harmonized_data
))
check_valid_columns
<-
check_column_names
(
data_colnames
)
columns_not_valid
<-
check_valid_columns
$
not_colnames
valid_colnames_column
<-
as.data.frame
(
check_valid_columns
$
colnames
)
names
(
valid_colnames_column
)
=
c
(
"valid_colnames"
)
valid_colnames_with_data
<-
subset
(
harmonized_data
,
select
=
valid_colnames_column
$
valid_colnames
)
result
<-
""
result
<-
check_valid_values
(
valid_colnames_with_data
,
codebook
)
print
(
columns_not_valid
)
cat
(
result
)
connection_parameters.R
View file @
2dd8e999
...
...
@@ -143,7 +143,7 @@ urls <- c(
"https://opal.aulss6.veneto.it"
,
"https://uncover.itg.be"
)
#
users
<-
c
(
"user_analisis"
,
"user_analisis"
,
...
...
@@ -212,6 +212,6 @@ pass <- c(
"Ekfl07UUgz"
,
"Ekfl07UUgz"
,
"3^z4AV.)hG5~PT/]"
,
"
Ekfl07UUgz
"
,
"
7yuvFnCTrPxPDt8@
"
,
"3^z4AV.)hG5~PT/]"
)
remote_execute_as_admin.R
View file @
2dd8e999
# load opal library
library
(
opalr
)
o
<-
opal.login
(
'
pepe'
,
'7G9b!!2IyJs$'
,
url
=
'https://uncover.itg.be
'
)
o
<-
opal.login
(
'
opal_admin'
,
'nK5&@WC1A13M'
,
url
=
'https://opal.aulss6.veneto.it
'
)
dir_name
<-
readline
(
"Introduce the name of the directory please: "
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment