Commit 6d57f049 authored by pxp9's avatar pxp9

pequeña mejora en la sustitucion de Yes/No

parent 3570d2a0
rm(list=ls())
setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources")
setwd(("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources/new_res_baskent/outpatient"))
#Cambiarlo por el nombre del ressource que se desea limpiar
hospital <- data.frame(read.csv("konya_outpatient.csv", sep=","))
hospital <- hospital %>% select(-contains("numeric"))
# hospital["NOT.HARMONISED"] <- NULL
#
# names <- colnames(hospital)
# for (i in 1:length(names)){
#
# if(grepl("NOT.HARMONISED", names[i])){
# hospital[names[i]] <- NULL
# print(paste("quito ", names[i]))
# }
#
# }
# hospital <- hospital[-1,]
setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/harmonised_data")
ComAndRF <- data.frame(read.csv("Com&RF.csv", sep=","))[1:64,1:5]
Complications <- data.frame(read.csv("Complications.csv", sep=";"))[1:20,1:5]
Dates <- data.frame(read.csv("Dates.csv", sep=";"))[1:12,1:5]
Demographics <- data.frame(read.csv("Demographics.csv", sep=";"))[1:9,1:5]
Home_med <- data.frame(read.csv("Home_med.csv", sep=";"))[1:13,1:5]
Imaging_data <- data.frame(read.csv("Imaging_data.csv", sep=";"))[1:11,1:5]
Labo <- data.frame(read.csv("Labo.csv", sep=";"))[1:143,1:5]
SiAndSympt <- data.frame(read.csv("Si&Sympt.csv", sep=";"))[1:50,1:5]
Treatment <- data.frame(read.csv("Treatment.csv", sep=";"))[1:32,1:5]
LifestyleAndDiet <- data.frame(read.csv("Lifestyle&Diet.csv", sep=";"))[1:165,1:5]
harmonised_data <- rbind(SiAndSympt,ComAndRF)
harmonised_data <- rbind(harmonised_data,Treatment)
harmonised_data <- rbind(harmonised_data,Dates)
harmonised_data <- rbind(harmonised_data,Demographics)
harmonised_data <- rbind(harmonised_data,Home_med)
harmonised_data <- rbind(harmonised_data,Imaging_data)
harmonised_data <- rbind(harmonised_data,Complications)
harmonised_data <- rbind(harmonised_data,Labo)
harmonised_data <- rbind(harmonised_data,LifestyleAndDiet)
rm(list=c("SiAndSympt",
"Complications",
"ComAndRF",
"Dates",
"Demographics",
"Home_med",
"Imaging_data",
"Complications",
"Labo",
"LifestyleAndDiet"))
noYesValues <- subset(harmonised_data, harmonised_data$Harmonised.data.format.unit == "No/Yes / missing" | harmonised_data$Harmonised.data.format.unit == "No/Yes / Missing")
noYesValues <- noYesValues$Harmonised.variable.name
noYesValues <- c(noYesValues,"CSXCOTAB","CSXCOTAG","IMDIT","RFXHIV_RFXAIDS", "SMXASAH", "CMXATH", "CMXNO")
categoric_vars = c("CMXATH", "CMXNO", "SMXASAH","CSXCOTAB","CSXCOTAG","IMDIT","RFXHIV_RFXAIDS","DMRGENDR", "DMRBORN", "DMRRETH1", "DMROCCU", "DMRHREDU", "DSXOS", "DSXHO", "DSXIC", "TRXAV","TRXRIB","TRXLR","TRXRM","TRXIA","TRXIB","TRXCH","TRXAB","TRXCS","TRXHEP","TRXAF","TRXCP","TRXOT","TRXECM","TRXIV","TRXNIV","TRXNO","TRXOX","TRXRR","TRXTR","TRXVA","TRXPE","TRXPV","TRXIT","TRXNMB","TRXAC","TRXINA","TRXIS","TRXIM","TRXVC","TRXVD","TRXZN", "CSXCOT","CSXCTR","SMXASAH","SMXFEA","SMXCOA","SMXSTA","SMXSBA","SMXRNA","SMXMYA","SMXARA","SMXCPA","SMXAPA","SMXINA","SMXNAA","SMXDIA","SMXFAA","SMXHEA","SMXCNA","SMXACA","SMXSLA","SMXTLA","SMXSYA","SMXWHA","SMXLYA","SMXANA","SMXIWA","SMXSRA","SMXBLA","CMXPRG","CMXCVD","CMXCMP","CMXHT","CMXDI","CMXCKD","CMXCLD","CMXCPD","CMXASM","CMXCND","CMXRHE","CMXCCI","CMXCBD","CMXDE","CMXPU","CMXST","CMXLY","CMXAP","RFXSM","RFXFSM","RFXOB","RFXTB","RFXIMD","RFXHIV","RFXAIDS","RFXUI","RFXHC","RFXONC","RFXMN", "HMRACI","HMRARB","HMRAHO","HMRNS","HMROS","HMRCS","HMRIS","HMRAV","HMRAB","HMRCOV","IMDXCT","IMDXCTCR","IMDXCTTE","IMDXCTAB","IMDXXR","IMDXPN", "COXRD","COXAR","COXPM","COXMOD","COXPT","COXEC","COXSH","COXIO","COXPE","COXST","COXDIC","COXRIO","COXKF","COXHF","COXBC")
personalized <- c("DMRGENDR", "DSXOS", "CSXCTR", "SMXFEA", "CSXCOT")
is_number <- function(x){
res <- FALSE
if(length(x)!=0){
x <- str_replace(x,",",".")
aux <- as.numeric(x)
if(!is.na(aux))
res <- TRUE
}
return(res)
}
replaceNoYesValues <- function(x){
#Replace the value with Yes or No
if(is.na(x))
x <- ""
else if (x==0 | x =="No" | x ==" No" | x =="NO")
x <- "No"
else if (x==1 | x == "Yes" | x == " Yes" | x=="SI")
x <- "Yes"
else
x <- ""
return (x)
}
fixNonCategoric <- function(x){
if(!is_number(x)){
x <- ""
}else{
x <- str_replace(x,",",".")
}
return(x)
}
personalizedFun <- function(x, colname){
if(colname == "DMRGENDR"){
if(is.na(x))
x <- ""
else if(x == 1 | x == "F"| x == "f" | x== "Female")
x <- "Female"
else if (x == 0 | x =="M" | x == "m" | x== "Male")
x <- "Male"
}
if(colname == "CSXCTR"){
if(is.na(x))
x <- ""
else if(x == 1 | x == "positive" | x=="PositivM" | x=="POSITIVM" | x =="POS?T?VM" | x =="NMGAT?VM")
x <- "Positive"
else if (x == 0 | x == "negative" | x=="negativM" | x=="NMGATIVM" | x=="NAGATIVM" | x =="NMGATIV" | x =="negativeM" | x == "NAGAT?VM" | x =="NMGAT?V")
x <- "Negative"
}
if(colname == "SMXFEA"){
if(is.na(x))
x <- ""
else if(x == 1)
x <- "Yes"
else if (x == 0)
x <- "No"
else if (x == ".")
x <- ""
}
if(colname == "DMRRETH1"){
if(is.na(x))
x <- ""
else if(x ==1)
x <- "Asian"
else if (x == 2)
x <- "Black"
else if (x == 3)
x <- "Hispanic"
else if (x == 4)
x <- "White"
else if (x == 5)
x <- "Multiracial"
else if (x == 6)
x <- "Other"
}
if(colname == "DMROCCU"){
if(is.na(x))
x <- ""
else if(x ==1)
x <- "Unemployed"
else if (x == 2)
x <- "Student"
else if (x == 3)
x <- "Employed"
else if (x == 4)
x <- "Self-employed"
else if (x == 5)
x <- "Retired"
else if (x == 6)
x <- ""
}
if(colname == "DMRHREDU"){
if(is.na(x))
x <- ""
else if(x ==1)
x <- "High School"
else if (x == 2)
x <- "Bachelors"
else if (x == 3)
x <- "Postgraduate"
else if (x == 4)
x <- "Other"
}
if(colname =="DSXOS"){
if(is.na(x))
x <- ""
else if (x==0 | x == "Recovered")
x <- "Recovered"
else if (x==1 | x == "Deceased")
x <- "Deceased"
else if (x==2 | x == "Transferred")
x <- "Transferred"
else
x <- ""
}
if(colname =="CSXCOT"){
if(is.na(x))
x <- ""
else if (x==1 )
x <- "PCR"
else if (x==2 )
x <- "antigen"
else if (x==3 )
x <- "other"
else
x <- ""
}
return(x)
}
dotToBar <- function (x){
if (grepl(".", x, fixed = TRUE))
res <- format(as.Date(x, format = "%d.%m.%Y"), "%d/%m/%Y")
else res <- x
return(res)
}
rm(newDf)
newDf <- hospital
names <- colnames(hospital)
for (j in 1:ncol(hospital)){
percentage <- trunc(j/ncol(hospital)*100)
mes <- paste(toString(percentage),"% completed", sep="")
print(mes)
print(names[j])
for(i in 1:nrow(hospital)){
if(names[j] %in% noYesValues){
newDf[i,j] <- replaceNoYesValues(hospital[i,j])
}else if(!(names[j] %in% categoric_vars) & names[j] != "DMRBORN" & !grepl("DAT",names[j], fixed=TRUE)){
newDf[i,j] <- fixNonCategoric(hospital[i,j])
}
if(names[j] %in% personalized){
newDf[i,j] <- personalizedFun(hospital[i,j],names[j])
}
if (is.na(hospital[i,j]))
newDf[i,j] <- ""
else if (hospital[i,j] == ".")
newDf[i,j] <- ""
}
}
setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources/newRessources")
setwd(("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources/new_res_baskent/outpatient/clean"))
write.csv(x=newDf, file = "konya_outpatient.csv", row.names = FALSE)
rm(list=ls())
setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources")
setwd(("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources/new_res_baskent/outpatient"))
#Cambiarlo por el nombre del ressource que se desea limpiar
hospital <- data.frame(read.csv("konya_outpatient.csv", sep=","))
hospital <- hospital %>% select(-contains("numeric"))
# hospital["NOT.HARMONISED"] <- NULL
#
# names <- colnames(hospital)
# for (i in 1:length(names)){
#
# if(grepl("NOT.HARMONISED", names[i])){
# hospital[names[i]] <- NULL
# print(paste("quito ", names[i]))
# }
#
# }
# hospital <- hospital[-1,]
setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/harmonised_data")
ComAndRF <- data.frame(read.csv("Com&RF.csv", sep=","))[1:64,1:5]
Complications <- data.frame(read.csv("Complications.csv", sep=";"))[1:20,1:5]
Dates <- data.frame(read.csv("Dates.csv", sep=";"))[1:12,1:5]
Demographics <- data.frame(read.csv("Demographics.csv", sep=";"))[1:9,1:5]
Home_med <- data.frame(read.csv("Home_med.csv", sep=";"))[1:13,1:5]
Imaging_data <- data.frame(read.csv("Imaging_data.csv", sep=";"))[1:11,1:5]
Labo <- data.frame(read.csv("Labo.csv", sep=";"))[1:143,1:5]
SiAndSympt <- data.frame(read.csv("Si&Sympt.csv", sep=";"))[1:50,1:5]
Treatment <- data.frame(read.csv("Treatment.csv", sep=";"))[1:32,1:5]
LifestyleAndDiet <- data.frame(read.csv("Lifestyle&Diet.csv", sep=";"))[1:165,1:5]
harmonised_data <- rbind(SiAndSympt,ComAndRF)
harmonised_data <- rbind(harmonised_data,Treatment)
harmonised_data <- rbind(harmonised_data,Dates)
harmonised_data <- rbind(harmonised_data,Demographics)
harmonised_data <- rbind(harmonised_data,Home_med)
harmonised_data <- rbind(harmonised_data,Imaging_data)
harmonised_data <- rbind(harmonised_data,Complications)
harmonised_data <- rbind(harmonised_data,Labo)
harmonised_data <- rbind(harmonised_data,LifestyleAndDiet)
rm(list=c("SiAndSympt",
"Complications",
"ComAndRF",
"Dates",
"Demographics",
"Home_med",
"Imaging_data",
"Complications",
"Labo",
"LifestyleAndDiet"))
noYesValues <- subset(harmonised_data, harmonised_data$Harmonised.data.format.unit == "No/Yes / missing" | harmonised_data$Harmonised.data.format.unit == "No/Yes / Missing")
noYesValues <- noYesValues$Harmonised.variable.name
noYesValues <- c(noYesValues,"CSXCOTAB","CSXCOTAG","IMDIT","RFXHIV_RFXAIDS", "SMXASAH", "CMXATH", "CMXNO")
categoric_vars = c("CMXATH", "CMXNO", "SMXASAH","CSXCOTAB","CSXCOTAG","IMDIT","RFXHIV_RFXAIDS","DMRGENDR", "DMRBORN", "DMRRETH1", "DMROCCU", "DMRHREDU", "DSXOS", "DSXHO", "DSXIC", "TRXAV","TRXRIB","TRXLR","TRXRM","TRXIA","TRXIB","TRXCH","TRXAB","TRXCS","TRXHEP","TRXAF","TRXCP","TRXOT","TRXECM","TRXIV","TRXNIV","TRXNO","TRXOX","TRXRR","TRXTR","TRXVA","TRXPE","TRXPV","TRXIT","TRXNMB","TRXAC","TRXINA","TRXIS","TRXIM","TRXVC","TRXVD","TRXZN", "CSXCOT","CSXCTR","SMXASAH","SMXFEA","SMXCOA","SMXSTA","SMXSBA","SMXRNA","SMXMYA","SMXARA","SMXCPA","SMXAPA","SMXINA","SMXNAA","SMXDIA","SMXFAA","SMXHEA","SMXCNA","SMXACA","SMXSLA","SMXTLA","SMXSYA","SMXWHA","SMXLYA","SMXANA","SMXIWA","SMXSRA","SMXBLA","CMXPRG","CMXCVD","CMXCMP","CMXHT","CMXDI","CMXCKD","CMXCLD","CMXCPD","CMXASM","CMXCND","CMXRHE","CMXCCI","CMXCBD","CMXDE","CMXPU","CMXST","CMXLY","CMXAP","RFXSM","RFXFSM","RFXOB","RFXTB","RFXIMD","RFXHIV","RFXAIDS","RFXUI","RFXHC","RFXONC","RFXMN", "HMRACI","HMRARB","HMRAHO","HMRNS","HMROS","HMRCS","HMRIS","HMRAV","HMRAB","HMRCOV","IMDXCT","IMDXCTCR","IMDXCTTE","IMDXCTAB","IMDXXR","IMDXPN", "COXRD","COXAR","COXPM","COXMOD","COXPT","COXEC","COXSH","COXIO","COXPE","COXST","COXDIC","COXRIO","COXKF","COXHF","COXBC")
personalized <- c("DMRGENDR", "DSXOS", "CSXCTR", "SMXFEA", "CSXCOT")
is_number <- function(x){
res <- FALSE
if(length(x)!=0){
x <- str_replace(x,",",".")
aux <- as.numeric(x)
if(!is.na(aux))
res <- TRUE
}
return(res)
}
replaceNoYesValues <- function(x){
#Replace the value with Yes or No
if(is.na(x)){
x <- ""
return (x)
}
x <- trimws(toupper(x))
if (x=="0" | x =="NO")
x <- "No"
else if (x=="1" | x == "YES" | x=="SI")
x <- "Yes"
else
x <- ""
return (x)
}
fixNonCategoric <- function(x){
if(!is_number(x)){
x <- ""
}else{
x <- str_replace(x,",",".")
}
return(x)
}
personalizedFun <- function(x, colname){
if(colname == "DMRGENDR"){
if(is.na(x))
x <- ""
else if(x == 1 | x == "F"| x == "f" | x== "Female")
x <- "Female"
else if (x == 0 | x =="M" | x == "m" | x== "Male")
x <- "Male"
}
if(colname == "CSXCTR"){
if(is.na(x))
x <- ""
else if(x == 1 | x == "positive" | x=="PositivM" | x=="POSITIVM" | x =="POS?T?VM" | x =="NMGAT?VM")
x <- "Positive"
else if (x == 0 | x == "negative" | x=="negativM" | x=="NMGATIVM" | x=="NAGATIVM" | x =="NMGATIV" | x =="negativeM" | x == "NAGAT?VM" | x =="NMGAT?V")
x <- "Negative"
}
if(colname == "SMXFEA"){
x <- replaceNoYesValues(x)
#if(is.na(x))
# x <- ""
#else if(x == 1)
# x <- "Yes"
#else if (x == 0)
# x <- "No"
#else if (x == ".")
# x <- ""
}
if(colname == "DMRRETH1"){
if(is.na(x))
x <- ""
else if(x ==1)
x <- "Asian"
else if (x == 2)
x <- "Black"
else if (x == 3)
x <- "Hispanic"
else if (x == 4)
x <- "White"
else if (x == 5)
x <- "Multiracial"
else if (x == 6)
x <- "Other"
}
if(colname == "DMROCCU"){
if(is.na(x))
x <- ""
else if(x ==1)
x <- "Unemployed"
else if (x == 2)
x <- "Student"
else if (x == 3)
x <- "Employed"
else if (x == 4)
x <- "Self-employed"
else if (x == 5)
x <- "Retired"
else if (x == 6)
x <- ""
}
if(colname == "DMRHREDU"){
if(is.na(x))
x <- ""
else if(x ==1)
x <- "High School"
else if (x == 2)
x <- "Bachelors"
else if (x == 3)
x <- "Postgraduate"
else if (x == 4)
x <- "Other"
}
if(colname =="DSXOS"){
if(is.na(x))
x <- ""
else if (x==0 | x == "Recovered")
x <- "Recovered"
else if (x==1 | x == "Deceased")
x <- "Deceased"
else if (x==2 | x == "Transferred")
x <- "Transferred"
else
x <- ""
}
if(colname =="CSXCOT"){
if(is.na(x))
x <- ""
else if (x==1 )
x <- "PCR"
else if (x==2 )
x <- "antigen"
else if (x==3 )
x <- "other"
else
x <- ""
}
return(x)
}
dotToBar <- function (x){
if (grepl(".", x, fixed = TRUE))
res <- format(as.Date(x, format = "%d.%m.%Y"), "%d/%m/%Y")
else res <- x
return(res)
}
rm(newDf)
newDf <- hospital
names <- colnames(hospital)
for (j in 1:ncol(hospital)){
percentage <- trunc(j/ncol(hospital)*100)
mes <- paste(toString(percentage),"% completed", sep="")
print(mes)
print(names[j])
for(i in 1:nrow(hospital)){
if(names[j] %in% noYesValues){
newDf[i,j] <- replaceNoYesValues(hospital[i,j])
}else if(!(names[j] %in% categoric_vars) & names[j] != "DMRBORN" & !grepl("DAT",names[j], fixed=TRUE)){
newDf[i,j] <- fixNonCategoric(hospital[i,j])
}
if(names[j] %in% personalized){
newDf[i,j] <- personalizedFun(hospital[i,j],names[j])
}
if (is.na(hospital[i,j]))
newDf[i,j] <- ""
else if (hospital[i,j] == ".")
newDf[i,j] <- ""
}
}
setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources/newRessources")
setwd(("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources/new_res_baskent/outpatient/clean"))
write.csv(x=newDf, file = "konya_outpatient.csv", row.names = FALSE)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment