Commit 6d57f049 authored by pxp9's avatar pxp9

pequeña mejora en la sustitucion de Yes/No

parent 3570d2a0
rm(list=ls()) rm(list=ls())
setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources") setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources")
setwd(("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources/new_res_baskent/outpatient")) setwd(("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources/new_res_baskent/outpatient"))
#Cambiarlo por el nombre del ressource que se desea limpiar #Cambiarlo por el nombre del ressource que se desea limpiar
hospital <- data.frame(read.csv("konya_outpatient.csv", sep=",")) hospital <- data.frame(read.csv("konya_outpatient.csv", sep=","))
hospital <- hospital %>% select(-contains("numeric")) hospital <- hospital %>% select(-contains("numeric"))
# hospital["NOT.HARMONISED"] <- NULL # hospital["NOT.HARMONISED"] <- NULL
# #
# names <- colnames(hospital) # names <- colnames(hospital)
# for (i in 1:length(names)){ # for (i in 1:length(names)){
# #
# if(grepl("NOT.HARMONISED", names[i])){ # if(grepl("NOT.HARMONISED", names[i])){
# hospital[names[i]] <- NULL # hospital[names[i]] <- NULL
# print(paste("quito ", names[i])) # print(paste("quito ", names[i]))
# } # }
# #
# } # }
# hospital <- hospital[-1,] # hospital <- hospital[-1,]
setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/harmonised_data") setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/harmonised_data")
ComAndRF <- data.frame(read.csv("Com&RF.csv", sep=","))[1:64,1:5] ComAndRF <- data.frame(read.csv("Com&RF.csv", sep=","))[1:64,1:5]
Complications <- data.frame(read.csv("Complications.csv", sep=";"))[1:20,1:5] Complications <- data.frame(read.csv("Complications.csv", sep=";"))[1:20,1:5]
Dates <- data.frame(read.csv("Dates.csv", sep=";"))[1:12,1:5] Dates <- data.frame(read.csv("Dates.csv", sep=";"))[1:12,1:5]
Demographics <- data.frame(read.csv("Demographics.csv", sep=";"))[1:9,1:5] Demographics <- data.frame(read.csv("Demographics.csv", sep=";"))[1:9,1:5]
Home_med <- data.frame(read.csv("Home_med.csv", sep=";"))[1:13,1:5] Home_med <- data.frame(read.csv("Home_med.csv", sep=";"))[1:13,1:5]
Imaging_data <- data.frame(read.csv("Imaging_data.csv", sep=";"))[1:11,1:5] Imaging_data <- data.frame(read.csv("Imaging_data.csv", sep=";"))[1:11,1:5]
Labo <- data.frame(read.csv("Labo.csv", sep=";"))[1:143,1:5] Labo <- data.frame(read.csv("Labo.csv", sep=";"))[1:143,1:5]
SiAndSympt <- data.frame(read.csv("Si&Sympt.csv", sep=";"))[1:50,1:5] SiAndSympt <- data.frame(read.csv("Si&Sympt.csv", sep=";"))[1:50,1:5]
Treatment <- data.frame(read.csv("Treatment.csv", sep=";"))[1:32,1:5] Treatment <- data.frame(read.csv("Treatment.csv", sep=";"))[1:32,1:5]
LifestyleAndDiet <- data.frame(read.csv("Lifestyle&Diet.csv", sep=";"))[1:165,1:5] LifestyleAndDiet <- data.frame(read.csv("Lifestyle&Diet.csv", sep=";"))[1:165,1:5]
harmonised_data <- rbind(SiAndSympt,ComAndRF) harmonised_data <- rbind(SiAndSympt,ComAndRF)
harmonised_data <- rbind(harmonised_data,Treatment) harmonised_data <- rbind(harmonised_data,Treatment)
harmonised_data <- rbind(harmonised_data,Dates) harmonised_data <- rbind(harmonised_data,Dates)
harmonised_data <- rbind(harmonised_data,Demographics) harmonised_data <- rbind(harmonised_data,Demographics)
harmonised_data <- rbind(harmonised_data,Home_med) harmonised_data <- rbind(harmonised_data,Home_med)
harmonised_data <- rbind(harmonised_data,Imaging_data) harmonised_data <- rbind(harmonised_data,Imaging_data)
harmonised_data <- rbind(harmonised_data,Complications) harmonised_data <- rbind(harmonised_data,Complications)
harmonised_data <- rbind(harmonised_data,Labo) harmonised_data <- rbind(harmonised_data,Labo)
harmonised_data <- rbind(harmonised_data,LifestyleAndDiet) harmonised_data <- rbind(harmonised_data,LifestyleAndDiet)
rm(list=c("SiAndSympt", rm(list=c("SiAndSympt",
"Complications", "Complications",
"ComAndRF", "ComAndRF",
"Dates", "Dates",
"Demographics", "Demographics",
"Home_med", "Home_med",
"Imaging_data", "Imaging_data",
"Complications", "Complications",
"Labo", "Labo",
"LifestyleAndDiet")) "LifestyleAndDiet"))
noYesValues <- subset(harmonised_data, harmonised_data$Harmonised.data.format.unit == "No/Yes / missing" | harmonised_data$Harmonised.data.format.unit == "No/Yes / Missing") noYesValues <- subset(harmonised_data, harmonised_data$Harmonised.data.format.unit == "No/Yes / missing" | harmonised_data$Harmonised.data.format.unit == "No/Yes / Missing")
noYesValues <- noYesValues$Harmonised.variable.name noYesValues <- noYesValues$Harmonised.variable.name
noYesValues <- c(noYesValues,"CSXCOTAB","CSXCOTAG","IMDIT","RFXHIV_RFXAIDS", "SMXASAH", "CMXATH", "CMXNO") noYesValues <- c(noYesValues,"CSXCOTAB","CSXCOTAG","IMDIT","RFXHIV_RFXAIDS", "SMXASAH", "CMXATH", "CMXNO")
categoric_vars = c("CMXATH", "CMXNO", "SMXASAH","CSXCOTAB","CSXCOTAG","IMDIT","RFXHIV_RFXAIDS","DMRGENDR", "DMRBORN", "DMRRETH1", "DMROCCU", "DMRHREDU", "DSXOS", "DSXHO", "DSXIC", "TRXAV","TRXRIB","TRXLR","TRXRM","TRXIA","TRXIB","TRXCH","TRXAB","TRXCS","TRXHEP","TRXAF","TRXCP","TRXOT","TRXECM","TRXIV","TRXNIV","TRXNO","TRXOX","TRXRR","TRXTR","TRXVA","TRXPE","TRXPV","TRXIT","TRXNMB","TRXAC","TRXINA","TRXIS","TRXIM","TRXVC","TRXVD","TRXZN", "CSXCOT","CSXCTR","SMXASAH","SMXFEA","SMXCOA","SMXSTA","SMXSBA","SMXRNA","SMXMYA","SMXARA","SMXCPA","SMXAPA","SMXINA","SMXNAA","SMXDIA","SMXFAA","SMXHEA","SMXCNA","SMXACA","SMXSLA","SMXTLA","SMXSYA","SMXWHA","SMXLYA","SMXANA","SMXIWA","SMXSRA","SMXBLA","CMXPRG","CMXCVD","CMXCMP","CMXHT","CMXDI","CMXCKD","CMXCLD","CMXCPD","CMXASM","CMXCND","CMXRHE","CMXCCI","CMXCBD","CMXDE","CMXPU","CMXST","CMXLY","CMXAP","RFXSM","RFXFSM","RFXOB","RFXTB","RFXIMD","RFXHIV","RFXAIDS","RFXUI","RFXHC","RFXONC","RFXMN", "HMRACI","HMRARB","HMRAHO","HMRNS","HMROS","HMRCS","HMRIS","HMRAV","HMRAB","HMRCOV","IMDXCT","IMDXCTCR","IMDXCTTE","IMDXCTAB","IMDXXR","IMDXPN", "COXRD","COXAR","COXPM","COXMOD","COXPT","COXEC","COXSH","COXIO","COXPE","COXST","COXDIC","COXRIO","COXKF","COXHF","COXBC") categoric_vars = c("CMXATH", "CMXNO", "SMXASAH","CSXCOTAB","CSXCOTAG","IMDIT","RFXHIV_RFXAIDS","DMRGENDR", "DMRBORN", "DMRRETH1", "DMROCCU", "DMRHREDU", "DSXOS", "DSXHO", "DSXIC", "TRXAV","TRXRIB","TRXLR","TRXRM","TRXIA","TRXIB","TRXCH","TRXAB","TRXCS","TRXHEP","TRXAF","TRXCP","TRXOT","TRXECM","TRXIV","TRXNIV","TRXNO","TRXOX","TRXRR","TRXTR","TRXVA","TRXPE","TRXPV","TRXIT","TRXNMB","TRXAC","TRXINA","TRXIS","TRXIM","TRXVC","TRXVD","TRXZN", "CSXCOT","CSXCTR","SMXASAH","SMXFEA","SMXCOA","SMXSTA","SMXSBA","SMXRNA","SMXMYA","SMXARA","SMXCPA","SMXAPA","SMXINA","SMXNAA","SMXDIA","SMXFAA","SMXHEA","SMXCNA","SMXACA","SMXSLA","SMXTLA","SMXSYA","SMXWHA","SMXLYA","SMXANA","SMXIWA","SMXSRA","SMXBLA","CMXPRG","CMXCVD","CMXCMP","CMXHT","CMXDI","CMXCKD","CMXCLD","CMXCPD","CMXASM","CMXCND","CMXRHE","CMXCCI","CMXCBD","CMXDE","CMXPU","CMXST","CMXLY","CMXAP","RFXSM","RFXFSM","RFXOB","RFXTB","RFXIMD","RFXHIV","RFXAIDS","RFXUI","RFXHC","RFXONC","RFXMN", "HMRACI","HMRARB","HMRAHO","HMRNS","HMROS","HMRCS","HMRIS","HMRAV","HMRAB","HMRCOV","IMDXCT","IMDXCTCR","IMDXCTTE","IMDXCTAB","IMDXXR","IMDXPN", "COXRD","COXAR","COXPM","COXMOD","COXPT","COXEC","COXSH","COXIO","COXPE","COXST","COXDIC","COXRIO","COXKF","COXHF","COXBC")
personalized <- c("DMRGENDR", "DSXOS", "CSXCTR", "SMXFEA", "CSXCOT") personalized <- c("DMRGENDR", "DSXOS", "CSXCTR", "SMXFEA", "CSXCOT")
is_number <- function(x){ is_number <- function(x){
res <- FALSE res <- FALSE
if(length(x)!=0){ if(length(x)!=0){
x <- str_replace(x,",",".") x <- str_replace(x,",",".")
aux <- as.numeric(x) aux <- as.numeric(x)
if(!is.na(aux)) if(!is.na(aux))
res <- TRUE res <- TRUE
} }
return(res) return(res)
} }
replaceNoYesValues <- function(x){ replaceNoYesValues <- function(x){
#Replace the value with Yes or No #Replace the value with Yes or No
if(is.na(x)) if(is.na(x)){
x <- "" x <- ""
else if (x==0 | x =="No" | x ==" No" | x =="NO") return (x)
x <- "No" }
else if (x==1 | x == "Yes" | x == " Yes" | x=="SI")
x <- "Yes" x <- trimws(toupper(x))
else if (x=="0" | x =="NO")
x <- "" x <- "No"
else if (x=="1" | x == "YES" | x=="SI")
return (x) x <- "Yes"
else
} x <- ""
fixNonCategoric <- function(x){ return (x)
if(!is_number(x)){ }
x <- ""
}else{ fixNonCategoric <- function(x){
x <- str_replace(x,",",".")
} if(!is_number(x)){
x <- ""
return(x) }else{
x <- str_replace(x,",",".")
} }
personalizedFun <- function(x, colname){ return(x)
if(colname == "DMRGENDR"){ }
if(is.na(x))
x <- "" personalizedFun <- function(x, colname){
else if(x == 1 | x == "F"| x == "f" | x== "Female")
x <- "Female" if(colname == "DMRGENDR"){
else if (x == 0 | x =="M" | x == "m" | x== "Male") if(is.na(x))
x <- "Male" x <- ""
} else if(x == 1 | x == "F"| x == "f" | x== "Female")
x <- "Female"
if(colname == "CSXCTR"){ else if (x == 0 | x =="M" | x == "m" | x== "Male")
if(is.na(x)) x <- "Male"
x <- "" }
else if(x == 1 | x == "positive" | x=="PositivM" | x=="POSITIVM" | x =="POS?T?VM" | x =="NMGAT?VM")
x <- "Positive" if(colname == "CSXCTR"){
else if (x == 0 | x == "negative" | x=="negativM" | x=="NMGATIVM" | x=="NAGATIVM" | x =="NMGATIV" | x =="negativeM" | x == "NAGAT?VM" | x =="NMGAT?V") if(is.na(x))
x <- "Negative" x <- ""
} else if(x == 1 | x == "positive" | x=="PositivM" | x=="POSITIVM" | x =="POS?T?VM" | x =="NMGAT?VM")
x <- "Positive"
if(colname == "SMXFEA"){ else if (x == 0 | x == "negative" | x=="negativM" | x=="NMGATIVM" | x=="NAGATIVM" | x =="NMGATIV" | x =="negativeM" | x == "NAGAT?VM" | x =="NMGAT?V")
if(is.na(x)) x <- "Negative"
x <- "" }
else if(x == 1)
x <- "Yes" if(colname == "SMXFEA"){
else if (x == 0) x <- replaceNoYesValues(x)
x <- "No" #if(is.na(x))
else if (x == ".") # x <- ""
x <- "" #else if(x == 1)
} # x <- "Yes"
#else if (x == 0)
if(colname == "DMRRETH1"){ # x <- "No"
if(is.na(x)) #else if (x == ".")
x <- "" # x <- ""
else if(x ==1) }
x <- "Asian"
else if (x == 2) if(colname == "DMRRETH1"){
x <- "Black" if(is.na(x))
else if (x == 3) x <- ""
x <- "Hispanic" else if(x ==1)
else if (x == 4) x <- "Asian"
x <- "White" else if (x == 2)
else if (x == 5) x <- "Black"
x <- "Multiracial" else if (x == 3)
else if (x == 6) x <- "Hispanic"
x <- "Other" else if (x == 4)
} x <- "White"
else if (x == 5)
if(colname == "DMROCCU"){ x <- "Multiracial"
if(is.na(x)) else if (x == 6)
x <- "" x <- "Other"
else if(x ==1) }
x <- "Unemployed"
else if (x == 2) if(colname == "DMROCCU"){
x <- "Student" if(is.na(x))
else if (x == 3) x <- ""
x <- "Employed" else if(x ==1)
else if (x == 4) x <- "Unemployed"
x <- "Self-employed" else if (x == 2)
else if (x == 5) x <- "Student"
x <- "Retired" else if (x == 3)
else if (x == 6) x <- "Employed"
x <- "" else if (x == 4)
} x <- "Self-employed"
else if (x == 5)
if(colname == "DMRHREDU"){ x <- "Retired"
if(is.na(x)) else if (x == 6)
x <- "" x <- ""
else if(x ==1) }
x <- "High School"
else if (x == 2) if(colname == "DMRHREDU"){
x <- "Bachelors" if(is.na(x))
else if (x == 3) x <- ""
x <- "Postgraduate" else if(x ==1)
else if (x == 4) x <- "High School"
x <- "Other" else if (x == 2)
} x <- "Bachelors"
else if (x == 3)
if(colname =="DSXOS"){ x <- "Postgraduate"
else if (x == 4)
if(is.na(x)) x <- "Other"
x <- "" }
else if (x==0 | x == "Recovered")
x <- "Recovered" if(colname =="DSXOS"){
else if (x==1 | x == "Deceased")
x <- "Deceased" if(is.na(x))
else if (x==2 | x == "Transferred") x <- ""
x <- "Transferred" else if (x==0 | x == "Recovered")
else x <- "Recovered"
x <- "" else if (x==1 | x == "Deceased")
} x <- "Deceased"
else if (x==2 | x == "Transferred")
if(colname =="CSXCOT"){ x <- "Transferred"
if(is.na(x)) else
x <- "" x <- ""
else if (x==1 ) }
x <- "PCR"
else if (x==2 ) if(colname =="CSXCOT"){
x <- "antigen" if(is.na(x))
else if (x==3 ) x <- ""
x <- "other" else if (x==1 )
else x <- "PCR"
x <- "" else if (x==2 )
} x <- "antigen"
else if (x==3 )
x <- "other"
return(x) else
x <- ""
} }
dotToBar <- function (x){
return(x)
if (grepl(".", x, fixed = TRUE))
res <- format(as.Date(x, format = "%d.%m.%Y"), "%d/%m/%Y") }
else res <- x
return(res) dotToBar <- function (x){
} if (grepl(".", x, fixed = TRUE))
res <- format(as.Date(x, format = "%d.%m.%Y"), "%d/%m/%Y")
else res <- x
return(res)
rm(newDf)
}
newDf <- hospital
rm(newDf)
names <- colnames(hospital)
newDf <- hospital
for (j in 1:ncol(hospital)){
percentage <- trunc(j/ncol(hospital)*100) names <- colnames(hospital)
mes <- paste(toString(percentage),"% completed", sep="")
print(mes)
for (j in 1:ncol(hospital)){
print(names[j])
percentage <- trunc(j/ncol(hospital)*100)
for(i in 1:nrow(hospital)){ mes <- paste(toString(percentage),"% completed", sep="")
print(mes)
if(names[j] %in% noYesValues){
newDf[i,j] <- replaceNoYesValues(hospital[i,j]) print(names[j])
}else if(!(names[j] %in% categoric_vars) & names[j] != "DMRBORN" & !grepl("DAT",names[j], fixed=TRUE)){
newDf[i,j] <- fixNonCategoric(hospital[i,j]) for(i in 1:nrow(hospital)){
}
if(names[j] %in% noYesValues){
if(names[j] %in% personalized){ newDf[i,j] <- replaceNoYesValues(hospital[i,j])
newDf[i,j] <- personalizedFun(hospital[i,j],names[j]) }else if(!(names[j] %in% categoric_vars) & names[j] != "DMRBORN" & !grepl("DAT",names[j], fixed=TRUE)){
} newDf[i,j] <- fixNonCategoric(hospital[i,j])
}
if (is.na(hospital[i,j]))
newDf[i,j] <- "" if(names[j] %in% personalized){
newDf[i,j] <- personalizedFun(hospital[i,j],names[j])
else if (hospital[i,j] == ".") }
newDf[i,j] <- ""
if (is.na(hospital[i,j]))
} newDf[i,j] <- ""
}
else if (hospital[i,j] == ".")
setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources/newRessources") newDf[i,j] <- ""
setwd(("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources/new_res_baskent/outpatient/clean"))
}
}
setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources/newRessources")
write.csv(x=newDf, file = "konya_outpatient.csv", row.names = FALSE) setwd(("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources/new_res_baskent/outpatient/clean"))
write.csv(x=newDf, file = "konya_outpatient.csv", row.names = FALSE)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment