pequeña mejora en la sustitucion de Yes/No

6d57f049 · pxp9 · 3570d2a0 · 6d57f049
Commit 6d57f049 authored Feb 02, 2023 by pxp9
Hide whitespace changes
Inline Side-by-side

Showing with 285 additions and 280 deletions

ressourceCleaner.R ressourceCleaner.R +285 -280

No files found.
--- a/ressourceCleaner.R
+++ b/ressourceCleaner.R
-
-rm(list=ls())
-
-setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources")
-setwd(("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources/new_res_baskent/outpatient"))
-
-#Cambiarlo por el nombre del ressource que se desea limpiar
-hospital <- data.frame(read.csv("konya_outpatient.csv", sep=","))
-
-
-hospital <- hospital %>% select(-contains("numeric"))
-
-# hospital["NOT.HARMONISED"] <- NULL
-# 
-# names <- colnames(hospital)
-# for (i in 1:length(names)){
-# 
-#   if(grepl("NOT.HARMONISED", names[i])){
-#     hospital[names[i]] <- NULL
-#     print(paste("quito ", names[i]))
-#   }
-# 
-# }
-# hospital <- hospital[-1,]
-
-setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/harmonised_data")
-
-ComAndRF <- data.frame(read.csv("Com&RF.csv", sep=","))[1:64,1:5]
-Complications <- data.frame(read.csv("Complications.csv", sep=";"))[1:20,1:5]
-Dates <-  data.frame(read.csv("Dates.csv", sep=";"))[1:12,1:5]
-Demographics <-  data.frame(read.csv("Demographics.csv", sep=";"))[1:9,1:5]
-Home_med <-  data.frame(read.csv("Home_med.csv", sep=";"))[1:13,1:5]
-Imaging_data <-  data.frame(read.csv("Imaging_data.csv", sep=";"))[1:11,1:5]
-Labo <-  data.frame(read.csv("Labo.csv", sep=";"))[1:143,1:5]
-SiAndSympt <-  data.frame(read.csv("Si&Sympt.csv", sep=";"))[1:50,1:5]
-Treatment <-  data.frame(read.csv("Treatment.csv", sep=";"))[1:32,1:5]
-LifestyleAndDiet <-  data.frame(read.csv("Lifestyle&Diet.csv", sep=";"))[1:165,1:5]
-
-
-harmonised_data <- rbind(SiAndSympt,ComAndRF)
-harmonised_data <- rbind(harmonised_data,Treatment)
-
-harmonised_data <- rbind(harmonised_data,Dates)
-harmonised_data <- rbind(harmonised_data,Demographics)
-harmonised_data <- rbind(harmonised_data,Home_med)
-harmonised_data <- rbind(harmonised_data,Imaging_data)
-harmonised_data <- rbind(harmonised_data,Complications)
-harmonised_data <- rbind(harmonised_data,Labo)
-harmonised_data <- rbind(harmonised_data,LifestyleAndDiet)
-
-rm(list=c("SiAndSympt",
-          "Complications",
-          "ComAndRF",
-          "Dates",
-          "Demographics",
-          "Home_med",
-          "Imaging_data",
-          "Complications",
-          "Labo",
-          "LifestyleAndDiet"))
-
-noYesValues <- subset(harmonised_data, harmonised_data$Harmonised.data.format.unit == "No/Yes / missing" | harmonised_data$Harmonised.data.format.unit == "No/Yes / Missing")
-noYesValues <- noYesValues$Harmonised.variable.name
-noYesValues <- c(noYesValues,"CSXCOTAB","CSXCOTAG","IMDIT","RFXHIV_RFXAIDS", "SMXASAH", "CMXATH", "CMXNO")
-
-categoric_vars = c("CMXATH", "CMXNO", "SMXASAH","CSXCOTAB","CSXCOTAG","IMDIT","RFXHIV_RFXAIDS","DMRGENDR", "DMRBORN", "DMRRETH1", "DMROCCU", "DMRHREDU", "DSXOS", "DSXHO", "DSXIC", "TRXAV","TRXRIB","TRXLR","TRXRM","TRXIA","TRXIB","TRXCH","TRXAB","TRXCS","TRXHEP","TRXAF","TRXCP","TRXOT","TRXECM","TRXIV","TRXNIV","TRXNO","TRXOX","TRXRR","TRXTR","TRXVA","TRXPE","TRXPV","TRXIT","TRXNMB","TRXAC","TRXINA","TRXIS","TRXIM","TRXVC","TRXVD","TRXZN",                         "CSXCOT","CSXCTR","SMXASAH","SMXFEA","SMXCOA","SMXSTA","SMXSBA","SMXRNA","SMXMYA","SMXARA","SMXCPA","SMXAPA","SMXINA","SMXNAA","SMXDIA","SMXFAA","SMXHEA","SMXCNA","SMXACA","SMXSLA","SMXTLA","SMXSYA","SMXWHA","SMXLYA","SMXANA","SMXIWA","SMXSRA","SMXBLA","CMXPRG","CMXCVD","CMXCMP","CMXHT","CMXDI","CMXCKD","CMXCLD","CMXCPD","CMXASM","CMXCND","CMXRHE","CMXCCI","CMXCBD","CMXDE","CMXPU","CMXST","CMXLY","CMXAP","RFXSM","RFXFSM","RFXOB","RFXTB","RFXIMD","RFXHIV","RFXAIDS","RFXUI","RFXHC","RFXONC","RFXMN",                         "HMRACI","HMRARB","HMRAHO","HMRNS","HMROS","HMRCS","HMRIS","HMRAV","HMRAB","HMRCOV","IMDXCT","IMDXCTCR","IMDXCTTE","IMDXCTAB","IMDXXR","IMDXPN",                         "COXRD","COXAR","COXPM","COXMOD","COXPT","COXEC","COXSH","COXIO","COXPE","COXST","COXDIC","COXRIO","COXKF","COXHF","COXBC")
-
-personalized <- c("DMRGENDR", "DSXOS", "CSXCTR", "SMXFEA", "CSXCOT")
-
-is_number <- function(x){
-  res <- FALSE
-  
-  if(length(x)!=0){
-    x <- str_replace(x,",",".")
-    
-    aux <- as.numeric(x)
-    
-    if(!is.na(aux))
-      res <- TRUE
-  }
-  return(res)
-}
-
-
-replaceNoYesValues <- function(x){
-  
-  #Replace the value with Yes or No
-    if(is.na(x))
-      x <- ""
-    else if (x==0 | x =="No" | x ==" No" | x =="NO")
-      x <- "No"
-    else if (x==1 | x == "Yes" |  x == " Yes" | x=="SI")
-      x <- "Yes"
-    else
-      x <- ""
-    
-    return (x)
-  
-}
-
-fixNonCategoric <- function(x){
-  
-  if(!is_number(x)){
-    x <- ""
-  }else{
-    x <- str_replace(x,",",".")
-  }
-  
-  return(x)
-  
-}
-
-personalizedFun <- function(x, colname){
-  
-  if(colname == "DMRGENDR"){
-    if(is.na(x))
-      x <- ""
-    else if(x == 1 | x == "F"| x == "f" | x== "Female")
-      x <- "Female"
-    else if (x == 0 | x =="M" | x == "m" | x== "Male")
-      x <- "Male"
-  }
-  
-  if(colname == "CSXCTR"){
-    if(is.na(x))
-      x <- ""
-    else if(x == 1 | x == "positive" | x=="PositivM" | x=="POSITIVM" | x =="POS?T?VM" | x =="NMGAT?VM")
-      x <- "Positive"
-    else if (x == 0 | x == "negative" | x=="negativM" | x=="NMGATIVM" | x=="NAGATIVM" | x =="NMGATIV" | x =="negativeM" | x == "NAGAT?VM" | x =="NMGAT?V")
-      x <- "Negative"
-  }
-
-  if(colname == "SMXFEA"){
-    if(is.na(x))
-      x <- ""
-    else if(x == 1)
-      x <- "Yes"
-    else if (x == 0)
-      x <- "No"
-    else if (x == ".")
-      x <- ""
-  }
-  
-  if(colname == "DMRRETH1"){
-    if(is.na(x))
-      x <- ""
-    else if(x ==1)
-      x <- "Asian"
-    else if (x == 2)
-      x <- "Black"
-    else if (x == 3)
-      x <- "Hispanic"
-    else if (x == 4)
-      x <- "White"
-    else if (x == 5)
-      x <- "Multiracial"
-    else if (x == 6)
-      x <- "Other"
-  }
-  
-  if(colname == "DMROCCU"){
-    if(is.na(x))
-      x <- ""
-    else if(x ==1)
-      x <- "Unemployed"
-    else if (x == 2)
-      x <- "Student"
-    else if (x == 3)
-      x <- "Employed"
-    else if (x == 4)
-      x <- "Self-employed"
-    else if (x == 5)
-      x <- "Retired"
-    else if (x == 6)
-      x <- ""
-  }
-  
-  if(colname == "DMRHREDU"){
-    if(is.na(x))
-      x <- ""
-    else if(x ==1)
-      x <- "High School"
-    else if (x == 2)
-      x <- "Bachelors"
-    else if (x == 3)
-      x <- "Postgraduate"
-    else if (x == 4)
-      x <- "Other"
-  }
-  
-  if(colname =="DSXOS"){
-    
-    if(is.na(x))
-      x <- ""
-    else if (x==0 | x == "Recovered")
-      x <- "Recovered"
-    else if (x==1 | x == "Deceased")
-      x <- "Deceased"
-    else if (x==2 | x == "Transferred")
-      x <- "Transferred"
-    else
-      x <- ""
-  }
-  
-  if(colname =="CSXCOT"){
-    if(is.na(x))
-      x <- ""
-    else if (x==1 )
-      x <- "PCR"
-    else if (x==2 )
-      x <- "antigen"
-    else if (x==3 )
-      x <- "other"
-    else
-      x <- ""
-  }
-  
-
-  return(x)
-  
-}
-
-dotToBar <- function (x){
-  
-  if (grepl(".", x, fixed = TRUE))
-    res <- format(as.Date(x, format = "%d.%m.%Y"), "%d/%m/%Y")
-  else res <- x
-  return(res)
-  
-}
-
-
-
-rm(newDf)
-
-newDf <- hospital
-
-
-
-names <- colnames(hospital)
-
-
-for (j in 1:ncol(hospital)){
-  
-  percentage <- trunc(j/ncol(hospital)*100)
-  mes <- paste(toString(percentage),"% completed", sep="")
-  print(mes)
-  
-  print(names[j])
-  
-  for(i in 1:nrow(hospital)){
-    
-    if(names[j] %in% noYesValues){
-      newDf[i,j] <- replaceNoYesValues(hospital[i,j])
-    }else if(!(names[j] %in% categoric_vars) & names[j] != "DMRBORN" & !grepl("DAT",names[j], fixed=TRUE)){
-      newDf[i,j] <- fixNonCategoric(hospital[i,j])
-    }
-    
-    if(names[j] %in% personalized){
-      newDf[i,j] <- personalizedFun(hospital[i,j],names[j])
-    }
-    
-    if (is.na(hospital[i,j]))
-      newDf[i,j] <- ""
-    
-    else if (hospital[i,j] == ".")
-      newDf[i,j] <- ""
-    
-  }
-}
-
-setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources/newRessources")
-setwd(("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources/new_res_baskent/outpatient/clean"))
-
-
-
-
-write.csv(x=newDf, file = "konya_outpatient.csv", row.names = FALSE)
-
-
+
+rm(list=ls())
+
+setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources")
+setwd(("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources/new_res_baskent/outpatient"))
+
+#Cambiarlo por el nombre del ressource que se desea limpiar
+hospital <- data.frame(read.csv("konya_outpatient.csv", sep=","))
+
+
+hospital <- hospital %>% select(-contains("numeric"))
+
+# hospital["NOT.HARMONISED"] <- NULL
+# 
+# names <- colnames(hospital)
+# for (i in 1:length(names)){
+# 
+#   if(grepl("NOT.HARMONISED", names[i])){
+#     hospital[names[i]] <- NULL
+#     print(paste("quito ", names[i]))
+#   }
+# 
+# }
+# hospital <- hospital[-1,]
+
+setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/harmonised_data")
+
+ComAndRF <- data.frame(read.csv("Com&RF.csv", sep=","))[1:64,1:5]
+Complications <- data.frame(read.csv("Complications.csv", sep=";"))[1:20,1:5]
+Dates <-  data.frame(read.csv("Dates.csv", sep=";"))[1:12,1:5]
+Demographics <-  data.frame(read.csv("Demographics.csv", sep=";"))[1:9,1:5]
+Home_med <-  data.frame(read.csv("Home_med.csv", sep=";"))[1:13,1:5]
+Imaging_data <-  data.frame(read.csv("Imaging_data.csv", sep=";"))[1:11,1:5]
+Labo <-  data.frame(read.csv("Labo.csv", sep=";"))[1:143,1:5]
+SiAndSympt <-  data.frame(read.csv("Si&Sympt.csv", sep=";"))[1:50,1:5]
+Treatment <-  data.frame(read.csv("Treatment.csv", sep=";"))[1:32,1:5]
+LifestyleAndDiet <-  data.frame(read.csv("Lifestyle&Diet.csv", sep=";"))[1:165,1:5]
+
+
+harmonised_data <- rbind(SiAndSympt,ComAndRF)
+harmonised_data <- rbind(harmonised_data,Treatment)
+
+harmonised_data <- rbind(harmonised_data,Dates)
+harmonised_data <- rbind(harmonised_data,Demographics)
+harmonised_data <- rbind(harmonised_data,Home_med)
+harmonised_data <- rbind(harmonised_data,Imaging_data)
+harmonised_data <- rbind(harmonised_data,Complications)
+harmonised_data <- rbind(harmonised_data,Labo)
+harmonised_data <- rbind(harmonised_data,LifestyleAndDiet)
+
+rm(list=c("SiAndSympt",
+          "Complications",
+          "ComAndRF",
+          "Dates",
+          "Demographics",
+          "Home_med",
+          "Imaging_data",
+          "Complications",
+          "Labo",
+          "LifestyleAndDiet"))
+
+noYesValues <- subset(harmonised_data, harmonised_data$Harmonised.data.format.unit == "No/Yes / missing" | harmonised_data$Harmonised.data.format.unit == "No/Yes / Missing")
+noYesValues <- noYesValues$Harmonised.variable.name
+noYesValues <- c(noYesValues,"CSXCOTAB","CSXCOTAG","IMDIT","RFXHIV_RFXAIDS", "SMXASAH", "CMXATH", "CMXNO")
+
+categoric_vars = c("CMXATH", "CMXNO", "SMXASAH","CSXCOTAB","CSXCOTAG","IMDIT","RFXHIV_RFXAIDS","DMRGENDR", "DMRBORN", "DMRRETH1", "DMROCCU", "DMRHREDU", "DSXOS", "DSXHO", "DSXIC", "TRXAV","TRXRIB","TRXLR","TRXRM","TRXIA","TRXIB","TRXCH","TRXAB","TRXCS","TRXHEP","TRXAF","TRXCP","TRXOT","TRXECM","TRXIV","TRXNIV","TRXNO","TRXOX","TRXRR","TRXTR","TRXVA","TRXPE","TRXPV","TRXIT","TRXNMB","TRXAC","TRXINA","TRXIS","TRXIM","TRXVC","TRXVD","TRXZN",                         "CSXCOT","CSXCTR","SMXASAH","SMXFEA","SMXCOA","SMXSTA","SMXSBA","SMXRNA","SMXMYA","SMXARA","SMXCPA","SMXAPA","SMXINA","SMXNAA","SMXDIA","SMXFAA","SMXHEA","SMXCNA","SMXACA","SMXSLA","SMXTLA","SMXSYA","SMXWHA","SMXLYA","SMXANA","SMXIWA","SMXSRA","SMXBLA","CMXPRG","CMXCVD","CMXCMP","CMXHT","CMXDI","CMXCKD","CMXCLD","CMXCPD","CMXASM","CMXCND","CMXRHE","CMXCCI","CMXCBD","CMXDE","CMXPU","CMXST","CMXLY","CMXAP","RFXSM","RFXFSM","RFXOB","RFXTB","RFXIMD","RFXHIV","RFXAIDS","RFXUI","RFXHC","RFXONC","RFXMN",                         "HMRACI","HMRARB","HMRAHO","HMRNS","HMROS","HMRCS","HMRIS","HMRAV","HMRAB","HMRCOV","IMDXCT","IMDXCTCR","IMDXCTTE","IMDXCTAB","IMDXXR","IMDXPN",                         "COXRD","COXAR","COXPM","COXMOD","COXPT","COXEC","COXSH","COXIO","COXPE","COXST","COXDIC","COXRIO","COXKF","COXHF","COXBC")
+
+personalized <- c("DMRGENDR", "DSXOS", "CSXCTR", "SMXFEA", "CSXCOT")
+
+is_number <- function(x){
+  res <- FALSE
+  
+  if(length(x)!=0){
+    x <- str_replace(x,",",".")
+    
+    aux <- as.numeric(x)
+    
+    if(!is.na(aux))
+      res <- TRUE
+  }
+  return(res)
+}
+
+
+replaceNoYesValues <- function(x){
+  
+  #Replace the value with Yes or No
+    if(is.na(x)){
+      x <- ""
+      return (x)
+    }
+  
+    x <- trimws(toupper(x))
+    if (x=="0" | x =="NO")
+      x <- "No"
+    else if (x=="1" | x == "YES" |  x=="SI")
+      x <- "Yes"
+    else
+      x <- ""
+    
+    return (x)
+  
+}
+
+fixNonCategoric <- function(x){
+  
+  if(!is_number(x)){
+    x <- ""
+  }else{
+    x <- str_replace(x,",",".")
+  }
+  
+  return(x)
+  
+}
+
+personalizedFun <- function(x, colname){
+  
+  if(colname == "DMRGENDR"){
+    if(is.na(x))
+      x <- ""
+    else if(x == 1 | x == "F"| x == "f" | x== "Female")
+      x <- "Female"
+    else if (x == 0 | x =="M" | x == "m" | x== "Male")
+      x <- "Male"
+  }
+  
+  if(colname == "CSXCTR"){
+    if(is.na(x))
+      x <- ""
+    else if(x == 1 | x == "positive" | x=="PositivM" | x=="POSITIVM" | x =="POS?T?VM" | x =="NMGAT?VM")
+      x <- "Positive"
+    else if (x == 0 | x == "negative" | x=="negativM" | x=="NMGATIVM" | x=="NAGATIVM" | x =="NMGATIV" | x =="negativeM" | x == "NAGAT?VM" | x =="NMGAT?V")
+      x <- "Negative"
+  }
+
+  if(colname == "SMXFEA"){
+    x <- replaceNoYesValues(x)
+    #if(is.na(x))
+     # x <- ""
+    #else if(x == 1)
+     # x <- "Yes"
+    #else if (x == 0)
+     # x <- "No"
+    #else if (x == ".")
+     # x <- ""
+  }
+  
+  if(colname == "DMRRETH1"){
+    if(is.na(x))
+      x <- ""
+    else if(x ==1)
+      x <- "Asian"
+    else if (x == 2)
+      x <- "Black"
+    else if (x == 3)
+      x <- "Hispanic"
+    else if (x == 4)
+      x <- "White"
+    else if (x == 5)
+      x <- "Multiracial"
+    else if (x == 6)
+      x <- "Other"
+  }
+  
+  if(colname == "DMROCCU"){
+    if(is.na(x))
+      x <- ""
+    else if(x ==1)
+      x <- "Unemployed"
+    else if (x == 2)
+      x <- "Student"
+    else if (x == 3)
+      x <- "Employed"
+    else if (x == 4)
+      x <- "Self-employed"
+    else if (x == 5)
+      x <- "Retired"
+    else if (x == 6)
+      x <- ""
+  }
+  
+  if(colname == "DMRHREDU"){
+    if(is.na(x))
+      x <- ""
+    else if(x ==1)
+      x <- "High School"
+    else if (x == 2)
+      x <- "Bachelors"
+    else if (x == 3)
+      x <- "Postgraduate"
+    else if (x == 4)
+      x <- "Other"
+  }
+  
+  if(colname =="DSXOS"){
+    
+    if(is.na(x))
+      x <- ""
+    else if (x==0 | x == "Recovered")
+      x <- "Recovered"
+    else if (x==1 | x == "Deceased")
+      x <- "Deceased"
+    else if (x==2 | x == "Transferred")
+      x <- "Transferred"
+    else
+      x <- ""
+  }
+  
+  if(colname =="CSXCOT"){
+    if(is.na(x))
+      x <- ""
+    else if (x==1 )
+      x <- "PCR"
+    else if (x==2 )
+      x <- "antigen"
+    else if (x==3 )
+      x <- "other"
+    else
+      x <- ""
+  }
+  
+
+  return(x)
+  
+}
+
+dotToBar <- function (x){
+  
+  if (grepl(".", x, fixed = TRUE))
+    res <- format(as.Date(x, format = "%d.%m.%Y"), "%d/%m/%Y")
+  else res <- x
+  return(res)
+  
+}
+
+
+
+rm(newDf)
+
+newDf <- hospital
+
+
+
+names <- colnames(hospital)
+
+
+for (j in 1:ncol(hospital)){
+  
+  percentage <- trunc(j/ncol(hospital)*100)
+  mes <- paste(toString(percentage),"% completed", sep="")
+  print(mes)
+  
+  print(names[j])
+  
+  for(i in 1:nrow(hospital)){
+    
+    if(names[j] %in% noYesValues){
+      newDf[i,j] <- replaceNoYesValues(hospital[i,j])
+    }else if(!(names[j] %in% categoric_vars) & names[j] != "DMRBORN" & !grepl("DAT",names[j], fixed=TRUE)){
+      newDf[i,j] <- fixNonCategoric(hospital[i,j])
+    }
+    
+    if(names[j] %in% personalized){
+      newDf[i,j] <- personalizedFun(hospital[i,j],names[j])
+    }
+    
+    if (is.na(hospital[i,j]))
+      newDf[i,j] <- ""
+    
+    else if (hospital[i,j] == ".")
+      newDf[i,j] <- ""
+    
+  }
+}
+
+setwd("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources/newRessources")
+setwd(("C:/Users/Victor/Documents/TFG/r-analytics-master/ressources/new_res_baskent/outpatient/clean"))
+
+
+
+
+write.csv(x=newDf, file = "konya_outpatient.csv", row.names = FALSE)
+
+