descriptive_analysis.R

library(DBI)
library(reshape2)
library(limma)
library(ggplot2)
library(dplyr)
library(tidyverse) 
library(gridExtra)
output_dir="C:/Users/Laura/Documents/Master Cuatri 2/Practicas/DEA/exprs_dea/results/expr_descriptive_analysis"
diseases <- read.delim("C:/Users/Laura/Documents/Master Cuatri 2/Practicas/DEA/exprs_dea/data/data_01_neuro_diseases_final_disease_selected.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
diseases <- diseases %>%
  rename(disease_id = cui)%>%
  select(disease_id, disease_name)

con <- dbConnect(MySQL(), 
                 dbname = "disnet_biolayer", 
                 host = "ares.ctb.upm.es", 
                 port = 30604, 
                 user = "gene_expression", 
                 password = "gene_expression")


##TO REPRESENT THE NUMBER OF DATASETS (GDS) PER DISEASE 

all_dis_gds<-data.frame()
# Get expr_disease_gds for the current gds_id
query_dis_gds <- paste0("SELECT * FROM expr_disease_gds")
dis_gds <- dbGetQuery(con, query_dis_gds)

# Merge the dataframes all_dis_gds and all_diseases by disease_id
merged_data <- merge(dis_gds, diseases, by = "disease_id")

# Count the number of GDS for each disease_name
gds_count <- aggregate(gds_id ~ disease_name, data = merged_data, FUN = length)

gds_count <- gds_count %>% arrange(desc(gds_id))

# Plot the results with blue bars and sorted by the number of GDS
dis_gds_plot<-ggplot(gds_count, aes(x = reorder(disease_name, -gds_id), y = gds_id)) +
  geom_bar(stat = "identity", fill = "#7AC5CD") +
  xlab("Disease Name") +
  ylab("Number of GDS") +
  ggtitle("Number of GDS per Disease") +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 14),
    axis.text.y= element_text(size=14),
    axis.title.x = element_text(size = 14),
    axis.title.y = element_text(size = 14),
    panel.background = element_rect(fill = "white", color = "black"),
    panel.grid.major.x = element_blank(), # Remove vertical grid lines
    panel.grid.minor.x = element_blank(), # Remove minor vertical grid lines
    panel.grid.major.y = element_line(color = "gray90"), # Optional: Add horizontal grid lines
    panel.grid.minor.y = element_blank(), # Optional: Remove minor horizontal grid lines
    axis.line = element_line(color = "black"), # Add axis lines
    plot.title = element_text(hjust = 0.5, size = 16,  color = "black",face = "bold")
  ) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.1))) # Adjust the y-axis to start at 0 and add some padding at the top

dis_gds_plot
ggsave("C:/Users/Laura/Documents/Master Cuatri 2/Practicas/DEA/exprs_dea/results/expr_descriptive_analysis/number_of_gds_per_disease.png", plot = dis_gds_plot, width = 15, height = 12)

##REPRESENT THE NUMBER OF samples per dataset (GSMs per GDS)
query_gds_gsm <- paste0("SELECT gds_id,gsm_id FROM expr_processed_annot")
gds_gsm <- dbGetQuery(con, query_gds_gsm)
gds_count <- gds_gsm %>%
  group_by(gds_id) %>%
  summarise(num_gsm = n())
gds_gsm_plot <- ggplot(gds_count, aes(x = as.factor(gds_id), y = num_gsm)) +
  geom_bar(stat = "identity", fill = "#7AC5CD", width = 0.9)+
  xlab("GDS ID") +
  ylab("Number of GSM IDs") +
  ggtitle("Number of samples per dataset (GSMs per GDS)") +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 14),
    axis.text.y = element_text(size = 14),
    axis.title.x = element_text(size = 14),
    axis.title.y = element_text(size = 14),
    panel.background = element_rect(fill = "white", color = "black"),
    panel.grid.major.x = element_blank(), # Remove vertical grid lines
    panel.grid.minor.x = element_blank(), # Remove minor vertical grid lines
    panel.grid.major.y = element_line(color = "gray90"), # Optional: Add horizontal grid lines
    panel.grid.minor.y = element_blank(), # Optional: Remove minor horizontal grid lines
    axis.line = element_line(color = "black"), # Add axis lines
    plot.title = element_text(hjust = 0.5, size = 16,  color = "black",face = "bold")
  ) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.1)))+  
  
  scale_x_discrete(expand = expansion(add = c(0.9, 0.9)))
gds_gsm_plot
ggsave("C:/Users/Laura/Documents/Master Cuatri 2/Practicas/DEA/exprs_dea/results/expr_descriptive_analysis/number_of_gsm_per_gds.png", plot = gds_gsm_plot, width = 15, height = 12)


##THE REST OF DESCRIPTIVE ANALYSIS ITERING GDS_ID PER GDS_ID

gds_ids <- dbGetQuery(con, "SELECT DISTINCT gds_id FROM expr_processed_annot WHERE flag = 1")


all_expr_values <- data.frame()
all_normalized_values <- data.frame()


for (i in 1:nrow(gds_ids)) {
  gds_id<-gds_ids$gds_id[i]
  gds_output_dir <- file.path(output_dir, gds_id)
  
  # Check if the directory already exists
  if (!dir.exists(gds_output_dir)) {
    dir.create(gds_output_dir, recursive = TRUE)
  } 
  setwd(gds_output_dir)
  
  #extraction of data from expr_values table
  query_expr_values <- paste0("SELECT gsm_id, id_ref, gene_symbol, value FROM expr_values WHERE gds_id = '", gds_id, "'")
  expr_values <- dbGetQuery(con, query_expr_values)
  
  #extraction of data from expr_processed_annot table with flag = 1
  query_expr_raw_annot <- paste0("SELECT gsm_id, disease_state FROM expr_raw_annot WHERE gds_id = '", gds_id, "'")
  expr_raw_annot <- dbGetQuery(con, query_expr_raw_annot)
  
  #extraction of data from expr_processed_annot table with flag = 1
  query_expr_processed_annot <- paste0("SELECT gsm_id, disease_state FROM expr_processed_annot WHERE gds_id = '", gds_id, "'")
  expr_processed_annot <- dbGetQuery(con, query_expr_processed_annot)
  
  #to keep gene_symbol for reference at the end
  gene_symbols <- unique(expr_values[, c("id_ref", "gene_symbol")])
  
  
  if (nrow(expr_values) == 0) {
    warning(paste("No data found for gds_id:", gds_id))
    next
  }
  
  if (!is.numeric(expr_values$value)) {
    expr_values$value <- as.numeric(expr_values$value)
  }
  
  #create the expression matrix without gene_symbol
  expression <- dcast(expr_values, id_ref ~ gsm_id, value.var = "value", fun.aggregate = mean)
  rownames(expression) <- expression$id_ref
  expression <- expression[, -1]  # Remove the id_ref column
  
  if (ncol(expression) == 0) {
    warning(paste("No expression data available for gds_id:", gds_id))
    next
  }
  
  #normalization of expression matrix
  exprs_matrix <- as.matrix(expression)
  exprs_normalized <- normalizeBetweenArrays(exprs_matrix, method = 'quantile')
  exprs_normalized_df <- melt(exprs_normalized, varnames = c("id_ref", "gsm_id"), value.name = "value")
  normalized_df <- merge(exprs_normalized_df, gene_symbols, by = "id_ref", all.x = TRUE)
  
  expr_values$gds_id <- gds_id
  normalized_df$gds_id <- gds_id
  
  all_expr_values <- rbind(all_expr_values, expr_values)
  all_normalized_values <- rbind(all_normalized_values, normalized_df)
  
  
  expr_stats <- summarize(
    expr_values,
    mean = mean(value, na.rm = TRUE),
    std = sd(value, na.rm = TRUE),
    min = min(value, na.rm = TRUE),
    `25%` = quantile(value, 0.25, na.rm = TRUE),
    `50%` = median(value, na.rm = TRUE),
    `75%` = quantile(value, 0.75, na.rm = TRUE),
    max = max(value, na.rm = TRUE)
  )
  write.csv(expr_stats, file.path(gds_output_dir, paste0(gds_id, "_expression_stats_prenorm.csv")))
  
  
  expr_stats_norm <- summarize(
    normalized_df,
    mean = mean(value, na.rm = TRUE),
    std = sd(value, na.rm = TRUE),
    min = min(value, na.rm = TRUE),
    `25%` = quantile(value, 0.25, na.rm = TRUE),
    `50%` = median(value, na.rm = TRUE),
    `75%` = quantile(value, 0.75, na.rm = TRUE),
    max = max(value, na.rm = TRUE)
  )
  write.csv(expr_stats_norm, file.path(gds_output_dir, paste0(gds_id, "_expression_stats_postnorm.csv")))
  
  # Count categories and generate plots only for columns with data
  categorical_columns <- c('agent', 'disease_state', 'tissue', 'cell_type')
  for (col in categorical_columns) {
    if (col %in% colnames(expr_raw_annot) && !all(is.na(expr_raw_annot[[col]]))) {
      cat_counts <- table(expr_raw_annot[[col]])
      df_cat_counts <- as.data.frame(cat_counts)
      colnames(df_cat_counts) <- c(col, "freq")
      write.csv(df_cat_counts, file.path(gds_output_dir, paste0(gds_id, "_raw_", col, "_counts.csv")))
    
    if (col %in% colnames(expr_processed_annot) && !all(is.na(expr_processed_annot[[col]]))) {
      cat_counts <- table(expr_processed_annot[[col]])
      df_cat_counts <- as.data.frame(cat_counts)
      colnames(df_cat_counts) <- c(col, "freq")
      write.csv(df_cat_counts, file.path(gds_output_dir, paste0(gds_id, "_processed_", col, "_counts.csv")))
      
      plot <- ggplot(expr_processed_annot, aes(y = .data[[col]])) +
        geom_bar() +
        labs(title = paste0(str_to_title(col), " count for ", gds_id), hjust = 0.5) +
        theme_minimal()+
        theme(
          plot.title = element_text(size = 18, face = "bold", hjust = 0.5),  
          axis.title.x = element_text(size = 16),  
          axis.title.y = element_text(size = 16),  
          axis.text.x = element_text(size = 15),  
          axis.text.y = element_text(size = 15)   
        )
      
      ggsave(file.path(gds_output_dir, paste0(gds_id, "_", col, "_counts.png")), plot = plot, width = 10, height = 6,bg='white')
    }
  }
  
  
    # Boxplot of expression by disease state
    # Boxplot of expression by disease state
    if ("disease_state" %in% colnames(expr_processed_annot)) {
      # Merge expression and annotation dataframes
      merged_df <- inner_join(normalized_df, expr_processed_annot %>% select(gsm_id, disease_state), by = "gsm_id")
      
      # Violin plot
      violin_disease <- ggplot(merged_df, aes(x = disease_state, y = value)) +
        geom_violin(aes(fill = disease_state), position = position_dodge(0.9)) +
        scale_fill_manual(
          values = c("c" = "#FFFFF0", "d" = "#838B83"),  # Specify colors for the groups
          labels = c("c" = "Control", "d" = "Disease"), # Set the labels for the legend
          name = "Disease State"  # Set the title for the legend
        ) +
        scale_x_discrete(
          labels = c("c" = "Control", "d" = "Disease")  # Rename x-axis labels
        ) +
        labs(
          title = paste0("Expression Values by Disease State for ", gds_id),
          x = "Disease State", 
          y = "Expression Value"
        ) +
        theme_minimal() +
        theme(
          plot.title = element_text(size = 18, face = "bold", hjust = 0.5),  
          axis.title.x = element_text(size = 16),  
          axis.title.y = element_text(size = 16),  
          axis.text.x = element_text(size = 15),  
          axis.text.y = element_text(size = 15),
          legend.position = "none"  # Remove the legend
        )
      
      # Save the plot
      ggsave(
        file.path(gds_output_dir, paste0(gds_id, "_expression_by_disease_state.png")), 
        plot = violin_disease, 
        width = 14, 
        height = 8, 
        bg = 'white'
      )
    }
    
  
  # Boxplot of expression values
  boxplot_expression <- ggplot(expr_values, aes(x=gds_id, y = value)) +
    geom_boxplot() +
    labs(title = paste0("Distribution of Expression Values for ", gds_id),x='',
         y = "Expression Value") +
    theme_minimal()+
    theme(
      plot.title = element_text(size = 18, face = "bold", hjust = 0.5),  
      axis.title.x = element_text(size = 16),  
      axis.title.y = element_text(size = 16),  
      axis.text.x = element_text(size = 15),  
      axis.text.y = element_text(size = 15)   
    )
  
  ggsave(file.path(gds_output_dir, paste0(gds_id, "_expression_values_boxplot.png")), plot = boxplot_expression, width = 14, height = 8, bg = "white")
  
  
  # Boxplot of normalized expression values
  boxplot_norm_expression <- ggplot(normalized_df, aes(x=gds_id, y = value)) +
    geom_boxplot() +
    labs(title = paste0("Distribution of Normalized Expression Values for ", gds_id), x='',
         y = "Expression Value") +
    theme_minimal()+
    theme(
      plot.title = element_text(size = 18, face = "bold", hjust = 0.5),  
      axis.title.x = element_text(size = 16),  
      axis.title.y = element_text(size = 16),  
      axis.text.x = element_text(size = 15),  
      axis.text.y = element_text(size = 15)   
    )
  
  ggsave(file.path(gds_output_dir, paste0(gds_id, "_norm_expression_values_boxplot.png")), plot = boxplot_norm_expression, width = 14, height = 8, bg = "white")
  
  
  }
}


plot_expr_values <- ggplot(all_expr_values, aes(x = as.factor(gds_id), y = value)) +
  geom_boxplot() +
  labs(title = "Distribution of Expression Values Before Normalization",
       x = "GDS ID", y = "Expression Value") +
  theme_minimal()+
  theme(
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),  
    axis.title.x = element_text(size = 14),  
    axis.title.y = element_text(size = 14),  
    axis.text.x = element_text(size = 12),  
    axis.text.y = element_text(size = 12)   
  )

ggsave(file.path(output_dir, "boxplot_expression_values_prenormalization.png"), plot = plot_expr_values, width = 14, height = 8, bg = 'white')

# Graficar caja y bigotes para valores después de la normalización
plot_normalized_values <- ggplot(all_normalized_values, aes(x = as.factor(gds_id), y = value)) +
  geom_boxplot() +
  labs(title = "Distribution of Normalized Expression Values",
       x = "GDS ID", y = "Expression Value") +
  theme_minimal()+
  theme(
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),  
    axis.title.x = element_text(size = 14),  
    axis.title.y = element_text(size = 14),  
    axis.text.x = element_text(size = 12),  
    axis.text.y = element_text(size = 12)   
  )

ggsave(file.path(output_dir, "boxplot_expression_values_postnormalization.png"), plot = plot_normalized_values, width = 14, height = 8, bg = 'white')


#combined_plot <- grid.arrange(plot_expr_values, plot_normalized_values, nrow = 2)
#ggsave(file.path(output_dir, "boxplot_expression_values_combined.png"), plot = combined_plot, width = 14, height = 8, bg = 'white')