library(DBI) library(reshape2) library(limma) library(ggplot2) library(dplyr) library(tidyverse) library(gridExtra) output_dir="C:/Users/Laura/Documents/Master Cuatri 2/Practicas/DEA/exprs_dea/results/expr_descriptive_analysis" diseases <- read.delim("C:/Users/Laura/Documents/Master Cuatri 2/Practicas/DEA/exprs_dea/data/data_01_neuro_diseases_final_disease_selected.tsv", sep = "\t", header = TRUE, stringsAsFactors = FALSE) diseases <- diseases %>% rename(disease_id = cui)%>% select(disease_id, disease_name) con <- dbConnect(MySQL(), dbname = "disnet_biolayer", host = "ares.ctb.upm.es", port = 30604, user = "gene_expression", password = "gene_expression") ##TO REPRESENT THE NUMBER OF DATASETS (GDS) PER DISEASE all_dis_gds<-data.frame() # Get expr_disease_gds for the current gds_id query_dis_gds <- paste0("SELECT * FROM expr_disease_gds") dis_gds <- dbGetQuery(con, query_dis_gds) # Merge the dataframes all_dis_gds and all_diseases by disease_id merged_data <- merge(dis_gds, diseases, by = "disease_id") # Count the number of GDS for each disease_name gds_count <- aggregate(gds_id ~ disease_name, data = merged_data, FUN = length) gds_count <- gds_count %>% arrange(desc(gds_id)) # Plot the results with blue bars and sorted by the number of GDS dis_gds_plot<-ggplot(gds_count, aes(x = reorder(disease_name, -gds_id), y = gds_id)) + geom_bar(stat = "identity", fill = "#7AC5CD") + xlab("Disease Name") + ylab("Number of GDS") + ggtitle("Number of GDS per Disease") + theme( axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 14), axis.text.y= element_text(size=14), axis.title.x = element_text(size = 14), axis.title.y = element_text(size = 14), panel.background = element_rect(fill = "white", color = "black"), panel.grid.major.x = element_blank(), # Remove vertical grid lines panel.grid.minor.x = element_blank(), # Remove minor vertical grid lines panel.grid.major.y = element_line(color = "gray90"), # Optional: Add horizontal grid lines panel.grid.minor.y = element_blank(), # Optional: Remove minor horizontal grid lines axis.line = element_line(color = "black"), # Add axis lines plot.title = element_text(hjust = 0.5, size = 16, color = "black",face = "bold") ) + scale_y_continuous(expand = expansion(mult = c(0, 0.1))) # Adjust the y-axis to start at 0 and add some padding at the top dis_gds_plot ggsave("C:/Users/Laura/Documents/Master Cuatri 2/Practicas/DEA/exprs_dea/results/expr_descriptive_analysis/number_of_gds_per_disease.png", plot = dis_gds_plot, width = 15, height = 12) ##REPRESENT THE NUMBER OF samples per dataset (GSMs per GDS) query_gds_gsm <- paste0("SELECT gds_id,gsm_id FROM expr_processed_annot") gds_gsm <- dbGetQuery(con, query_gds_gsm) gds_count <- gds_gsm %>% group_by(gds_id) %>% summarise(num_gsm = n()) gds_gsm_plot <- ggplot(gds_count, aes(x = as.factor(gds_id), y = num_gsm)) + geom_bar(stat = "identity", fill = "#7AC5CD", width = 0.9)+ xlab("GDS ID") + ylab("Number of GSM IDs") + ggtitle("Number of samples per dataset (GSMs per GDS)") + theme( axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 14), axis.text.y = element_text(size = 14), axis.title.x = element_text(size = 14), axis.title.y = element_text(size = 14), panel.background = element_rect(fill = "white", color = "black"), panel.grid.major.x = element_blank(), # Remove vertical grid lines panel.grid.minor.x = element_blank(), # Remove minor vertical grid lines panel.grid.major.y = element_line(color = "gray90"), # Optional: Add horizontal grid lines panel.grid.minor.y = element_blank(), # Optional: Remove minor horizontal grid lines axis.line = element_line(color = "black"), # Add axis lines plot.title = element_text(hjust = 0.5, size = 16, color = "black",face = "bold") ) + scale_y_continuous(expand = expansion(mult = c(0, 0.1)))+ scale_x_discrete(expand = expansion(add = c(0.9, 0.9))) gds_gsm_plot ggsave("C:/Users/Laura/Documents/Master Cuatri 2/Practicas/DEA/exprs_dea/results/expr_descriptive_analysis/number_of_gsm_per_gds.png", plot = gds_gsm_plot, width = 15, height = 12) ##THE REST OF DESCRIPTIVE ANALYSIS ITERING GDS_ID PER GDS_ID gds_ids <- dbGetQuery(con, "SELECT DISTINCT gds_id FROM expr_processed_annot WHERE flag = 1") all_expr_values <- data.frame() all_normalized_values <- data.frame() for (i in 1:nrow(gds_ids)) { gds_id<-gds_ids$gds_id[i] gds_output_dir <- file.path(output_dir, gds_id) # Check if the directory already exists if (!dir.exists(gds_output_dir)) { dir.create(gds_output_dir, recursive = TRUE) } setwd(gds_output_dir) #extraction of data from expr_values table query_expr_values <- paste0("SELECT gsm_id, id_ref, gene_symbol, value FROM expr_values WHERE gds_id = '", gds_id, "'") expr_values <- dbGetQuery(con, query_expr_values) #extraction of data from expr_processed_annot table with flag = 1 query_expr_raw_annot <- paste0("SELECT gsm_id, disease_state FROM expr_raw_annot WHERE gds_id = '", gds_id, "'") expr_raw_annot <- dbGetQuery(con, query_expr_raw_annot) #extraction of data from expr_processed_annot table with flag = 1 query_expr_processed_annot <- paste0("SELECT gsm_id, disease_state FROM expr_processed_annot WHERE gds_id = '", gds_id, "'") expr_processed_annot <- dbGetQuery(con, query_expr_processed_annot) #to keep gene_symbol for reference at the end gene_symbols <- unique(expr_values[, c("id_ref", "gene_symbol")]) if (nrow(expr_values) == 0) { warning(paste("No data found for gds_id:", gds_id)) next } if (!is.numeric(expr_values$value)) { expr_values$value <- as.numeric(expr_values$value) } #create the expression matrix without gene_symbol expression <- dcast(expr_values, id_ref ~ gsm_id, value.var = "value", fun.aggregate = mean) rownames(expression) <- expression$id_ref expression <- expression[, -1] # Remove the id_ref column if (ncol(expression) == 0) { warning(paste("No expression data available for gds_id:", gds_id)) next } #normalization of expression matrix exprs_matrix <- as.matrix(expression) exprs_normalized <- normalizeBetweenArrays(exprs_matrix, method = 'quantile') exprs_normalized_df <- melt(exprs_normalized, varnames = c("id_ref", "gsm_id"), value.name = "value") normalized_df <- merge(exprs_normalized_df, gene_symbols, by = "id_ref", all.x = TRUE) expr_values$gds_id <- gds_id normalized_df$gds_id <- gds_id all_expr_values <- rbind(all_expr_values, expr_values) all_normalized_values <- rbind(all_normalized_values, normalized_df) expr_stats <- summarize( expr_values, mean = mean(value, na.rm = TRUE), std = sd(value, na.rm = TRUE), min = min(value, na.rm = TRUE), `25%` = quantile(value, 0.25, na.rm = TRUE), `50%` = median(value, na.rm = TRUE), `75%` = quantile(value, 0.75, na.rm = TRUE), max = max(value, na.rm = TRUE) ) write.csv(expr_stats, file.path(gds_output_dir, paste0(gds_id, "_expression_stats_prenorm.csv"))) expr_stats_norm <- summarize( normalized_df, mean = mean(value, na.rm = TRUE), std = sd(value, na.rm = TRUE), min = min(value, na.rm = TRUE), `25%` = quantile(value, 0.25, na.rm = TRUE), `50%` = median(value, na.rm = TRUE), `75%` = quantile(value, 0.75, na.rm = TRUE), max = max(value, na.rm = TRUE) ) write.csv(expr_stats_norm, file.path(gds_output_dir, paste0(gds_id, "_expression_stats_postnorm.csv"))) # Count categories and generate plots only for columns with data categorical_columns <- c('agent', 'disease_state', 'tissue', 'cell_type') for (col in categorical_columns) { if (col %in% colnames(expr_raw_annot) && !all(is.na(expr_raw_annot[[col]]))) { cat_counts <- table(expr_raw_annot[[col]]) df_cat_counts <- as.data.frame(cat_counts) colnames(df_cat_counts) <- c(col, "freq") write.csv(df_cat_counts, file.path(gds_output_dir, paste0(gds_id, "_raw_", col, "_counts.csv"))) if (col %in% colnames(expr_processed_annot) && !all(is.na(expr_processed_annot[[col]]))) { cat_counts <- table(expr_processed_annot[[col]]) df_cat_counts <- as.data.frame(cat_counts) colnames(df_cat_counts) <- c(col, "freq") write.csv(df_cat_counts, file.path(gds_output_dir, paste0(gds_id, "_processed_", col, "_counts.csv"))) plot <- ggplot(expr_processed_annot, aes(y = .data[[col]])) + geom_bar() + labs(title = paste0(str_to_title(col), " count for ", gds_id), hjust = 0.5) + theme_minimal()+ theme( plot.title = element_text(size = 18, face = "bold", hjust = 0.5), axis.title.x = element_text(size = 16), axis.title.y = element_text(size = 16), axis.text.x = element_text(size = 15), axis.text.y = element_text(size = 15) ) ggsave(file.path(gds_output_dir, paste0(gds_id, "_", col, "_counts.png")), plot = plot, width = 10, height = 6,bg='white') } } # Boxplot of expression by disease state # Boxplot of expression by disease state if ("disease_state" %in% colnames(expr_processed_annot)) { # Merge expression and annotation dataframes merged_df <- inner_join(normalized_df, expr_processed_annot %>% select(gsm_id, disease_state), by = "gsm_id") # Violin plot violin_disease <- ggplot(merged_df, aes(x = disease_state, y = value)) + geom_violin(aes(fill = disease_state), position = position_dodge(0.9)) + scale_fill_manual( values = c("c" = "#FFFFF0", "d" = "#838B83"), # Specify colors for the groups labels = c("c" = "Control", "d" = "Disease"), # Set the labels for the legend name = "Disease State" # Set the title for the legend ) + scale_x_discrete( labels = c("c" = "Control", "d" = "Disease") # Rename x-axis labels ) + labs( title = paste0("Expression Values by Disease State for ", gds_id), x = "Disease State", y = "Expression Value" ) + theme_minimal() + theme( plot.title = element_text(size = 18, face = "bold", hjust = 0.5), axis.title.x = element_text(size = 16), axis.title.y = element_text(size = 16), axis.text.x = element_text(size = 15), axis.text.y = element_text(size = 15), legend.position = "none" # Remove the legend ) # Save the plot ggsave( file.path(gds_output_dir, paste0(gds_id, "_expression_by_disease_state.png")), plot = violin_disease, width = 14, height = 8, bg = 'white' ) } # Boxplot of expression values boxplot_expression <- ggplot(expr_values, aes(x=gds_id, y = value)) + geom_boxplot() + labs(title = paste0("Distribution of Expression Values for ", gds_id),x='', y = "Expression Value") + theme_minimal()+ theme( plot.title = element_text(size = 18, face = "bold", hjust = 0.5), axis.title.x = element_text(size = 16), axis.title.y = element_text(size = 16), axis.text.x = element_text(size = 15), axis.text.y = element_text(size = 15) ) ggsave(file.path(gds_output_dir, paste0(gds_id, "_expression_values_boxplot.png")), plot = boxplot_expression, width = 14, height = 8, bg = "white") # Boxplot of normalized expression values boxplot_norm_expression <- ggplot(normalized_df, aes(x=gds_id, y = value)) + geom_boxplot() + labs(title = paste0("Distribution of Normalized Expression Values for ", gds_id), x='', y = "Expression Value") + theme_minimal()+ theme( plot.title = element_text(size = 18, face = "bold", hjust = 0.5), axis.title.x = element_text(size = 16), axis.title.y = element_text(size = 16), axis.text.x = element_text(size = 15), axis.text.y = element_text(size = 15) ) ggsave(file.path(gds_output_dir, paste0(gds_id, "_norm_expression_values_boxplot.png")), plot = boxplot_norm_expression, width = 14, height = 8, bg = "white") } } plot_expr_values <- ggplot(all_expr_values, aes(x = as.factor(gds_id), y = value)) + geom_boxplot() + labs(title = "Distribution of Expression Values Before Normalization", x = "GDS ID", y = "Expression Value") + theme_minimal()+ theme( plot.title = element_text(size = 16, face = "bold", hjust = 0.5), axis.title.x = element_text(size = 14), axis.title.y = element_text(size = 14), axis.text.x = element_text(size = 12), axis.text.y = element_text(size = 12) ) ggsave(file.path(output_dir, "boxplot_expression_values_prenormalization.png"), plot = plot_expr_values, width = 14, height = 8, bg = 'white') # Graficar caja y bigotes para valores despuĆ©s de la normalizaciĆ³n plot_normalized_values <- ggplot(all_normalized_values, aes(x = as.factor(gds_id), y = value)) + geom_boxplot() + labs(title = "Distribution of Normalized Expression Values", x = "GDS ID", y = "Expression Value") + theme_minimal()+ theme( plot.title = element_text(size = 16, face = "bold", hjust = 0.5), axis.title.x = element_text(size = 14), axis.title.y = element_text(size = 14), axis.text.x = element_text(size = 12), axis.text.y = element_text(size = 12) ) ggsave(file.path(output_dir, "boxplot_expression_values_postnormalization.png"), plot = plot_normalized_values, width = 14, height = 8, bg = 'white') #combined_plot <- grid.arrange(plot_expr_values, plot_normalized_values, nrow = 2) #ggsave(file.path(output_dir, "boxplot_expression_values_combined.png"), plot = combined_plot, width = 14, height = 8, bg = 'white')