library(tidyverse)
library(circlize)
library(ComplexHeatmap)
library(reshape2)
library(UpSetR)
library(ggplotify)
library(cowplot)
library(VennDiagram)
library(ggpubr)
library(readxl)
library(ggforce)
measure_col <- c("BH"="#D55E00", "ST"="#CC79A7", "sd.b"="#7570B3", "sd.m"="#E7298A", "mean"="#A6CEE3",
"mad"="#1F78B4", "dip"="#B2DF8A", "precision"="#FFFF00", "icc.b"="#FB9A99",
"icc.m"="#E31A1C", "refgene.pos"="#FDBF6F", "chr"="#A6761D", "cpg.loc"="#1B9E77",
"dhs"="#949494", "direction"="#7FFFD4", "probe.type" ="#FFE4E1")
method_col <- c("CAMT" = "#FF0000", "AdaPT" = "#E69F00", "IHW" = "#56B4E9",
"FDRreg" = "#009E73", "BL" = "#0072B2", BH="#D55E00", ST="#CC79A7")
EWAS51 is a dataset with binary phenotype (FASD and control). We sampled each phenotype randomly at sample size 10, 20, 30, 40, 50, 60, 70, 80, 90 and 100 by method IHW and CAMT. The power of each method with different covariates are displyed as follows.
selected_measure <- c("BH", "ST", "chr", "cpg.loc", "dhs", "dip", "direction", "mad", "mean",
"precision", "probe.type", "refgene.pos", "sd.b", "sd.m")
selected_samplesize <- c(10, 20, 30, 40, 50, 60, 70, 80, 90, 100)
IHW_power <- readRDS("Data/EWAS51_IHW_downsample_power.RDS")
IHW_power <- IHW_power %>%
filter(measure %in% selected_measure, sample_size %in% selected_samplesize)
cdata <- plyr::ddply(IHW_power, c("measure", "sample_size"), summarise,
N = sum(!is.na(Power)),
mean = mean(Power, na.rm = TRUE),
sd = sd(Power, na.rm = TRUE),
se = sd / sqrt(N))
fs13 <- ggplot(data = cdata, aes(x = factor(sample_size), y = mean, colour = measure)) +
geom_errorbar(aes(ymin=mean-se, ymax=mean+se),
width = 0.8,
position = position_dodge(width = 0.5),
lwd = 0.5) +
geom_point(position = position_dodge(width = 0.5), size = 0.5, shape = 16) +
scale_color_manual(values = measure_col) +
xlab("Sample Size") + ylab("Percentage") +
theme_bw() +
theme(axis.title = element_text(size = 10),
axis.text = element_text(size = 8),
legend.text = element_text(size = 6),
legend.title = element_text(size = 8))
fs13
CAMT_power <- readRDS("Data/EWAS51_CAMT_downsample_power.RDS")
CAMT_power <- CAMT_power %>%
filter(measure %in% selected_measure, sample_size %in% selected_samplesize)
cdata <- plyr::ddply(CAMT_power, c("measure", "sample_size"), summarise,
N = sum(!is.na(Power)),
mean = mean(Power, na.rm = TRUE),
sd = sd(Power, na.rm = TRUE),
se = sd / sqrt(N))
f4 <- ggplot(data = cdata, aes(x = factor(sample_size), y = mean, colour = measure)) +
geom_errorbar(aes(ymin=mean-se, ymax=mean+se), width = 0.8,
position = position_dodge(width = 0.5),
lwd = 0.5) +
geom_point(position = position_dodge(width = 0.5), size = 0.5, shape = 16) +
scale_color_manual(values = measure_col) +
xlab("Sample Size") + ylab("Percentage") +
theme_bw() +
theme(axis.title = element_text(size = 10),
axis.text = element_text(size = 8),
legend.text = element_text(size = 6),
legend.title = element_text(size = 8))
f4
selected_measure <- c("mean", "sd.b", "mad", "icc.m", "icc.b", "precision", "dhs", "direction",
"chr", "refgene.pos", "dip", "probe.type", "cpg.loc", "sd.m")
# read in adjusted p-values' data into a list
pvalues <- list()
for(i in c("AdaPT", "BL", "CAMT", "FDRreg", "IHW")){
pvalues[[i]] <- readRDS(paste0("Data/EWAS45_", i, "_adjusted_pvalues.RDS"))
}
# calculate the number of significant DMPs
discovery <- as.data.frame(lapply(pvalues, function(x)(apply(x, 2, function(x)sum(x < 0.05)))))
discovery <- as.data.frame(t(discovery))
# extract BH and ST DMPs' numbers
BH <- unique(discovery$BH)
ST <- unique(discovery$ST)
plot_data <- discovery %>%
select(selected_measure)
plot_data <- as.matrix((plot_data - ST)/ST)*100
cell_lable <- paste0(round(plot_data, 0), "%")
cell_lable <- matrix(cell_lable, nrow = nrow(plot_data), ncol = ncol(plot_data))
col_fun = colorRamp2(c(-200, -100, 0, 200),
c("royalblue", "blue", "white", "red"),
space = "RGB")
panel_A <- Heatmap(plot_data,
name = "Color Key",
col = col_fun,
rect_gp = gpar(col = "grey", lwd = 0.25),
cell_fun = function(j, i, x, y, width, height, fill){
if(abs(plot_data[i, j]) > 1) grid.text(cell_lable[i, j], x, y, gp = gpar(fontsize = 8))
},
clustering_distance_rows = "euclidean",
clustering_distance_columns = "euclidean",
clustering_method_rows = "ward.D",
clustering_method_columns = "ward.D",
row_names_gp = gpar(fontsize = 8),
column_names_gp = gpar(fontsize = 8),
heatmap_legend_param = list(title_gp = gpar(fontsize = 8),
labels_gp = gpar(fontsize = 6),
labels = c("-200%", "-100%", 0, "100%", "200%")))
draw(panel_A)
panel_A <- grid.grabExpr(draw(panel_A))
# get the significance order for each cpg
ranks <- lapply(pvalues, function(x)as.data.frame(apply(x, 2, function(x)rank(x, ties.method = "random"))))
selected_method <- "IHW"
gold_standard <- read.csv("Data/GoldStandardCpG.T4.csv", header = TRUE, row.names = 1)
selected_cpg <- gold_standard$x[gold_standard$x %in% rownames(ranks[[1]])]
reference_line <- median(ranks[[selected_method]][selected_cpg, "ST"])
panel_B <- ranks[[selected_method]][selected_cpg, ] %>%
select(selected_measure) %>%
melt(value.name = "Significance_rank") %>%
group_by(variable) %>%
mutate(significance_rank_median = median(Significance_rank)) %>%
ggplot(aes(x = reorder(variable, Significance_rank, FUN = median), y = Significance_rank, color = variable, fill = variable)) +
geom_sina(size = 0.25, alpha = 0.2, shape = 16) +
geom_hline(aes(yintercept = reference_line), color = "black", size = 0.5, linetype = "dashed") +
geom_segment(aes(x = variable, xend = variable,
y = reference_line, yend = significance_rank_median),
size = 0.25, color = "black") +
stat_summary(fun.y = median, geom = "point", size = 1.5, shape = 21, color = "black") +
scale_fill_manual(values = measure_col) +
scale_color_manual(values = measure_col) +
scale_x_discrete(limits = selected_measure) +
xlab("") + ylab("Significance Rank") + labs(fill = "") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 8),
axis.title = element_text(size = 10),
legend.position = "none")
panel_B
selected_method <- c("AdaPT", "BL", "CAMT", "FDRreg")
fs14 <- as.data.frame(ranks[selected_method])[selected_cpg, ] %>%
melt(value.name = "Significance_rank") %>%
mutate(method = sapply(str_split(variable, fixed("."), n = 2), function(x)unlist(x)[1]),
measure = sapply(str_split(variable, fixed("."), n = 2), function(x)unlist(x)[2])) %>%
filter(measure %in% selected_measure) %>%
select(Significance_rank, method, measure) %>%
group_by(method, measure) %>%
mutate(significance_rank_median = median(Significance_rank)) %>%
ggplot(aes(x = reorder(measure, Significance_rank, FUN = median), y = Significance_rank, color = measure, fill = measure)) +
geom_sina(size = 0.25, alpha = 0.2, shape = 16, show.legend = FALSE) +
geom_hline(aes(yintercept = reference_line), color = "black", size = 0.5, linetype = "dashed") +
geom_segment(aes(x = measure, xend = measure,
y = reference_line, yend = significance_rank_median),
size = 0.25, color = "black") +
stat_summary(fun.y = median, geom = "point", size = 1.5, shape = 21, color = "black") +
scale_fill_manual(values = measure_col) +
scale_color_manual(values = measure_col) +
scale_x_discrete(limits = selected_measure) +
facet_wrap(~ method, nrow = 2) +
xlab("") + ylab("Significance Rank") + labs(fill = "") +
theme_bw() +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.text.y = element_text(size = 8),
axis.title.y = element_text(size = 10))
fs14
# choose the most powerful covariate mean
subset_rank <- as.data.frame(lapply(ranks, function(x)x[, "mean", drop = FALSE]))
colnames(subset_rank) <- names(ranks)
selected_method <- c("CAMT", "AdaPT", "IHW", "BL", "FDRreg")
panel_C <- subset_rank[selected_cpg, ] %>%
select(selected_method) %>%
melt(value.name = "Significance_rank") %>%
ggplot(aes(x = variable, y = Significance_rank)) +
geom_boxplot(aes(fill = variable), width = 0.72, fatten = 1) +
scale_fill_manual(values = method_col) +
scale_x_discrete(limits = selected_method) +
xlab("") + ylab("Significance Rank") + labs(fill = "method") +
theme_classic() +
theme(axis.text.x = element_text(angle = 30, hjust = 0.5, vjust = 0.5, size = 8),
axis.title = element_text(size = 10),
legend.position = "none") +
geom_hline(yintercept = reference_line, linetype = "dashed", color = "black", size = 0.5)
panel_C
# organize figures into Figure 5
bottom_row <- plot_grid(panel_B, panel_C, nrow = 1, align = "h",
labels = c("B", "C"), label_size = 18)
f5 <- plot_grid(panel_A, bottom_row, ncol = 1,
labels = c("A", ""), label_size = 18)
f5
selected_measure <- c("mad", "mean", "sd.b", "precision", "sd.m", "dhs", "dip", "chr", "direction", "probe.type", "refgene.pos", "cpg.loc")
# read in adjusted p-values' data into a list
pvalues <- list()
for(i in c("AdaPT", "BL", "CAMT", "FDRreg", "IHW")){
pvalues[[i]] <- readRDS(paste0("Data/EWAS27_", i, "_adjusted_pvalues.RDS"))
}
## heatmap
discovery <- as.data.frame(lapply(pvalues, function(x)(apply(x, 2, function(x)sum(x < 0.05)))))
discovery <- as.data.frame(t(discovery))
# extract BH and ST signal numbers
BH <- unique(discovery$BH)
ST <- unique(discovery$ST)
plot_data <- discovery %>%
select(selected_measure)
plot_data <- as.matrix((plot_data - ST)/ST)*100
cell_lable <- paste0(round(plot_data, 0), "%")
cell_lable <- matrix(cell_lable, nrow = nrow(plot_data), ncol = ncol(plot_data))
col_fun = colorRamp2(c(-100, 0, 200), c("blue", "white", "red"), space = "RGB")
fs15_1 <- Heatmap(plot_data,
name = "Color Key",
col = col_fun,
rect_gp = gpar(col = "grey", lwd = 0.25),
cell_fun = function(j, i, x, y, width, height, fill){
if(abs(plot_data[i, j]) > 1) grid.text(cell_lable[i, j], x, y, gp = gpar(fontsize = 8))
},
clustering_distance_rows = "euclidean",
clustering_distance_columns = "euclidean",
clustering_method_rows = "ward.D",
clustering_method_columns = "ward.D",
row_names_gp = gpar(fontsize = 8),
column_names_gp = gpar(fontsize = 8),
heatmap_legend_param = list(labels_gp = gpar(fontsize = 6),
title_gp = gpar(fontsize = 8),
at = c(-100, 0, 100, 200),
labels = c("-100%", 0, "100%", "200%")))
fs15_1 <- grid.grabExpr(draw(fs15_1))
## boxplot
ranks <- lapply(pvalues, function(x)as.data.frame(apply(x, 2, function(x)rank(x, ties.method = "average"))))
selected_method <- "IHW"
gold_standard <- read.csv("Data/GoldStandardCpG.T4.csv",
header = TRUE, row.names = 1)
selected_cpg <- gold_standard$x[gold_standard$x %in% rownames(ranks[[1]])]
reference_line <- median(ranks[[selected_method]][selected_cpg, "ST"])
fs15_2 <- ranks[[selected_method]][selected_cpg, ] %>%
select(selected_measure) %>%
melt(value.name = "Significance_rank") %>%
group_by(variable) %>%
mutate(significance_rank_median = median(Significance_rank)) %>%
ggplot(aes(x = reorder(variable, Significance_rank, FUN = median), y = Significance_rank, color = variable, fill = variable)) +
geom_sina(size = 0.25, alpha = 0.2, shape = 16) +
geom_hline(aes(yintercept = reference_line), color = "black", size = 0.5, linetype = "dashed") +
geom_segment(aes(x = variable, xend = variable,
y = reference_line, yend = significance_rank_median),
size = 0.25, color = "black") +
stat_summary(fun.y = median, geom = "point", size = 1.5, shape = 21, color = "black") +
scale_fill_manual(values = measure_col) +
scale_color_manual(values = measure_col) +
scale_x_discrete(limits = selected_measure) +
xlab("") + ylab("Significance Rank") + labs(fill = "") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 8),
axis.title = element_text(size = 10),
legend.position = "none")
subset_rank <- as.data.frame(lapply(ranks, function(x)x[, "mean", drop = FALSE]))
colnames(subset_rank) <- names(ranks)
selected_method <- c("CAMT", "AdaPT", "IHW", "BL", "FDRreg")
fs15_3 <- subset_rank[selected_cpg, ] %>%
select(selected_method) %>%
melt(value.name = "Significance_rank") %>%
ggplot(aes(x = variable, y = Significance_rank)) +
geom_boxplot(aes(fill = variable), width = 0.72, fatten = 1) +
scale_fill_manual(values = method_col) +
scale_x_discrete(limits = selected_method) +
xlab("") + ylab("Significance Rank") + labs(fill = "method") +
theme_classic() +
theme(axis.text.x = element_text(angle = 30, hjust = 0.5, vjust = 0.5, size = 8),
axis.title = element_text(size = 10),
legend.position = "none") +
geom_hline(yintercept = reference_line, linetype = "dashed", color = "black", size = 0.5)
## put all figures together
bottom_row <- plot_grid(fs15_2, fs15_3, labels = c("B", "C"), align = "h", nrow = 1, label_size = 18)
fs15 <- plot_grid(fs15_1, bottom_row, labels = c("A", ""), ncol = 1, label_size = 18)
fs15
methy_data <- readRDS("Data/cg18092474.RDS")
# count samples within each group
table(methy_data$group)
##
## current smoker former smoker never
## 22 263 179
fs16 <- methy_data %>%
ggplot(aes(x = group, y = bVals, fill = group)) +
stat_boxplot(geom = "errorbar", width = 0.3) +
geom_boxplot(width = 0.6) +
scale_x_discrete(labels = c("current smoker (n=22)", "former smoker (n=263)", "never (n=179)")) +
annotate("text", label = "BH: 0.090 ST: 0.088\nIHW: 0.021 CAMT: 0.019",
x = 2, y = 0.9:1, color = "red") +
geom_segment(aes(x = 1, y = 0.9, xend = 3, yend = 0.9), size = 0.25) +
geom_segment(aes(x = 1, y = 0.8, xend = 1, yend = 0.9), size = 0.1) +
geom_segment(aes(x = 3, y = 0.85, xend = 3, yend = 0.9), size = 0.1) +
ylim(0, 1) +
xlab("Group") +
theme_bw() +
theme(legend.position = "none")
fs16
fs17 <- draw.pairwise.venn(area1 = length(IHW),
area2 = length(CAMT),
cross.area = length(intersect(IHW, CAMT)),
category = c("IHW", "CAMT"),
fill = c("pink", "skyblue"),
cat.pos = c(0, 0),
scaled = FALSE)