library(ggplot2)
library(tidyverse)
library(reshape2)
library(circlize)
library(ComplexHeatmap)
library(RColorBrewer)
library(cowplot)
library(ggplotify)
library(ggpubr)
library(ggforce)
method_col <- c("CAMT" = "#FF0000", "AdaPT" = "#E69F00", "IHW" = "#56B4E9",
"FDRreg" = "#009E73", "BL" = "#0072B2")
measure_col <- c("BH"="#1B9E77", "ST"="#D95F02", "sd.b"="#7570B3", "sd.m"="#E7298A", "mean"="#A6CEE3",
"mad"="#1F78B4", "dip"="#B2DF8A", "precision"="#FFFF00", "icc.b"="#FB9A99",
"icc.m"="#E31A1C", "refgene.pos"="#FDBF6F", "chr"="#A6761D", "cpg.loc"="#CAB2D6",
"dhs"="#949494", "direction"="#7FFFD4", "probe.type" ="#FFE4E1")
runtime <- readRDS("Data/runtime.RDs")
runtime <- mutate(runtime, log_runtime = log10(Run_Time)) %>%
group_by(measure, method) %>%
summarise(n = n(),
mean = mean(log_runtime),
sd = sd(log_runtime),
se = sd / sqrt(n))
selected_methods <- c("AdaPT", "BL", "CAMT", "FDRreg", "IHW")
selected_measure <- c("dip", "icc.b", "icc.m", "mad", "mean", "precision", "sd.b", "sd.m",
"dhs", "direction", "probe.type", "cpg.loc", "refgene.pos", "chr")
fs6 <- runtime %>%
filter(method %in% selected_methods, measure %in% selected_measure) %>%
ggplot(aes(x = measure, y = mean, group = method, color = method)) +
geom_line(lwd = 0.4) +
geom_point(size = 0.5) +
geom_errorbar(aes(ymin = mean-1.96*se, ymax = mean+1.96*se), width = 0.15, lwd = 0.35) +
ylab("log10 Runtime(s)") + xlab("") +
theme_bw() +
theme(title = element_text(size = 10),
text = element_text(size = 8),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size = 8),
axis.text.y = element_text(size = 8),
legend.position = "right",
legend.justification = "center",
legend.title = element_blank()) +
scale_x_discrete(limits = selected_measure) +
scale_color_manual(values = method_col)
fs6
For each data set, the number of ST detected significant differential CpGs is always no less than that of BH. So the result of ST is used as reference in the comparison with other covariate-adaptive FDR control methods.
reference_discovery <- read.csv("Data/reference_discovery.csv", header = TRUE)
fs7 <- reference_discovery %>%
ggplot(aes(x = BH + 1, y = ST + 1)) +
geom_point() +
geom_abline(slope = 1, color = "red") +
scale_x_continuous(trans = "log10") +
scale_y_continuous(trans = "log10") +
theme_bw() +
xlab("BH") + ylab("ST") +
theme(title = element_text(size = 10),
text = element_text(size = 8))
fs7
discovery_number <- read.csv("Data/discovery_number.csv", header = TRUE)
covariate_order <- c("mean", "sd.b", "mad", "precision", "sd.m", "icc.m", "cpg.loc",
"direction", "icc.b", "refgene.pos", "probe.type", "dip", "dhs", "chr")
plot_data <- left_join(discovery_number, reference_discovery, by = "dataset") %>%
filter(! measure %in% c("icc.b", "icc.m")) %>%
arrange(pi0) %>%
mutate(logFC = log2((discovery + 1)/(ST + 1)))
# prepare the matrix for heatmap
plot_data$measure <- factor(plot_data$measure, levels = covariate_order)
plot_data$dataset <- factor(plot_data$dataset, levels = unique(plot_data$dataset))
heatmap_data <- dcast(plot_data, dataset ~ method + measure, value.var = "logFC")
rownames(heatmap_data) <- heatmap_data$dataset
heatmap_data$dataset <- NULL
heatmap_data <- as.matrix(heatmap_data)
# prepare row and column annotation
annotation_row <- plot_data %>%
select(dataset, ST, pi0)
annotation_row <- annotation_row[! duplicated.data.frame(annotation_row), ]
# top annotation: ranks of discovered signal number recovered by different covariates within each dataset
top_anno <- plot_data %>%
group_by(dataset, method) %>%
mutate(rank = rank(discovery, ties.method = "average")) %>%
dcast(dataset ~ method + measure, value.var = "rank")
top_anno$dataset <- NULL
# heatmap
col_fun <- colorRamp2(c(-1, -0.5, 0, 0.5, 1), c("royalblue", "blue", "white", "red", "red3"), space = "RGB")
ST_color <- colorRamp2(c(0, 200, 1000, 10000), c("#DEEBF7", "#C6DBEF", "#6BAED6", "#08306B"))
pi0_color <- colorRamp2(c(0.6, 0.9, 0.95, 1), c("#F7FCF5", "#A1D99B", "#41AB5D", "#00441B"))
f3 <- Heatmap(heatmap_data,
name = "logFC",
rect_gp = gpar(col = "grey", lwd = 0.05),
cluster_rows = FALSE,
cluster_columns = FALSE,
col = col_fun,
left_annotation = rowAnnotation(
ST = annotation_row$ST,
pi0 = annotation_row$pi0,
col = list(ST = ST_color,
pi0 = pi0_color),
simple_anno_size = unit(0.25, "cm"),
annotation_name_gp = gpar(fontsize = 8),
annotation_legend_param = list(title_gp = gpar(fontsize = 9),
labels_gp = gpar(fontsize = 7))),
top_annotation = columnAnnotation(
rank = anno_boxplot(top_anno, outline = FALSE,
gp = gpar(fill = measure_col[covariate_order[-grep("icc", covariate_order)]]))),
row_names_gp = gpar(fontsize = 5),
column_names_gp = gpar(fontsize = 8),
column_split = rep(c("AdaPT", "BL", "CAMT", "FDRreg", "IHW"), each = 12),
column_gap = unit(0, "mm"),
column_labels = unlist(lapply(str_split(colnames(heatmap_data), pattern = "_"),
function(x)x[[2]])),
heatmap_legend_param = list(title_gp = gpar(fontsize = 9),
labels_gp = gpar(fontsize = 7)))
f3
# Method AdaPT's performance is very different from other methods, so an alternative range is set for it.
p1 <- plot_data %>%
filter(method == "AdaPT") %>%
ggplot(aes(x = measure, y = logFC)) +
geom_boxplot(aes(fill = measure), outlier.shape = NA, fatten = 1) +
scale_fill_manual(values = measure_col) +
coord_cartesian(ylim = c(-6, 6)) +
facet_wrap(~ method) +
theme_bw() +
xlab("") +
theme(title = element_text(size = 10),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
legend.position = "none")
p2 <- plot_data %>%
filter(method != "AdaPT") %>%
ggplot(aes(x = measure, y = logFC)) +
geom_boxplot(aes(fill = measure), outlier.shape = NA, fatten = 1) +
scale_fill_manual(values = measure_col) +
coord_cartesian(ylim = c(-2, 2)) +
facet_wrap(~ method, nrow = 1, scales = "free_y") +
theme_bw() +
xlab("") + ylab("") +
theme(title = element_text(size = 10),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
legend.key.size = unit(0.3, "cm"),
legend.position = "right",
legend.justification = "center")
fs8_1 <- plot_grid(p1, p2, rel_widths = c(1, 4))
# select the most informative covariate (with min omnibus test p-value)
omnibus_result <- read.csv("Data/omnibus.result.csv", header = TRUE)
# select the most significant covariate for each dataset
selected <- omnibus_result %>%
group_by(dataset) %>%
arrange(factor(dataset), p.value, desc(stat.o)) %>%
filter(rank(p.value, ties.method = "first") == 1, p.value < 0.05) %>%
select(dataset, measure) %>%
mutate(id = paste0(dataset, "_", measure))
fs8_2 <- left_join(discovery_number, reference_discovery, by = "dataset") %>%
mutate(logFC = log2((discovery + 1)/(ST + 1))) %>%
mutate(id = paste0(dataset, "_", measure)) %>%
filter(id %in% selected$id) %>%
ggplot(aes(x = method, y = logFC, color = method)) +
geom_boxplot(outlier.shape = NA, fatten = 1, lwd = 0.3) +
geom_sina(size = 1, alpha = 0.5, shape = 16) +
coord_cartesian(ylim = c(-3.75, 2.5)) +
scale_color_manual(values = method_col) +
theme_bw() +
theme(legend.position = "none",
axis.text.x = element_text(size = 8),
axis.text.y = element_text(size = 8),
title = element_text(size = 10)) +
xlab("") + ylab("logFC")
fs8 <- plot_grid(fs8_1, fs8_2, labels = c("A", "B"), ncol = 1, label_size = 18)
fs8
EWAS33 <- readRDS("Data/dataset/EWAS33.RDS")
candidate_covariates <- c("sd.b", "mad", "mean", "precision")
EWAS33_data <- data.frame()
for(s in candidate_covariates){
df <- data.frame(covariate_value = EWAS33[[s]],
p_value = EWAS33$P.value)
df$covariate <- s
f <- ecdf(df$covariate_value)
df$percentile_covariate <- f(df$covariate_value)
EWAS33_data <- rbind(EWAS33_data, df)
}
fs9 <- ggplot(data = EWAS33_data, mapping = aes(x = percentile_covariate, y = -log10(p_value))) +
geom_hex(bins = 100) +
facet_wrap(~covariate, nrow = 1) +
theme_bw() +
theme(legend.key.size = unit(0.3, "cm"))
fs9
plotdata_subset <- discovery_number %>%
mutate(id = paste0(dataset, "_", measure)) %>%
filter(id %in% selected$id, measure != "icc.b", measure != "icc.m") %>%
left_join(reference_discovery, by = "dataset") %>%
filter(ST > 0) %>%
arrange(pi0) %>%
mutate(logFC = log2((discovery + 1)/(ST + 1)))
heatmapdata_subset <- dcast(plotdata_subset, dataset ~ method, value.var = "logFC")
rownames(heatmapdata_subset) <- heatmapdata_subset$dataset
heatmapdata_subset$dataset <- NULL
rownames(reference_discovery) <- reference_discovery$dataset
heatmapdata_subset$ST <- reference_discovery[rownames(heatmapdata_subset), "ST"]
heatmapdata_subset$pi0 <- reference_discovery[rownames(heatmapdata_subset), "pi0"]
heatmapdata_subset <- heatmapdata_subset[order(heatmapdata_subset$pi0), ]
fs10 <- Heatmap(as.matrix(heatmapdata_subset[, 1:5]),
name = "logFC",
rect_gp = gpar(col = "grey", lwd = 0.05),
col = col_fun,
cluster_rows = FALSE,
left_annotation = rowAnnotation(
ST = heatmapdata_subset$ST,
pi0 = heatmapdata_subset$pi0,
col = list(ST = ST_color,
pi0 = pi0_color),
simple_anno_size = unit(0.1, "inch"),
annotation_name_gp = gpar(fontsize = 6),
annotation_legend_param = list(title_gp = gpar(fontsize = 6),
labels_gp = gpar(fontsize = 5),
grid_height = unit(0.1, "inch"),
grid_width = unit(0.1, "inch"))),
row_names_gp = gpar(fontsize = 4),
column_names_gp = gpar(fontsize = 6),
column_names_rot = 45,
column_names_centered = TRUE,
show_row_dend = FALSE,
column_dend_height = unit(0.2, "inch"),
width = unit(1, "inch"),
height = unit(4, "inch"),
heatmap_legend_param = list(title_gp = gpar(fontsize = 6),
labels_gp = gpar(fontsize = 5),
grid_height = unit(0.1, "inch"),
grid_width = unit(0.1, "inch")))
fs10
Three groups (“p<=0.001”, “0.001<p<=0.01”, “p>0.01”) are classified according to omnibus test p-value. We explored the signal number changes accross the three groups.
# Method AdaPT's performance is very different from other methods, so an alternative range is set for it.
p1 <- left_join(plot_data, omnibus_result) %>%
filter(method == "AdaPT") %>%
mutate(group = factor(ifelse(p.value <= 0.001, "p<=0.001", ifelse(p.value <= 0.01, "0.001<p<=0.01", "p>0.01")),
levels = c("p<=0.001", "0.001<p<=0.01", "p>0.01"))) %>%
ggplot(aes(x = group, y = logFC)) +
geom_boxplot(aes(fill = group), outlier.shape = NA, fatten = 1) +
stat_summary(fun.y = mean, geom = "point", shape = 18, color = "black") +
coord_cartesian(ylim = c(-6, 6)) +
facet_wrap(~method, nrow = 1, scales = "free") +
theme_bw() +
theme(title = element_text(size = 10),
axis.text = element_text(size = 8),
axis.text.x = element_blank(),
axis.title.x = element_blank(),
axis.ticks.x = element_blank(),
legend.position = "none")
p2 <- left_join(plot_data, omnibus_result) %>%
filter(method != "AdaPT") %>%
mutate(group = factor(ifelse(p.value <= 0.001, "p<=0.001", ifelse(p.value <= 0.01, "0.001<p<=0.01", "p>0.01")),
levels = c("p<=0.001", "0.001<p<=0.01", "p>0.01"))) %>%
ggplot(aes(x = group, y = logFC)) +
geom_boxplot(aes(fill = group), outlier.shape = NA, fatten = 1) +
stat_summary(fun.y = mean, geom = "point", shape = 18, color = "black") +
coord_cartesian(ylim = c(-2, 2)) +
facet_wrap(~method, nrow = 1, scales = "free") +
theme_bw() +
theme(title = element_blank(),
axis.text = element_text(size = 8),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
legend.position = "right")
fs11 <- plot_grid(p1, p2, rel_widths = c(1, 4.5))
fs11
CAMT_combined <- read.csv("Data/CAMT_combined.csv", header = TRUE)
CAMT_combined$fromST <- NULL
long_data <- melt(CAMT_combined, id.vars = "X", value.name = "discovery", variable.name = "measure")
selected <- omnibus_result %>%
group_by(dataset) %>%
arrange(factor(dataset), p.value, desc(stat.o)) %>%
filter(rank(p.value, ties.method = "first") == 1, p.value < 0.05) %>%
select(dataset, measure) %>%
left_join(long_data, by = c("dataset" = "X", "measure" = "measure")) %>%
select(dataset, measure, discovery) %>%
ungroup()
selected$measure <- "most.sig"
colnames(long_data) <- colnames(selected)
long_data <- rbind(long_data, selected)
long_data$discovery <- as.numeric(long_data$discovery)
long_data <- long_data %>%
left_join(reference_discovery) %>%
mutate(logFC = log2((1+discovery)/(1+ST)),
id = paste0(dataset, "_", measure))
wide_data <- dcast(long_data, dataset ~ measure, value.var = "logFC")
rownames(wide_data) <- wide_data$dataset
wide_data$dataset <- NULL
wide_data$icc.b <- NULL
wide_data$icc.m <- NULL
wide_data$most.sig[is.na(wide_data$most.sig)] <- 0
covariate_order <- c("mean", "sd.b", "mad", "precision", "sd.m", "cpg.loc",
"direction", "refgene.pos", "probe.type", "dip", "dhs", "chr",
"most.sig", "multiple")
wide_data <- wide_data[, covariate_order]
# heatmap top annotation
top_anno <- long_data %>%
group_by(measure) %>%
dcast(dataset ~ measure, value.var = "logFC")
rownames(top_anno) <- top_anno$dataset
top_anno <- top_anno[rownames(wide_data), colnames(wide_data)]
# row annotation
rownames(reference_discovery) <- reference_discovery$dataset
wide_data$ST <- reference_discovery[rownames(wide_data), "ST"]
wide_data$pi0 <- reference_discovery[rownames(wide_data), "pi0"]
wide_data <- wide_data[order(wide_data$pi0), ]
col_fun <- colorRamp2(c(-4, -2, 0, 2, 4), c("royalblue", "blue", "white", "red", "red3"),
space = "RGB")
measrue_color <- c("most.sig"="#FB9A99", "multiple"="#E31A1C", "sd.b"="#7570B3", "sd.m"="#E7298A",
"mean"="#A6CEE3", "mad"="#1F78B4", "dip"="#B2DF8A", "precision"="#FFFF00",
"refgene.pos"="#FDBF6F", "chr"="#A6761D", "cpg.loc"="#CAB2D6", "dhs"="#949494",
"direction"="#7FFFD4", "probe.type" ="#FFE4E1")
fs12 <- Heatmap(as.matrix(wide_data[, 1:14]),
col = col_fun,
border = "grey",
na_col = "lightgrey",
cluster_rows = FALSE,
cluster_columns = FALSE,
name = "logFC",
row_names_gp = gpar(fontsize = 4),
column_names_gp = gpar(fontsize = 7),
width = unit(2, "inch"),
height = unit(4, "inch"),
heatmap_legend_param = list(title_gp = gpar(fontsize = 6),
labels_gp = gpar(fontsize = 5),
grid_height = unit(0.1, "inch"),
grid_width = unit(0.13, "inch")),
top_annotation = columnAnnotation(
logFC = anno_boxplot(top_anno, outline = FALSE,
gp = gpar(fill = measrue_color[covariate_order])),
annotation_height = unit(0.7, "inch"),
annotation_name_gp = gpar(fontsize = 6)),
left_annotation = rowAnnotation(
ST = wide_data$ST,
pi0 = wide_data$pi0,
col = list(ST = ST_color,
pi0 = pi0_color),
simple_anno_size = unit(0.1, "inch"),
annotation_name_gp = gpar(fontsize = 6),
annotation_legend_param = list(title_gp = gpar(fontsize = 6),
labels_gp = gpar(fontsize = 5),
grid_height = unit(0.1, "inch"),
grid_width = unit(0.13, "inch")))
)
fs12