Further validation of the recovered signals

1. Down-sampling analysis
2. Increasing power to discovery true signals
3. Some well-known smoking-associated CpGs were recovered

Library R packages and basic settings

library(tidyverse)
library(circlize)
library(ComplexHeatmap)
library(reshape2)
library(UpSetR)
library(ggplotify)
library(cowplot)
library(VennDiagram)
library(ggpubr)
library(readxl)
library(ggforce)

measure_col <- c("BH"="#D55E00", "ST"="#CC79A7", "sd.b"="#7570B3", "sd.m"="#E7298A", "mean"="#A6CEE3", 
                 "mad"="#1F78B4", "dip"="#B2DF8A", "precision"="#FFFF00", "icc.b"="#FB9A99", 
                 "icc.m"="#E31A1C", "refgene.pos"="#FDBF6F", "chr"="#A6761D", "cpg.loc"="#1B9E77", 
                 "dhs"="#949494", "direction"="#7FFFD4", "probe.type" ="#FFE4E1")
method_col <- c("CAMT" = "#FF0000", "AdaPT" = "#E69F00", "IHW" = "#56B4E9", 
                "FDRreg" = "#009E73", "BL" = "#0072B2", BH="#D55E00", ST="#CC79A7")

1. Down-sampling analysis

EWAS51 is a dataset with binary phenotype (FASD and control). We sampled each phenotype randomly at sample size 10, 20, 30, 40, 50, 60, 70, 80, 90 and 100 by method IHW and CAMT. The power of each method with different covariates are displyed as follows.

1.1 Method IHW

selected_measure <- c("BH", "ST", "chr", "cpg.loc", "dhs", "dip", "direction", "mad", "mean",
                      "precision", "probe.type", "refgene.pos", "sd.b", "sd.m")
selected_samplesize <- c(10, 20, 30, 40, 50, 60, 70, 80, 90, 100)
IHW_power <- readRDS("Data/EWAS51_IHW_downsample_power.RDS")

IHW_power <- IHW_power %>%
  filter(measure %in% selected_measure, sample_size %in% selected_samplesize)

cdata <- plyr::ddply(IHW_power, c("measure", "sample_size"), summarise,
               N    = sum(!is.na(Power)),
               mean = mean(Power, na.rm = TRUE),
               sd   = sd(Power, na.rm = TRUE),
               se   = sd / sqrt(N))

fs13 <- ggplot(data = cdata, aes(x = factor(sample_size), y = mean, colour = measure)) +
 geom_errorbar(aes(ymin=mean-se, ymax=mean+se), 
               width = 0.8, 
               position = position_dodge(width = 0.5),
               lwd = 0.5) +
  geom_point(position = position_dodge(width = 0.5), size = 0.5, shape = 16) +
  scale_color_manual(values = measure_col) +
  xlab("Sample Size") + ylab("Percentage") +
  theme_bw() +
  theme(axis.title = element_text(size = 10),
        axis.text = element_text(size = 8),
        legend.text = element_text(size = 6),
        legend.title = element_text(size = 8))

fs13

1.2 Method CAMT

CAMT_power <- readRDS("Data/EWAS51_CAMT_downsample_power.RDS")
CAMT_power <- CAMT_power %>%
  filter(measure %in% selected_measure, sample_size %in% selected_samplesize)

cdata <- plyr::ddply(CAMT_power, c("measure", "sample_size"), summarise,
               N    = sum(!is.na(Power)),
               mean = mean(Power, na.rm = TRUE),
               sd   = sd(Power, na.rm = TRUE),
               se   = sd / sqrt(N))

f4 <- ggplot(data = cdata, aes(x = factor(sample_size), y = mean, colour = measure)) +
 geom_errorbar(aes(ymin=mean-se, ymax=mean+se), width = 0.8, 
               position = position_dodge(width = 0.5),
               lwd = 0.5) +
  geom_point(position = position_dodge(width = 0.5), size = 0.5, shape = 16) +
  scale_color_manual(values = measure_col) +
  xlab("Sample Size") + ylab("Percentage") +
  theme_bw() +
  theme(axis.title = element_text(size = 10),
        axis.text = element_text(size = 8),
        legend.text = element_text(size = 6),
        legend.title = element_text(size = 8))

f4

2. Increasing power to discovery true signals

2.1 Overview of dataset EWAS45 across different methods and covariates

selected_measure <- c("mean", "sd.b", "mad", "icc.m", "icc.b", "precision", "dhs", "direction",
                      "chr", "refgene.pos", "dip", "probe.type", "cpg.loc", "sd.m")

# read in adjusted p-values' data into a list
pvalues <- list()
for(i in c("AdaPT", "BL", "CAMT", "FDRreg", "IHW")){
  pvalues[[i]] <- readRDS(paste0("Data/EWAS45_", i, "_adjusted_pvalues.RDS"))
}

# calculate the number of significant DMPs
discovery <- as.data.frame(lapply(pvalues, function(x)(apply(x, 2, function(x)sum(x < 0.05)))))
discovery <- as.data.frame(t(discovery))

# extract BH and ST DMPs' numbers
BH <- unique(discovery$BH)
ST <- unique(discovery$ST)
plot_data <- discovery %>%
  select(selected_measure)

plot_data <- as.matrix((plot_data - ST)/ST)*100
cell_lable <- paste0(round(plot_data, 0), "%")
cell_lable <- matrix(cell_lable, nrow = nrow(plot_data), ncol = ncol(plot_data))

col_fun = colorRamp2(c(-200, -100, 0, 200), 
                     c("royalblue", "blue", "white", "red"), 
                     space = "RGB")
panel_A <- Heatmap(plot_data,
        name = "Color Key",
        col = col_fun,
        rect_gp = gpar(col = "grey", lwd = 0.25),
        cell_fun = function(j, i, x, y, width, height, fill){
          if(abs(plot_data[i, j]) > 1) grid.text(cell_lable[i, j], x, y, gp = gpar(fontsize = 8))
        },
        clustering_distance_rows = "euclidean",
        clustering_distance_columns = "euclidean",
        clustering_method_rows = "ward.D",
        clustering_method_columns = "ward.D",
        row_names_gp = gpar(fontsize = 8),
        column_names_gp = gpar(fontsize = 8),
        heatmap_legend_param = list(title_gp = gpar(fontsize = 8), 
                                    labels_gp = gpar(fontsize = 6),
                                    labels = c("-200%", "-100%", 0, "100%", "200%")))
draw(panel_A)

panel_A <- grid.grabExpr(draw(panel_A))

2.2 Boxplot of gold standard CpGs’ significance orders by covariate under different methods

# get the significance order for each cpg
ranks <- lapply(pvalues, function(x)as.data.frame(apply(x, 2, function(x)rank(x, ties.method = "random"))))
selected_method <- "IHW"

gold_standard <- read.csv("Data/GoldStandardCpG.T4.csv", header = TRUE, row.names = 1)
selected_cpg <- gold_standard$x[gold_standard$x %in% rownames(ranks[[1]])]

reference_line <- median(ranks[[selected_method]][selected_cpg, "ST"])
        
panel_B <- ranks[[selected_method]][selected_cpg, ] %>%
  select(selected_measure) %>%
  melt(value.name = "Significance_rank") %>%
  group_by(variable) %>%
  mutate(significance_rank_median = median(Significance_rank)) %>%
  ggplot(aes(x = reorder(variable, Significance_rank, FUN = median), y = Significance_rank, color = variable, fill = variable)) +
  geom_sina(size = 0.25, alpha = 0.2, shape = 16) +
  geom_hline(aes(yintercept = reference_line), color = "black", size = 0.5, linetype = "dashed") +
  geom_segment(aes(x = variable, xend = variable,
                   y = reference_line, yend = significance_rank_median),
               size = 0.25, color = "black") +
  stat_summary(fun.y = median, geom = "point", size = 1.5, shape = 21, color = "black") +
  scale_fill_manual(values = measure_col) +
  scale_color_manual(values = measure_col) +
  scale_x_discrete(limits = selected_measure) +
  xlab("") + ylab("Significance Rank") + labs(fill = "") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 8),
        axis.title = element_text(size = 10),
        legend.position = "none")
panel_B

selected_method <- c("AdaPT", "BL", "CAMT", "FDRreg")
fs14 <- as.data.frame(ranks[selected_method])[selected_cpg, ] %>%
  melt(value.name = "Significance_rank") %>%
  mutate(method = sapply(str_split(variable, fixed("."), n = 2), function(x)unlist(x)[1]),
         measure = sapply(str_split(variable, fixed("."), n = 2), function(x)unlist(x)[2])) %>%
  filter(measure %in% selected_measure) %>%
  select(Significance_rank, method, measure) %>%
  group_by(method, measure) %>%
  mutate(significance_rank_median = median(Significance_rank)) %>%
  ggplot(aes(x = reorder(measure, Significance_rank, FUN = median), y = Significance_rank, color = measure, fill = measure)) +
  geom_sina(size = 0.25, alpha = 0.2, shape = 16, show.legend = FALSE) +
  geom_hline(aes(yintercept = reference_line), color = "black", size = 0.5, linetype = "dashed") +
  geom_segment(aes(x = measure, xend = measure,
                   y = reference_line, yend = significance_rank_median),
               size = 0.25, color = "black") +
  stat_summary(fun.y = median, geom = "point", size = 1.5, shape = 21, color = "black") +
  scale_fill_manual(values = measure_col) +
  scale_color_manual(values = measure_col) +
  scale_x_discrete(limits = selected_measure) +
  facet_wrap(~ method, nrow = 2) +
  xlab("") + ylab("Significance Rank") + labs(fill = "") +
  theme_bw() +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        axis.text.y = element_text(size = 8),
        axis.title.y = element_text(size = 10))

fs14

2.3 Boxplot of gold standard CpGs’ significance orders by different methods

# choose the most powerful covariate mean
subset_rank <- as.data.frame(lapply(ranks, function(x)x[, "mean", drop = FALSE]))
colnames(subset_rank) <- names(ranks)

selected_method <- c("CAMT", "AdaPT", "IHW", "BL", "FDRreg")
panel_C <- subset_rank[selected_cpg, ] %>%
  select(selected_method) %>%
  melt(value.name = "Significance_rank") %>%
  ggplot(aes(x = variable, y = Significance_rank)) +
  geom_boxplot(aes(fill = variable), width = 0.72, fatten = 1) +
  scale_fill_manual(values = method_col) + 
  scale_x_discrete(limits = selected_method) +
  xlab("") + ylab("Significance Rank") + labs(fill = "method") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 30, hjust = 0.5, vjust = 0.5, size = 8),
        axis.title = element_text(size = 10),
        legend.position = "none") +
  geom_hline(yintercept = reference_line, linetype = "dashed", color = "black", size = 0.5)

panel_C

# organize figures into Figure 5
bottom_row <- plot_grid(panel_B, panel_C, nrow = 1, align = "h", 
                        labels = c("B", "C"), label_size = 18)
f5 <- plot_grid(panel_A, bottom_row, ncol = 1,
                labels = c("A", ""), label_size = 18)
f5

2.4 Validation in another data set EWAS27

selected_measure <- c("mad", "mean", "sd.b", "precision", "sd.m", "dhs", "dip", "chr",  "direction", "probe.type", "refgene.pos", "cpg.loc")

# read in adjusted p-values' data into a list
pvalues <- list()
for(i in c("AdaPT", "BL", "CAMT", "FDRreg", "IHW")){
  pvalues[[i]] <- readRDS(paste0("Data/EWAS27_", i, "_adjusted_pvalues.RDS"))
}

## heatmap
discovery <- as.data.frame(lapply(pvalues, function(x)(apply(x, 2, function(x)sum(x < 0.05)))))
discovery <- as.data.frame(t(discovery))

# extract BH and ST signal numbers
BH <- unique(discovery$BH)
ST <- unique(discovery$ST)
plot_data <- discovery %>%
  select(selected_measure)

plot_data <- as.matrix((plot_data - ST)/ST)*100
cell_lable <- paste0(round(plot_data, 0), "%")
cell_lable <- matrix(cell_lable, nrow = nrow(plot_data), ncol = ncol(plot_data))
col_fun = colorRamp2(c(-100, 0, 200), c("blue", "white", "red"), space = "RGB")
fs15_1 <- Heatmap(plot_data,
        name = "Color Key",
        col = col_fun,
        rect_gp = gpar(col = "grey", lwd = 0.25),
        cell_fun = function(j, i, x, y, width, height, fill){
          if(abs(plot_data[i, j]) > 1) grid.text(cell_lable[i, j], x, y, gp = gpar(fontsize = 8))
        },
        clustering_distance_rows = "euclidean",
        clustering_distance_columns = "euclidean",
        clustering_method_rows = "ward.D",
        clustering_method_columns = "ward.D",
        row_names_gp = gpar(fontsize = 8),
        column_names_gp = gpar(fontsize = 8),
        heatmap_legend_param = list(labels_gp = gpar(fontsize = 6),
                                    title_gp = gpar(fontsize = 8),
                                    at = c(-100, 0, 100, 200),
                                    labels = c("-100%", 0, "100%", "200%")))
fs15_1 <- grid.grabExpr(draw(fs15_1))

## boxplot
ranks <- lapply(pvalues, function(x)as.data.frame(apply(x, 2, function(x)rank(x, ties.method = "average"))))

selected_method <- "IHW"
gold_standard <- read.csv("Data/GoldStandardCpG.T4.csv", 
                          header = TRUE, row.names = 1)
selected_cpg <- gold_standard$x[gold_standard$x %in% rownames(ranks[[1]])]

reference_line <- median(ranks[[selected_method]][selected_cpg, "ST"])

fs15_2 <- ranks[[selected_method]][selected_cpg, ] %>%
  select(selected_measure) %>%
  melt(value.name = "Significance_rank") %>%
  group_by(variable) %>%
  mutate(significance_rank_median = median(Significance_rank)) %>%
  ggplot(aes(x = reorder(variable, Significance_rank, FUN = median), y = Significance_rank, color = variable, fill = variable)) +
  geom_sina(size = 0.25, alpha = 0.2, shape = 16) +
  geom_hline(aes(yintercept = reference_line), color = "black", size = 0.5, linetype = "dashed") +
  geom_segment(aes(x = variable, xend = variable,
                   y = reference_line, yend = significance_rank_median),
               size = 0.25, color = "black") +
  stat_summary(fun.y = median, geom = "point", size = 1.5, shape = 21, color = "black") +
  scale_fill_manual(values = measure_col) +
  scale_color_manual(values = measure_col) +
  scale_x_discrete(limits = selected_measure) +
  xlab("") + ylab("Significance Rank") + labs(fill = "") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 8),
        axis.title = element_text(size = 10),
        legend.position = "none")

subset_rank <- as.data.frame(lapply(ranks, function(x)x[, "mean", drop = FALSE]))
colnames(subset_rank) <- names(ranks)

selected_method <- c("CAMT", "AdaPT", "IHW", "BL", "FDRreg")
fs15_3 <- subset_rank[selected_cpg, ] %>%
  select(selected_method) %>%
  melt(value.name = "Significance_rank") %>%
  ggplot(aes(x = variable, y = Significance_rank)) +
  geom_boxplot(aes(fill = variable), width = 0.72, fatten = 1) +
  scale_fill_manual(values = method_col) + 
  scale_x_discrete(limits = selected_method) +
  xlab("") + ylab("Significance Rank") + labs(fill = "method") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 30, hjust = 0.5, vjust = 0.5, size = 8),
        axis.title = element_text(size = 10),
        legend.position = "none") +
  geom_hline(yintercept = reference_line, linetype = "dashed", color = "black", size = 0.5)

## put all figures together
bottom_row <- plot_grid(fs15_2, fs15_3, labels = c("B", "C"), align = "h", nrow = 1, label_size = 18)
fs15 <- plot_grid(fs15_1, bottom_row, labels = c("A", ""), ncol = 1, label_size = 18)
fs15

3. Some well-known smoking-associated CpGs were recovered

3.1 Compare covariate-adaptive FDR methods’ result with reported smoking-related CpGs

selected_measure <- c("mean", "sd.b", "mad", "precision", "dhs", "chr", "refgene.pos",
                      "dip", "probe.type", "cpg.loc", "sd.m")

# read in adjusted p-values' data into a list
pvalues <- list()
for(i in c("AdaPT", "BL", "CAMT", "FDRreg", "IHW")){
  pvalues[[i]] <- readRDS(paste0("Data/EWAS20_", i, "_adjusted_pvalues.RDS"))
}

raw_probes <- rownames(pvalues[[1]])
sig_probes <- lapply(pvalues, function(x)apply(x, 2, function(x)raw_probes[x < 0.05]))

3.1.1 Method IHW

result <- sig_probes[["IHW"]]

# compare with the results of GSE50660-originated paper
paper_results <- read_xlsx("Data/Known smoking-associated CpGs.xlsx", 
                           sheet = "GSE50660")

overlap <- lapply(result, function(x)intersect(x, paper_results$CpGs))
reccurence <- as.data.frame(lapply(overlap, function(x)sprintf("%s/%s", length(x), length(paper_results$CpGs))))

(reccurence <- reccurence[, c("BH", "ST", selected_measure)])

##      BH    ST  mean  sd.b   mad precision   dhs   chr refgene.pos   dip
## 1 27/30 27/30 29/30 28/30 26/30     27/30 27/30 27/30       27/30 27/30
##   probe.type cpg.loc  sd.m
## 1      27/30   27/30 27/30

# extra findings by covariates
identical(result[["BH"]], result[["ST"]])

## [1] TRUE

BH <- result[["BH"]]
result$BH <- NULL
result$ST <- NULL
extra_findings <- lapply(result, function(x)setdiff(setdiff(x, BH), paper_results$CpGs))
(extra_findings <- extra_findings[selected_measure])

## $mean
##  [1] "cg04885881" "cg12856965" "cg26337070" "cg03147185" "cg13185177"
##  [6] "cg19719391" "cg00300637" "cg17924476" "cg24688690" "cg03774957"
## [11] "cg08149865" "cg17476951" "cg19254163" "cg07986378" "cg08730245"
## [16] "cg10322443" "cg18092474" "cg01207684" "cg16519923" "cg06972908"
## [21] "cg25683268" "cg15159987"
## 
## $sd.b
##  [1] "cg04885881" "cg22740783" "cg19827923" "cg19719391" "cg17924476"
##  [6] "cg08149865" "cg17476951" "cg19254163" "cg07986378" "cg18092474"
## [11] "cg06972908" "cg08149682" "cg14712058" "cg15159987"
## 
## $mad
##  [1] "cg04885881" "cg14179389" "cg22740783" "cg26337070" "cg19827923"
##  [6] "cg19719391" "cg17924476" "cg22132788" "cg08149865" "cg02526790"
## [11] "cg17476951" "cg13151811" "cg07986378" "cg08730245" "cg18092474"
## [16] "cg01207684" "cg06972908" "cg25683268" "cg08149682" "cg14712058"
## [21] "cg15159987"
## 
## $precision
## [1] "cg19827923" "cg19719391" "cg07986378" "cg14712058" "cg15159987"
## 
## $dhs
## character(0)
## 
## $chr
## character(0)
## 
## $refgene.pos
## [1] "cg06972908"
## 
## $dip
## character(0)
## 
## $probe.type
## character(0)
## 
## $cpg.loc
## character(0)
## 
## $sd.m
## character(0)

# see if the extra findings are in the list of other two papers
race_results <- read_xlsx("Data/Known smoking-associated CpGs.xlsx", 
                          sheet = "different_population")
(overlap_1 <- lapply(extra_findings, function(x)intersect(x, race_results$CpGs)))

## $mean
## [1] "cg19254163" "cg07986378" "cg15159987"
## 
## $sd.b
## [1] "cg19827923" "cg19254163" "cg07986378" "cg15159987"
## 
## $mad
## [1] "cg19827923" "cg22132788" "cg07986378" "cg15159987"
## 
## $precision
## [1] "cg19827923" "cg07986378" "cg15159987"
## 
## $dhs
## character(0)
## 
## $chr
## character(0)
## 
## $refgene.pos
## character(0)
## 
## $dip
## character(0)
## 
## $probe.type
## character(0)
## 
## $cpg.loc
## character(0)
## 
## $sd.m
## character(0)

multiple_results <- read_xlsx("Data/Known smoking-associated CpGs.xlsx",
                              sheet = "multiple_reported")
(overlap_2 <- lapply(extra_findings, function(x)intersect(x, multiple_results$CpGs)))

## $mean
## [1] "cg04885881"
## 
## $sd.b
## [1] "cg04885881"
## 
## $mad
## [1] "cg04885881" "cg22132788"
## 
## $precision
## character(0)
## 
## $dhs
## character(0)
## 
## $chr
## character(0)
## 
## $refgene.pos
## character(0)
## 
## $dip
## character(0)
## 
## $probe.type
## character(0)
## 
## $cpg.loc
## character(0)
## 
## $sd.m
## character(0)

IHW <- unique(unlist(extra_findings)) # save for later use

3.1.2 Method CAMT

result <- sig_probes[["CAMT"]]

# compare with the results of GSE50660-originated paper
overlap <- lapply(result, function(x)intersect(x, paper_results$CpGs))
reccurence <- as.data.frame(lapply(overlap, function(x)sprintf("%s/%s", length(x), length(paper_results$CpGs))))
(reccurence <- reccurence[, c("BH", "ST", selected_measure)])

##      BH    ST  mean  sd.b   mad precision   dhs   chr refgene.pos   dip
## 1 27/30 27/30 29/30 27/30 26/30     27/30 27/30 27/30       27/30 27/30
##   probe.type cpg.loc  sd.m
## 1      27/30   27/30 27/30

# extra findings by covariates
identical(result[["BH"]], result[["ST"]])

## [1] TRUE

BH <- result[["BH"]]
result$BH <- NULL
result$ST <- NULL
extra_findings <- lapply(result, function(x)setdiff(setdiff(x, BH), paper_results$CpGs))
(extra_findings <- extra_findings[selected_measure])

## $mean
##  [1] "cg04885881" "cg12856965" "cg22740783" "cg26337070" "cg03147185"
##  [6] "cg19827923" "cg00501876" "cg16845623" "cg13185177" "cg19719391"
## [11] "cg00300637" "cg17924476" "cg24688690" "cg03774957" "cg17979719"
## [16] "cg08149865" "cg17476951" "cg19254163" "cg20492912" "cg07986378"
## [21] "cg08730245" "cg10322443" "cg02342791" "cg18092474" "cg01207684"
## [26] "cg16519923" "cg06972908" "cg25683268" "cg08149682" "cg14712058"
## [31] "cg15159987"
## 
## $sd.b
##  [1] "cg22740783" "cg19827923" "cg19719391" "cg17924476" "cg08149865"
##  [6] "cg19254163" "cg07986378" "cg06972908" "cg08149682" "cg14712058"
## [11] "cg15159987"
## 
## $mad
## [1] "cg19719391" "cg19254163" "cg07986378" "cg06972908" "cg08149682"
## [6] "cg14712058" "cg15159987"
## 
## $precision
##  [1] "cg22740783" "cg26337070" "cg19719391" "cg17924476" "cg25949550"
##  [6] "cg07986378" "cg18092474" "cg01207684" "cg06972908" "cg08149682"
## [11] "cg14712058" "cg15159987"
## 
## $dhs
## [1] "cg22740783" "cg01207684"
## 
## $chr
## [1] "cg17924476" "cg24688690" "cg19254163" "cg13151811" "cg18092474"
## 
## $refgene.pos
## [1] "cg13151811" "cg07986378" "cg06972908"
## 
## $dip
## [1] "cg06972908"
## 
## $probe.type
## [1] "cg13151811" "cg06972908"
## 
## $cpg.loc
## [1] "cg07986378" "cg06972908"
## 
## $sd.m
## [1] "cg13151811" "cg14712058"

# see if the extra findings are in the list of other two papers
(overlap_1 <- lapply(extra_findings, function(x)intersect(x, race_results$CpGs)))

## $mean
## [1] "cg19827923" "cg00501876" "cg19254163" "cg07986378" "cg15159987"
## 
## $sd.b
## [1] "cg19827923" "cg19254163" "cg07986378" "cg15159987"
## 
## $mad
## [1] "cg19254163" "cg07986378" "cg15159987"
## 
## $precision
## [1] "cg25949550" "cg07986378" "cg15159987"
## 
## $dhs
## character(0)
## 
## $chr
## [1] "cg19254163"
## 
## $refgene.pos
## [1] "cg07986378"
## 
## $dip
## character(0)
## 
## $probe.type
## character(0)
## 
## $cpg.loc
## [1] "cg07986378"
## 
## $sd.m
## character(0)

(overlap_2 <- lapply(extra_findings, function(x)intersect(x, multiple_results$CpGs)))

## $mean
## [1] "cg04885881"
## 
## $sd.b
## character(0)
## 
## $mad
## character(0)
## 
## $precision
## [1] "cg25949550"
## 
## $dhs
## character(0)
## 
## $chr
## character(0)
## 
## $refgene.pos
## character(0)
## 
## $dip
## character(0)
## 
## $probe.type
## character(0)
## 
## $cpg.loc
## character(0)
## 
## $sd.m
## character(0)

CAMT <- unique(unlist(extra_findings)) # save for later use

3.2 Compare the methylation level of cg18092474 between groups

methy_data <- readRDS("Data/cg18092474.RDS")

# count samples within each group
table(methy_data$group)

## 
## current smoker  former smoker          never 
##             22            263            179

fs16 <- methy_data %>%
  ggplot(aes(x = group, y = bVals, fill = group)) + 
  stat_boxplot(geom = "errorbar", width = 0.3) + 
  geom_boxplot(width = 0.6) + 
  scale_x_discrete(labels = c("current smoker (n=22)", "former smoker (n=263)", "never (n=179)")) +
  annotate("text", label = "BH: 0.090       ST: 0.088\nIHW: 0.021       CAMT: 0.019",
           x = 2, y = 0.9:1, color = "red") +
  geom_segment(aes(x = 1, y = 0.9, xend = 3, yend = 0.9), size = 0.25) +
  geom_segment(aes(x = 1, y = 0.8, xend = 1, yend = 0.9), size = 0.1) +
  geom_segment(aes(x = 3, y = 0.85, xend = 3, yend = 0.9), size = 0.1) +
  ylim(0, 1) +
  xlab("Group") +
  theme_bw() + 
  theme(legend.position = "none") 
fs16

3.3 Intersetion of method IHW and CAMT

fs17 <- draw.pairwise.venn(area1 = length(IHW),
                           area2 = length(CAMT),
                           cross.area = length(intersect(IHW, CAMT)),
                           category = c("IHW", "CAMT"),
                           fill = c("pink", "skyblue"),
                           cat.pos = c(0, 0),
                           scaled = FALSE)