Hi!
I've been analyzing DNA-methylation data from 48 samples from two different treatment groups, and have used BiSeq to try to find significantly differentially methylated CpGs. However, I have two issues that I would like your support on.
First, I find that the sill of the variogram for the re-sampled (null-grouped) data is larger than 1. More specifically around 1.2. My statistical understanding is not that great, so I was wondering if this is an issue? I seem to run into an error if I put it above 1 when smoothing the variogram.
Secondly, I find that the output of the trimClusters function has CpGs with negative p.li values. My understanding is that these are P-values, and should therefore not be negative. Any idea what might be the cause?
I hope you can help! I have attached the code that was used in my analysis as well as session-info below. Let me know if you need more!
library(BiSeq)
library(stringr)
library(betareg)
out_dir <- "biseq_output"
## -------------------------------------------------------------------------------------------
col_data <- readxl::read_xlsx("metadata_RRBS.xlsx")
col_data$Sample.ID <- col_data$`Sample ID (Library ID)`
files <- gtools::mixedsort(list.files("scripts", pattern = "*cov.gz", full.names = TRUE))
names(files) <- str_remove_all(string = basename(files),
pattern = "_S[[:digit:]]+_R[[:digit:]]+_001.UMI_trimmed.fq_trimmed_bismark_bt2.deduplicated.bismark.cov.gz")
## -------------------------------------------------------------------------------------------
rrbs_raw <- readBismark(unname(files[col_data$Sample.ID]), col_data)
saveRDS(rrbs_raw, file=file.path(out_dir, "rrbs_raw.RDS"))
cov_stat <- do.call(covStatistics(rrbs_raw), what = "rbind")
colnames(cov_stat) <- rrbs_raw$Sample.ID
write.csv(cov_stat, file = file.path(out_dir, "coverage_statistics.csv"))
pdf(file.path(out_dir, "coverage_boxplot_pre_smooth.pdf"))
covBoxplots(rrbs_raw, col = "cornflowerblue", las = 2)
dev.off()
## -------------------------------------------------------------------------------------------
rrbs_clust_unlim <- clusterSites(object = rrbs_raw,
groups = as.factor(colData(rrbs_raw)$Condition1),
perc.samples = 4/5,
min.sites = 20,
max.dist = 100)
clusters <- clusterSitesToGR(rrbs_clust_unlim)
saveRDS(clusters, file = file.path(out_dir, "clusters.RDS"))
## -------------------------------------------------------------------------------------------
ind_cov <- totalReads(rrbs_clust_unlim) > 0
quant <- quantile(totalReads(rrbs_clust_unlim)[ind_cov], 0.9)
rrbs_clust_lim <- limitCov(rrbs_clust_unlim, maxCov = quant)
pdf(file.path(out_dir, "coverage_boxplot_post_smooth.pdf"))
covBoxplots(rrbs_clust_lim, col = "cornflowerblue", las = 2)
dev.off()
## -------------------------------------------------------------------------------------------
predicted_meth <- predictMeth(object = rrbs_clust_lim, mc.cores = 80)
saveRDS(predicted_meth, file.path(out_dir, "predicted_meth.RDS"))
pdf(file.path(out_dir, "smoothed_methylation_plot.pdf"))
plotMeth(object.raw = rrbs_raw[,1],
object.rel = predicted_meth[,1],
region = clusters[1],
lwd.lines = 2,
col.points = "blue",
cex = 1.5)
dev.off()
## -------------------------------------------------------------------------------------------
beta_results <- betaRegression(formula = ~ Condition1,
link = "probit",
object = predicted_meth,
type = "BR",
mc.cores = 80)
saveRDS(beta_results, file.path(out_dir,"beta_results.RDS"))
## -------------------------------------------------------------------------------------------
is_control <- which(predicted_meth$Condition1=="Control")[c(-23)]
is_covid <- which(predicted_meth$Condition1=="COVID+")[c(-23, -24, -25)]
predictedMethNull <- predicted_meth[,c(is_control,is_covid)]
colData(predictedMethNull)$group.null <- rep(c(1,2), 22)
#To shorten the run time, please set mc.cores, if possible!
betaResultsNull <- betaRegression(formula = ~group.null,
link = "probit",
object = predictedMethNull,
type="BR", mc.cores = 80)
saveRDS(betaResultsNull, file=file.path(out_dir, "beta_results_null.RDS"))
## -------------------------------------------------------------------------------------------
# make variogram
vario <- makeVariogram(betaResultsNull)
pdf(file.path(out_dir, "variogram.pdf"))
plot(vario$variogram$v)
dev.off()
# smooth variogram using sill
vario_sm <- smoothVariogram(vario, sill = 0.99)
pdf(file.path(out_dir,"variogram_combined.pdf"))
plot(vario$variogram$v)
lines(vario_sm$variogram[,c("h", "v.sm")],
col = "red", lwd = 1.5)
grid()
dev.off()
## -------------------------------------------------------------------------------------------
vario_aux <- makeVariogram(beta_results, make.variogram=FALSE)
vario_sm$pValsList <- vario_aux$pValsList
locCor <- estLocCor(vario_sm)
clusters_rej <- testClusters(locCor, FDR.cluster = 0.1)
clusters_trimmed <- trimClusters(clusters_rej, FDR.loc = 0.05)
saveRDS(clusters_trimmed, file = file.path(out_dir,"clusters_trimmed.RDS"))
## -------------------------------------------------------------------------------------------
DMRs <- findDMRs(clusters_trimmed,
max.dist = 100,
diff.dir = TRUE)
saveRDS(DMRs, file = file.path(out_dir,"DMRs.RDS"))
> tail(clusters_trimmed,10)
chr pos p.val meth.group1 meth.group2 meth.diff estimate std.error pseudo.R.sqrt cluster.id z.score pos.new p.li
7_313.27.53815 7 1668600 4.988307e-01 0.37060148 0.353865637 0.01673585 -0.04464407 0.06600923 0.009689202 7_313 0.002931076 3904 -2.8483686879
7_313.28.53815 7 1668652 2.002920e-01 0.32829334 0.299843621 0.02844971 -0.08021965 0.06263636 0.033975194 7_313 0.840578653 3956 -2.9172344947
7_313.29.53815 7 1668670 2.284003e-01 0.34206546 0.312274171 0.02979129 -0.08258177 0.06856162 0.029649915 7_313 0.744125318 3974 -2.8126457785
7_313.31.53815 7 1668685 2.715850e-01 0.31819458 0.291411358 0.02678322 -0.07651293 0.06959392 0.024190005 7_313 0.608026364 3989 -2.6679710048
7_313.32.53815 7 1668765 3.232215e-02 0.24241993 0.203622844 0.03879708 -0.13021083 0.06083471 0.096301926 7_313 1.847710023 4069 -0.6660171024
7_313.36.53815 7 1668799 8.771825e-02 0.18883453 0.163117196 0.02571733 -0.09952767 0.05828653 0.060647147 7_313 1.354940543 4103 -1.2672760201
9_120.15.59685 9 18315698 3.424942e-07 0.90168580 0.964242419 -0.06255662 0.51097516 0.10022219 0.289505153 9_120 4.965572121 129 0.0002328785
9_120.16.59685 9 18315701 3.064997e-07 0.89448831 0.962151028 -0.06766272 0.52545587 0.10264001 0.294130231 9_120 4.987076041 132 0.0001826686
9_120.17.59685 9 18315702 2.944730e-07 0.89227377 0.961521061 -0.06924729 0.52990538 0.10335684 0.295489987 9_120 4.994807126 133 0.0001680384
9_847.20.60926 9 121651359 1.411898e-05 0.02192601 0.008299454 0.01362656 -0.37997114 0.08751057 0.157339873 9_847 4.187229948 20 0.0009093949
## -------------------------------------------------------------------------------------------
**sessionInfo( )**
R version 4.0.3 (2020-10-10)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: CentOS Linux 7 (Core)
Matrix products: default
BLAS/LAPACK: /services/tools/intel/perflibs/2020_update4/compilers_and_libraries_2020.4.304/linux/mkl/lib/intel64_lin/libmkl_intel_lp64.so
locale:
[1] LC_CTYPE=en_US.utf-8 LC_NUMERIC=C
[3] LC_TIME=en_US.utf-8 LC_COLLATE=en_US.utf-8
[5] LC_MONETARY=en_US.utf-8 LC_MESSAGES=en_US.utf-8
[7] LC_PAPER=en_US.utf-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.utf-8 LC_IDENTIFICATION=C
attached base packages:
[1] parallel stats4 stats graphics grDevices utils datasets
[8] methods base
other attached packages:
[1] betareg_3.1-3 stringr_1.4.0
[3] BiSeq_1.30.0 Formula_1.2-4
[5] SummarizedExperiment_1.20.0 Biobase_2.50.0
[7] MatrixGenerics_1.2.1 matrixStats_0.61.0
[9] GenomicRanges_1.42.0 GenomeInfoDb_1.26.7
[11] IRanges_2.24.1 S4Vectors_0.28.1
[13] BiocGenerics_0.36.1
loaded via a namespace (and not attached):
[1] zoo_1.8-9 modeltools_0.2-23 splines_4.0.3
[4] lattice_0.20-41 vctrs_0.3.8 rtracklayer_1.50.0
[7] blob_1.2.2 XML_4.0-0 survival_3.2-7
[10] rlang_1.0.0 DBI_1.1.2 BiocParallel_1.24.1
[13] bit64_4.0.5 GenomeInfoDbData_1.2.4 zlibbioc_1.36.0
[16] Biostrings_2.58.0 globaltest_5.44.0 memoise_2.0.1
[19] fastmap_1.1.0 lmtest_0.9-39 flexmix_2.3-17
[22] AnnotationDbi_1.52.0 Rcpp_1.0.8 xtable_1.8-4
[25] cachem_1.0.6 DelayedArray_0.16.3 annotate_1.68.0
[28] XVector_0.30.0 bit_4.0.4 Rsamtools_2.6.0
[31] stringi_1.7.6 grid_4.0.3 cli_3.1.1
[34] tools_4.0.3 bitops_1.0-7 magrittr_2.0.2
[37] sandwich_3.0-1 RCurl_1.98-1.5 RSQLite_2.2.9
[40] lokern_1.1-9 crayon_1.4.2 Matrix_1.3-4
[43] httr_1.4.2 R6_2.5.1 GenomicAlignments_1.26.0
[46] sfsmisc_1.1-7 nnet_7.3-14 compiler_4.0.3
Here's an image of the created variogram, showing the actual sill is around 1.3 (and not 1 as indicated by the red line).