I am obtaining unexpected results with the MEDIPS
package to identify DMRs with the MEDIPS.meth
function (which calls edgeR
under the hood) that could potentially be a bug. In particular, when the ordering of the samples within one sample group is altered, the results are different. I would expect that no matter the sample order within group, the results would be identical. Below is a reprex that uses the MEDIPSData example data and illustrates the behaviour I am referring to.
library(MEDIPSData)
library(MEDIPS)
#> Loading required package: BSgenome
#> Loading required package: BiocGenerics
#> Loading required package: parallel
#>
#> Attaching package: 'BiocGenerics'
#> The following objects are masked from 'package:parallel':
#>
#> clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
#> clusterExport, clusterMap, parApply, parCapply, parLapply,
#> parLapplyLB, parRapply, parSapply, parSapplyLB
#> The following objects are masked from 'package:stats':
#>
#> IQR, mad, sd, var, xtabs
#> The following objects are masked from 'package:base':
#>
#> anyDuplicated, append, as.data.frame, basename, cbind, colnames,
#> dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
#> grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
#> order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
#> rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
#> union, unique, unsplit, which, which.max, which.min
#> Loading required package: S4Vectors
#> Loading required package: stats4
#>
#> Attaching package: 'S4Vectors'
#> The following object is masked from 'package:base':
#>
#> expand.grid
#> Loading required package: IRanges
#> Loading required package: GenomeInfoDb
#> Loading required package: GenomicRanges
#> Loading required package: Biostrings
#> Loading required package: XVector
#>
#> Attaching package: 'Biostrings'
#> The following object is masked from 'package:base':
#>
#> strsplit
#> Loading required package: rtracklayer
#> Loading required package: Rsamtools
library(BSgenome.Hsapiens.UCSC.hg19)
data(hESCs_MeDIP)
data(DE_MeDIP)
CS = MEDIPS.couplingVector(pattern = "CG", refObj = hESCs_MeDIP[[1]])
#> Get genomic sequence pattern positions...
#> Searching in chr22 ...[ 578097 ] found.
#> Number of identified CG pattern: 578097
#> Calculating genomic coordinates...
#> Creating Granges object for genome wide windows...
#> Counting the number of CG's in each window...
mr.edgeR = MEDIPS.meth(MSet1 = DE_MeDIP, MSet2 = hESCs_MeDIP,
CSet = CS, p.adj = "BH",
diff.method = "edgeR", MeDIP = F, CNV = F,
minRowSum = 10)
#> Calculating genomic coordinates...
#> Creating Granges object for genome wide windows...
#> Preprocessing MEDIPS SET 1 in MSet1...
#> Preprocessing MEDIPS SET 2 in MSet1...
#> Preprocessing MEDIPS SET 3 in MSet1...
#> Preprocessing MEDIPS SET 1 in MSet2...
#> Preprocessing MEDIPS SET 2 in MSet2...
#> Preprocessing MEDIPS SET 3 in MSet2...
#> Differential coverage analysis...
#> Extracting count windows with at least 10 reads...
#> Execute edgeR for count data of 168437 windows...
#> (Neglecting parameter 'type')
#> Creating a DGEList object...
#> Apply trimmed mean of M-values (TMM) for library sizes normalization...
#> Estimating common dispersion...
#> Estimating tagwise dispersion...
#> Calculating differential coverage...
#> Adjusting p.values for multiple testing...
#> Please note, log2 ratios are reported as log2(MSet1/MSet2).
#> Creating results table...
#> Adding differential coverage results...
mr.edgeR_alt = MEDIPS.meth(MSet1 = rev(DE_MeDIP), MSet2 = rev(hESCs_MeDIP),
CSet = CS, p.adj = "BH",
diff.method = "edgeR", MeDIP = F, CNV = F,
minRowSum = 10)
#> Calculating genomic coordinates...
#> Creating Granges object for genome wide windows...
#> Preprocessing MEDIPS SET 1 in MSet1...
#> Preprocessing MEDIPS SET 2 in MSet1...
#> Preprocessing MEDIPS SET 3 in MSet1...
#> Preprocessing MEDIPS SET 1 in MSet2...
#> Preprocessing MEDIPS SET 2 in MSet2...
#> Preprocessing MEDIPS SET 3 in MSet2...
#> Differential coverage analysis...
#> Extracting count windows with at least 10 reads...
#> Execute edgeR for count data of 168437 windows...
#> (Neglecting parameter 'type')
#> Creating a DGEList object...
#> Apply trimmed mean of M-values (TMM) for library sizes normalization...
#> Estimating common dispersion...
#> Estimating tagwise dispersion...
#> Calculating differential coverage...
#> Adjusting p.values for multiple testing...
#> Please note, log2 ratios are reported as log2(MSet1/MSet2).
#> Creating results table...
#> Adding differential coverage results...
colnames(mr.edgeR)
#> [1] "chr" "start"
#> [3] "stop" "DE.MeDIP.Rep1.chr22.bam.counts"
#> [5] "DE.MeDIP.Rep2.chr22.bam.counts" "DE.MeDIP.Rep3.chr22.bam.counts"
#> [7] "hESC.MeDIP.Rep1.chr22.bam.counts" "hESC.MeDIP.Rep2.chr22.bam.counts"
#> [9] "hESC.MeDIP.Rep3.chr22.bam.counts" "DE.MeDIP.Rep1.chr22.bam.rpkm"
#> [11] "DE.MeDIP.Rep2.chr22.bam.rpkm" "DE.MeDIP.Rep3.chr22.bam.rpkm"
#> [13] "hESC.MeDIP.Rep1.chr22.bam.rpkm" "hESC.MeDIP.Rep2.chr22.bam.rpkm"
#> [15] "hESC.MeDIP.Rep3.chr22.bam.rpkm" "MSets1.counts.mean"
#> [17] "MSets1.rpkm.mean" "MSets2.counts.mean"
#> [19] "MSets2.rpkm.mean" "edgeR.logFC"
#> [21] "edgeR.logCPM" "edgeR.p.value"
#> [23] "edgeR.adj.p.value"
# reorder columns to match original ordering
mr.edgeR_alt2 <- mr.edgeR_alt[,c(1:3, 6,5,4, 9,8,7, 12,11,10, 15,14,13, 16:23)]
# check that reordering worked
identical(colnames(mr.edgeR), colnames(mr.edgeR_alt2))
#> [1] TRUE
# Results are not identical
identical(mr.edgeR, mr.edgeR_alt2)
#> [1] FALSE
# data columns identical; edgeR results columns not
for (j in 1:ncol(mr.edgeR)){
print(paste0(colnames(mr.edgeR)[j], ": ",
identical(mr.edgeR[,j], mr.edgeR_alt2[,j])))
}
#> [1] "chr: TRUE"
#> [1] "start: TRUE"
#> [1] "stop: TRUE"
#> [1] "DE.MeDIP.Rep1.chr22.bam.counts: TRUE"
#> [1] "DE.MeDIP.Rep2.chr22.bam.counts: TRUE"
#> [1] "DE.MeDIP.Rep3.chr22.bam.counts: TRUE"
#> [1] "hESC.MeDIP.Rep1.chr22.bam.counts: TRUE"
#> [1] "hESC.MeDIP.Rep2.chr22.bam.counts: TRUE"
#> [1] "hESC.MeDIP.Rep3.chr22.bam.counts: TRUE"
#> [1] "DE.MeDIP.Rep1.chr22.bam.rpkm: TRUE"
#> [1] "DE.MeDIP.Rep2.chr22.bam.rpkm: TRUE"
#> [1] "DE.MeDIP.Rep3.chr22.bam.rpkm: TRUE"
#> [1] "hESC.MeDIP.Rep1.chr22.bam.rpkm: TRUE"
#> [1] "hESC.MeDIP.Rep2.chr22.bam.rpkm: TRUE"
#> [1] "hESC.MeDIP.Rep3.chr22.bam.rpkm: TRUE"
#> [1] "MSets1.counts.mean: TRUE"
#> [1] "MSets1.rpkm.mean: FALSE"
#> [1] "MSets2.counts.mean: TRUE"
#> [1] "MSets2.rpkm.mean: FALSE"
#> [1] "edgeR.logFC: FALSE"
#> [1] "edgeR.logCPM: FALSE"
#> [1] "edgeR.p.value: FALSE"
#> [1] "edgeR.adj.p.value: FALSE"
Created on 2020-01-24 by the reprex package (v0.3.0)
Brilliant! Thank you for clearing this up.