I have 2 scRNA seq datasets that I'm trying to merge and correct for batch effect by mnnCorrect. I'm generally using Seurat for my analysis but I've used scran/scater for the normalization step. My original data sets do not have the same number of genes but I've subsetted the data based on highly variable genes shared between the two dataset ( 698 genes). I still get the error indicating that the number of rows is not the same across batches. Any idea why?
Here's my code...
Cont_MF <- Read10X(data.dir = "/Users/snejat/Documents/Sarah_scData/Epelman_Sarah_cont/mm10_TDTomato")
MI_MF <- Read10X(data.dir = "/Users/snejat/Documents/Sarah_scData/Epelman_Sarah_M1_MFS/mm10_TDTomato")
dim(MI_MF) # 28001 4697
dim(Cont_MF) #28001 1806
Cont_MFs <- CreateSeuratObject(raw.data = Cont_MF, min.cells = 3, min.genes = 200,
project = "Cont_MFs")
MI_MFs <- CreateSeuratObject(raw.data = MI_MF, min.cells = 3, min.genes = 200,
project = "MI_MFs")
mito.genes <- grep(pattern = "^mt-", x = rownames(x = Cont_MFs@data), value = TRUE)
percent.mito <- Matrix::colSums(Cont_MFs@raw.data[mito.genes, ])/Matrix::colSums(Cont_MFs@raw.data)
mito.genes <- grep(pattern = "^mt-", x = rownames(x = MI_MFs@data), value = TRUE)
percent.mito <- Matrix::colSums(MI_MFs@raw.data[mito.genes, ])/Matrix::colSums(MI_MFs@raw.data)
length(mito.genes) #13
Cont_MFs <- AddMetaData(object = Cont_MFs, metadata = percent.mito, col.name = "percent.mito")
MI_MFs <- AddMetaData(object = MI_MFs, metadata = percent.mito, col.name = "percent.mito")
Cont_MFs2 <- FilterCells(object = Cont_MFs, subset.names = c("nGene", "percent.mito"),
low.thresholds = c(1100, -Inf), high.thresholds = c(5500, 0.1))
MI_MFs2 <- FilterCells(object = MI_MFs, subset.names = c("nGene", "percent.mito"),
low.thresholds = c(1000, -Inf), high.thresholds = c(4800, 0.1))
dim(Cont_MFs2@data) # 14263 1730
dim(MI_MFs2@data) #14784 4626
Cont_MFs2_meta <-Cont_MFs2@meta.data
MI_MFs2_meta <-MI_MFs2@meta.data
Cont_MFs2 <- as.matrix(Cont_MFs2@data)
MI_MFs2 <- as.matrix(MI_MFs2@data)
Cont_sce <- SingleCellExperiment(assays = list(counts = Cont_MFs2))
MI_sce <- SingleCellExperiment(assays = list(counts = MI_MFs2))
#compute sum factors for normalization
Cont_sce <- scran::computeSumFactors(Cont_sce)
MI_sce <- scran::computeSumFactors(MI_sce)
#normalize data - uses the above sum factors
Cont_sce <- normalize(Cont_sce)
MI_sce <- normalize(MI_sce)
Cont_sce_norm <- as.matrix(exprs(Cont_sce))
MI_sce_norm <- as.matrix(exprs(MI_sce))
#reverse the log2 + 1
Cont_norm_nolog <- (2^(Cont_sce_norm)) -1
MI_norm_nolog <- (2^(MI_sce_norm)) -1
#natural log + 1
Cont_norm_ln <- log(Cont_norm_nolog + 1)
MI_norm_ln <- log(MI_norm_nolog + 1)
#load your original data into seurat
Cont_MFs2 <- CreateSeuratObject(raw.data = Cont_MFs2, min.cells = 3, min.genes = 300, project = "Cont_MI", meta.data = Cont_MFs2_meta)
MI_MFs2 <- CreateSeuratObject(raw.data = MI_MFs2, min.cells = 3, min.genes = 300, project = "MI_MI", meta.data = MI_MFs2_meta)
#load your newly normalized (natural log + 1) into the seurat.raw@data slot
Cont_MFs2@data <- Cont_norm_ln
MI_MFs2@data <- MI_norm_ln
Cont_MFs2 <- FindVariableGenes(object = Cont_MFs2, mean.function = ExpMean, dispersion.function = LogVMR, do.plot = TRUE, x.low.cutoff = 0.0125, x.high.cutoff = 3, y.cutoff = 0.5)
length( Cont_MFs2@var.genes)
[1] 1165
MI_MFs2 <- FindVariableGenes(object = MI_MFs2, mean.function = ExpMean, dispersion.function = LogVMR, do.plot = TRUE, x.low.cutoff = 0.0125, x.high.cutoff = 3, y.cutoff = 0.5)
length(MI_MFs2@var.genes)
[1] 1020
common_hvgs <- intersect(Cont_MFs2@var.genes,MI_MFs2@var.genes)
length(common_hvgs) # 698
Cont_MFs2_matrix <- as.matrix(Cont_MFs2@data)
MI_MFs2_matrix <- as.matrix(MI_MFs2@data)
Cont_MFs2_matrix_sample <- as.matrix(Cont_MFs2_matrix [common_hvgs, ])
MI_MFs2_matrix_sample <- as.matrix(MI_MFs2_matrix [common_hvgs, ])
dim(Cont_MFs2_matrix_sample)
[1] 698 1730
dim(MI_MFs2_matrix_sample)
[1] 698 4626
mnn_correction <- mnnCorrect(Cont_MFs2_matrix_sample , MI_MFs2_matrix_sample, k=20, sigma=1, cos.norm.in=FALSE, cos.norm.out=FALSE, subset.row=common_hvgs, var.adj = TRUE )
Error in mnnCorrect(Cont_MFs2_matrix_sample, MI_MFs2_matrix_sample, k = 20, :
number of rows is not the same across batches
Here's the session info
> sessionInfo()
R version 3.4.4 (2018-03-15)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS Sierra 10.12.6
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
locale:
[1] en_CA.UTF-8/en_CA.UTF-8/en_CA.UTF-8/C/en_CA.UTF-8/en_CA.UTF-8
attached base packages:
[1] parallel stats4 stats graphics grDevices utils datasets
[8] methods base
other attached packages:
[1] BiocInstaller_1.28.0 Rtsne_0.13
[3] scales_0.5.0 limma_3.34.9
[5] scater_1.6.3 scran_1.6.9
[7] SingleCellExperiment_1.0.0 SummarizedExperiment_1.8.1
[9] DelayedArray_0.4.1 matrixStats_0.53.1
[11] Biobase_2.38.0 GenomicRanges_1.30.3
[13] GenomeInfoDb_1.14.0 IRanges_2.12.0
[15] S4Vectors_0.16.0 BiocGenerics_0.24.0
[17] BiocParallel_1.12.0 dplyr_0.7.6
[19] Seurat_2.3.3 Matrix_1.2-14
[21] cowplot_0.9.2 ggplot2_3.0.0
loaded via a namespace (and not attached):
[1] shinydashboard_0.7.0 R.utils_2.6.0 tidyselect_0.2.4
[4] RSQLite_2.1.1 AnnotationDbi_1.40.0 htmlwidgets_1.2
[7] grid_3.4.4 trimcluster_0.1-2 ranger_0.10.1
[10] munsell_0.5.0 codetools_0.2-15 ica_1.0-2
[13] DT_0.4 statmod_1.4.30 withr_2.1.2
[16] colorspace_1.3-2 knitr_1.20 rstudioapi_0.7
[19] geometry_0.3-6 ROCR_1.0-7 robustbase_0.93-1
[22] dtw_1.20-1 dimRed_0.1.0 lars_1.2
[25] tximport_1.6.0 GenomeInfoDbData_1.0.0 mnormt_1.5-5
[28] bit64_0.9-7 rhdf5_2.22.0 ipred_0.9-6
[31] diptest_0.75-7 R6_2.2.2 ggbeeswarm_0.6.0
[34] VGAM_1.0-5 locfit_1.5-9.1 flexmix_2.3-14
[37] DRR_0.0.3 bitops_1.0-6 assertthat_0.2.0
[40] SDMTools_1.1-221 nnet_7.3-12 beeswarm_0.2.3
[43] gtable_0.2.0 ddalpha_1.3.4 timeDate_3043.102
[46] rlang_0.2.1 CVST_0.2-2 scatterplot3d_0.3-41
[49] RcppRoll_0.3.0 splines_3.4.4 lazyeval_0.2.1
[52] ModelMetrics_1.1.0 acepack_1.4.1 broom_0.4.5
[55] checkmate_1.8.5 reshape2_1.4.3 abind_1.4-5
[58] backports_1.1.2 httpuv_1.4.4.2 Hmisc_4.1-1
[61] caret_6.0-80 tools_3.4.4 lava_1.6.2
[64] psych_1.8.4 gplots_3.0.1 RColorBrewer_1.1-2
[67] proxy_0.4-22 dynamicTreeCut_1.63-1 ggridges_0.5.0
[70] Rcpp_0.12.17 plyr_1.8.4 base64enc_0.1-3
[73] progress_1.2.0 zlibbioc_1.24.0 purrr_0.2.5
[76] RCurl_1.95-4.10 prettyunits_1.0.2 rpart_4.1-13
[79] pbapply_1.3-4 viridis_0.5.1 zoo_1.8-2
[82] sfsmisc_1.1-2 cluster_2.0.7-1 magrittr_1.5
[85] data.table_1.11.4 lmtest_0.9-36 RANN_2.5.1
[88] mvtnorm_1.0-8 fitdistrplus_1.0-9 mime_0.5
[91] xtable_1.8-2 XML_3.98-1.11 mclust_5.4.1
[94] gridExtra_2.3 compiler_3.4.4 biomaRt_2.34.2
[97] tibble_1.4.2 KernSmooth_2.23-15 R.oo_1.22.0
[100] htmltools_0.3.6 segmented_0.5-3.0 Formula_1.2-3
[103] snow_0.4-2 tidyr_0.8.1 tclust_1.4-1
[106] lubridate_1.7.4 DBI_1.0.0 diffusionMap_1.1-0
[109] magic_1.5-8 MASS_7.3-50 fpc_2.1-11
[112] R.methodsS3_1.7.1 gdata_2.18.0 metap_0.9
[115] bindr_0.1.1 gower_0.1.2 igraph_1.2.1
[118] pkgconfig_2.0.1 sn_1.5-2 numDeriv_2016.8-1
[121] foreign_0.8-70 recipes_0.1.3 foreach_1.4.4
[124] vipor_0.4.5 XVector_0.18.0 prodlim_2018.04.18
[127] stringr_1.3.1 digest_0.6.15 tsne_0.1-3
[130] htmlTable_1.12 edgeR_3.20.9 kernlab_0.9-26
[133] shiny_1.1.0 gtools_3.8.1 modeltools_0.2-21
[136] rjson_0.2.20 nlme_3.1-137 bindrcpp_0.2.2
[139] viridisLite_0.3.0 pillar_1.2.3 lattice_0.20-35
[142] DEoptimR_1.0-8 survival_2.42-4 glue_1.2.0
[145] FNN_1.1 png_0.1-7 prabclus_2.2-6
[148] iterators_1.0.9 bit_1.1-14 class_7.3-14
[151] stringi_1.2.3 mixtools_1.1.0 blob_1.1.1
[154] doSNOW_1.0.16 latticeExtra_0.6-28 caTools_1.17.1
[157] memoise_1.1.0 irlba_2.3.2 ape_5.1
You're lucky that the support site is my homepage, otherwise I would have never seen this post. For future reference, please tag the post with the package name, otherwise the package maintainers don't get notified.
As for your actual problem, please read the posting guide, and provide some code and your session information.
P.S. When adding new information, edit your existing post. Do not "add an answer". Unless you solved your own problem.