runPCA in a SingleCellExperiment results ina colMeans error
1
0
Entering edit mode
@lluis-revilla-sancho
Last seen 4 weeks ago
European Union

I'm testing analyzing an example dataset from the ParseBioscience, they have a tutorial with Seurat but I tried to use Bioconductor instead.

I'm having troubles running a PCA because there is some problem with the data from a dgTMatrix (from the Matrix package).
The data files read initially are a PBMC dataset provided by them:

library("Matrix")
library("SingleCellExperiment")
library("scuttle")
library("BiocSingular")
mat <- readMM("data/PBMCS/DGE2.mtx")
metadata <- read.csv("data/PBMCS/cell_metadata2.csv")
genes <- read.csv("data/PBMCS/all_genes2.csv")
matt <- t(mat) # To convert to genes x samples
sce <- SingleCellExperiment(assays = SimpleList(counts = matt), rowData = genes, colData = metadata)
scee <- sce |> 
  logNormCounts() |> 
  runPCA(rank = 50)
## dimnames(.) <- NULL translated to
## dimnames(.) <- list(NULL,NULL)
## dimnames(.) <- NULL translated to
## dimnames(.) <- list(NULL,NULL)
## dimnames(.) <- NULL translated to
## dimnames(.) <- list(NULL,NULL)
## Error in base::colMeans(x, na.rm = na.rm, dims = dims, ...) : 
##   'x' must be an array of at least two dimensions

I'm using BiocSingular 1.16.0 and I don't see anything weird in the data, the process or the traceback:

traceback()
16: stop("'x' must be an array of at least two dimensions")
15: base::colMeans(x, na.rm = na.rm, dims = dims, ...)
14: colMeans(x)
13: colMeans(x)
12: .compute_center_and_scale(x, center, scale)
11: standardize_matrix(x, center = center, scale = scale, deferred = deferred, 
        BPPARAM = BPPARAM)
10: (function (x, k = min(dim(x)), nu = k, nv = k, center = FALSE, 
        scale = FALSE, deferred = FALSE, fold = Inf, BPPARAM = SerialParam()) 
    {
        checked <- check_numbers(x, k = k, nu = nu, nv = nv)
        k <- checked$k
        nv <- checked$nv
        nu <- checked$nu
        old <- getAutoBPPARAM()
        setAutoBPPARAM(BPPARAM)
        on.exit(setAutoBPPARAM(old))
        if (!.bpisup2(BPPARAM)) {
            bpstart(BPPARAM)
            on.exit(bpstop(BPPARAM), add = TRUE)
        }
        x <- standardize_matrix(x, center = center, scale = scale, 
            deferred = deferred, BPPARAM = BPPARAM)
        if (use_crossprod(x, fold)) {
            res <- svd_via_crossprod(x, k = k, nu = nu, nv = nv, 
                FUN = safe_svd)
        }
     ...
9: do.call(FUN, c(list(x = x, k = k, nu = nu, nv = nv, center = center, 
       scale = scale, BPPARAM = BPPARAM, ...), ARGS(BSPARAM)))
8: (new("standardGeneric", .Data = function (x, k, nu = k, nv = k, 
       center = FALSE, scale = FALSE, BPPARAM = SerialParam(), ..., 
       BSPARAM = ExactParam()) 
   standardGeneric("runSVD"), generic = structure("runSVD", package = "BiocSingular"), 
       package = "BiocSingular", group = list(), valueClass = character(0), 
       signature = "BSPARAM", default = NULL, skeleton = (function (x, 
           k, nu = k, nv = k, center = FALSE, scale = FALSE, BPPARAM = SerialParam(), 
           ..., BSPARAM = ExactParam()) 
       stop(gettextf("invalid call in method dispatch to '%s' (no default method)", 
           "runSVD"), domain = NA))(x, k, nu, nv, center, scale, 
           BPPARAM, ..., BSPARAM = BSPARAM)))(x = new("SingleCellExperiment", 
       int_elementMetadata = new("DFrame", rownames = NULL, nrows = 62703L, 
           elementType = "ANY", elementMetadata = NULL, metadata = list(), 
           listData = list(rowPairs = new("DFrame", rownames = NULL, 
               nrows = 62703L, elementType = "ANY", elementMetadata = NULL, 
               metadata = list(), listData = structure(list(), names = character(0))))), 
       int_colData = new("DFrame", rownames = NULL, nrows = 5017L, 
           elementType = "ANY", elementMetadata = NULL, metadata = list(), 
           listData = list(reducedDims = new("DFrame", rownames = NULL, 
               nrows = 5017L, elementType = "ANY", elementMetadata = NULL, 
    ...
7: (new("standardGeneric", .Data = function (x, k, nu = k, nv = k, 
       center = FALSE, scale = FALSE, BPPARAM = SerialParam(), ..., 
       BSPARAM = ExactParam()) 
   standardGeneric("runSVD"), generic = structure("runSVD", package = "BiocSingular"), 
       package = "BiocSingular", group = list(), valueClass = character(0), 
       signature = "BSPARAM", default = NULL, skeleton = (function (x, 
           k, nu = k, nv = k, center = FALSE, scale = FALSE, BPPARAM = SerialParam(), 
           ..., BSPARAM = ExactParam()) 
       stop(gettextf("invalid call in method dispatch to '%s' (no default method)", 
           "runSVD"), domain = NA))(x, k, nu, nv, center, scale, 
           BPPARAM, ..., BSPARAM = BSPARAM)))(x = new("SingleCellExperiment", 
       int_elementMetadata = new("DFrame", rownames = NULL, nrows = 62703L, 
           elementType = "ANY", elementMetadata = NULL, metadata = list(), 
           listData = list(rowPairs = new("DFrame", rownames = NULL, 
               nrows = 62703L, elementType = "ANY", elementMetadata = NULL, 
               metadata = list(), listData = structure(list(), names = character(0))))), 
       int_colData = new("DFrame", rownames = NULL, nrows = 5017L, 
           elementType = "ANY", elementMetadata = NULL, metadata = list(), 
           listData = list(reducedDims = new("DFrame", rownames = NULL, 
               nrows = 5017L, elementType = "ANY", elementMetadata = NULL, 
    ...
6: do.call(FUN, c(list(x = x, k = k, nu = nu, nv = nv, center = center, 
       scale = scale, BPPARAM = BPPARAM, ...), ARGS(BSPARAM)))
5: runSVD(x, k = rank, nu = ifelse(get.pcs, rank, 0), nv = ifelse(get.rotation, 
       rank, 0), center = center, scale = scale, ...)
4: runSVD(x, k = rank, nu = ifelse(get.pcs, rank, 0), nv = ifelse(get.rotation, 
       rank, 0), center = center, scale = scale, ...)
3: .local(x, ...)
2: runPCA(logNormCounts(sce), rank = 50)
1: runPCA(logNormCounts(sce), rank = 50)
BiocManager::version()
## [1] '3.17'
is(matt)
##  [1] "dgTMatrix"     "TsparseMatrix" "dsparseMatrix" "generalMatrix" "dMatrix"       "sparseMatrix"  "compMatrix"   
##  [8] "Matrix"        "xMatrix"       "mMatrix"       "replValueSp" 
 matt[1:5, 1:5]
## 5 x 5 sparse Matrix of class "dgTMatrix"
##               
## [1,] . . . . .
## [2,] . . . . .
## [3,] . 1 . . 1
## [4,] 2 1 . 1 1
## [5,] . . . . .
head(genes)
##           gene_id gene_name genome
## 1 ENSG00000000003    TSPAN6   hg38
## 2 ENSG00000000005      TNMD   hg38
## 3 ENSG00000000419      DPM1   hg38
## 4 ENSG00000000457     SCYL3   hg38
## 5 ENSG00000000460  C1orf112   hg38
## 6 ENSG00000000938       FGR   hg38
head(metadata)
##   bc_wells   sample species gene_count tscp_count mread_count bc1_well bc2_well bc3_well bc1_wind bc2_wind
## 1 01_01_14 Sample_1    hg38       3077       7477       29532       A1       A1       B2        1        1
## 2 01_01_31 Sample_1    hg38       2619       5776       21785       A1       A1       C7        1        1
## 3 01_01_66 Sample_1    hg38       2648       5906       22928       A1       A1       F6        1        1
## 4 01_01_70 Sample_1    hg38       2339       4698       18252       A1       A1      F10        1        1
## 5 01_02_03 Sample_1    hg38       2009       4001       15789       A1       A2       A3        1        2
## 6 01_02_62 Sample_1    hg38       2481       5093       19933       A1       A2       F2        1        2
##   bc3_wind gene_count2
## 1       14        3077
## 2       31        2619
## 3       66        2648
## 4       70        2339
## 5        3        2009
## 6       62        2481
scran SingleCellExperiment • 1.3k views
ADD COMMENT
0
Entering edit mode
ATpoint ★ 4.6k
@atpoint-13662
Last seen 9 hours ago
Germany

Not sure why, but when you specify rank, then the runPCA from BiocSingular needs a matrix, else it can accept a SCE. Anyway, just use scater, this works with SCE well.

library(SingleCellExperiment)
library(scuttle)
library(BiocSingular)
library(scater)

sce <- mockSCE()
sce <- logNormCounts(sce)

# ok
BiocSingular::runPCA(sce)

# error
BiocSingular::runPCA(sce, rank=50)

# ok and should be identical to above
scater::runPCA(sce, ncomponents=50)

I would also use dgCMatrix, in my hands that has a smaller memory footprint and faster processing times for many functions. dgC rather than dgT is also used for example by DropletUtils: https://github.com/MarioniLab/DropletUtils/blob/devel/R/read10xCounts.R#L275

ADD COMMENT
0
Entering edit mode

Thanks for the tip to convert it to dgCMatrix I had a 25% memory decrease. It is the first time using this big datasets and I am not familiar with all the practical memory reduction tricks. Precisely what I want to know with this approach is if the droplet based methods work with the combinatorics approach used here.

ADD REPLY

Login before adding your answer.

Traffic: 543 users visited in the last hour
Help About
FAQ
Access RSS
API
Stats

Use of this site constitutes acceptance of our User Agreement and Privacy Policy.

Powered by the version 2.3.6