I'm testing analyzing an example dataset from the ParseBioscience, they have a tutorial with Seurat but I tried to use Bioconductor instead.
I'm having troubles running a PCA because there is some problem with the data from a dgTMatrix (from the Matrix package).
The data files read initially are a PBMC dataset provided by them:
library("Matrix")
library("SingleCellExperiment")
library("scuttle")
library("BiocSingular")
mat <- readMM("data/PBMCS/DGE2.mtx")
metadata <- read.csv("data/PBMCS/cell_metadata2.csv")
genes <- read.csv("data/PBMCS/all_genes2.csv")
matt <- t(mat) # To convert to genes x samples
sce <- SingleCellExperiment(assays = SimpleList(counts = matt), rowData = genes, colData = metadata)
scee <- sce |>
logNormCounts() |>
runPCA(rank = 50)
## dimnames(.) <- NULL translated to
## dimnames(.) <- list(NULL,NULL)
## dimnames(.) <- NULL translated to
## dimnames(.) <- list(NULL,NULL)
## dimnames(.) <- NULL translated to
## dimnames(.) <- list(NULL,NULL)
## Error in base::colMeans(x, na.rm = na.rm, dims = dims, ...) :
## 'x' must be an array of at least two dimensions
I'm using BiocSingular 1.16.0 and I don't see anything weird in the data, the process or the traceback:
traceback()
16: stop("'x' must be an array of at least two dimensions")
15: base::colMeans(x, na.rm = na.rm, dims = dims, ...)
14: colMeans(x)
13: colMeans(x)
12: .compute_center_and_scale(x, center, scale)
11: standardize_matrix(x, center = center, scale = scale, deferred = deferred,
BPPARAM = BPPARAM)
10: (function (x, k = min(dim(x)), nu = k, nv = k, center = FALSE,
scale = FALSE, deferred = FALSE, fold = Inf, BPPARAM = SerialParam())
{
checked <- check_numbers(x, k = k, nu = nu, nv = nv)
k <- checked$k
nv <- checked$nv
nu <- checked$nu
old <- getAutoBPPARAM()
setAutoBPPARAM(BPPARAM)
on.exit(setAutoBPPARAM(old))
if (!.bpisup2(BPPARAM)) {
bpstart(BPPARAM)
on.exit(bpstop(BPPARAM), add = TRUE)
}
x <- standardize_matrix(x, center = center, scale = scale,
deferred = deferred, BPPARAM = BPPARAM)
if (use_crossprod(x, fold)) {
res <- svd_via_crossprod(x, k = k, nu = nu, nv = nv,
FUN = safe_svd)
}
...
9: do.call(FUN, c(list(x = x, k = k, nu = nu, nv = nv, center = center,
scale = scale, BPPARAM = BPPARAM, ...), ARGS(BSPARAM)))
8: (new("standardGeneric", .Data = function (x, k, nu = k, nv = k,
center = FALSE, scale = FALSE, BPPARAM = SerialParam(), ...,
BSPARAM = ExactParam())
standardGeneric("runSVD"), generic = structure("runSVD", package = "BiocSingular"),
package = "BiocSingular", group = list(), valueClass = character(0),
signature = "BSPARAM", default = NULL, skeleton = (function (x,
k, nu = k, nv = k, center = FALSE, scale = FALSE, BPPARAM = SerialParam(),
..., BSPARAM = ExactParam())
stop(gettextf("invalid call in method dispatch to '%s' (no default method)",
"runSVD"), domain = NA))(x, k, nu, nv, center, scale,
BPPARAM, ..., BSPARAM = BSPARAM)))(x = new("SingleCellExperiment",
int_elementMetadata = new("DFrame", rownames = NULL, nrows = 62703L,
elementType = "ANY", elementMetadata = NULL, metadata = list(),
listData = list(rowPairs = new("DFrame", rownames = NULL,
nrows = 62703L, elementType = "ANY", elementMetadata = NULL,
metadata = list(), listData = structure(list(), names = character(0))))),
int_colData = new("DFrame", rownames = NULL, nrows = 5017L,
elementType = "ANY", elementMetadata = NULL, metadata = list(),
listData = list(reducedDims = new("DFrame", rownames = NULL,
nrows = 5017L, elementType = "ANY", elementMetadata = NULL,
...
7: (new("standardGeneric", .Data = function (x, k, nu = k, nv = k,
center = FALSE, scale = FALSE, BPPARAM = SerialParam(), ...,
BSPARAM = ExactParam())
standardGeneric("runSVD"), generic = structure("runSVD", package = "BiocSingular"),
package = "BiocSingular", group = list(), valueClass = character(0),
signature = "BSPARAM", default = NULL, skeleton = (function (x,
k, nu = k, nv = k, center = FALSE, scale = FALSE, BPPARAM = SerialParam(),
..., BSPARAM = ExactParam())
stop(gettextf("invalid call in method dispatch to '%s' (no default method)",
"runSVD"), domain = NA))(x, k, nu, nv, center, scale,
BPPARAM, ..., BSPARAM = BSPARAM)))(x = new("SingleCellExperiment",
int_elementMetadata = new("DFrame", rownames = NULL, nrows = 62703L,
elementType = "ANY", elementMetadata = NULL, metadata = list(),
listData = list(rowPairs = new("DFrame", rownames = NULL,
nrows = 62703L, elementType = "ANY", elementMetadata = NULL,
metadata = list(), listData = structure(list(), names = character(0))))),
int_colData = new("DFrame", rownames = NULL, nrows = 5017L,
elementType = "ANY", elementMetadata = NULL, metadata = list(),
listData = list(reducedDims = new("DFrame", rownames = NULL,
nrows = 5017L, elementType = "ANY", elementMetadata = NULL,
...
6: do.call(FUN, c(list(x = x, k = k, nu = nu, nv = nv, center = center,
scale = scale, BPPARAM = BPPARAM, ...), ARGS(BSPARAM)))
5: runSVD(x, k = rank, nu = ifelse(get.pcs, rank, 0), nv = ifelse(get.rotation,
rank, 0), center = center, scale = scale, ...)
4: runSVD(x, k = rank, nu = ifelse(get.pcs, rank, 0), nv = ifelse(get.rotation,
rank, 0), center = center, scale = scale, ...)
3: .local(x, ...)
2: runPCA(logNormCounts(sce), rank = 50)
1: runPCA(logNormCounts(sce), rank = 50)
BiocManager::version()
## [1] '3.17'
is(matt)
## [1] "dgTMatrix" "TsparseMatrix" "dsparseMatrix" "generalMatrix" "dMatrix" "sparseMatrix" "compMatrix"
## [8] "Matrix" "xMatrix" "mMatrix" "replValueSp"
matt[1:5, 1:5]
## 5 x 5 sparse Matrix of class "dgTMatrix"
##
## [1,] . . . . .
## [2,] . . . . .
## [3,] . 1 . . 1
## [4,] 2 1 . 1 1
## [5,] . . . . .
head(genes)
## gene_id gene_name genome
## 1 ENSG00000000003 TSPAN6 hg38
## 2 ENSG00000000005 TNMD hg38
## 3 ENSG00000000419 DPM1 hg38
## 4 ENSG00000000457 SCYL3 hg38
## 5 ENSG00000000460 C1orf112 hg38
## 6 ENSG00000000938 FGR hg38
head(metadata)
## bc_wells sample species gene_count tscp_count mread_count bc1_well bc2_well bc3_well bc1_wind bc2_wind
## 1 01_01_14 Sample_1 hg38 3077 7477 29532 A1 A1 B2 1 1
## 2 01_01_31 Sample_1 hg38 2619 5776 21785 A1 A1 C7 1 1
## 3 01_01_66 Sample_1 hg38 2648 5906 22928 A1 A1 F6 1 1
## 4 01_01_70 Sample_1 hg38 2339 4698 18252 A1 A1 F10 1 1
## 5 01_02_03 Sample_1 hg38 2009 4001 15789 A1 A2 A3 1 2
## 6 01_02_62 Sample_1 hg38 2481 5093 19933 A1 A2 F2 1 2
## bc3_wind gene_count2
## 1 14 3077
## 2 31 2619
## 3 66 2648
## 4 70 2339
## 5 3 2009
## 6 62 2481
Thanks for the tip to convert it to dgCMatrix I had a 25% memory decrease. It is the first time using this big datasets and I am not familiar with all the practical memory reduction tricks. Precisely what I want to know with this approach is if the droplet based methods work with the combinatorics approach used here.