Problem opening hdf5 files previously created with HDF5Array functions
1
1
Entering edit mode
eric.kern13 ▴ 20
@erickern13-15260
Last seen 2.4 years ago
United States

Continuing a thread from this github issue, there's a couple of CSV's containing SHARE-seq RNA and ATAC data, and they don't fit into RAM, so I want to use HDF5Array. Some test hdf5 files are successfully made using suggestions from that thread, and I'll post both them and the code to make them below. The problem is when I try to read them.

> rhdf5::h5closeAll()
> withr::with_dir( data_dir, {
>   HDF5Array::HDF5Array("rna.test.h5", "RNA")[1:4, 1:4]
> })
> withr::with_dir( data_dir, {
>   HDF5Array::HDF5Array("atac.test.h5", "ATAC")[1:4, 1:4]
> })

Both reads yield Error in H5Dopen(gid, name) : HDF5. Dataset. Can't open object.

Occasionally, these read commands work, but most of the time they fail. The exact cause is not obvious but the typical pattern is "work fail fail..." or "fail fail fail ...", never "fail work fail ..." (once you fail, always fail thereafter). Restarting R and deleting and remaking the files does not help. I don't know how to post the files yet but I am working on that. In the mean time, here is the code and session info.

library(Matrix)
library(DelayedArray)
source("~/Desktop/jhu/research/datalake/convert_to_hdf5.R")
data_dir = "~/Desktop/jhu/research/datalake/share_seq/skin"

withr::with_dir( new = data_dir, code = { 
  csvToTENxMatrix(
    input_block_nrow = 5,
    group = "RNA",
    verbose = T,
    input_file  = "test_file.tsv.gz",
    output_file = "rna.test.h5",
    input_dim = c(9, 42948),
    force = T,
    sep = "\t",
    row.names = 1
  )
  csvToTENxMatrix(
    input_block_nrow = 5,
    input_file  = "test_sparse.txt",
    output_file = "atac.test.h5",
    input_dim = c(344592, 10),
    force = T,
    sep = " ",
    group = "ATAC",
    is_sparse_input = T
  )
})
rhdf5::h5closeAll()
withr::with_dir( data_dir, {
  HDF5Array::HDF5Array("rna.test.h5", "RNA")[1:4, 1:4]
})
withr::with_dir( data_dir, {
  HDF5Array::HDF5Array("atac.test.h5", "ATAC")[1:4, 1:4]
})

And here's the the file that gets sourced above (convert_to_hdf5.R).

library(DelayedArray)
library(HDF5Array)

#' Convert a csv or matrix market file to an HDF5Array.
#'
#' @param input_dim c(nrows, ncols) if dense. 
#' same order as 2nd line if matrix market sparse.
#' @param input_file
#' @param output_file 
#' @param group Name of the group for the hdf5 output.
#' @param input_block_nrow How many rows to ingest at a time.
#' @param force Overwrite output file if needed
#' @param verbose 
#' @param is_sparse_input If T, assumes matrix market coordinate integer general.
#'
#' As suggested by Herve Pages 
#' https://github.com/Bioconductor/HDF5Array/issues/47
#' Also expanded to cover matrix market coordinate integer general format
#' (no handling of matrix market special structure e.g. symmetric)
csvToTENxMatrix <- function(
  input_file, 
  input_dim, 
  output_file,
  group, 
  input_block_nrow,
  force=FALSE, 
  verbose=FALSE, 
  is_sparse_input = F,
  ...)
{
  # Prepare output file
  if (file.exists(output_file)) {
    if (!force){
      stop("File exists. Pass force=TRUE to overwrite.")
    }
    if (unlink(output_file) != 0L){
      stop("failed to delete file \"", output_file, "\"")
    }
  }
  input_connection <- gzfile(input_file, "r")
  output_dim <- as.integer(input_dim)
  # Sparse input is handled one column at a time.
  # Dense input is handled one row at a time.
  if(!is_sparse_input){
    output_dim = rev(output_dim)
  }
  output_sink <- HDF5Array::TENxRealizationSink(output_dim, filepath=output_file, group=group)
  sink_grid <- DelayedArray::colGrid(output_sink, ncol=input_block_nrow)


  # Prepare input reader
  if(!is_sparse_input){
    my_reader = function(input_connection, range_to_read, block_index){
      input_block = read.csv(input_connection, nrows=width(range_to_read), header=(block_index==1), ...)
      if (ncol(input_block) <= 1L)
        stop("No delimiters found. Wrong delimiter?")
      input_block
    }
  } else {
    # With sparse-format input, I still can't fit it in my puny RAM, so I need hdf5 and random access.
    # But I don't know how many nonzeroes are in the block I want to write next.
    # This takes care of that. 
    my_reader = function(input_connection, range_to_read, block_index){
      block = matrix(0, ncol = width(range_to_read), nrow = input_dim[[1]]) 
      if (ncol(block) <= 1L)
        stop("No delimiters found. Wrong delimiter?")
      is_top = T
      stop_loop = F
      while(!stop_loop){
        lines = readLines(input_connection, n = 1000)
        if(length(lines)<1){
          break
        }
        X = read.csv(text = lines, header = is_top & (block_index==1), comment.char = "%", ...)
        is_top = F
        X[["has_overshot"]] = X[[2]] > end( range_to_read )
        if( any(X[["has_overshot"]])  ){
          pushBack(data = paste0(lines[X[["has_overshot"]]], collapse = "\n"),
                   connection = input_connection)
          X = X[!X[["has_overshot"]],]
          stop_loop = T
        }
        X[[2]] = X[[2]] - (block_index-1)*input_block_nrow 
        stopifnot(all(X[[2]]>0))
        block[as.matrix(X[1:2])] = X[[3]]
      }
      t(block)
    }
  }

  ## Walk on the grid, and, for each viewport, read a block from the CSV file and
  ## write it to the h5 file.
  for (bid in seq_along(sink_grid)) {
    viewport <- sink_grid[[bid]]
    if (verbose)
      cat("Reading row(s) ", as.character(ranges(viewport)[2]), " from ", input_file, " ... ", sep="")
    input_block <- my_reader(input_connection,
                             range_to_read = ranges(viewport)[2], 
                             block_index = bid)
    if (nrow(input_block) < width(viewport)[2] )
      stop("Too few rows in input.")
    if (verbose)
      cat("OK\n")
    output_block <- t(as.matrix(input_block))
    if (verbose)
      cat("Writing column(s) ", as.character(ranges(viewport)[2]), " to ", output_file, " ... ", sep="")
    write_block(output_sink, viewport, output_block)
    if (verbose)
      cat("OK\n")
  }
  close(output_sink)
  close(input_connection)
  as(output_sink, "DelayedArray")
}

Sessioninfo:

> sessionInfo()
R version 4.0.0 (2020-04-24)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: elementary OS 5.1.7 Hera

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/openblas/libblas.so.3
LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C               LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8     LC_MONETARY=en_US.UTF-8   
 [6] LC_MESSAGES=en_US.UTF-8    LC_PAPER=en_US.UTF-8       LC_NAME=C                  LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] parallel  stats4    stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] HDF5Array_1.16.1    rhdf5_2.32.4        DelayedArray_0.14.1 IRanges_2.22.2      S4Vectors_0.26.1    BiocGenerics_0.34.0 matrixStats_0.61.0 
[8] Matrix_1.3-4       

loaded via a namespace (and not attached):
[1] compiler_4.0.0  tools_4.0.0     withr_2.4.2     grid_4.0.0      lattice_0.20-45 Rhdf5lib_1.10.1
HDF5Array • 1.3k views
ADD COMMENT
0
Entering edit mode

I can't see a built-in way to post files here, but they are public on my DropBox.

ADD REPLY
1
Entering edit mode

The "dense input" link doesn't work at the moment.

However I can confirm I get the same error just by downloading your output .h5 file and running the HDF5Array() code.

ADD REPLY
2
Entering edit mode
Mike Smith ★ 6.6k
@mike-smith
Last seen 17 hours ago
EMBL Heidelberg

You're create a sparse matrix, but using the constructor for an HDF5Array based on a dense matrix. Maybe HDF5Array::H5SparseMatrix() is what you're looking for:

> HDF5Array::H5SparseMatrix(filepath = "~/Downloads/rna.test.h5", group = "RNA")
<42948 x 9> sparse matrix of class H5SparseMatrix and type "double":
         [,1] [,2] [,3] ... [,8] [,9]
    [1,]    0    0    0   .    0    0
    [2,]    1    0    0   .    0    0
    [3,]    0    0    0   .    0    0
    [4,]    0    0    0   .    0    0
    [5,]    0    0    0   .    0    0
     ...    .    .    .   .    .    .
[42944,]    0    0    0   .    0    0
[42945,]    1    0    0   .    0    0
[42946,]    0    0    0   .    0    0
[42947,]    0    0    0   .    0    0
[42948,]    0    0    0   .    0    0

Possibly you'd be better with HDF5Array::TENxMatrix(), I'm not sure what the practical differences are between the H5SparseMatrix-class and the TENxMatrix-class which are returned respectively.

ADD COMMENT

Login before adding your answer.

Traffic: 468 users visited in the last hour
Help About
FAQ
Access RSS
API
Stats

Use of this site constitutes acceptance of our User Agreement and Privacy Policy.

Powered by the version 2.3.6