Question

Difficulty exporting consensus sequence from bam file

0

Entering edit mode

mike_flower • 0

@fa745d19

Last seen 3.7 years ago

United Kingdom

I've just generated a bam file like this:

library(Rsubread)
buildindex(basename="ref.ch5", reference="chr5.fa") #downloaded from UCSC, as suggested here: https://bioinformatics-core-shared-training.github.io/RNAseq-R/align-and-count.nb.html
reads1.ch5 <- file.path(dirPath, "MSH3repeat_S1_L001_R1_001.fastq.gz")
reads2.ch5 <- file.path(dirPath, "MSH3repeat_S1_L001_R2_001.fastq.gz")
align.stat <- align(index="ref.ch5", readfile1=reads1.ch5, readfile2=reads2.ch5, output_file="alignResults.ch5.BAM", sortReadsByCoordinates = TRUE)

I've found a few links for advice, but can't get any of them to work:

http://bioconductor.org/packages/devel/bioc/vignettes/GenomicRanges/inst/doc/GenomicRangesHOWTOs.pdf

This one errors on the last command (cm_by_chrom) – R encountered a fatal error, The session was terminated ...

param <- ScanBamParam(what=c("seq", "qual"))
gal <- readGAlignments("alignResults.ch5.BAM", use.names=TRUE, param=param)
qseq <- setNames(mcols(gal)$seq, names(gal))
qual <- setNames(mcols(gal)$qual, names(gal))
qseq_on_ref <- sequenceLayer(qseq, cigar(gal), from="query", to="reference")
qual_on_ref <- sequenceLayer(qual, cigar(gal), from="query", to="reference")
qseq_on_ref_by_chrom <- splitAsList(qseq_on_ref, seqnames(gal))
qual_on_ref_by_chrom <- splitAsList(qual_on_ref, seqnames(gal))
pos_by_chrom <- splitAsList(start(gal), seqnames(gal))
gr_by_chrom <- lapply(seqlevels(gal),
                      function(seqname)
                      {
                        qseq_on_ref2 <- qseq_on_ref_by_chrom[[seqname]]
                        qual_on_ref2 <- qual_on_ref_by_chrom[[seqname]]
                        pos2 <- pos_by_chrom[[seqname]]
                        qseq_on_ref_per_pos <- split(qseq_on_ref2, pos2)
                        qual_on_ref_per_pos <- split(qual_on_ref2, pos2)
                        nread <- elementNROWS(qseq_on_ref_per_pos)
                        gr_mcols <- DataFrame(nread=unname(nread),
                                              qseq_on_ref=unname(qseq_on_ref_per_pos),
                                              qual_on_ref=unname(qual_on_ref_per_pos))
                        gr <- GRanges(Rle(seqname, nrow(gr_mcols)),
                                      IRanges(as.integer(names(nread)), width=1))
                        mcols(gr) <- gr_mcols
                        seqlevels(gr) <- seqlevels(gal)
                        gr
                      })
gr <- do.call(c, gr_by_chrom)
seqinfo(gr) <- seqinfo(gal)
gr[1:6]
qseq_on_ref
qual_on_ref
mcols(gr)$qseq_on_ref[[6]]
mcols(gr)$qual_on_ref[[6]]
qseq_on_ref <- mcols(gr)$qseq_on_ref
tmp <- unlist(qseq_on_ref, use.names=FALSE)
qseq_on_ref_id <- relist(match(tmp, tmp), qseq_on_ref)
qseq_on_ref_id
qseq_on_ref_id2 <- endoapply(qseq_on_ref_id,
                             function(ids) ids[countMatches(ids, ids) >= 0.2 * length(ids)])
tmp <- unlist(qseq_on_ref_id2, use.names=FALSE)
qseq_on_ref2 <- relist(unlist(qseq_on_ref, use.names=FALSE)[tmp],
                       qseq_on_ref_id2)
split_factor <- rep.int(seqnames(gr), elementNROWS(qseq_on_ref2))
qseq_on_ref2 <- unlist(qseq_on_ref2, use.names=FALSE)
qseq_on_ref2_by_chrom <- splitAsList(qseq_on_ref2, split_factor)
qseq_pos_by_chrom <- splitAsList(start(gr), split_factor)
cm_by_chrom <- lapply(names(qseq_pos_by_chrom),
                      function(seqname)
                        consensusMatrix(qseq_on_ref2_by_chrom[[seqname]],
                                        as.prob=TRUE,
                                        shift=qseq_pos_by_chrom[[seqname]]-1,
                                        width=seqlengths(gr)[[seqname]]))

I then tried this method: Create consensus fasta file from indexed Bam file This one also errored at the "cm_by_chrom" stage.

param <- ScanBamParam(what="seq", isPaired=TRUE)
bam <- BamFile("alignResults.ch5.BAM", yieldSize=1000)
gal <- readGAlignments(bam, param=param)
qseq <- mcols(gal)$seq  # the query sequences
qseq_on_ref <- sequenceLayer(qseq, cigar(gal),
                             from="query", to="reference")
qseq_on_ref_by_chrom <- splitAsList(qseq_on_ref, seqnames(gal))
qseq_pos_by_chrom <- splitAsList(start(gal), seqnames(gal))
cm_by_chrom <- lapply(names(qseq_pos_by_chrom),
                      function(seqname)
                        consensusMatrix(qseq_on_ref_by_chrom[[seqname]],
                                        as.prob=TRUE,
                                        shift=qseq_pos_by_chrom[[seqname]]-1,
                                        width=seqlengths(gal)[[seqname]]))
names(cm_by_chrom) <- names(qseq_pos_by_chrom)

I'm now very stuck!

Rsamtools sequenceLayer • 859 views

ADD COMMENT • link 3.7 years ago mike_flower • 0