Difficulty exporting consensus sequence from bam file
0
0
Entering edit mode
@fa745d19
Last seen 3.4 years ago
United Kingdom

I've just generated a bam file like this:

library(Rsubread)
buildindex(basename="ref.ch5", reference="chr5.fa") #downloaded from UCSC, as suggested here: https://bioinformatics-core-shared-training.github.io/RNAseq-R/align-and-count.nb.html
reads1.ch5 <- file.path(dirPath, "MSH3repeat_S1_L001_R1_001.fastq.gz")
reads2.ch5 <- file.path(dirPath, "MSH3repeat_S1_L001_R2_001.fastq.gz")
align.stat <- align(index="ref.ch5", readfile1=reads1.ch5, readfile2=reads2.ch5, output_file="alignResults.ch5.BAM", sortReadsByCoordinates = TRUE)

I've found a few links for advice, but can't get any of them to work:

http://bioconductor.org/packages/devel/bioc/vignettes/GenomicRanges/inst/doc/GenomicRangesHOWTOs.pdf

This one errors on the last command (cm_by_chrom) – R encountered a fatal error, The session was terminated ...

param <- ScanBamParam(what=c("seq", "qual"))
gal <- readGAlignments("alignResults.ch5.BAM", use.names=TRUE, param=param)
qseq <- setNames(mcols(gal)$seq, names(gal))
qual <- setNames(mcols(gal)$qual, names(gal))
qseq_on_ref <- sequenceLayer(qseq, cigar(gal), from="query", to="reference")
qual_on_ref <- sequenceLayer(qual, cigar(gal), from="query", to="reference")
qseq_on_ref_by_chrom <- splitAsList(qseq_on_ref, seqnames(gal))
qual_on_ref_by_chrom <- splitAsList(qual_on_ref, seqnames(gal))
pos_by_chrom <- splitAsList(start(gal), seqnames(gal))
gr_by_chrom <- lapply(seqlevels(gal),
                      function(seqname)
                      {
                        qseq_on_ref2 <- qseq_on_ref_by_chrom[[seqname]]
                        qual_on_ref2 <- qual_on_ref_by_chrom[[seqname]]
                        pos2 <- pos_by_chrom[[seqname]]
                        qseq_on_ref_per_pos <- split(qseq_on_ref2, pos2)
                        qual_on_ref_per_pos <- split(qual_on_ref2, pos2)
                        nread <- elementNROWS(qseq_on_ref_per_pos)
                        gr_mcols <- DataFrame(nread=unname(nread),
                                              qseq_on_ref=unname(qseq_on_ref_per_pos),
                                              qual_on_ref=unname(qual_on_ref_per_pos))
                        gr <- GRanges(Rle(seqname, nrow(gr_mcols)),
                                      IRanges(as.integer(names(nread)), width=1))
                        mcols(gr) <- gr_mcols
                        seqlevels(gr) <- seqlevels(gal)
                        gr
                      })
gr <- do.call(c, gr_by_chrom)
seqinfo(gr) <- seqinfo(gal)
gr[1:6]
qseq_on_ref
qual_on_ref
mcols(gr)$qseq_on_ref[[6]]
mcols(gr)$qual_on_ref[[6]]
qseq_on_ref <- mcols(gr)$qseq_on_ref
tmp <- unlist(qseq_on_ref, use.names=FALSE)
qseq_on_ref_id <- relist(match(tmp, tmp), qseq_on_ref)
qseq_on_ref_id
qseq_on_ref_id2 <- endoapply(qseq_on_ref_id,
                             function(ids) ids[countMatches(ids, ids) >= 0.2 * length(ids)])
tmp <- unlist(qseq_on_ref_id2, use.names=FALSE)
qseq_on_ref2 <- relist(unlist(qseq_on_ref, use.names=FALSE)[tmp],
                       qseq_on_ref_id2)
split_factor <- rep.int(seqnames(gr), elementNROWS(qseq_on_ref2))
qseq_on_ref2 <- unlist(qseq_on_ref2, use.names=FALSE)
qseq_on_ref2_by_chrom <- splitAsList(qseq_on_ref2, split_factor)
qseq_pos_by_chrom <- splitAsList(start(gr), split_factor)
cm_by_chrom <- lapply(names(qseq_pos_by_chrom),
                      function(seqname)
                        consensusMatrix(qseq_on_ref2_by_chrom[[seqname]],
                                        as.prob=TRUE,
                                        shift=qseq_pos_by_chrom[[seqname]]-1,
                                        width=seqlengths(gr)[[seqname]]))

I then tried this method: Create consensus fasta file from indexed Bam file This one also errored at the "cm_by_chrom" stage.

param <- ScanBamParam(what="seq", isPaired=TRUE)
bam <- BamFile("alignResults.ch5.BAM", yieldSize=1000)
gal <- readGAlignments(bam, param=param)
qseq <- mcols(gal)$seq  # the query sequences
qseq_on_ref <- sequenceLayer(qseq, cigar(gal),
                             from="query", to="reference")
qseq_on_ref_by_chrom <- splitAsList(qseq_on_ref, seqnames(gal))
qseq_pos_by_chrom <- splitAsList(start(gal), seqnames(gal))
cm_by_chrom <- lapply(names(qseq_pos_by_chrom),
                      function(seqname)
                        consensusMatrix(qseq_on_ref_by_chrom[[seqname]],
                                        as.prob=TRUE,
                                        shift=qseq_pos_by_chrom[[seqname]]-1,
                                        width=seqlengths(gal)[[seqname]]))
names(cm_by_chrom) <- names(qseq_pos_by_chrom)

I'm now very stuck!

Rsamtools sequenceLayer • 795 views
ADD COMMENT

Login before adding your answer.

Traffic: 613 users visited in the last hour
Help About
FAQ
Access RSS
API
Stats

Use of this site constitutes acceptance of our User Agreement and Privacy Policy.

Powered by the version 2.3.6