Entering edit mode
I've just generated a bam file like this:
library(Rsubread)
buildindex(basename="ref.ch5", reference="chr5.fa") #downloaded from UCSC, as suggested here: https://bioinformatics-core-shared-training.github.io/RNAseq-R/align-and-count.nb.html
reads1.ch5 <- file.path(dirPath, "MSH3repeat_S1_L001_R1_001.fastq.gz")
reads2.ch5 <- file.path(dirPath, "MSH3repeat_S1_L001_R2_001.fastq.gz")
align.stat <- align(index="ref.ch5", readfile1=reads1.ch5, readfile2=reads2.ch5, output_file="alignResults.ch5.BAM", sortReadsByCoordinates = TRUE)
I've found a few links for advice, but can't get any of them to work:
http://bioconductor.org/packages/devel/bioc/vignettes/GenomicRanges/inst/doc/GenomicRangesHOWTOs.pdf
This one errors on the last command (cm_by_chrom) – R encountered a fatal error, The session was terminated ...
param <- ScanBamParam(what=c("seq", "qual"))
gal <- readGAlignments("alignResults.ch5.BAM", use.names=TRUE, param=param)
qseq <- setNames(mcols(gal)$seq, names(gal))
qual <- setNames(mcols(gal)$qual, names(gal))
qseq_on_ref <- sequenceLayer(qseq, cigar(gal), from="query", to="reference")
qual_on_ref <- sequenceLayer(qual, cigar(gal), from="query", to="reference")
qseq_on_ref_by_chrom <- splitAsList(qseq_on_ref, seqnames(gal))
qual_on_ref_by_chrom <- splitAsList(qual_on_ref, seqnames(gal))
pos_by_chrom <- splitAsList(start(gal), seqnames(gal))
gr_by_chrom <- lapply(seqlevels(gal),
function(seqname)
{
qseq_on_ref2 <- qseq_on_ref_by_chrom[[seqname]]
qual_on_ref2 <- qual_on_ref_by_chrom[[seqname]]
pos2 <- pos_by_chrom[[seqname]]
qseq_on_ref_per_pos <- split(qseq_on_ref2, pos2)
qual_on_ref_per_pos <- split(qual_on_ref2, pos2)
nread <- elementNROWS(qseq_on_ref_per_pos)
gr_mcols <- DataFrame(nread=unname(nread),
qseq_on_ref=unname(qseq_on_ref_per_pos),
qual_on_ref=unname(qual_on_ref_per_pos))
gr <- GRanges(Rle(seqname, nrow(gr_mcols)),
IRanges(as.integer(names(nread)), width=1))
mcols(gr) <- gr_mcols
seqlevels(gr) <- seqlevels(gal)
gr
})
gr <- do.call(c, gr_by_chrom)
seqinfo(gr) <- seqinfo(gal)
gr[1:6]
qseq_on_ref
qual_on_ref
mcols(gr)$qseq_on_ref[[6]]
mcols(gr)$qual_on_ref[[6]]
qseq_on_ref <- mcols(gr)$qseq_on_ref
tmp <- unlist(qseq_on_ref, use.names=FALSE)
qseq_on_ref_id <- relist(match(tmp, tmp), qseq_on_ref)
qseq_on_ref_id
qseq_on_ref_id2 <- endoapply(qseq_on_ref_id,
function(ids) ids[countMatches(ids, ids) >= 0.2 * length(ids)])
tmp <- unlist(qseq_on_ref_id2, use.names=FALSE)
qseq_on_ref2 <- relist(unlist(qseq_on_ref, use.names=FALSE)[tmp],
qseq_on_ref_id2)
split_factor <- rep.int(seqnames(gr), elementNROWS(qseq_on_ref2))
qseq_on_ref2 <- unlist(qseq_on_ref2, use.names=FALSE)
qseq_on_ref2_by_chrom <- splitAsList(qseq_on_ref2, split_factor)
qseq_pos_by_chrom <- splitAsList(start(gr), split_factor)
cm_by_chrom <- lapply(names(qseq_pos_by_chrom),
function(seqname)
consensusMatrix(qseq_on_ref2_by_chrom[[seqname]],
as.prob=TRUE,
shift=qseq_pos_by_chrom[[seqname]]-1,
width=seqlengths(gr)[[seqname]]))
I then tried this method: Create consensus fasta file from indexed Bam file This one also errored at the "cm_by_chrom" stage.
param <- ScanBamParam(what="seq", isPaired=TRUE)
bam <- BamFile("alignResults.ch5.BAM", yieldSize=1000)
gal <- readGAlignments(bam, param=param)
qseq <- mcols(gal)$seq # the query sequences
qseq_on_ref <- sequenceLayer(qseq, cigar(gal),
from="query", to="reference")
qseq_on_ref_by_chrom <- splitAsList(qseq_on_ref, seqnames(gal))
qseq_pos_by_chrom <- splitAsList(start(gal), seqnames(gal))
cm_by_chrom <- lapply(names(qseq_pos_by_chrom),
function(seqname)
consensusMatrix(qseq_on_ref_by_chrom[[seqname]],
as.prob=TRUE,
shift=qseq_pos_by_chrom[[seqname]]-1,
width=seqlengths(gal)[[seqname]]))
names(cm_by_chrom) <- names(qseq_pos_by_chrom)
I'm now very stuck!