Hi Thomas,
Yes the sequenceLayer()
function can be used to overlay the forward and reverse read of a pair and return a consensus. The layQuerySequencesOnRef()
function below does that.
library(GenomicAlignments)
## A parallel string consensus function.
## Stronger letters win over weaker letters. Letters strength from
## stronger to weaker: nucleotide > D.letter (deletion) >
## N.letter (skipped region).
.pconsensus <- function(x, y, D.letter="-", N.letter=".",
discord.letter="N")
{
stopifnot(identical(width(x), width(y)))
unlisted_x <- unlist(x, use.names=FALSE)
unlisted_y <- unlist(y, use.names=FALSE)
D_byte <- as.raw(DNAString(D.letter))
N_byte <- as.raw(DNAString(N.letter))
y_bytes <- as.raw(unlisted_y)
y_is_nucleotide <- !(y_bytes %in% c(N_byte, D_byte))
x_bytes <- as.raw(unlisted_x)
replace_idx <- which(x_bytes == N_byte |
x_bytes == D_byte & y_is_nucleotide)
unlisted_x[replace_idx] <- unlisted_y[replace_idx]
x_bytes <- as.raw(unlisted_x)
is_discordant <- x_bytes != y_bytes & y_is_nucleotide
unlisted_x[is_discordant] <- discord.letter
ans <- relist(unlisted_x, x)
discord_at <- which(relist(is_discordant, x))
mcols(ans) <- DataFrame(discord_at=discord_at)
ans
}
## Lay each pair of query sequences along the reference and merge
## the 2 sequences in each pair by filling the gap between them
## with "+". In case of overlay between the 2 mates, discordant
## nucleotides are replaced with an N. The returned DNAStringSet
## object has the same shape (i.e. same length and width) as
## 'granges(galp)'.
layQuerySequencesOnRef <- function(galp, mate.gap.letter="+",
D.letter="-",
N.letter=".",
discord.letter="N")
{
stopifnot(is(galp, "GAlignmentPairs"))
gal1 <- first(galp, real.strand=TRUE)
gal2 <- last(galp, real.strand=TRUE)
qseq1 <- mcols(gal1)$seq
qseq2 <- mcols(gal2)$seq
if (is.null(qseq1) || is.null(qseq2))
stop(wmsg(
"'galp' doesn't contain the query sequences. ",
"Make sure you load it with ",
"readGAlignmentPairs(..., ",
"param=ScanBamParam(what=\"seq\")"
))
mate.gap.letter <- Biostrings:::.normarg_padding.letter(
mate.gap.letter,
seqtype(qseq1))
qseq1_on_ref <- sequenceLayer(qseq1, cigar(gal1),
D.letter=D.letter,
N.letter=N.letter)
qseq2_on_ref <- sequenceLayer(qseq2, cigar(gal2),
D.letter=D.letter,
N.letter=N.letter)
gr1 <- granges(gal1)
stopifnot(identical(width(gr1), width(qseq1_on_ref)))
gr2 <- granges(gal2)
stopifnot(identical(width(gr2), width(qseq2_on_ref)))
gr <- punion(gr1, gr2, fill.gap=TRUE) # same as granges(galp)
big_gap <- paste0(rep.int(mate.gap.letter, max(width(gr))),
collapse="")
ans <- extractAt(DNAString(big_gap),
IRanges(1L, width=width(gr)))
start1 <- start(gr1) - start(gr) + 1L
subseq(ans, start=start1, width=width(gr1)) <- qseq1_on_ref
start2 <- start(gr2) - start(gr) + 1L
subseq(ans, start=start2, width=width(gr2)) <- qseq2_on_ref
## Resolve mate sequence discordance by replacing discordant
## nucleotides with an N.
overlay_gr <- pintersect(gr1, gr2)
start1 <- pmax(start(overlay_gr) - start(gr1) + 1L, 1L)
start1 <- pmin(start1, width(gr1))
overlay1 <- subseq(qseq1_on_ref, start=start1,
width=width(overlay_gr))
start2 <- pmax(start(overlay_gr) - start(gr2) + 1L, 1L)
start2 <- pmin(start2, width(gr2))
overlay2 <- subseq(qseq2_on_ref, start=start2,
width=width(overlay_gr))
overlay <- .pconsensus(overlay1, overlay2,
D.letter=D.letter, N.letter=N.letter,
discord.letter=discord.letter)
overlay_offset <- start(overlay_gr) - start(gr)
subseq(ans, start=overlay_offset + 1L,
width=width(overlay_gr)) <- overlay
mate_discord_at <- mcols(overlay)$discord_at + overlay_offset
mcols(ans) <- DataFrame(mate_discord_at=mate_discord_at)
ans
}
Then:
library(RNAseqData.HNRNPC.bam.chr14)
param <- ScanBamParam(what="seq")
bamfile <- RNAseqData.HNRNPC.bam.chr14_BAMFILES[1]
galp <- readGAlignmentPairs(bamfile, param=param)
qseq_on_ref <- layQuerySequencesOnRef(galp)
gr <- granges(galp)
stopifnot(identical(width(qseq_on_ref), width(gr))) # sanity check
mcols(gr)$qseq_on_ref <- qseq_on_ref
mcols(gr)$mate_discord_at <- mcols(qseq_on_ref)$mate_discord_at
## Note that there is a display issue for objects with long DNA sequences in
## their metadata columns. The issue is addressed in Biostrings >= 2.39.3.
gr[973:976]
# GRanges object with 4 ranges and 2 metadata columns:
# seqnames ranges strand | qseq_on_ref
# <Rle> <IRanges> <Rle> | <DNAStringSet>
# [1] chr14 [19684694, 19684812] - | AGCCATGCCA...TTGCTTCGTA
# [2] chr14 [19684743, 19684917] + | CATCACCTGC...GAGCTCGGGG
# [3] chr14 [19684728, 19684820] - | CGTCACTGGC...TAGTGCCACA
# [4] chr14 [19684716, 19684820] - | TGCCAACCCA...TAGTGCCACA
# mate_discord_at
# <IntegerList>
# [1] 54,63,67
# [2]
# [3]
# [4] 52
# -------
# seqinfo: 93 sequences from an unspecified genome
The mate_discord_at
metadata column indicates the location in qseq_on_ref
where the Ns were injected. There doesn't seem to be too much discord:
> table(elementLengths(mcols(gr)$mate_discord_at))
0 1 2 3 4
389479 7889 2294 330 62
Then it's easy for example to put the reference sequences on gr
, side by side with the query sequences:
library(BSgenome.Hsapiens.UCSC.hg19)
genome <- BSgenome.Hsapiens.UCSC.hg19
mcols(gr)$rseq <- getSeq(genome, unstrand(gr))
gr[973:976]
# GRanges object with 4 ranges and 3 metadata columns:
# seqnames ranges strand | qseq_on_ref
# <Rle> <IRanges> <Rle> | <DNAStringSet>
# [1] chr14 [19684694, 19684812] - | AGCCATGCCA...TTGCTTCGTA
# [2] chr14 [19684743, 19684917] + | CATCACCTGC...GAGCTCGGGG
# [3] chr14 [19684728, 19684820] - | CGTCACTGGC...TAGTGCCACA
# [4] chr14 [19684716, 19684820] - | TGCCAACCCA...TAGTGCCACA
# mate_discord_at rseq
# <IntegerList> <DNAStringSet>
# [1] 54,63,67 AGCCATGCCA...TTGCTTCGTA
# [2] CATCACCTGC...GAGCTCGGGG
# [3] CGTCACTGGC...TAGTGCCACA
# [4] 52 TGCCAACCCA...TAGTGCCACA
# -------
# seqinfo: 93 sequences from an unspecified genome
Is this what you were looking after? I'll add this to the GenomicAlignments package today.
Cheers,
H.
I've edited my answer above to address an issue with mate sequence discordance resolution.
H.