I needed to find the (single) occurrences of primers in a genome in a GenomicRanges or data frame format. This is similar to the example given in BSgenome's GenomeSearching vignette, section on "Finding an arbitrary nucleotide pattern in an entire genome", but I wanted a flexible function with a nice output format, and I'm working with a genome I can load into memory. I just wrote this function below which seems to work - any suggestions? If it's helpful to others could it be incorporated into the package?
Edward
# matchInGenome
# Use Bioconductor to find all matches of a DNA string in a genome
# Edward Wallace, 20 August 2016
library(Biostrings)
library(GenomicRanges)
library(BSgenome.Scerevisiae.UCSC.sacCer3)
matchInGenome <- function(pattern,genome,...) {
# find all matches of pattern in genome on both strands
# output as GRanges
# coerce pattern to DNAString
pattern <- DNAString(pattern)
if(class(genome) != "BSgenome") stop("genome must be a BSgenome")
# forward matches to genome
fwdmatch <- GenomicRangesList(lapply(seqnames(genome), function(chr) {
irmatch <- matchPattern(pattern,genome[[chr]],...)@ranges
if(length(irmatch) > 0) {
return(GRanges(seqnames=chr,ranges=irmatch,strand="+"))
} else {
return(GRanges())
}
}))
# reverse complement matches to genome
rcpattern <- reverseComplement(pattern)
revmatch <- GenomicRangesList(lapply(seqnames(genome), function(chr) {
irmatch <- matchPattern(rcpattern,genome[[chr]],...)@ranges
if(length(irmatch) > 0) {
return(GRanges(seqnames=chr,ranges=irmatch,strand="-"))
} else {
return(GRanges())
}
}))
# collapse fwd/rev GRangesLists into single GRanges
allmatches <- unlist(c(fwdmatch,revmatch))
# set seqinfo as in genome
seqlevels(allmatches) <- seqlevels(genome)
seqinfo(allmatches) <- seqinfo(genome)
return(allmatches)
}
# test with single match (from YAL003W gene on +strand/chrI)
testpattern1 <- "GACAAGTCATACATTGAAGG"
matchInGenome(testpattern1,BSgenome.Scerevisiae.UCSC.sacCer3)
# test with single match NOT from chrI (from YFL039C gene on -strand/chrVI)
testpattern2 <- "ATTTACTGAATTAACAATGGATTCTG"
matchInGenome(testpattern2,BSgenome.Scerevisiae.UCSC.sacCer3)
# test with many matches (Puf3 consensus binding)
testpatternmany <- "TGTAAATA"
matchInGenome(testpatternmany,BSgenome.Scerevisiae.UCSC.sacCer3)