Entering edit mode
Amos Folarin
▴
80
@amos-folarin-4200
Last seen 10.2 years ago
Hi,
I was wondering I'm going about this in the correct way. I need to
test if there are coding sequences or exons in hg19 which match a
string of 100bp "D" i.e. [A,G or T]. However I'm getting a strange
result.
I get a hit on chr7, using the 100bp search however when I search with
60bp sequence of "D" I don't get any hits.
library("BSgenome")
library("Biostrings")
library("BSgenome.Hsapiens.UCSC.hg19")
library("biomaRt")
library("GenomicFeatures")
#extract the alignments which match real genes
#(add this onto stefans script ⦠req to buid the gr object using the
regions from the BSgenome alignment of the 100bp seqs.)
txdb <- makeTranscriptDbFromUCSC(genome = "hg19", tablename =
"knownGene") ## do once locally & save
#query.plus <- DNAString("DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD
DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD") # 100bp C
free sequence
query.plus <- DNAString("DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD
DDDDDDDDDDDDDDD") # 60bp C free sequence
query.minus <- reverseComplement(query.plus)
chrList <- c("chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7",
"chr8", "chr9", "chr10", "chr11", "chr12", "chr13", "chr14", "chr15",
"chr16", "chr17", "chr18", "chr19", "chr20", "chr21", "chr22", "chrX",
"chrY")
#access/group the by subset of annotation
annotGr <- exons(txdb)
#annotGr <- cds(txdb)
wholeGenomeMatch <- function()
{
for(i in chrList)
{
#matches on plus strand
matches.plus.strand <- matchPattern(query.plus, Hsapiens[[i]],
fixed=FALSE, max.mismatch=5)
mp <- as.matrix(matches.plus.strand)
#matches on minus strand
matches.minus.strand <- matchPattern(query.minus, Hsapiens[[i]],
fixed=FALSE, max.mismatch=5)
mm <- as.matrix(matches.minus.strand)
OL.p <- NULL
if(nrow(mp) > 0)
{
##### MATCH THE POSITIVE STRAND ######
#need to get start and end positions (end = start + (length-1))
gr <- GRanges(
seqnames = rep(i,nrow(mp)),
ranges = IRanges(start = mp[[1]],
end = mp[[1]]+(mp[,2]-1)),
strand = rep("+", nrow(mp)))
#OL <- findOverlaps(query=gr, subject=annotGr)
OL.p <- annotGr[(!is.na(match(annotGr, gr)))]
#as.data.frame(OL) ## view the results
#tdata.p <- annotGr[unique(queryHits(OL)),]
#tdata.p <- as.data.frame(tdata.p, row.names = NULL, optional =
FALSE)
#write.table(tdata.p, paste(i,"plus.txt"))
cat( paste(i,"plus.txt\n"))
}
OL.m <- NULL
if(nrow(mm) > 0)
{
##### MATCH THE MINUS STRAND ########
#need to get start and end positions (end = start + (length-1))
gr <- GRanges(
seqnames = rep(i,nrow(mm)),
ranges = IRanges(start = mm[[1]],
end = mm[[1]]+(mm[,2]-1)),
strand = rep("-", nrow(mm)))
#OL <- findOverlaps(query=gr, subject=annotGr)
OL.m <- annotGr[(!is.na(match(annotGr, gr)))]
#tdata.m <- annotGr[unique(queryHits(OL)),]
#tdata.m <- as.data.frame(tdata.m, row.names = NULL, optional =
FALSE)
#write.table(tdata.m, paste(i,"minus.txt"))
cat( paste(i,"minus.txt\n"))
#write.table(rbind(tdata.p, tdata.p), paste(i, ".txt"))
}
##write all matching results into one file
write.table(rbind(as.data.frame(OL.p), as.data.frame(OL.m)), file
="60bp_v_exons_max-mismatch=5.txt", append=TRUE)
}
}
wholeGenomeMatch()
#in the first instance with the 100bp search I get 1 hit in the output
file:
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"1" "chr7" 420815 422845 2031 "+" 97277
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
#with the 60 bp query string my output file reads no hits
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
Your help is much appreciated,
Amos
Sent from my iPhone
[[alternative HTML version deleted]]