Whole genome searching of 100bp "D" sequence
0
0
Entering edit mode
Amos Folarin ▴ 80
@amos-folarin-4200
Last seen 10.2 years ago
Hi, I was wondering I'm going about this in the correct way. I need to test if there are coding sequences or exons in hg19 which match a string of 100bp "D" i.e. [A,G or T]. However I'm getting a strange result. I get a hit on chr7, using the 100bp search however when I search with 60bp sequence of "D" I don't get any hits. library("BSgenome") library("Biostrings") library("BSgenome.Hsapiens.UCSC.hg19") library("biomaRt") library("GenomicFeatures") #extract the alignments which match real genes #(add this onto stefans script … req to buid the gr object using the regions from the BSgenome alignment of the 100bp seqs.) txdb <- makeTranscriptDbFromUCSC(genome = "hg19", tablename = "knownGene") ## do once locally & save #query.plus <- DNAString("DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD") # 100bp C free sequence query.plus <- DNAString("DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD DDDDDDDDDDDDDDD") # 60bp C free sequence query.minus <- reverseComplement(query.plus) chrList <- c("chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10", "chr11", "chr12", "chr13", "chr14", "chr15", "chr16", "chr17", "chr18", "chr19", "chr20", "chr21", "chr22", "chrX", "chrY") #access/group the by subset of annotation annotGr <- exons(txdb) #annotGr <- cds(txdb) wholeGenomeMatch <- function() { for(i in chrList) { #matches on plus strand matches.plus.strand <- matchPattern(query.plus, Hsapiens[[i]], fixed=FALSE, max.mismatch=5) mp <- as.matrix(matches.plus.strand) #matches on minus strand matches.minus.strand <- matchPattern(query.minus, Hsapiens[[i]], fixed=FALSE, max.mismatch=5) mm <- as.matrix(matches.minus.strand) OL.p <- NULL if(nrow(mp) > 0) { ##### MATCH THE POSITIVE STRAND ###### #need to get start and end positions (end = start + (length-1)) gr <- GRanges( seqnames = rep(i,nrow(mp)), ranges = IRanges(start = mp[[1]], end = mp[[1]]+(mp[,2]-1)), strand = rep("+", nrow(mp))) #OL <- findOverlaps(query=gr, subject=annotGr) OL.p <- annotGr[(!is.na(match(annotGr, gr)))] #as.data.frame(OL) ## view the results #tdata.p <- annotGr[unique(queryHits(OL)),] #tdata.p <- as.data.frame(tdata.p, row.names = NULL, optional = FALSE) #write.table(tdata.p, paste(i,"plus.txt")) cat( paste(i,"plus.txt\n")) } OL.m <- NULL if(nrow(mm) > 0) { ##### MATCH THE MINUS STRAND ######## #need to get start and end positions (end = start + (length-1)) gr <- GRanges( seqnames = rep(i,nrow(mm)), ranges = IRanges(start = mm[[1]], end = mm[[1]]+(mm[,2]-1)), strand = rep("-", nrow(mm))) #OL <- findOverlaps(query=gr, subject=annotGr) OL.m <- annotGr[(!is.na(match(annotGr, gr)))] #tdata.m <- annotGr[unique(queryHits(OL)),] #tdata.m <- as.data.frame(tdata.m, row.names = NULL, optional = FALSE) #write.table(tdata.m, paste(i,"minus.txt")) cat( paste(i,"minus.txt\n")) #write.table(rbind(tdata.p, tdata.p), paste(i, ".txt")) } ##write all matching results into one file write.table(rbind(as.data.frame(OL.p), as.data.frame(OL.m)), file ="60bp_v_exons_max-mismatch=5.txt", append=TRUE) } } wholeGenomeMatch() #in the first instance with the 100bp search I get 1 hit in the output file: "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "1" "chr7" 420815 422845 2031 "+" 97277 "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" #with the 60 bp query string my output file reads no hits "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" Your help is much appreciated, Amos Sent from my iPhone [[alternative HTML version deleted]]
Alignment BSgenome BSgenome Alignment BSgenome BSgenome • 1.1k views
ADD COMMENT

Login before adding your answer.

Traffic: 478 users visited in the last hour
Help About
FAQ
Access RSS
API
Stats

Use of this site constitutes acceptance of our User Agreement and Privacy Policy.

Powered by the version 2.3.6