Hi Jake,
The bad news is that the "exonFrames" column is ignored by makeTranscriptDbFromUCSC()
. The good news is that the frame of each CDS in a given transcript can be inferred from its offset within the transcript, that is, from the cumulated widths of the CDS'es preceding it in the transcript. Here is a function that does it. Like cdsBy(..., by="tx")
it returns the CDS'es grouped by transcript but with the additional "frame"
inner metadata column:
cdsByTx <- function(txdb, use.names=FALSE)
{
cds_by_tx <- cdsBy(txdb, by="tx", use.names=use.names)
offset <- cumsum(width(cds_by_tx))
offset <- pc(rep.int(0L, length(offset)), phead(offset, n=-1))
frame <- offset %% 3L
frame <- as.integer(unlist(frame, use.names=FALSE))
unlisted_ans <- unlist(cds_by_tx, use.names=FALSE)
mcols(unlisted_ans)$frame <- frame
relist(unlisted_ans, cds_by_tx)
}
Then:
library(GenomicFeatures)
txdb <- makeTranscriptDbFromUCSC(genome="mm10", tablename="ccdsGene")
## Note that for the ccdsGene table the reported transcripts are actually CDS
## regions. This means that exons and CDS'es are the same and that there are
## no UTRs:
all(cds(txdb) == exons(txdb)) # TRUE
length(fiveUTRsByTranscript(txdb)) # 0
length(threeUTRsByTranscript(txdb)) # 0
mm10_cds_by_tx <- cdsByTx(txdb, use.names=TRUE)
mm10_cds_by_tx[c(5, 10, 710)]
# GRangesList object of length 3:
# $CCDS14809.1
# GRanges object with 3 ranges and 4 metadata columns:
# seqnames ranges strand | cds_id cds_name exon_rank
# <Rle> <IRanges> <Rle> | <integer> <character> <integer>
# [1] chr1 [5589049, 5589305] + | 34 <NA> 1
# [2] chr1 [5598590, 5598942] + | 35 <NA> 2
# [3] chr1 [5602252, 5602784] + | 36 <NA> 3
# frame
# <integer>
# [1] 0
# [2] 2
# [3] 1
#
# $CCDS14811.1
# GRanges object with 1 range and 4 metadata columns:
# seqnames ranges strand | cds_id cds_name exon_rank frame
# [1] chr1 [9545524, 9546621] + | 88 <NA> 1 0
#
# $CCDS48226.1
# GRanges object with 2 ranges and 4 metadata columns:
# seqnames ranges strand | cds_id cds_name exon_rank frame
# [1] chr1 [18265026, 18265080] - | 6360 <NA> 1 0
# [2] chr1 [18251200, 18251333] - | 6358 <NA> 2 1
#
# -------
# seqinfo: 66 sequences (1 circular) from mm10 genome
As a sanity check, now we're going to check that our inferred "frame"
is the same as the "exonFrames" column found in the ccdsGene table. For this, we're first going to fetch the full table from UCSC using getTable()
from the rtracklayer package:
library(rtracklayer)
session <- browserSession()
genome(session) <- "mm10"
query <- ucscTableQuery(session, "CCDS", table="ccdsGene")
mm10_ccdsGene <- getTable(query)
mm10_ccdsGene[c(1,7,70), ]
# bin name chrom strand txStart txEnd cdsStart cdsEnd
# 1 0 CCDS15305.1 chr1 - 134202950 134234355 134202950 134234355
# 7 2 CCDS48344.1 chr1 + 125677336 125872884 125677336 125872884
# 70 76 CCDS14803.1 chr1 - 3216021 3671348 3216021 3671348
# exonCount exonStarts exonEnds score name2
# 1 2 134202950,134234014, 134203590,134234355, 0 NA
# 7 2 125677336,125872369, 125678192,125872884, 0 NA
# 70 3 3216021,3421701,3670551, 3216968,3421901,3671348, 0 NA
# cdsStartStat cdsEndStat exonFrames
# 1 cmpl cmpl 2,0,
# 7 cmpl cmpl 0,1,
# 70 cmpl cmpl 1,2,0,
Just a confirmation that for this table the reported transcripts are actually CDS regions:
identical(mm10_ccdsGene[,"txStart"], mm10_ccdsGene[,"cdsStart"]) # TRUE
identical(mm10_ccdsGene[,"txEnd"], mm10_ccdsGene[,"cdsEnd"]) # TRUE
Note that you can use makeGRangesFromDataFrame()
to turn mm10_ccdsGene
into a GRanges object:
gr <- makeGRangesFromDataFrame(mm10_ccdsGene,
start.field="txStart", end.field="txEnd",
starts.in.df.are.0based=TRUE,
keep.extra.columns=TRUE)
Now let's compare our inferred "frame"
with mm10_ccdsGene$exonFrames
:
cds_frames <- relist(mcols(unlist(mm10_cds_by_tx, use.names=FALSE))$frame,
mm10_cds_by_tx)
cds_frames <- cds_frames[mm10_ccdsGene$name]
cds_frames <- revElements(cds_frames, mm10_ccdsGene$strand == "-")
exonFrames <- IntegerList(strsplit(as.character(mm10_ccdsGene$exonFrames),
",", fixed=TRUE))
identical(unname(cds_frames), exonFrames) # TRUE
Cheers,
H.