Hi Nathaniel,
Nathaniel Street wrote:
> Hi
>
> I am trying to use biomaRt to automate the retrieval of information
for
> Arabidopsis thaliana (my ultimate aim is actually to annotate poplar
> gene models based on arabidopsis best-BLAST results). I want to be
able
> to extract GO information and to then construct an annotation
package to
> enable me to use GOstats and other Bioconductor packages.
>
> Is AnnBuilder still the best option for constructing annotation
> packages? Has anyone come across worked example of using biomaRt to
> retrieve data and then using this data to make an annotation
package?
Seems like a difficult way to go about things -- biomaRt is intended
for
more or less interactive annotation of things rather than simply
getting
all annotations. Wouldn't it be easier to just download a database
dump
from TAIR (or wherever one would get Arabidopsis info)?
>
> Here's the script I am running
>
> library(biomaRt)
> gramene<-useMart('ENSEMBL_MART_ENSEMBL')
> athmart<-useDataset("athaliana_gene_ensembl", mart = gramene)
> baseUrlAT<-"
ftp://ftp.arabidopsis.org/home/tair/Genes/TAIR_sequenced
_genes"
> baseAT<-read.table(baseUrlAT, sep = "\t", as.is = TRUE, fill=TRUE,
quote
> = "", comment.char = "", header=T)
> ath<-baseAT[,2]
> ath<-unique(ath)
> go<-getBM(attributes=c("tair_locus_model",
"ptrichocarpa_ensembl_gene",
> "go"), values=ath, filters="tair_locus_model", mart=athmart)
>
> #1st attribute there because it's not returned by default
>
> Running this, I get the error message
>
> Error: ncol(result) == length(attributes) is not TRUE
Two things:
Using read.table is probably not the way you want to go in this
instance. Something like
baseAT <- scan(baseUrlAT, what=list("","",0,""), sep="\t", skip=1)
would be much more efficient.
Ideally you would want to use the mysql interface for downloading lots
of information, but it appears this mart doesn't support mysql
(dangit!). Anyway, if you output as a list things seem to work. I
don't
know if this is expected or a bug (Steffen Durinck will likely chime
in
if it is unexpected). Additionally, when you output in a list you
don't
need to output the filter as an attribute as well because there is no
problem of multiple lines per attribute.
> mart <- useMart("ENSEMBL_MART_ENSEMBL", "athaliana_gene_ensembl",
mysql=T)
Loading required package: RMySQL
Loading required package: DBI
Error in useMart("ENSEMBL_MART_ENSEMBL", "athaliana_gene_ensembl",
mysql
= T) :
Requested BioMart database is not available please use the function
listMarts(mysql=TRUE) to see the valid biomart names you can query
using
mysql access
> mart <- useMart("ENSEMBL_MART_ENSEMBL", "athaliana_gene_ensembl")
Checking attributes and filters ... ok
> getBM(c("tair_locus_model","ptrichocarpa_ensembl_gene","go"),
"tair_locus_model", bst[[2]], mart, output="list")
$tair_locus_model
$tair_locus_model$AT1G01010.1
[1] "AT1G01010.1"
$tair_locus_model$AT1G01020.1
[1] "AT1G01020.1"
$tair_locus_model$AT1G01020.2
[1] "AT1G01020.2"
$tair_locus_model$AT1G01030.1
[1] "AT1G01030.1"
$tair_locus_model$AT1G01040.1
[1] "AT1G01040.1"
$tair_locus_model$DCL1
[1] NA
$tair_locus_model$AT1G01050.1
[1] "AT1G01050.1"
$tair_locus_model$AT1G01060.1
[1] "AT1G01060.1"
$tair_locus_model$AT1G01060.2
[1] "AT1G01060.2"
$tair_locus_model$AT1G01060.3
[1] "AT1G01060.3"
$ptrichocarpa_ensembl_gene
$ptrichocarpa_ensembl_gene$AT1G01010.1
[1] NA
$ptrichocarpa_ensembl_gene$AT1G01020.1
[1] NA
$ptrichocarpa_ensembl_gene$AT1G01020.2
[1] NA
$ptrichocarpa_ensembl_gene$AT1G01030.1
[1] "gw1.XIV.1973.1"
$ptrichocarpa_ensembl_gene$AT1G01040.1
[1] "eugene3.00021687"
$ptrichocarpa_ensembl_gene$DCL1
[1] NA
$ptrichocarpa_ensembl_gene$AT1G01050.1
[1] NA
$ptrichocarpa_ensembl_gene$AT1G01060.1
[1] "estExt_Genewise1_v1.C_LG_XIV1950"
$ptrichocarpa_ensembl_gene$AT1G01060.2
[1] "estExt_Genewise1_v1.C_LG_XIV1950"
$ptrichocarpa_ensembl_gene$AT1G01060.3
[1] "estExt_Genewise1_v1.C_LG_XIV1950"
$go
$go$AT1G01010.1
[1] "GO:0007275" "GO:0003700" "GO:0005575"
$go$AT1G01020.1
[1] "GO:0008150" "GO:0003674" "GO:0016020"
$go$AT1G01020.2
[1] "GO:0008150" "GO:0003674" "GO:0016020"
$go$AT1G01030.1
[1] "GO:0003700" "GO:0005575" "GO:0009908"
[4] "GO:0045449" "GO:0048366"
$go$AT1G01040.1
[1] NA
$go$DCL1
[1] NA
$go$AT1G01050.1
[1] "GO:0008152" "GO:0016462" "GO:0004427"
[4] "GO:0005634" "GO:0016020" "GO:0005737"
$go$AT1G01060.1
[1] "GO:0003700"
$go$AT1G01060.2
[1] "GO:0003700"
$go$AT1G01060.3
[1] "GO:0003700"
> saveHistory()
Error: could not find function "saveHistory"
> apropos("save")
[1] ".__M__saveHTML:annaffy"
[2] ".__M__saveText:annaffy"
[3] ".__T__saveHTML:annaffy"
[4] ".__T__saveText:annaffy"
[5] ".saveRDS"
[6] "save"
[7] "save.image"
[8] "savehistory"
[9] "saveHTML"
[10] "saveNamespaceImage"
[11] "savePlot"
[12] "saveText"
[13] "sys.save.image"
> savehistory()
> mart <- useMart("ENSEMBL_MART_ENSEMBL", "athaliana_gene_ensembl")
Checking attributes and filters ... ok
> bs <-
"
ftp://ftp.arabidopsis.org/home/tair/Genes/TAIR_sequenced_genes"
> bst <- scan(bs, sep="\t", what=list("","",0,""), skip=1, nlines=10)
Read 10 records
> getBM(c("ptrichocarpa_ensembl_gene","go"), "tair_locus_model",
bst[[2]], mart, output="list")
$ptrichocarpa_ensembl_gene
$ptrichocarpa_ensembl_gene$AT1G01010.1
[1] NA
$ptrichocarpa_ensembl_gene$AT1G01020.1
[1] NA
$ptrichocarpa_ensembl_gene$AT1G01020.2
[1] NA
$ptrichocarpa_ensembl_gene$AT1G01030.1
[1] "gw1.XIV.1973.1"
$ptrichocarpa_ensembl_gene$AT1G01040.1
[1] "eugene3.00021687"
$ptrichocarpa_ensembl_gene$DCL1
[1] NA
$ptrichocarpa_ensembl_gene$AT1G01050.1
[1] NA
$ptrichocarpa_ensembl_gene$AT1G01060.1
[1] "estExt_Genewise1_v1.C_LG_XIV1950"
$ptrichocarpa_ensembl_gene$AT1G01060.2
[1] "estExt_Genewise1_v1.C_LG_XIV1950"
$ptrichocarpa_ensembl_gene$AT1G01060.3
[1] "estExt_Genewise1_v1.C_LG_XIV1950"
$go
$go$AT1G01010.1
[1] "GO:0007275" "GO:0003700" "GO:0005575"
$go$AT1G01020.1
[1] "GO:0008150" "GO:0003674" "GO:0016020"
$go$AT1G01020.2
[1] "GO:0008150" "GO:0003674" "GO:0016020"
$go$AT1G01030.1
[1] "GO:0003700" "GO:0005575" "GO:0009908"
[4] "GO:0045449" "GO:0048366"
$go$AT1G01040.1
[1] NA
$go$DCL1
[1] NA
$go$AT1G01050.1
[1] "GO:0008152" "GO:0016462" "GO:0004427"
[4] "GO:0005634" "GO:0016020" "GO:0005737"
$go$AT1G01060.1
[1] "GO:0003700"
$go$AT1G01060.2
[1] "GO:0003700"
$go$AT1G01060.3
[1] "GO:0003700"
Best,
Jim
>
> If I run the getBM function for individual instances in ath and only
> retrieve the attribute "tair_locus_model" this always works (I have
> tried a large number of AGI codes from inside ath randomly ) but
even
> running getBM to only retrieve "tair_locus_model" for all instances
of
> ath fails (it returns only 2 results even though there are >40,000
> entries in ath) and running getBM on individual instances of ath but
for
> all attributes I want to return also fails with the same error
message
> as above.
>
> I'm not sure if this is a problem with my code, a biomaRt issue or
an
> issue specific to the use of Gramene.
>
> Any help much appreciated.
>
> Thanks
>
> Nathaniel Street
>
> SessionInfo
>
> R version 2.6.0 (2007-10-03)
> i386-pc-mingw32
>
> locale:
> LC_COLLATE=English_United Kingdom.1252;LC_CTYPE=English_United
> Kingdom.1252;LC_MONETARY=English_United
> Kingdom.1252;LC_NUMERIC=C;LC_TIME=English_United Kingdom.1252
>
> attached base packages:
> [1] tools stats graphics grDevices utils datasets
methods
> base
>
> other attached packages:
> [1] AnnBuilder_1.16.0 annotate_1.16.1 xtable_1.5-2
> AnnotationDbi_1.0.6 RSQLite_0.6-4 DBI_0.2-4
XML_1.93-2.1
>
> [8] Biobase_1.16.1 biomaRt_1.12.2 RCurl_0.8-1
>
>
--
James W. MacDonald, M.S.
Biostatistician
Affymetrix and cDNA Microarray Core
University of Michigan Cancer Center
1500 E. Medical Center Drive
7410 CCGC
Ann Arbor MI 48109
734-647-5623