Entering edit mode
Hi,
Ive had similar challenges as Francesco, and have unsuccessfully
tried to use the data structures and functions provided by
VariantAnnotation. My experience is that I need the expand'
functionality more often with respect to the samples rather than the
annotation tags. And as my experience with R is somewhat limited, I
tend to like to work with simple data.frames when I want to summarise
and characterise the data.
Here is how I generated a simple data frame with all sample variants
(variants and samples are dummy encoded):
## read VCF (100 samples)
> all_vcf <- readVcf(cancer.exome.project.vcf.gz','hg19')
> seqinfo(all_vcf) <- seqinfo(BSgenome.Hsapiens.UCSC.hg19)
> rowData(all_vcf)
GRanges with 50383 ranges and 5 metadata columns:
## get variants with PASS filter
> all_vcf_PASS <-
all_vcf[str_detect(elementMetadata(all_vcf)$FILTER,"PASS"),]
> rowData(all_vcf_PASS)
GRanges with 4252 ranges and 5 metadata columns:
...
## get genotype information for all samples that have called genotypes
(GT != .)
## From my VCF:
## FORMAT=<id=adt,number=.,type=integer,description="allelic depths="" for="" the="" ref="" and="" alt="" alleles="" in="" the="" order="" listed="" (tumor)"="">
## FORMAT=<id=gt,number=1,type=string,description="genotype">
## FORMAT=<id=adc,number=.,type=integer,description="allelic depths="" for="" the="" ref="" and="" alt="" alleles="" in="" the="" order="" listed="" (control)"="">
##
> tumor_ref_support <-
t(as.data.frame(geno(all_vcf_PASS)$ADT[which(geno(all_vcf_PASS)$GT !=
'.', arr.ind=T)],row.names=NULL)[1,])
> normal_ref_support <-
t(as.data.frame(geno(all_vcf_PASS)$ADC[which(geno(all_vcf_PASS)$GT !=
'.', arr.ind=T)],row.names=NULL)[1,])
> tumor_alt_support <-
t(as.data.frame(geno(all_vcf_PASS)$ADT[which(geno(all_vcf_PASS)$GT !=
'.', arr.ind=T)],row.names=NULL)[2,])
> normal_alt_support <-
t(as.data.frame(geno(all_vcf_PASS)$ADC[which(geno(all_vcf_PASS)$GT !=
'.', arr.ind=T)],row.names=NULL)[2,])
## get sample ids and variant ids for the expanded set of variants'
> b <- as.data.frame(which(geno(all_vcf_PASS)$GT != '.',
arr.ind=T))$col
> c <- as.data.frame(which(geno(all_vcf_PASS)$GT != '.',
arr.ind=T))$row
> sample_ids <- as.data.frame(rownames(colData(all_vcf_PASS))[b])
> variant_ids <- as.data.frame(rownames(geno(all_vcf_PASS)$GT)[c])
## make simple data.frame of all samples with genotype information
> row.names(tumor_ref_support) <- NULL
> row.names(normal_ref_support) <- NULL
> row.names(tumor_alt_support) <- NULL
> row.names(external_passed) <- NULL
> row.names(normal_alt_support) <- NULL
> tmp <- as.data.frame(cbind(variant_ids,sample_ids,tumor_ref_support,
tumor_alt_support,normal_ref_support,normal_alt_support))
> colnames(tmp) <- c('variant_id','sample_id','tumor_ref_support','tum
or_alt_support','normal_ref_support','normal_alt_support')
> tmp$allele_frac_tumor <- rep(0,nrow(tmp))
> tmp$tumor_depth <- tmp$tumor_ref_support + tmp$tumor_alt_support
> tmp$normal_depth <- tmp$normal_ref_support + tmp$normal_alt_support
> tmp[tmp$tumor_depth > 0,]$allele_frac_tumor <-
round(tmp[tmp$tumor_depth > 0,]$tumor_alt_support /
tmp[tmp$tumor_depth > 0,]$tumor_depth,2)
##
> head(tmp)
variant_id sample_id tumor_ref_support tumor_alt_support
normal_ref_support normal_alt_support allele_frac_tumor tumor_depth
normal_depth
1 chr5_10000000_T_C 001B_001T 17 7
21 0 0.29 24 21
2 chr11_10000000_C_T 001B_001T 31 4
33 0 0.11 35 33
3 chr18_10000000_C_T 001B_001T 21 7
37 1 0.25 28 38
4 chrY_1000000_A_G 001B_001T 10 2
11 0 0.17 12 11
5 chr1_1000000_G_C 011B_011T 17 3
48 0 0.15 20 48
6 chr1_1000000_A_G 011B_011T 77 13
114 0 0.14 90 114
> str(tmp)
'data.frame': 4418 obs. of 9 variables:
$ variant_id : Factor w/ 4252 levels "chr1_1000000_G_T",..:
3254 620 1644 4251 359 381 391 401 246 1875 ...
$ sample_id : Factor w/ 99 levels "001B_001T","011B_011T",..:
1 1 1 1 2 2 2 2 2 2 ...
$ tumor_ref_support : int 17 31 21 10 17 77 12 40 75 53 ...
$ tumor_alt_support : int 7 4 7 2 3 13 3 6 8 9 ...
$ normal_ref_support: int 21 33 37 11 48 114 19 52 88 89 ...
$ normal_alt_support: int 0 0 1 0 0 0 0 0 0 0 ...
$ allele_frac_tumor : num 0.29 0.11 0.25 0.17 0.15 0.14 0.2 0.13 0.1
0.15 ...
$ tumor_depth : int 24 35 28 12 20 90 15 46 83 62 ...
$ normal_depth : int 21 33 38 11 48 114 19 52 88 89 ...
Next, I plan to do a merge with my functional annotations (info) using
the variant_id as the key, which I think would be straightforward.
If there is a more convenient way to get here using the
VariantAnnotation package, I would be grateful to hear about this.
---
Sigve Nakken, PhD
Postdoctoral Fellow, Dept. of Tumor Biology
Institute for Cancer Research
Oslo University Hospital, Norway
phone: +4795753022
email: sigven@ifi.uio.no
[[alternative HTML version deleted]]