Entering edit mode
Cornish, Joseph NIH/NIAID [F]
▴
50
@cornish-joseph-nihniaid-f-5864
Last seen 10.3 years ago
I am attempting to write a script that will allow me to do bulk
benchmarking of various methods for normalization and background
correction. I am concerned that I am not processing the data correctly
and this may be leading to the abnormal heatmaps that I am seeing. For
a test case, I am using an experiment where I have two different
conditions with two different states with two replicates per state.
The heatmaps concern me because of the uniformity between them, the
values are all completely green or completely red, there are
differences between genes within a state according to the heatmap.
In the examples I have seen people have suggested that I use the
coefficients from the eBayes fitting of the data, however in my
example the eBayes LMarray object does not have two columns in the
results, only on (not sure if this is a sign of errors on my part). I
have instead used the lmFit coefficients since there are column for
each matrix.
To further confuse someone who is new to R, the simplified example and
full code generate different heatmaps for the same data and same
methods (normexp for background, cyclic loess for norm). The
differences being in the simplified version the gene IDs show up on
the right side but in the full version, only row number appears. The
color for some genes changes between states as well.
My questions are:
Why does the bayes model only have one column?
Am I approaching the processing correctly?
Why are there differences between the resulting heatmaps of each
version?
I know that the idea is to keep the amount of posted code to a minimum
but I don't see any other way here.
Here is the simplified example I have made:
library(limma)
library(vsn)
library(gplots)
set.seed(0xabcd) #from vsn manual
#constants for reading agi outs from our imager
COL__ <- list(G="Raw intensity (tr.mean) {A}",Gb="Background
(tr.mean) {A}")
ANO__ <- c("Position", "Name", "ID", "Description", "GeneName")
#parse data
targets <- readTargets("test.csv", sep = ",")
gal <- readGAL("026806_D_20110524.gal")
layout <- getLayout(gal)
agi <- read.maimages(targets, columns = COL__,
annotation = ANO__,
green.only = TRUE)
design <- model.matrix(~0+factor(c(1,1,2,2)))
colnames(design) <- c("s1", "s2")
#process
bg <- backgroundCorrect(agi, method = "normexp")
norm <- normalizeBetweenArrays(bg, method = "cyclicloess")
avg <- avereps(norm, ID=norm$genes$ProbeName)
#fit
contmat <- makeContrasts(s2-s1, levels = design)
fit_lm <- lmFit(avg$E, design)
fit_be <- eBayes(contrasts.fit(fit_lm, contmat))
#results
difexp <- topTable(fit_be, coef = 1, adjust = "BH")
res <- decideTests(fit_be)
comp <- vennDiagram(res)
#plot
pdf("example-heatmap.pdf")
genes <- as.numeric(rownames(topTable(fit_be, n = 100)))
heatmap(fit_lm$coefficients[genes,], col=redgreen(75), key = TRUE)
dev.off()
The full version:
library(vsn)
library(limma)
library(gplots)
set.seed(0xabcd) #from the vsn manual
#constants
A_SUB__ <- 1
A_NXP__ <- 2
A_MIN__ <- 3
A_VSN__ <- 1
A_QNT__ <- 2
A_LWS__ <- 3
M_SUB__ <- "subtract"
M_NXP__ <- "normexp"
M_MIN__ <- "minimum"
M_VSN__ <- "vsn"
M_LWS__ <- "cyclicloess"
M_QNT__ <- "quantile"
COL__ <- list(G="Raw intensity (tr.mean) {A}",Gb="Background
(tr.mean) {A}")
ANO__ <- c("Position", "Name", "ID", "Description", "GeneName")
N_BGM__ <- 3 #number of background correction methods
N_NRM__ <- 3 #number of normalization methods
A_BGM__ <- c(A_SUB__, A_NXP__, A_MIN__)
A_NRM__ <- c(A_VSN__, A_QNT__, A_LWS__)
#utility methods
#applies subtraction background correction with limma
bg_subtract <- function(x){
return(list(norm = x$norm,
bg = M_SUB__,
agi = backgroundCorrect(x$agi, method = M_SUB__)))
}
#applies normexp backgroud correction with limma
bg_normexp <- function(x){
return(list(norm = x$norm,
bg = M_NXP__,
agi = backgroundCorrect(x$agi, method = M_NXP__)))
}
#applies minimum background correction with limma
bg_minimum <- function(x){
return(list(norm = x$norm,
bg = M_MIN__,
agi = backgroundCorrect(x$agi, method = M_MIN__)))
}
#applies VSN normalization with VSN package
nb_vsn <- function(x){
return(list(norm = M_VSN__,
bg = x$bg,
agi = normalizeVSN(x$agi)))
}
#applies VSN normalization with VSN package
nb_vsn <- function(x){
return(list(norm = M_VSN__,
bg = x$bg,
agi = normalizeVSN(x$agi)))
}
#applies cyclic lowess normalization with limma
nb_lowess <- function(x){
return(list(norm = M_LWS__,
bg = x$bg,
agi = normalizeBetweenArrays(x$agi, method =
M_LWS__)))
}
#applies quantile normalization with limma
nb_quantile <- function(x){
return(list(norm = M_QNT__,
bg = x$bg,
agi = normalizeBetweenArrays(x$agi, method =
M_QNT__)))
}
#generates a plot of MA values with limma
pl_ma <- function(x){
name <- paste(c(x[[1]]$bg, x[[1]]$norm, "-ma.pdf"), collapse="")
pdf(name)
plotMA(x[[1]]$agi$E)
dev.off()
}
#generates a plot of mean versus stdev with VS
pl_meansd <- function(x){
name <- paste(c(x[[1]]$bg, x[[1]]$norm, "-meansd.pdf"),
collapse="")
pdf(name)
meanSdPlot(x[[1]]$agi$E)
dev.off()
}
#generate volcanoplot of log-fold versus log-odds
pl_volcano <- function(x){
name <- paste(c(x[[1]]$bg, x[[1]]$norm, "-volcano.pdf"),
collapse="")
pdf(name)
volcanoplot(x[[1]]$bayes)
dev.off()
}
#generate plot of expression data over time
pl_lines <- function(x){
name <- paste(c(x[[1]]$bg, x[[1]]$norm, "-lines.pdf"),
collapse="")
pdf(name)
plotlines(x[[1]]$agi$E)
dev.off()
}
#generate heatmat of final results
pl_heatmap <- function(x, ntop){
name <- paste(c(x[[1]]$bg, x[[1]]$norm, "-heatmap.pdf"),
collapse="")
pdf(name)
genes <- as.numeric(rownames(topTable(x[[1]]$bayes, n = ntop)))
heatmap(x[[1]]$lm$coefficients[genes,],
col = redgreen(75),
key = TRUE)
dev.off()
}
#averages replicates of probes
m_repavg <- function(x){
return(list(agi = x[[1]]$agi,
avg = avereps(x[[1]]$agi,
ID=x[[1]]$agi$genes$ProbeName),
bg = x[[1]]$bg,
norm = x[[1]]$norm))
}
#generate contrasts
m_fit <- function(x, design){
contmat <- makeContrasts(s2-s1,
levels = design)
fit1 <- lmFit(x[[1]]$agi$E, design)
fit2 <- eBayes(contrasts.fit(fit1, contmat))
diff <- topTable(fit2, coef = 1, adjust = "BH")
res <- decideTests(fit2)
comp <- vennDiagram(res)
return(list(agi = x[[1]]$agi,
bg = x[[1]]$bg,
norm = x[[1]]$norm,
ave = x[[1]]$ave,
cont = contmat,
lm = fit1,
bayes = fit2,
diff = diff,
res = res,
comp = comp))
}
######################################################################
#########
# Function to parse array data into limma objects
# Args:
# tfile: target file (see limma documentation)
# tsep: limma defaults TSV, have to set manually for CSV and so
on
# galfile: GAL file for printer/layout
# Returns data object with targets, genes and printer layout
######################################################################
#########
parse <- function(tfile, tsep, galfile){
targets <- readTargets(tfile, sep=tsep)
gal <- readGAL(galfile)
layout <- getLayout(gal)
agi <- read.maimages(targets, columns = COL__,
annotation = ANO__,
green.only = TRUE)
bg <- "none"
norm <- "none"
desg <- model.matrix(~0+factor(c(1,1,2,2)))
colnames(desg) <- c("s1","s2")
d <- list(agi = agi,
gal = gal,
layout = layout,
targets = targets,
bg = bg,
desg = desg,
norm = norm)
processed <- matrix(NA, nrow = N_BGM__, ncol = N_NRM__)
processed <- apply(processed, 1:2, function(x){return(d)})
return(processed)
}
#parse command args
args <- commandArgs(trailingOnly = TRUE)
#load files into matrix
a <- parse(args[1], args[2], args[3])
#keepy a copy of a raw dataset
raw <- a[1,1]
print(raw[[1]]$desg)
#perform backgroung corrections
a[,A_SUB__] <- lapply(a[,A_SUB__], bg_subtract)
a[,A_NXP__] <- lapply(a[,A_NXP__], bg_normexp)
a[,A_MIN__] <- lapply(a[,A_MIN__], bg_minimum)
#perform normalization
a[A_VSN__,] <- lapply(a[A_VSN__,], nb_vsn)
a[A_LWS__,] <- lapply(a[A_LWS__,], nb_lowess)
a[A_QNT__,] <- lapply(a[A_QNT__,], nb_quantile)
#fit model and differentiate expression
a <- apply(a, 1:2, m_repavg)
a <- apply(a, 1:2, m_fit, design = raw[[1]]$desg)
#generate plots
apply(a, 1:2, pl_ma)
apply(a, 1:2, pl_meansd)
apply(a, 1:2, pl_volcano)
apply(a, 1:2, pl_lines)
apply(a, 1:2, pl_heatmap, args[4])
[[alternative HTML version deleted]]