Heatmaps for Agilent Single Color 4x44k using limma and vsn.
1
0
Entering edit mode
@cornish-joseph-nihniaid-f-5864
Last seen 10.3 years ago
I am attempting to write a script that will allow me to do bulk benchmarking of various methods for normalization and background correction. I am concerned that I am not processing the data correctly and this may be leading to the abnormal heatmaps that I am seeing. For a test case, I am using an experiment where I have two different conditions with two different states with two replicates per state. The heatmaps concern me because of the uniformity between them, the values are all completely green or completely red, there are differences between genes within a state according to the heatmap. In the examples I have seen people have suggested that I use the coefficients from the eBayes fitting of the data, however in my example the eBayes LMarray object does not have two columns in the results, only on (not sure if this is a sign of errors on my part). I have instead used the lmFit coefficients since there are column for each matrix. To further confuse someone who is new to R, the simplified example and full code generate different heatmaps for the same data and same methods (normexp for background, cyclic loess for norm). The differences being in the simplified version the gene IDs show up on the right side but in the full version, only row number appears. The color for some genes changes between states as well. My questions are: Why does the bayes model only have one column? Am I approaching the processing correctly? Why are there differences between the resulting heatmaps of each version? I know that the idea is to keep the amount of posted code to a minimum but I don't see any other way here. Here is the simplified example I have made: library(limma) library(vsn) library(gplots) set.seed(0xabcd) #from vsn manual #constants for reading agi outs from our imager COL__ <- list(G="Raw intensity (tr.mean) {A}",Gb="Background (tr.mean) {A}") ANO__ <- c("Position", "Name", "ID", "Description", "GeneName") #parse data targets <- readTargets("test.csv", sep = ",") gal <- readGAL("026806_D_20110524.gal") layout <- getLayout(gal) agi <- read.maimages(targets, columns = COL__, annotation = ANO__, green.only = TRUE) design <- model.matrix(~0+factor(c(1,1,2,2))) colnames(design) <- c("s1", "s2") #process bg <- backgroundCorrect(agi, method = "normexp") norm <- normalizeBetweenArrays(bg, method = "cyclicloess") avg <- avereps(norm, ID=norm$genes$ProbeName) #fit contmat <- makeContrasts(s2-s1, levels = design) fit_lm <- lmFit(avg$E, design) fit_be <- eBayes(contrasts.fit(fit_lm, contmat)) #results difexp <- topTable(fit_be, coef = 1, adjust = "BH") res <- decideTests(fit_be) comp <- vennDiagram(res) #plot pdf("example-heatmap.pdf") genes <- as.numeric(rownames(topTable(fit_be, n = 100))) heatmap(fit_lm$coefficients[genes,], col=redgreen(75), key = TRUE) dev.off() The full version: library(vsn) library(limma) library(gplots) set.seed(0xabcd) #from the vsn manual #constants A_SUB__ <- 1 A_NXP__ <- 2 A_MIN__ <- 3 A_VSN__ <- 1 A_QNT__ <- 2 A_LWS__ <- 3 M_SUB__ <- "subtract" M_NXP__ <- "normexp" M_MIN__ <- "minimum" M_VSN__ <- "vsn" M_LWS__ <- "cyclicloess" M_QNT__ <- "quantile" COL__ <- list(G="Raw intensity (tr.mean) {A}",Gb="Background (tr.mean) {A}") ANO__ <- c("Position", "Name", "ID", "Description", "GeneName") N_BGM__ <- 3 #number of background correction methods N_NRM__ <- 3 #number of normalization methods A_BGM__ <- c(A_SUB__, A_NXP__, A_MIN__) A_NRM__ <- c(A_VSN__, A_QNT__, A_LWS__) #utility methods #applies subtraction background correction with limma bg_subtract <- function(x){ return(list(norm = x$norm, bg = M_SUB__, agi = backgroundCorrect(x$agi, method = M_SUB__))) } #applies normexp backgroud correction with limma bg_normexp <- function(x){ return(list(norm = x$norm, bg = M_NXP__, agi = backgroundCorrect(x$agi, method = M_NXP__))) } #applies minimum background correction with limma bg_minimum <- function(x){ return(list(norm = x$norm, bg = M_MIN__, agi = backgroundCorrect(x$agi, method = M_MIN__))) } #applies VSN normalization with VSN package nb_vsn <- function(x){ return(list(norm = M_VSN__, bg = x$bg, agi = normalizeVSN(x$agi))) } #applies VSN normalization with VSN package nb_vsn <- function(x){ return(list(norm = M_VSN__, bg = x$bg, agi = normalizeVSN(x$agi))) } #applies cyclic lowess normalization with limma nb_lowess <- function(x){ return(list(norm = M_LWS__, bg = x$bg, agi = normalizeBetweenArrays(x$agi, method = M_LWS__))) } #applies quantile normalization with limma nb_quantile <- function(x){ return(list(norm = M_QNT__, bg = x$bg, agi = normalizeBetweenArrays(x$agi, method = M_QNT__))) } #generates a plot of MA values with limma pl_ma <- function(x){ name <- paste(c(x[[1]]$bg, x[[1]]$norm, "-ma.pdf"), collapse="") pdf(name) plotMA(x[[1]]$agi$E) dev.off() } #generates a plot of mean versus stdev with VS pl_meansd <- function(x){ name <- paste(c(x[[1]]$bg, x[[1]]$norm, "-meansd.pdf"), collapse="") pdf(name) meanSdPlot(x[[1]]$agi$E) dev.off() } #generate volcanoplot of log-fold versus log-odds pl_volcano <- function(x){ name <- paste(c(x[[1]]$bg, x[[1]]$norm, "-volcano.pdf"), collapse="") pdf(name) volcanoplot(x[[1]]$bayes) dev.off() } #generate plot of expression data over time pl_lines <- function(x){ name <- paste(c(x[[1]]$bg, x[[1]]$norm, "-lines.pdf"), collapse="") pdf(name) plotlines(x[[1]]$agi$E) dev.off() } #generate heatmat of final results pl_heatmap <- function(x, ntop){ name <- paste(c(x[[1]]$bg, x[[1]]$norm, "-heatmap.pdf"), collapse="") pdf(name) genes <- as.numeric(rownames(topTable(x[[1]]$bayes, n = ntop))) heatmap(x[[1]]$lm$coefficients[genes,], col = redgreen(75), key = TRUE) dev.off() } #averages replicates of probes m_repavg <- function(x){ return(list(agi = x[[1]]$agi, avg = avereps(x[[1]]$agi, ID=x[[1]]$agi$genes$ProbeName), bg = x[[1]]$bg, norm = x[[1]]$norm)) } #generate contrasts m_fit <- function(x, design){ contmat <- makeContrasts(s2-s1, levels = design) fit1 <- lmFit(x[[1]]$agi$E, design) fit2 <- eBayes(contrasts.fit(fit1, contmat)) diff <- topTable(fit2, coef = 1, adjust = "BH") res <- decideTests(fit2) comp <- vennDiagram(res) return(list(agi = x[[1]]$agi, bg = x[[1]]$bg, norm = x[[1]]$norm, ave = x[[1]]$ave, cont = contmat, lm = fit1, bayes = fit2, diff = diff, res = res, comp = comp)) } ###################################################################### ######### # Function to parse array data into limma objects # Args: # tfile: target file (see limma documentation) # tsep: limma defaults TSV, have to set manually for CSV and so on # galfile: GAL file for printer/layout # Returns data object with targets, genes and printer layout ###################################################################### ######### parse <- function(tfile, tsep, galfile){ targets <- readTargets(tfile, sep=tsep) gal <- readGAL(galfile) layout <- getLayout(gal) agi <- read.maimages(targets, columns = COL__, annotation = ANO__, green.only = TRUE) bg <- "none" norm <- "none" desg <- model.matrix(~0+factor(c(1,1,2,2))) colnames(desg) <- c("s1","s2") d <- list(agi = agi, gal = gal, layout = layout, targets = targets, bg = bg, desg = desg, norm = norm) processed <- matrix(NA, nrow = N_BGM__, ncol = N_NRM__) processed <- apply(processed, 1:2, function(x){return(d)}) return(processed) } #parse command args args <- commandArgs(trailingOnly = TRUE) #load files into matrix a <- parse(args[1], args[2], args[3]) #keepy a copy of a raw dataset raw <- a[1,1] print(raw[[1]]$desg) #perform backgroung corrections a[,A_SUB__] <- lapply(a[,A_SUB__], bg_subtract) a[,A_NXP__] <- lapply(a[,A_NXP__], bg_normexp) a[,A_MIN__] <- lapply(a[,A_MIN__], bg_minimum) #perform normalization a[A_VSN__,] <- lapply(a[A_VSN__,], nb_vsn) a[A_LWS__,] <- lapply(a[A_LWS__,], nb_lowess) a[A_QNT__,] <- lapply(a[A_QNT__,], nb_quantile) #fit model and differentiate expression a <- apply(a, 1:2, m_repavg) a <- apply(a, 1:2, m_fit, design = raw[[1]]$desg) #generate plots apply(a, 1:2, pl_ma) apply(a, 1:2, pl_meansd) apply(a, 1:2, pl_volcano) apply(a, 1:2, pl_lines) apply(a, 1:2, pl_heatmap, args[4]) [[alternative HTML version deleted]]
Annotation Normalization vsn limma Annotation Normalization vsn limma • 1.9k views
ADD COMMENT
0
Entering edit mode
@james-w-macdonald-5106
Last seen 2 days ago
United States
Hi Joseph, On 3/28/2013 9:27 AM, Cornish, Joseph (NIH/NIAID) [F] wrote: > I am attempting to write a script that will allow me to do bulk benchmarking of various methods for normalization and background correction. I am concerned that I am not processing the data correctly and this may be leading to the abnormal heatmaps that I am seeing. For a test case, I am using an experiment where I have two different conditions with two different states with two replicates per state. > > The heatmaps concern me because of the uniformity between them, the values are all completely green or completely red, there are differences between genes within a state according to the heatmap. > > In the examples I have seen people have suggested that I use the coefficients from the eBayes fitting of the data, however in my example the eBayes LMarray object does not have two columns in the results, only on (not sure if this is a sign of errors on my part). I have instead used the lmFit coefficients since there are column for each matrix. > > To further confuse someone who is new to R, the simplified example and full code generate different heatmaps for the same data and same methods (normexp for background, cyclic loess for norm). The differences being in the simplified version the gene IDs show up on the right side but in the full version, only row number appears. The color for some genes changes between states as well. > > My questions are: > Why does the bayes model only have one column? Because you are computing one contrast (which is all you can do). One contrast means one test which means one column of results. In other words, the coefficient you are estimating in the contrasts.fit() step is the difference in means between the two groups, which is one value per gene. > Am I approaching the processing correctly? I tried to read through that business below, but got some weird affliction due to an apparent overdose of underscores and had to stop. > Why are there differences between the resulting heatmaps of each version? I don't know, as I worried my brain might explode if I read any more. But you should note that it is fairly unusual to plot coefficients in a heatmap, as what you are then presenting are log ratios, and most people can't understand that. Instead you might consider getting the list of genes, and then creating a heatmap using the normalized expression values for each sample instead. You then would want to scale in some way. We often sweep the mean of the controls out of the matrix (by row), so the colors can then be interpreted as differences between the treated samples and controls. Best, Jim > > I know that the idea is to keep the amount of posted code to a minimum but I don't see any other way here. > > Here is the simplified example I have made: > library(limma) > library(vsn) > library(gplots) > > set.seed(0xabcd) #from vsn manual > > #constants for reading agi outs from our imager > COL__<- list(G="Raw intensity (tr.mean) {A}",Gb="Background (tr.mean) {A}") > ANO__<- c("Position", "Name", "ID", "Description", "GeneName") > > #parse data > targets<- readTargets("test.csv", sep = ",") > gal<- readGAL("026806_D_20110524.gal") > layout<- getLayout(gal) > agi<- read.maimages(targets, columns = COL__, > annotation = ANO__, > green.only = TRUE) > design<- model.matrix(~0+factor(c(1,1,2,2))) > colnames(design)<- c("s1", "s2") > > #process > bg<- backgroundCorrect(agi, method = "normexp") > norm<- normalizeBetweenArrays(bg, method = "cyclicloess") > avg<- avereps(norm, ID=norm$genes$ProbeName) > > #fit > contmat<- makeContrasts(s2-s1, levels = design) > fit_lm<- lmFit(avg$E, design) > fit_be<- eBayes(contrasts.fit(fit_lm, contmat)) > > #results > difexp<- topTable(fit_be, coef = 1, adjust = "BH") > res<- decideTests(fit_be) > comp<- vennDiagram(res) > > #plot > pdf("example-heatmap.pdf") > genes<- as.numeric(rownames(topTable(fit_be, n = 100))) > heatmap(fit_lm$coefficients[genes,], col=redgreen(75), key = TRUE) > dev.off() > > > The full version: > library(vsn) > library(limma) > library(gplots) > set.seed(0xabcd) #from the vsn manual > > #constants > A_SUB__<- 1 > A_NXP__<- 2 > A_MIN__<- 3 > A_VSN__<- 1 > A_QNT__<- 2 > A_LWS__<- 3 > M_SUB__<- "subtract" > M_NXP__<- "normexp" > M_MIN__<- "minimum" > M_VSN__<- "vsn" > M_LWS__<- "cyclicloess" > M_QNT__<- "quantile" > COL__<- list(G="Raw intensity (tr.mean) {A}",Gb="Background (tr.mean) {A}") > ANO__<- c("Position", "Name", "ID", "Description", "GeneName") > N_BGM__<- 3 #number of background correction methods > N_NRM__<- 3 #number of normalization methods > A_BGM__<- c(A_SUB__, A_NXP__, A_MIN__) > A_NRM__<- c(A_VSN__, A_QNT__, A_LWS__) > > > #utility methods > #applies subtraction background correction with limma > bg_subtract<- function(x){ > return(list(norm = x$norm, > bg = M_SUB__, > agi = backgroundCorrect(x$agi, method = M_SUB__))) > } > > #applies normexp backgroud correction with limma > bg_normexp<- function(x){ > return(list(norm = x$norm, > bg = M_NXP__, > agi = backgroundCorrect(x$agi, method = M_NXP__))) > } > > #applies minimum background correction with limma > bg_minimum<- function(x){ > return(list(norm = x$norm, > bg = M_MIN__, > agi = backgroundCorrect(x$agi, method = M_MIN__))) > } > > #applies VSN normalization with VSN package > nb_vsn<- function(x){ > return(list(norm = M_VSN__, > bg = x$bg, > agi = normalizeVSN(x$agi))) > } > > #applies VSN normalization with VSN package > nb_vsn<- function(x){ > return(list(norm = M_VSN__, > bg = x$bg, > agi = normalizeVSN(x$agi))) > } > > #applies cyclic lowess normalization with limma > nb_lowess<- function(x){ > return(list(norm = M_LWS__, > bg = x$bg, > agi = normalizeBetweenArrays(x$agi, method = M_LWS__))) > } > > #applies quantile normalization with limma > nb_quantile<- function(x){ > return(list(norm = M_QNT__, > bg = x$bg, > agi = normalizeBetweenArrays(x$agi, method = M_QNT__))) > } > > #generates a plot of MA values with limma > pl_ma<- function(x){ > name<- paste(c(x[[1]]$bg, x[[1]]$norm, "-ma.pdf"), collapse="") > pdf(name) > plotMA(x[[1]]$agi$E) > dev.off() > } > > #generates a plot of mean versus stdev with VS > pl_meansd<- function(x){ > name<- paste(c(x[[1]]$bg, x[[1]]$norm, "-meansd.pdf"), collapse="") > pdf(name) > meanSdPlot(x[[1]]$agi$E) > dev.off() > } > > #generate volcanoplot of log-fold versus log-odds > pl_volcano<- function(x){ > name<- paste(c(x[[1]]$bg, x[[1]]$norm, "-volcano.pdf"), collapse="") > pdf(name) > volcanoplot(x[[1]]$bayes) > dev.off() > } > > #generate plot of expression data over time > pl_lines<- function(x){ > name<- paste(c(x[[1]]$bg, x[[1]]$norm, "-lines.pdf"), collapse="") > pdf(name) > plotlines(x[[1]]$agi$E) > dev.off() > } > > #generate heatmat of final results > pl_heatmap<- function(x, ntop){ > name<- paste(c(x[[1]]$bg, x[[1]]$norm, "-heatmap.pdf"), collapse="") > pdf(name) > genes<- as.numeric(rownames(topTable(x[[1]]$bayes, n = ntop))) > heatmap(x[[1]]$lm$coefficients[genes,], > col = redgreen(75), > key = TRUE) > dev.off() > } > > #averages replicates of probes > m_repavg<- function(x){ > return(list(agi = x[[1]]$agi, > avg = avereps(x[[1]]$agi, ID=x[[1]]$agi$genes$ProbeName), > bg = x[[1]]$bg, > norm = x[[1]]$norm)) > } > > #generate contrasts > m_fit<- function(x, design){ > contmat<- makeContrasts(s2-s1, > levels = design) > fit1<- lmFit(x[[1]]$agi$E, design) > fit2<- eBayes(contrasts.fit(fit1, contmat)) > diff<- topTable(fit2, coef = 1, adjust = "BH") > res<- decideTests(fit2) > comp<- vennDiagram(res) > return(list(agi = x[[1]]$agi, > bg = x[[1]]$bg, > norm = x[[1]]$norm, > ave = x[[1]]$ave, > cont = contmat, > lm = fit1, > bayes = fit2, > diff = diff, > res = res, > comp = comp)) > } > > #################################################################### ########### > # Function to parse array data into limma objects > # Args: > # tfile: target file (see limma documentation) > # tsep: limma defaults TSV, have to set manually for CSV and so on > # galfile: GAL file for printer/layout > # Returns data object with targets, genes and printer layout > #################################################################### ########### > parse<- function(tfile, tsep, galfile){ > targets<- readTargets(tfile, sep=tsep) > gal<- readGAL(galfile) > layout<- getLayout(gal) > agi<- read.maimages(targets, columns = COL__, > annotation = ANO__, > green.only = TRUE) > bg<- "none" > norm<- "none" > desg<- model.matrix(~0+factor(c(1,1,2,2))) > colnames(desg)<- c("s1","s2") > d<- list(agi = agi, > gal = gal, > layout = layout, > targets = targets, > bg = bg, > desg = desg, > norm = norm) > > processed<- matrix(NA, nrow = N_BGM__, ncol = N_NRM__) > processed<- apply(processed, 1:2, function(x){return(d)}) > return(processed) > } > > #parse command args > args<- commandArgs(trailingOnly = TRUE) > > #load files into matrix > a<- parse(args[1], args[2], args[3]) > > #keepy a copy of a raw dataset > raw<- a[1,1] > print(raw[[1]]$desg) > > #perform backgroung corrections > a[,A_SUB__]<- lapply(a[,A_SUB__], bg_subtract) > a[,A_NXP__]<- lapply(a[,A_NXP__], bg_normexp) > a[,A_MIN__]<- lapply(a[,A_MIN__], bg_minimum) > > #perform normalization > a[A_VSN__,]<- lapply(a[A_VSN__,], nb_vsn) > a[A_LWS__,]<- lapply(a[A_LWS__,], nb_lowess) > a[A_QNT__,]<- lapply(a[A_QNT__,], nb_quantile) > > #fit model and differentiate expression > a<- apply(a, 1:2, m_repavg) > a<- apply(a, 1:2, m_fit, design = raw[[1]]$desg) > > #generate plots > apply(a, 1:2, pl_ma) > apply(a, 1:2, pl_meansd) > apply(a, 1:2, pl_volcano) > apply(a, 1:2, pl_lines) > apply(a, 1:2, pl_heatmap, args[4]) > > > [[alternative HTML version deleted]] > > _______________________________________________ > Bioconductor mailing list > Bioconductor at r-project.org > https://stat.ethz.ch/mailman/listinfo/bioconductor > Search the archives: http://news.gmane.org/gmane.science.biology.informatics.conductor -- James W. MacDonald, M.S. Biostatistician University of Washington Environmental and Occupational Health Sciences 4225 Roosevelt Way NE, # 100 Seattle WA 98105-6099
ADD COMMENT
0
Entering edit mode
Hello James, Are you referring to function names or constants? The constants are an old habit I picked up long ago in my CS classes, there are a ton but it makes them stand out for me in vim more readily. I've come to avoid camel case since switching to linux, it makes searching for things a nightmare, it can also help if I group functions with a prefix (eg plot functions start with "pl_"). If you want I can find+replace all of the constants with their values... The differences came from using the unaveraged replicates in the larger script versus the average in the example. I'm not sure what you mean by "mean of the controls", would this simply be filtering out the control spots? I'll generate the heatmaps using the normalized expression values. I suppose in that case it would be worthwhile to filter ahead of time by p-value and fold change (the coef in topTable). _________thanks_____________ _______Joe______ -----Original Message----- From: James W. MacDonald [mailto:jmacdon@uw.edu] Sent: Thursday, March 28, 2013 10:08 AM To: Cornish, Joseph (NIH/NIAID) [F] Cc: bioconductor at r-project.org Subject: Re: [BioC] Heatmaps for Agilent Single Color 4x44k using limma and vsn. Hi Joseph, On 3/28/2013 9:27 AM, Cornish, Joseph (NIH/NIAID) [F] wrote: > I am attempting to write a script that will allow me to do bulk benchmarking of various methods for normalization and background correction. I am concerned that I am not processing the data correctly and this may be leading to the abnormal heatmaps that I am seeing. For a test case, I am using an experiment where I have two different conditions with two different states with two replicates per state. > > The heatmaps concern me because of the uniformity between them, the values are all completely green or completely red, there are differences between genes within a state according to the heatmap. > > In the examples I have seen people have suggested that I use the coefficients from the eBayes fitting of the data, however in my example the eBayes LMarray object does not have two columns in the results, only on (not sure if this is a sign of errors on my part). I have instead used the lmFit coefficients since there are column for each matrix. > > To further confuse someone who is new to R, the simplified example and full code generate different heatmaps for the same data and same methods (normexp for background, cyclic loess for norm). The differences being in the simplified version the gene IDs show up on the right side but in the full version, only row number appears. The color for some genes changes between states as well. > > My questions are: > Why does the bayes model only have one column? Because you are computing one contrast (which is all you can do). One contrast means one test which means one column of results. In other words, the coefficient you are estimating in the contrasts.fit() step is the difference in means between the two groups, which is one value per gene. > Am I approaching the processing correctly? I tried to read through that business below, but got some weird affliction due to an apparent overdose of underscores and had to stop. > Why are there differences between the resulting heatmaps of each version? I don't know, as I worried my brain might explode if I read any more. But you should note that it is fairly unusual to plot coefficients in a heatmap, as what you are then presenting are log ratios, and most people can't understand that. Instead you might consider getting the list of genes, and then creating a heatmap using the normalized expression values for each sample instead. You then would want to scale in some way. We often sweep the mean of the controls out of the matrix (by row), so the colors can then be interpreted as differences between the treated samples and controls. Best, Jim > > I know that the idea is to keep the amount of posted code to a minimum but I don't see any other way here. > > Here is the simplified example I have made: > library(limma) > library(vsn) > library(gplots) > > set.seed(0xabcd) #from vsn manual > > #constants for reading agi outs from our imager > COL__<- list(G="Raw intensity (tr.mean) {A}",Gb="Background (tr.mean) > {A}") > ANO__<- c("Position", "Name", "ID", "Description", "GeneName") > > #parse data > targets<- readTargets("test.csv", sep = ",") > gal<- readGAL("026806_D_20110524.gal") > layout<- getLayout(gal) > agi<- read.maimages(targets, columns = COL__, > annotation = ANO__, > green.only = TRUE) > design<- model.matrix(~0+factor(c(1,1,2,2))) > colnames(design)<- c("s1", "s2") > > #process > bg<- backgroundCorrect(agi, method = "normexp") > norm<- normalizeBetweenArrays(bg, method = "cyclicloess") > avg<- avereps(norm, ID=norm$genes$ProbeName) > > #fit > contmat<- makeContrasts(s2-s1, levels = design) > fit_lm<- lmFit(avg$E, design) > fit_be<- eBayes(contrasts.fit(fit_lm, contmat)) > > #results > difexp<- topTable(fit_be, coef = 1, adjust = "BH") > res<- decideTests(fit_be) > comp<- vennDiagram(res) > > #plot > pdf("example-heatmap.pdf") > genes<- as.numeric(rownames(topTable(fit_be, n = 100))) > heatmap(fit_lm$coefficients[genes,], col=redgreen(75), key = TRUE) > dev.off() > > > The full version: > library(vsn) > library(limma) > library(gplots) > set.seed(0xabcd) #from the vsn manual > > #constants > A_SUB__<- 1 > A_NXP__<- 2 > A_MIN__<- 3 > A_VSN__<- 1 > A_QNT__<- 2 > A_LWS__<- 3 > M_SUB__<- "subtract" > M_NXP__<- "normexp" > M_MIN__<- "minimum" > M_VSN__<- "vsn" > M_LWS__<- "cyclicloess" > M_QNT__<- "quantile" > COL__<- list(G="Raw intensity (tr.mean) {A}",Gb="Background (tr.mean) > {A}") > ANO__<- c("Position", "Name", "ID", "Description", "GeneName") > N_BGM__<- 3 #number of background correction methods > N_NRM__<- 3 #number of normalization methods > A_BGM__<- c(A_SUB__, A_NXP__, A_MIN__) > A_NRM__<- c(A_VSN__, A_QNT__, A_LWS__) > > > #utility methods > #applies subtraction background correction with limma > bg_subtract<- function(x){ > return(list(norm = x$norm, > bg = M_SUB__, > agi = backgroundCorrect(x$agi, method = M_SUB__))) } > > #applies normexp backgroud correction with limma > bg_normexp<- function(x){ > return(list(norm = x$norm, > bg = M_NXP__, > agi = backgroundCorrect(x$agi, method = M_NXP__))) } > > #applies minimum background correction with limma > bg_minimum<- function(x){ > return(list(norm = x$norm, > bg = M_MIN__, > agi = backgroundCorrect(x$agi, method = M_MIN__))) } > > #applies VSN normalization with VSN package > nb_vsn<- function(x){ > return(list(norm = M_VSN__, > bg = x$bg, > agi = normalizeVSN(x$agi))) } > > #applies VSN normalization with VSN package > nb_vsn<- function(x){ > return(list(norm = M_VSN__, > bg = x$bg, > agi = normalizeVSN(x$agi))) } > > #applies cyclic lowess normalization with limma > nb_lowess<- function(x){ > return(list(norm = M_LWS__, > bg = x$bg, > agi = normalizeBetweenArrays(x$agi, method = > M_LWS__))) } > > #applies quantile normalization with limma > nb_quantile<- function(x){ > return(list(norm = M_QNT__, > bg = x$bg, > agi = normalizeBetweenArrays(x$agi, method = > M_QNT__))) } > > #generates a plot of MA values with limma > pl_ma<- function(x){ > name<- paste(c(x[[1]]$bg, x[[1]]$norm, "-ma.pdf"), collapse="") > pdf(name) > plotMA(x[[1]]$agi$E) > dev.off() > } > > #generates a plot of mean versus stdev with VS > pl_meansd<- function(x){ > name<- paste(c(x[[1]]$bg, x[[1]]$norm, "-meansd.pdf"), collapse="") > pdf(name) > meanSdPlot(x[[1]]$agi$E) > dev.off() > } > > #generate volcanoplot of log-fold versus log-odds > pl_volcano<- function(x){ > name<- paste(c(x[[1]]$bg, x[[1]]$norm, "-volcano.pdf"), collapse="") > pdf(name) > volcanoplot(x[[1]]$bayes) > dev.off() > } > > #generate plot of expression data over time > pl_lines<- function(x){ > name<- paste(c(x[[1]]$bg, x[[1]]$norm, "-lines.pdf"), collapse="") > pdf(name) > plotlines(x[[1]]$agi$E) > dev.off() > } > > #generate heatmat of final results > pl_heatmap<- function(x, ntop){ > name<- paste(c(x[[1]]$bg, x[[1]]$norm, "-heatmap.pdf"), collapse="") > pdf(name) > genes<- as.numeric(rownames(topTable(x[[1]]$bayes, n = ntop))) > heatmap(x[[1]]$lm$coefficients[genes,], > col = redgreen(75), > key = TRUE) > dev.off() > } > > #averages replicates of probes > m_repavg<- function(x){ > return(list(agi = x[[1]]$agi, > avg = avereps(x[[1]]$agi, ID=x[[1]]$agi$genes$ProbeName), > bg = x[[1]]$bg, > norm = x[[1]]$norm)) > } > > #generate contrasts > m_fit<- function(x, design){ > contmat<- makeContrasts(s2-s1, > levels = design) > fit1<- lmFit(x[[1]]$agi$E, design) > fit2<- eBayes(contrasts.fit(fit1, contmat)) > diff<- topTable(fit2, coef = 1, adjust = "BH") > res<- decideTests(fit2) > comp<- vennDiagram(res) > return(list(agi = x[[1]]$agi, > bg = x[[1]]$bg, > norm = x[[1]]$norm, > ave = x[[1]]$ave, > cont = contmat, > lm = fit1, > bayes = fit2, > diff = diff, > res = res, > comp = comp)) > } > > ###################################################################### > ######### # Function to parse array data into limma objects # Args: > # tfile: target file (see limma documentation) > # tsep: limma defaults TSV, have to set manually for CSV and so on > # galfile: GAL file for printer/layout > # Returns data object with targets, genes and printer layout > ###################################################################### > ######### > parse<- function(tfile, tsep, galfile){ > targets<- readTargets(tfile, sep=tsep) > gal<- readGAL(galfile) > layout<- getLayout(gal) > agi<- read.maimages(targets, columns = COL__, > annotation = ANO__, > green.only = TRUE) > bg<- "none" > norm<- "none" > desg<- model.matrix(~0+factor(c(1,1,2,2))) > colnames(desg)<- c("s1","s2") > d<- list(agi = agi, > gal = gal, > layout = layout, > targets = targets, > bg = bg, > desg = desg, > norm = norm) > > processed<- matrix(NA, nrow = N_BGM__, ncol = N_NRM__) > processed<- apply(processed, 1:2, function(x){return(d)}) > return(processed) > } > > #parse command args > args<- commandArgs(trailingOnly = TRUE) > > #load files into matrix > a<- parse(args[1], args[2], args[3]) > > #keepy a copy of a raw dataset > raw<- a[1,1] > print(raw[[1]]$desg) > > #perform backgroung corrections > a[,A_SUB__]<- lapply(a[,A_SUB__], bg_subtract) > a[,A_NXP__]<- lapply(a[,A_NXP__], bg_normexp) > a[,A_MIN__]<- lapply(a[,A_MIN__], bg_minimum) > > #perform normalization > a[A_VSN__,]<- lapply(a[A_VSN__,], nb_vsn) > a[A_LWS__,]<- lapply(a[A_LWS__,], nb_lowess) > a[A_QNT__,]<- lapply(a[A_QNT__,], nb_quantile) > > #fit model and differentiate expression > a<- apply(a, 1:2, m_repavg) > a<- apply(a, 1:2, m_fit, design = raw[[1]]$desg) > > #generate plots > apply(a, 1:2, pl_ma) > apply(a, 1:2, pl_meansd) > apply(a, 1:2, pl_volcano) > apply(a, 1:2, pl_lines) > apply(a, 1:2, pl_heatmap, args[4]) > > > [[alternative HTML version deleted]] > > _______________________________________________ > Bioconductor mailing list > Bioconductor at r-project.org > https://stat.ethz.ch/mailman/listinfo/bioconductor > Search the archives: > http://news.gmane.org/gmane.science.biology.informatics.conductor -- James W. MacDonald, M.S. Biostatistician University of Washington Environmental and Occupational Health Sciences 4225 Roosevelt Way NE, # 100 Seattle WA 98105-6099
0
Entering edit mode
On Thu, Mar 28, 2013 at 10:22 AM, Cornish, Joseph (NIH/NIAID) [F] < joseph.cornish@nih.gov> wrote: > Hello James, > > Are you referring to function names or constants? The constants are an old > habit I picked up long ago in my CS classes, there are a ton but it makes > them stand out for me in vim more readily. I've come to avoid camel case > since switching to linux, it makes searching for things a nightmare, it can > also help if I group functions with a prefix (eg plot functions start with > "pl_"). If you want I can find+replace all of the constants with their > values... > > Hi, Joe. I think I could help to translate part of what James said. If you could boil your example down to a few lines of reproducible code that show the problem, that would be simpler to interpret and easier to comment on directly. Sometimes in the process of producing such a simplified example, I have found my own misunderstandings. It also sometimes helps to get someone local to give a second pair of eyes. > The differences came from using the unaveraged replicates in the larger > script versus the average in the example. > > I'm not sure what you mean by "mean of the controls", would this simply be > filtering out the control spots? > > No. I think he means for you to subtract the mean of the control samples (or one group of samples) for each gene from all the normalized expression values. > I'll generate the heatmaps using the normalized expression values. I > suppose in that case it would be worthwhile to filter ahead of time by > p-value and fold change (the coef in topTable). > > Yes. Sean > > _________thanks_____________ > _______Joe______ > > -----Original Message----- > From: James W. MacDonald [mailto:jmacdon@uw.edu] > Sent: Thursday, March 28, 2013 10:08 AM > To: Cornish, Joseph (NIH/NIAID) [F] > Cc: bioconductor@r-project.org > Subject: Re: [BioC] Heatmaps for Agilent Single Color 4x44k using limma > and vsn. > > Hi Joseph, > > > On 3/28/2013 9:27 AM, Cornish, Joseph (NIH/NIAID) [F] wrote: > > I am attempting to write a script that will allow me to do bulk > benchmarking of various methods for normalization and background > correction. I am concerned that I am not processing the data correctly and > this may be leading to the abnormal heatmaps that I am seeing. For a test > case, I am using an experiment where I have two different conditions with > two different states with two replicates per state. > > > > The heatmaps concern me because of the uniformity between them, the > values are all completely green or completely red, there are differences > between genes within a state according to the heatmap. > > > > In the examples I have seen people have suggested that I use the > coefficients from the eBayes fitting of the data, however in my example the > eBayes LMarray object does not have two columns in the results, only on > (not sure if this is a sign of errors on my part). I have instead used the > lmFit coefficients since there are column for each matrix. > > > > To further confuse someone who is new to R, the simplified example and > full code generate different heatmaps for the same data and same methods > (normexp for background, cyclic loess for norm). The differences being in > the simplified version the gene IDs show up on the right side but in the > full version, only row number appears. The color for some genes changes > between states as well. > > > > My questions are: > > Why does the bayes model only have one column? > > Because you are computing one contrast (which is all you can do). One > contrast means one test which means one column of results. In other words, > the coefficient you are estimating in the contrasts.fit() step is the > difference in means between the two groups, which is one value per gene. > > > Am I approaching the processing correctly? > > I tried to read through that business below, but got some weird affliction > due to an apparent overdose of underscores and had to stop. > > > Why are there differences between the resulting heatmaps of each version? > > I don't know, as I worried my brain might explode if I read any more. > But you should note that it is fairly unusual to plot coefficients in a > heatmap, as what you are then presenting are log ratios, and most people > can't understand that. > > Instead you might consider getting the list of genes, and then creating a > heatmap using the normalized expression values for each sample instead. You > then would want to scale in some way. We often sweep the mean of the > controls out of the matrix (by row), so the colors can then be interpreted > as differences between the treated samples and controls. > > Best, > > Jim > > > > > > I know that the idea is to keep the amount of posted code to a minimum > but I don't see any other way here. > > > > Here is the simplified example I have made: > > library(limma) > > library(vsn) > > library(gplots) > > > > set.seed(0xabcd) #from vsn manual > > > > #constants for reading agi outs from our imager > > COL__<- list(G="Raw intensity (tr.mean) {A}",Gb="Background (tr.mean) > > {A}") > > ANO__<- c("Position", "Name", "ID", "Description", "GeneName") > > > > #parse data > > targets<- readTargets("test.csv", sep = ",") > > gal<- readGAL("026806_D_20110524.gal") > > layout<- getLayout(gal) > > agi<- read.maimages(targets, columns = COL__, > > annotation = ANO__, > > green.only = TRUE) > > design<- model.matrix(~0+factor(c(1,1,2,2))) > > colnames(design)<- c("s1", "s2") > > > > #process > > bg<- backgroundCorrect(agi, method = "normexp") > > norm<- normalizeBetweenArrays(bg, method = "cyclicloess") > > avg<- avereps(norm, ID=norm$genes$ProbeName) > > > > #fit > > contmat<- makeContrasts(s2-s1, levels = design) > > fit_lm<- lmFit(avg$E, design) > > fit_be<- eBayes(contrasts.fit(fit_lm, contmat)) > > > > #results > > difexp<- topTable(fit_be, coef = 1, adjust = "BH") > > res<- decideTests(fit_be) > > comp<- vennDiagram(res) > > > > #plot > > pdf("example-heatmap.pdf") > > genes<- as.numeric(rownames(topTable(fit_be, n = 100))) > > heatmap(fit_lm$coefficients[genes,], col=redgreen(75), key = TRUE) > > dev.off() > > > > > > The full version: > > library(vsn) > > library(limma) > > library(gplots) > > set.seed(0xabcd) #from the vsn manual > > > > #constants > > A_SUB__<- 1 > > A_NXP__<- 2 > > A_MIN__<- 3 > > A_VSN__<- 1 > > A_QNT__<- 2 > > A_LWS__<- 3 > > M_SUB__<- "subtract" > > M_NXP__<- "normexp" > > M_MIN__<- "minimum" > > M_VSN__<- "vsn" > > M_LWS__<- "cyclicloess" > > M_QNT__<- "quantile" > > COL__<- list(G="Raw intensity (tr.mean) {A}",Gb="Background (tr.mean) > > {A}") > > ANO__<- c("Position", "Name", "ID", "Description", "GeneName") > > N_BGM__<- 3 #number of background correction methods > > N_NRM__<- 3 #number of normalization methods > > A_BGM__<- c(A_SUB__, A_NXP__, A_MIN__) > > A_NRM__<- c(A_VSN__, A_QNT__, A_LWS__) > > > > > > #utility methods > > #applies subtraction background correction with limma > > bg_subtract<- function(x){ > > return(list(norm = x$norm, > > bg = M_SUB__, > > agi = backgroundCorrect(x$agi, method = M_SUB__))) } > > > > #applies normexp backgroud correction with limma > > bg_normexp<- function(x){ > > return(list(norm = x$norm, > > bg = M_NXP__, > > agi = backgroundCorrect(x$agi, method = M_NXP__))) } > > > > #applies minimum background correction with limma > > bg_minimum<- function(x){ > > return(list(norm = x$norm, > > bg = M_MIN__, > > agi = backgroundCorrect(x$agi, method = M_MIN__))) } > > > > #applies VSN normalization with VSN package > > nb_vsn<- function(x){ > > return(list(norm = M_VSN__, > > bg = x$bg, > > agi = normalizeVSN(x$agi))) } > > > > #applies VSN normalization with VSN package > > nb_vsn<- function(x){ > > return(list(norm = M_VSN__, > > bg = x$bg, > > agi = normalizeVSN(x$agi))) } > > > > #applies cyclic lowess normalization with limma > > nb_lowess<- function(x){ > > return(list(norm = M_LWS__, > > bg = x$bg, > > agi = normalizeBetweenArrays(x$agi, method = > > M_LWS__))) } > > > > #applies quantile normalization with limma > > nb_quantile<- function(x){ > > return(list(norm = M_QNT__, > > bg = x$bg, > > agi = normalizeBetweenArrays(x$agi, method = > > M_QNT__))) } > > > > #generates a plot of MA values with limma > > pl_ma<- function(x){ > > name<- paste(c(x[[1]]$bg, x[[1]]$norm, "-ma.pdf"), collapse="") > > pdf(name) > > plotMA(x[[1]]$agi$E) > > dev.off() > > } > > > > #generates a plot of mean versus stdev with VS > > pl_meansd<- function(x){ > > name<- paste(c(x[[1]]$bg, x[[1]]$norm, "-meansd.pdf"), collapse="") > > pdf(name) > > meanSdPlot(x[[1]]$agi$E) > > dev.off() > > } > > > > #generate volcanoplot of log-fold versus log-odds > > pl_volcano<- function(x){ > > name<- paste(c(x[[1]]$bg, x[[1]]$norm, "-volcano.pdf"), collapse="") > > pdf(name) > > volcanoplot(x[[1]]$bayes) > > dev.off() > > } > > > > #generate plot of expression data over time > > pl_lines<- function(x){ > > name<- paste(c(x[[1]]$bg, x[[1]]$norm, "-lines.pdf"), collapse="") > > pdf(name) > > plotlines(x[[1]]$agi$E) > > dev.off() > > } > > > > #generate heatmat of final results > > pl_heatmap<- function(x, ntop){ > > name<- paste(c(x[[1]]$bg, x[[1]]$norm, "-heatmap.pdf"), collapse="") > > pdf(name) > > genes<- as.numeric(rownames(topTable(x[[1]]$bayes, n = ntop))) > > heatmap(x[[1]]$lm$coefficients[genes,], > > col = redgreen(75), > > key = TRUE) > > dev.off() > > } > > > > #averages replicates of probes > > m_repavg<- function(x){ > > return(list(agi = x[[1]]$agi, > > avg = avereps(x[[1]]$agi, > ID=x[[1]]$agi$genes$ProbeName), > > bg = x[[1]]$bg, > > norm = x[[1]]$norm)) > > } > > > > #generate contrasts > > m_fit<- function(x, design){ > > contmat<- makeContrasts(s2-s1, > > levels = design) > > fit1<- lmFit(x[[1]]$agi$E, design) > > fit2<- eBayes(contrasts.fit(fit1, contmat)) > > diff<- topTable(fit2, coef = 1, adjust = "BH") > > res<- decideTests(fit2) > > comp<- vennDiagram(res) > > return(list(agi = x[[1]]$agi, > > bg = x[[1]]$bg, > > norm = x[[1]]$norm, > > ave = x[[1]]$ave, > > cont = contmat, > > lm = fit1, > > bayes = fit2, > > diff = diff, > > res = res, > > comp = comp)) > > } > > > > ###################################################################### > > ######### # Function to parse array data into limma objects # Args: > > # tfile: target file (see limma documentation) > > # tsep: limma defaults TSV, have to set manually for CSV and so on > > # galfile: GAL file for printer/layout > > # Returns data object with targets, genes and printer layout > > ###################################################################### > > ######### > > parse<- function(tfile, tsep, galfile){ > > targets<- readTargets(tfile, sep=tsep) > > gal<- readGAL(galfile) > > layout<- getLayout(gal) > > agi<- read.maimages(targets, columns = COL__, > > annotation = ANO__, > > green.only = TRUE) > > bg<- "none" > > norm<- "none" > > desg<- model.matrix(~0+factor(c(1,1,2,2))) > > colnames(desg)<- c("s1","s2") > > d<- list(agi = agi, > > gal = gal, > > layout = layout, > > targets = targets, > > bg = bg, > > desg = desg, > > norm = norm) > > > > processed<- matrix(NA, nrow = N_BGM__, ncol = N_NRM__) > > processed<- apply(processed, 1:2, function(x){return(d)}) > > return(processed) > > } > > > > #parse command args > > args<- commandArgs(trailingOnly = TRUE) > > > > #load files into matrix > > a<- parse(args[1], args[2], args[3]) > > > > #keepy a copy of a raw dataset > > raw<- a[1,1] > > print(raw[[1]]$desg) > > > > #perform backgroung corrections > > a[,A_SUB__]<- lapply(a[,A_SUB__], bg_subtract) > > a[,A_NXP__]<- lapply(a[,A_NXP__], bg_normexp) > > a[,A_MIN__]<- lapply(a[,A_MIN__], bg_minimum) > > > > #perform normalization > > a[A_VSN__,]<- lapply(a[A_VSN__,], nb_vsn) > > a[A_LWS__,]<- lapply(a[A_LWS__,], nb_lowess) > > a[A_QNT__,]<- lapply(a[A_QNT__,], nb_quantile) > > > > #fit model and differentiate expression > > a<- apply(a, 1:2, m_repavg) > > a<- apply(a, 1:2, m_fit, design = raw[[1]]$desg) > > > > #generate plots > > apply(a, 1:2, pl_ma) > > apply(a, 1:2, pl_meansd) > > apply(a, 1:2, pl_volcano) > > apply(a, 1:2, pl_lines) > > apply(a, 1:2, pl_heatmap, args[4]) > > > > > > [[alternative HTML version deleted]] > > > > _______________________________________________ > > Bioconductor mailing list > > Bioconductor@r-project.org > > https://stat.ethz.ch/mailman/listinfo/bioconductor > > Search the archives: > > http://news.gmane.org/gmane.science.biology.informatics.conductor > > -- > James W. MacDonald, M.S. > Biostatistician > University of Washington > Environmental and Occupational Health Sciences > 4225 Roosevelt Way NE, # 100 > Seattle WA 98105-6099 > > _______________________________________________ > Bioconductor mailing list > Bioconductor@r-project.org > https://stat.ethz.ch/mailman/listinfo/bioconductor > Search the archives: > http://news.gmane.org/gmane.science.biology.informatics.conductor > [[alternative HTML version deleted]]
ADD REPLY
0
Entering edit mode
Hello Davis, I posted a simple example, which after fixing a problem in the longer example no produces the same results. I only posted the longer one because I wasn’t able to reproduce the results of the longer version initially. I’ve been sort of banging my head on this for a while so I missed the difference between “agi” and “avg”. I’m thankful anyone bothered to look after posting the code, so thanks. If I sounded offended by the underscores, I was only admitting that I know they’re terrible. c: Thanks for clarifying the subtraction of the mean aspect. That has been one aspect that has been troubling when it comes to learning this analysis, there seem to be plenty of ways to do things but, without experience, no way to tell what is better. I will give the mean subtraction and the z-score scaling a try. Am I reading too much into the usefulness of these heatmaps? In most papers I’ve seen they seem to be there because the look nice or the attached clustering might be of some use. Thanks again! From: Davis, Sean (NCI) On Behalf Of Davis, Sean (NIH/NCI) [E] Sent: Thursday, March 28, 2013 10:34 AM To: Cornish, Joseph (NIH/NIAID) [F] Cc: James W. MacDonald; bioconductor@r-project.org Subject: Re: [BioC] Heatmaps for Agilent Single Color 4x44k using limma and vsn. On Thu, Mar 28, 2013 at 10:22 AM, Cornish, Joseph (NIH/NIAID) [F] <joseph.cornish@nih.gov<mailto:joseph.cornish@nih.gov>> wrote: Hello James, Are you referring to function names or constants? The constants are an old habit I picked up long ago in my CS classes, there are a ton but it makes them stand out for me in vim more readily. I've come to avoid camel case since switching to linux, it makes searching for things a nightmare, it can also help if I group functions with a prefix (eg plot functions start with "pl_"). If you want I can find+replace all of the constants with their values... Hi, Joe. I think I could help to translate part of what James said. If you could boil your example down to a few lines of reproducible code that show the problem, that would be simpler to interpret and easier to comment on directly. Sometimes in the process of producing such a simplified example, I have found my own misunderstandings. It also sometimes helps to get someone local to give a second pair of eyes. The differences came from using the unaveraged replicates in the larger script versus the average in the example. I'm not sure what you mean by "mean of the controls", would this simply be filtering out the control spots? No. I think he means for you to subtract the mean of the control samples (or one group of samples) for each gene from all the normalized expression values. I'll generate the heatmaps using the normalized expression values. I suppose in that case it would be worthwhile to filter ahead of time by p-value and fold change (the coef in topTable). Yes. Sean _________thanks_____________ _______Joe______ -----Original Message----- From: James W. MacDonald [mailto:jmacdon@uw.edu<mailto:jmacdon@uw.edu>] Sent: Thursday, March 28, 2013 10:08 AM To: Cornish, Joseph (NIH/NIAID) [F] Cc: bioconductor@r-project.org<mailto:bioconductor@r-project.org> Subject: Re: [BioC] Heatmaps for Agilent Single Color 4x44k using limma and vsn. Hi Joseph, On 3/28/2013 9:27 AM, Cornish, Joseph (NIH/NIAID) [F] wrote: > I am attempting to write a script that will allow me to do bulk benchmarking of various methods for normalization and background correction. I am concerned that I am not processing the data correctly and this may be leading to the abnormal heatmaps that I am seeing. For a test case, I am using an experiment where I have two different conditions with two different states with two replicates per state. > > The heatmaps concern me because of the uniformity between them, the values are all completely green or completely red, there are differences between genes within a state according to the heatmap. > > In the examples I have seen people have suggested that I use the coefficients from the eBayes fitting of the data, however in my example the eBayes LMarray object does not have two columns in the results, only on (not sure if this is a sign of errors on my part). I have instead used the lmFit coefficients since there are column for each matrix. > > To further confuse someone who is new to R, the simplified example and full code generate different heatmaps for the same data and same methods (normexp for background, cyclic loess for norm). The differences being in the simplified version the gene IDs show up on the right side but in the full version, only row number appears. The color for some genes changes between states as well. > > My questions are: > Why does the bayes model only have one column? Because you are computing one contrast (which is all you can do). One contrast means one test which means one column of results. In other words, the coefficient you are estimating in the contrasts.fit() step is the difference in means between the two groups, which is one value per gene. > Am I approaching the processing correctly? I tried to read through that business below, but got some weird affliction due to an apparent overdose of underscores and had to stop. > Why are there differences between the resulting heatmaps of each version? I don't know, as I worried my brain might explode if I read any more. But you should note that it is fairly unusual to plot coefficients in a heatmap, as what you are then presenting are log ratios, and most people can't understand that. Instead you might consider getting the list of genes, and then creating a heatmap using the normalized expression values for each sample instead. You then would want to scale in some way. We often sweep the mean of the controls out of the matrix (by row), so the colors can then be interpreted as differences between the treated samples and controls. Best, Jim > > I know that the idea is to keep the amount of posted code to a minimum but I don't see any other way here. > > Here is the simplified example I have made: > library(limma) > library(vsn) > library(gplots) > > set.seed(0xabcd) #from vsn manual > > #constants for reading agi outs from our imager > COL__<- list(G="Raw intensity (tr.mean) {A}",Gb="Background (tr.mean) > {A}") > ANO__<- c("Position", "Name", "ID", "Description", "GeneName") > > #parse data > targets<- readTargets("test.csv", sep = ",") > gal<- readGAL("026806_D_20110524.gal") > layout<- getLayout(gal) > agi<- read.maimages(targets, columns = COL__, > annotation = ANO__, > green.only = TRUE) > design<- model.matrix(~0+factor(c(1,1,2,2))) > colnames(design)<- c("s1", "s2") > > #process > bg<- backgroundCorrect(agi, method = "normexp") > norm<- normalizeBetweenArrays(bg, method = "cyclicloess") > avg<- avereps(norm, ID=norm$genes$ProbeName) > > #fit > contmat<- makeContrasts(s2-s1, levels = design) > fit_lm<- lmFit(avg$E, design) > fit_be<- eBayes(contrasts.fit(fit_lm, contmat)) > > #results > difexp<- topTable(fit_be, coef = 1, adjust = "BH") > res<- decideTests(fit_be) > comp<- vennDiagram(res) > > #plot > pdf("example-heatmap.pdf") > genes<- as.numeric(rownames(topTable(fit_be, n = 100))) > heatmap(fit_lm$coefficients[genes,], col=redgreen(75), key = TRUE) > dev.off() > > > The full version: > library(vsn) > library(limma) > library(gplots) > set.seed(0xabcd) #from the vsn manual > > #constants > A_SUB__<- 1 > A_NXP__<- 2 > A_MIN__<- 3 > A_VSN__<- 1 > A_QNT__<- 2 > A_LWS__<- 3 > M_SUB__<- "subtract" > M_NXP__<- "normexp" > M_MIN__<- "minimum" > M_VSN__<- "vsn" > M_LWS__<- "cyclicloess" > M_QNT__<- "quantile" > COL__<- list(G="Raw intensity (tr.mean) {A}",Gb="Background (tr.mean) > {A}") > ANO__<- c("Position", "Name", "ID", "Description", "GeneName") > N_BGM__<- 3 #number of background correction methods > N_NRM__<- 3 #number of normalization methods > A_BGM__<- c(A_SUB__, A_NXP__, A_MIN__) > A_NRM__<- c(A_VSN__, A_QNT__, A_LWS__) > > > #utility methods > #applies subtraction background correction with limma > bg_subtract<- function(x){ > return(list(norm = x$norm, > bg = M_SUB__, > agi = backgroundCorrect(x$agi, method = M_SUB__))) } > > #applies normexp backgroud correction with limma > bg_normexp<- function(x){ > return(list(norm = x$norm, > bg = M_NXP__, > agi = backgroundCorrect(x$agi, method = M_NXP__))) } > > #applies minimum background correction with limma > bg_minimum<- function(x){ > return(list(norm = x$norm, > bg = M_MIN__, > agi = backgroundCorrect(x$agi, method = M_MIN__))) } > > #applies VSN normalization with VSN package > nb_vsn<- function(x){ > return(list(norm = M_VSN__, > bg = x$bg, > agi = normalizeVSN(x$agi))) } > > #applies VSN normalization with VSN package > nb_vsn<- function(x){ > return(list(norm = M_VSN__, > bg = x$bg, > agi = normalizeVSN(x$agi))) } > > #applies cyclic lowess normalization with limma > nb_lowess<- function(x){ > return(list(norm = M_LWS__, > bg = x$bg, > agi = normalizeBetweenArrays(x$agi, method = > M_LWS__))) } > > #applies quantile normalization with limma > nb_quantile<- function(x){ > return(list(norm = M_QNT__, > bg = x$bg, > agi = normalizeBetweenArrays(x$agi, method = > M_QNT__))) } > > #generates a plot of MA values with limma > pl_ma<- function(x){ > name<- paste(c(x[[1]]$bg, x[[1]]$norm, "-ma.pdf"), collapse="") > pdf(name) > plotMA(x[[1]]$agi$E) > dev.off() > } > > #generates a plot of mean versus stdev with VS > pl_meansd<- function(x){ > name<- paste(c(x[[1]]$bg, x[[1]]$norm, "-meansd.pdf"), collapse="") > pdf(name) > meanSdPlot(x[[1]]$agi$E) > dev.off() > } > > #generate volcanoplot of log-fold versus log-odds > pl_volcano<- function(x){ > name<- paste(c(x[[1]]$bg, x[[1]]$norm, "-volcano.pdf"), collapse="") > pdf(name) > volcanoplot(x[[1]]$bayes) > dev.off() > } > > #generate plot of expression data over time > pl_lines<- function(x){ > name<- paste(c(x[[1]]$bg, x[[1]]$norm, "-lines.pdf"), collapse="") > pdf(name) > plotlines(x[[1]]$agi$E) > dev.off() > } > > #generate heatmat of final results > pl_heatmap<- function(x, ntop){ > name<- paste(c(x[[1]]$bg, x[[1]]$norm, "-heatmap.pdf"), collapse="") > pdf(name) > genes<- as.numeric(rownames(topTable(x[[1]]$bayes, n = ntop))) > heatmap(x[[1]]$lm$coefficients[genes,], > col = redgreen(75), > key = TRUE) > dev.off() > } > > #averages replicates of probes > m_repavg<- function(x){ > return(list(agi = x[[1]]$agi, > avg = avereps(x[[1]]$agi, ID=x[[1]]$agi$genes$ProbeName), > bg = x[[1]]$bg, > norm = x[[1]]$norm)) > } > > #generate contrasts > m_fit<- function(x, design){ > contmat<- makeContrasts(s2-s1, > levels = design) > fit1<- lmFit(x[[1]]$agi$E, design) > fit2<- eBayes(contrasts.fit(fit1, contmat)) > diff<- topTable(fit2, coef = 1, adjust = "BH") > res<- decideTests(fit2) > comp<- vennDiagram(res) > return(list(agi = x[[1]]$agi, > bg = x[[1]]$bg, > norm = x[[1]]$norm, > ave = x[[1]]$ave, > cont = contmat, > lm = fit1, > bayes = fit2, > diff = diff, > res = res, > comp = comp)) > } > > ###################################################################### > ######### # Function to parse array data into limma objects # Args: > # tfile: target file (see limma documentation) > # tsep: limma defaults TSV, have to set manually for CSV and so on > # galfile: GAL file for printer/layout > # Returns data object with targets, genes and printer layout > ###################################################################### > ######### > parse<- function(tfile, tsep, galfile){ > targets<- readTargets(tfile, sep=tsep) > gal<- readGAL(galfile) > layout<- getLayout(gal) > agi<- read.maimages(targets, columns = COL__, > annotation = ANO__, > green.only = TRUE) > bg<- "none" > norm<- "none" > desg<- model.matrix(~0+factor(c(1,1,2,2))) > colnames(desg)<- c("s1","s2") > d<- list(agi = agi, > gal = gal, > layout = layout, > targets = targets, > bg = bg, > desg = desg, > norm = norm) > > processed<- matrix(NA, nrow = N_BGM__, ncol = N_NRM__) > processed<- apply(processed, 1:2, function(x){return(d)}) > return(processed) > } > > #parse command args > args<- commandArgs(trailingOnly = TRUE) > > #load files into matrix > a<- parse(args[1], args[2], args[3]) > > #keepy a copy of a raw dataset > raw<- a[1,1] > print(raw[[1]]$desg) > > #perform backgroung corrections > a[,A_SUB__]<- lapply(a[,A_SUB__], bg_subtract) > a[,A_NXP__]<- lapply(a[,A_NXP__], bg_normexp) > a[,A_MIN__]<- lapply(a[,A_MIN__], bg_minimum) > > #perform normalization > a[A_VSN__,]<- lapply(a[A_VSN__,], nb_vsn) > a[A_LWS__,]<- lapply(a[A_LWS__,], nb_lowess) > a[A_QNT__,]<- lapply(a[A_QNT__,], nb_quantile) > > #fit model and differentiate expression > a<- apply(a, 1:2, m_repavg) > a<- apply(a, 1:2, m_fit, design = raw[[1]]$desg) > > #generate plots > apply(a, 1:2, pl_ma) > apply(a, 1:2, pl_meansd) > apply(a, 1:2, pl_volcano) > apply(a, 1:2, pl_lines) > apply(a, 1:2, pl_heatmap, args[4]) > > > [[alternative HTML version deleted]] > > _______________________________________________ > Bioconductor mailing list > Bioconductor@r-project.org<mailto:bioconductor@r-project.org> > https://stat.ethz.ch/mailman/listinfo/bioconductor > Search the archives: > http://news.gmane.org/gmane.science.biology.informatics.conductor -- James W. MacDonald, M.S. Biostatistician University of Washington Environmental and Occupational Health Sciences 4225 Roosevelt Way NE, # 100 Seattle WA 98105-6099 _______________________________________________ Bioconductor mailing list Bioconductor@r-project.org<mailto:bioconductor@r-project.org> https://stat.ethz.ch/mailman/listinfo/bioconductor Search the archives: http://news.gmane.org/gmane.science.biology.informatics.conductor [[alternative HTML version deleted]]
0
Entering edit mode
On Thu, Mar 28, 2013 at 11:00 AM, Cornish, Joseph (NIH/NIAID) [F] <joseph.cornish at="" nih.gov=""> wrote: > Hello Davis, > > I posted a simple example, which after fixing a problem in the longer example no produces the same results. I only posted the longer one because I wasn?t able to reproduce the results of the longer version initially. I?ve been sort of banging my head on this for a while so I missed the difference between ?agi? and ?avg?. I?m thankful anyone bothered to look after posting the code, so thanks. If I sounded offended by the underscores, I was only admitting that I know they?re terrible. c: > > Thanks for clarifying the subtraction of the mean aspect. That has been one aspect that has been troubling when it comes to learning this analysis, there seem to be plenty of ways to do things but, without experience, no way to tell what is better. > There is not a "right" way to do this since the only metric I have seen for the evaluation of the color scale in a heatmap is subjective and qualitative. > I will give the mean subtraction and the z-score scaling a try. Am I reading too much into the usefulness of these heatmaps? In most papers I?ve seen they seem to be there because the look nice or the attached clustering might be of some use. > Heatmaps are very good visualization tools. I would not try to attach more utility to them than that. Sean > Thanks again! > > From: Davis, Sean (NCI) On Behalf Of Davis, Sean (NIH/NCI) [E] > Sent: Thursday, March 28, 2013 10:34 AM > To: Cornish, Joseph (NIH/NIAID) [F] > Cc: James W. MacDonald; bioconductor at r-project.org > Subject: Re: [BioC] Heatmaps for Agilent Single Color 4x44k using limma and vsn. > > > > On Thu, Mar 28, 2013 at 10:22 AM, Cornish, Joseph (NIH/NIAID) [F] <joseph.cornish at="" nih.gov<mailto:joseph.cornish="" at="" nih.gov="">> wrote: > Hello James, > > Are you referring to function names or constants? The constants are an old habit I picked up long ago in my CS classes, there are a ton but it makes them stand out for me in vim more readily. I've come to avoid camel case since switching to linux, it makes searching for things a nightmare, it can also help if I group functions with a prefix (eg plot functions start with "pl_"). If you want I can find+replace all of the constants with their values... > > Hi, Joe. > > I think I could help to translate part of what James said. If you could boil your example down to a few lines of reproducible code that show the problem, that would be simpler to interpret and easier to comment on directly. Sometimes in the process of producing such a simplified example, I have found my own misunderstandings. It also sometimes helps to get someone local to give a second pair of eyes. > > The differences came from using the unaveraged replicates in the larger script versus the average in the example. > > I'm not sure what you mean by "mean of the controls", would this simply be filtering out the control spots? > > No. I think he means for you to subtract the mean of the control samples (or one group of samples) for each gene from all the normalized expression values. > > I'll generate the heatmaps using the normalized expression values. I suppose in that case it would be worthwhile to filter ahead of time by p-value and fold change (the coef in topTable). > > Yes. > > Sean > > > _________thanks_____________ > _______Joe______ > > -----Original Message----- > From: James W. MacDonald [mailto:jmacdon at uw.edu<mailto:jmacdon at="" uw.edu="">] > Sent: Thursday, March 28, 2013 10:08 AM > To: Cornish, Joseph (NIH/NIAID) [F] > Cc: bioconductor at r-project.org<mailto:bioconductor at="" r-project.org=""> > Subject: Re: [BioC] Heatmaps for Agilent Single Color 4x44k using limma and vsn. > > Hi Joseph, > > > On 3/28/2013 9:27 AM, Cornish, Joseph (NIH/NIAID) [F] wrote: >> I am attempting to write a script that will allow me to do bulk benchmarking of various methods for normalization and background correction. I am concerned that I am not processing the data correctly and this may be leading to the abnormal heatmaps that I am seeing. For a test case, I am using an experiment where I have two different conditions with two different states with two replicates per state. >> >> The heatmaps concern me because of the uniformity between them, the values are all completely green or completely red, there are differences between genes within a state according to the heatmap. >> >> In the examples I have seen people have suggested that I use the coefficients from the eBayes fitting of the data, however in my example the eBayes LMarray object does not have two columns in the results, only on (not sure if this is a sign of errors on my part). I have instead used the lmFit coefficients since there are column for each matrix. >> >> To further confuse someone who is new to R, the simplified example and full code generate different heatmaps for the same data and same methods (normexp for background, cyclic loess for norm). The differences being in the simplified version the gene IDs show up on the right side but in the full version, only row number appears. The color for some genes changes between states as well. >> >> My questions are: >> Why does the bayes model only have one column? > > Because you are computing one contrast (which is all you can do). One contrast means one test which means one column of results. In other words, the coefficient you are estimating in the contrasts.fit() step is the difference in means between the two groups, which is one value per gene. > >> Am I approaching the processing correctly? > > I tried to read through that business below, but got some weird affliction due to an apparent overdose of underscores and had to stop. > >> Why are there differences between the resulting heatmaps of each version? > > I don't know, as I worried my brain might explode if I read any more. > But you should note that it is fairly unusual to plot coefficients in a heatmap, as what you are then presenting are log ratios, and most people can't understand that. > > Instead you might consider getting the list of genes, and then creating a heatmap using the normalized expression values for each sample instead. You then would want to scale in some way. We often sweep the mean of the controls out of the matrix (by row), so the colors can then be interpreted as differences between the treated samples and controls. > > Best, > > Jim > > >> >> I know that the idea is to keep the amount of posted code to a minimum but I don't see any other way here. >> >> Here is the simplified example I have made: >> library(limma) >> library(vsn) >> library(gplots) >> >> set.seed(0xabcd) #from vsn manual >> >> #constants for reading agi outs from our imager >> COL__<- list(G="Raw intensity (tr.mean) {A}",Gb="Background (tr.mean) >> {A}") >> ANO__<- c("Position", "Name", "ID", "Description", "GeneName") >> >> #parse data >> targets<- readTargets("test.csv", sep = ",") >> gal<- readGAL("026806_D_20110524.gal") >> layout<- getLayout(gal) >> agi<- read.maimages(targets, columns = COL__, >> annotation = ANO__, >> green.only = TRUE) >> design<- model.matrix(~0+factor(c(1,1,2,2))) >> colnames(design)<- c("s1", "s2") >> >> #process >> bg<- backgroundCorrect(agi, method = "normexp") >> norm<- normalizeBetweenArrays(bg, method = "cyclicloess") >> avg<- avereps(norm, ID=norm$genes$ProbeName) >> >> #fit >> contmat<- makeContrasts(s2-s1, levels = design) >> fit_lm<- lmFit(avg$E, design) >> fit_be<- eBayes(contrasts.fit(fit_lm, contmat)) >> >> #results >> difexp<- topTable(fit_be, coef = 1, adjust = "BH") >> res<- decideTests(fit_be) >> comp<- vennDiagram(res) >> >> #plot >> pdf("example-heatmap.pdf") >> genes<- as.numeric(rownames(topTable(fit_be, n = 100))) >> heatmap(fit_lm$coefficients[genes,], col=redgreen(75), key = TRUE) >> dev.off() >> >> >> The full version: >> library(vsn) >> library(limma) >> library(gplots) >> set.seed(0xabcd) #from the vsn manual >> >> #constants >> A_SUB__<- 1 >> A_NXP__<- 2 >> A_MIN__<- 3 >> A_VSN__<- 1 >> A_QNT__<- 2 >> A_LWS__<- 3 >> M_SUB__<- "subtract" >> M_NXP__<- "normexp" >> M_MIN__<- "minimum" >> M_VSN__<- "vsn" >> M_LWS__<- "cyclicloess" >> M_QNT__<- "quantile" >> COL__<- list(G="Raw intensity (tr.mean) {A}",Gb="Background (tr.mean) >> {A}") >> ANO__<- c("Position", "Name", "ID", "Description", "GeneName") >> N_BGM__<- 3 #number of background correction methods >> N_NRM__<- 3 #number of normalization methods >> A_BGM__<- c(A_SUB__, A_NXP__, A_MIN__) >> A_NRM__<- c(A_VSN__, A_QNT__, A_LWS__) >> >> >> #utility methods >> #applies subtraction background correction with limma >> bg_subtract<- function(x){ >> return(list(norm = x$norm, >> bg = M_SUB__, >> agi = backgroundCorrect(x$agi, method = M_SUB__))) } >> >> #applies normexp backgroud correction with limma >> bg_normexp<- function(x){ >> return(list(norm = x$norm, >> bg = M_NXP__, >> agi = backgroundCorrect(x$agi, method = M_NXP__))) } >> >> #applies minimum background correction with limma >> bg_minimum<- function(x){ >> return(list(norm = x$norm, >> bg = M_MIN__, >> agi = backgroundCorrect(x$agi, method = M_MIN__))) } >> >> #applies VSN normalization with VSN package >> nb_vsn<- function(x){ >> return(list(norm = M_VSN__, >> bg = x$bg, >> agi = normalizeVSN(x$agi))) } >> >> #applies VSN normalization with VSN package >> nb_vsn<- function(x){ >> return(list(norm = M_VSN__, >> bg = x$bg, >> agi = normalizeVSN(x$agi))) } >> >> #applies cyclic lowess normalization with limma >> nb_lowess<- function(x){ >> return(list(norm = M_LWS__, >> bg = x$bg, >> agi = normalizeBetweenArrays(x$agi, method = >> M_LWS__))) } >> >> #applies quantile normalization with limma >> nb_quantile<- function(x){ >> return(list(norm = M_QNT__, >> bg = x$bg, >> agi = normalizeBetweenArrays(x$agi, method = >> M_QNT__))) } >> >> #generates a plot of MA values with limma >> pl_ma<- function(x){ >> name<- paste(c(x[[1]]$bg, x[[1]]$norm, "-ma.pdf"), collapse="") >> pdf(name) >> plotMA(x[[1]]$agi$E) >> dev.off() >> } >> >> #generates a plot of mean versus stdev with VS >> pl_meansd<- function(x){ >> name<- paste(c(x[[1]]$bg, x[[1]]$norm, "-meansd.pdf"), collapse="") >> pdf(name) >> meanSdPlot(x[[1]]$agi$E) >> dev.off() >> } >> >> #generate volcanoplot of log-fold versus log-odds >> pl_volcano<- function(x){ >> name<- paste(c(x[[1]]$bg, x[[1]]$norm, "-volcano.pdf"), collapse="") >> pdf(name) >> volcanoplot(x[[1]]$bayes) >> dev.off() >> } >> >> #generate plot of expression data over time >> pl_lines<- function(x){ >> name<- paste(c(x[[1]]$bg, x[[1]]$norm, "-lines.pdf"), collapse="") >> pdf(name) >> plotlines(x[[1]]$agi$E) >> dev.off() >> } >> >> #generate heatmat of final results >> pl_heatmap<- function(x, ntop){ >> name<- paste(c(x[[1]]$bg, x[[1]]$norm, "-heatmap.pdf"), collapse="") >> pdf(name) >> genes<- as.numeric(rownames(topTable(x[[1]]$bayes, n = ntop))) >> heatmap(x[[1]]$lm$coefficients[genes,], >> col = redgreen(75), >> key = TRUE) >> dev.off() >> } >> >> #averages replicates of probes >> m_repavg<- function(x){ >> return(list(agi = x[[1]]$agi, >> avg = avereps(x[[1]]$agi, ID=x[[1]]$agi$genes$ProbeName), >> bg = x[[1]]$bg, >> norm = x[[1]]$norm)) >> } >> >> #generate contrasts >> m_fit<- function(x, design){ >> contmat<- makeContrasts(s2-s1, >> levels = design) >> fit1<- lmFit(x[[1]]$agi$E, design) >> fit2<- eBayes(contrasts.fit(fit1, contmat)) >> diff<- topTable(fit2, coef = 1, adjust = "BH") >> res<- decideTests(fit2) >> comp<- vennDiagram(res) >> return(list(agi = x[[1]]$agi, >> bg = x[[1]]$bg, >> norm = x[[1]]$norm, >> ave = x[[1]]$ave, >> cont = contmat, >> lm = fit1, >> bayes = fit2, >> diff = diff, >> res = res, >> comp = comp)) >> } >> >> ###################################################################### >> ######### # Function to parse array data into limma objects # Args: >> # tfile: target file (see limma documentation) >> # tsep: limma defaults TSV, have to set manually for CSV and so on >> # galfile: GAL file for printer/layout >> # Returns data object with targets, genes and printer layout >> ###################################################################### >> ######### >> parse<- function(tfile, tsep, galfile){ >> targets<- readTargets(tfile, sep=tsep) >> gal<- readGAL(galfile) >> layout<- getLayout(gal) >> agi<- read.maimages(targets, columns = COL__, >> annotation = ANO__, >> green.only = TRUE) >> bg<- "none" >> norm<- "none" >> desg<- model.matrix(~0+factor(c(1,1,2,2))) >> colnames(desg)<- c("s1","s2") >> d<- list(agi = agi, >> gal = gal, >> layout = layout, >> targets = targets, >> bg = bg, >> desg = desg, >> norm = norm) >> >> processed<- matrix(NA, nrow = N_BGM__, ncol = N_NRM__) >> processed<- apply(processed, 1:2, function(x){return(d)}) >> return(processed) >> } >> >> #parse command args >> args<- commandArgs(trailingOnly = TRUE) >> >> #load files into matrix >> a<- parse(args[1], args[2], args[3]) >> >> #keepy a copy of a raw dataset >> raw<- a[1,1] >> print(raw[[1]]$desg) >> >> #perform backgroung corrections >> a[,A_SUB__]<- lapply(a[,A_SUB__], bg_subtract) >> a[,A_NXP__]<- lapply(a[,A_NXP__], bg_normexp) >> a[,A_MIN__]<- lapply(a[,A_MIN__], bg_minimum) >> >> #perform normalization >> a[A_VSN__,]<- lapply(a[A_VSN__,], nb_vsn) >> a[A_LWS__,]<- lapply(a[A_LWS__,], nb_lowess) >> a[A_QNT__,]<- lapply(a[A_QNT__,], nb_quantile) >> >> #fit model and differentiate expression >> a<- apply(a, 1:2, m_repavg) >> a<- apply(a, 1:2, m_fit, design = raw[[1]]$desg) >> >> #generate plots >> apply(a, 1:2, pl_ma) >> apply(a, 1:2, pl_meansd) >> apply(a, 1:2, pl_volcano) >> apply(a, 1:2, pl_lines) >> apply(a, 1:2, pl_heatmap, args[4]) >> >> >> [[alternative HTML version deleted]] >> >> _______________________________________________ >> Bioconductor mailing list >> Bioconductor at r-project.org<mailto:bioconductor at="" r-project.org=""> >> https://stat.ethz.ch/mailman/listinfo/bioconductor >> Search the archives: >> http://news.gmane.org/gmane.science.biology.informatics.conductor > > -- > James W. MacDonald, M.S. > Biostatistician > University of Washington > Environmental and Occupational Health Sciences > 4225 Roosevelt Way NE, # 100 > Seattle WA 98105-6099 > > _______________________________________________ > Bioconductor mailing list > Bioconductor at r-project.org<mailto:bioconductor at="" r-project.org=""> > https://stat.ethz.ch/mailman/listinfo/bioconductor > Search the archives: http://news.gmane.org/gmane.science.biology.informatics.conductor > > > [[alternative HTML version deleted]] > > > _______________________________________________ > Bioconductor mailing list > Bioconductor at r-project.org > https://stat.ethz.ch/mailman/listinfo/bioconductor > Search the archives: http://news.gmane.org/gmane.science.biology.informatics.conductor
ADD REPLY
0
Entering edit mode
Hi Joe, I don't mean to be critical. If you like to use gobtons of underscores, then rock on. I would just find it very distracting. Anyway, by 'mean of the controls', I mean the average value of the control samples. But this could be generalized in any way. For example, your contrast was s2 - s1, so you could do mat <- avg$E[<index of="" interesting="" genes="">,] mn <- fit_lm[<index of="" interesting="" genes="">,"s1"] swept <- sweep(mat, 1, mn, "-") and then do the heatmap. In this case the colors would be interpreted as the log fold difference between any sample and the mean of the s1 samples. But the general take home message here is that you can't just use the raw expression values, as you will then have a solid color to your heatmap. Other possibilities are to use the scale argument to heatmap.2() to scale by row (convert to z-scores), in which case the interpretation of the colors is less obvious. Best, Jim On 3/28/2013 10:22 AM, Cornish, Joseph (NIH/NIAID) [F] wrote: > Hello James, > > Are you referring to function names or constants? The constants are an old habit I picked up long ago in my CS classes, there are a ton but it makes them stand out for me in vim more readily. I've come to avoid camel case since switching to linux, it makes searching for things a nightmare, it can also help if I group functions with a prefix (eg plot functions start with "pl_"). If you want I can find+replace all of the constants with their values... > > The differences came from using the unaveraged replicates in the larger script versus the average in the example. > > I'm not sure what you mean by "mean of the controls", would this simply be filtering out the control spots? > > I'll generate the heatmaps using the normalized expression values. I suppose in that case it would be worthwhile to filter ahead of time by p-value and fold change (the coef in topTable). > > > _________thanks_____________ > _______Joe______ > > -----Original Message----- > From: James W. MacDonald [mailto:jmacdon at uw.edu] > Sent: Thursday, March 28, 2013 10:08 AM > To: Cornish, Joseph (NIH/NIAID) [F] > Cc: bioconductor at r-project.org > Subject: Re: [BioC] Heatmaps for Agilent Single Color 4x44k using limma and vsn. > > Hi Joseph, > > > On 3/28/2013 9:27 AM, Cornish, Joseph (NIH/NIAID) [F] wrote: >> I am attempting to write a script that will allow me to do bulk benchmarking of various methods for normalization and background correction. I am concerned that I am not processing the data correctly and this may be leading to the abnormal heatmaps that I am seeing. For a test case, I am using an experiment where I have two different conditions with two different states with two replicates per state. >> >> The heatmaps concern me because of the uniformity between them, the values are all completely green or completely red, there are differences between genes within a state according to the heatmap. >> >> In the examples I have seen people have suggested that I use the coefficients from the eBayes fitting of the data, however in my example the eBayes LMarray object does not have two columns in the results, only on (not sure if this is a sign of errors on my part). I have instead used the lmFit coefficients since there are column for each matrix. >> >> To further confuse someone who is new to R, the simplified example and full code generate different heatmaps for the same data and same methods (normexp for background, cyclic loess for norm). The differences being in the simplified version the gene IDs show up on the right side but in the full version, only row number appears. The color for some genes changes between states as well. >> >> My questions are: >> Why does the bayes model only have one column? > Because you are computing one contrast (which is all you can do). One contrast means one test which means one column of results. In other words, the coefficient you are estimating in the contrasts.fit() step is the difference in means between the two groups, which is one value per gene. > >> Am I approaching the processing correctly? > I tried to read through that business below, but got some weird affliction due to an apparent overdose of underscores and had to stop. > >> Why are there differences between the resulting heatmaps of each version? > I don't know, as I worried my brain might explode if I read any more. > But you should note that it is fairly unusual to plot coefficients in a heatmap, as what you are then presenting are log ratios, and most people can't understand that. > > Instead you might consider getting the list of genes, and then creating a heatmap using the normalized expression values for each sample instead. You then would want to scale in some way. We often sweep the mean of the controls out of the matrix (by row), so the colors can then be interpreted as differences between the treated samples and controls. > > Best, > > Jim > > >> I know that the idea is to keep the amount of posted code to a minimum but I don't see any other way here. >> >> Here is the simplified example I have made: >> library(limma) >> library(vsn) >> library(gplots) >> >> set.seed(0xabcd) #from vsn manual >> >> #constants for reading agi outs from our imager >> COL__<- list(G="Raw intensity (tr.mean) {A}",Gb="Background (tr.mean) >> {A}") >> ANO__<- c("Position", "Name", "ID", "Description", "GeneName") >> >> #parse data >> targets<- readTargets("test.csv", sep = ",") >> gal<- readGAL("026806_D_20110524.gal") >> layout<- getLayout(gal) >> agi<- read.maimages(targets, columns = COL__, >> annotation = ANO__, >> green.only = TRUE) >> design<- model.matrix(~0+factor(c(1,1,2,2))) >> colnames(design)<- c("s1", "s2") >> >> #process >> bg<- backgroundCorrect(agi, method = "normexp") >> norm<- normalizeBetweenArrays(bg, method = "cyclicloess") >> avg<- avereps(norm, ID=norm$genes$ProbeName) >> >> #fit >> contmat<- makeContrasts(s2-s1, levels = design) >> fit_lm<- lmFit(avg$E, design) >> fit_be<- eBayes(contrasts.fit(fit_lm, contmat)) >> >> #results >> difexp<- topTable(fit_be, coef = 1, adjust = "BH") >> res<- decideTests(fit_be) >> comp<- vennDiagram(res) >> >> #plot >> pdf("example-heatmap.pdf") >> genes<- as.numeric(rownames(topTable(fit_be, n = 100))) >> heatmap(fit_lm$coefficients[genes,], col=redgreen(75), key = TRUE) >> dev.off() >> >> >> The full version: >> library(vsn) >> library(limma) >> library(gplots) >> set.seed(0xabcd) #from the vsn manual >> >> #constants >> A_SUB__<- 1 >> A_NXP__<- 2 >> A_MIN__<- 3 >> A_VSN__<- 1 >> A_QNT__<- 2 >> A_LWS__<- 3 >> M_SUB__<- "subtract" >> M_NXP__<- "normexp" >> M_MIN__<- "minimum" >> M_VSN__<- "vsn" >> M_LWS__<- "cyclicloess" >> M_QNT__<- "quantile" >> COL__<- list(G="Raw intensity (tr.mean) {A}",Gb="Background (tr.mean) >> {A}") >> ANO__<- c("Position", "Name", "ID", "Description", "GeneName") >> N_BGM__<- 3 #number of background correction methods >> N_NRM__<- 3 #number of normalization methods >> A_BGM__<- c(A_SUB__, A_NXP__, A_MIN__) >> A_NRM__<- c(A_VSN__, A_QNT__, A_LWS__) >> >> >> #utility methods >> #applies subtraction background correction with limma >> bg_subtract<- function(x){ >> return(list(norm = x$norm, >> bg = M_SUB__, >> agi = backgroundCorrect(x$agi, method = M_SUB__))) } >> >> #applies normexp backgroud correction with limma >> bg_normexp<- function(x){ >> return(list(norm = x$norm, >> bg = M_NXP__, >> agi = backgroundCorrect(x$agi, method = M_NXP__))) } >> >> #applies minimum background correction with limma >> bg_minimum<- function(x){ >> return(list(norm = x$norm, >> bg = M_MIN__, >> agi = backgroundCorrect(x$agi, method = M_MIN__))) } >> >> #applies VSN normalization with VSN package >> nb_vsn<- function(x){ >> return(list(norm = M_VSN__, >> bg = x$bg, >> agi = normalizeVSN(x$agi))) } >> >> #applies VSN normalization with VSN package >> nb_vsn<- function(x){ >> return(list(norm = M_VSN__, >> bg = x$bg, >> agi = normalizeVSN(x$agi))) } >> >> #applies cyclic lowess normalization with limma >> nb_lowess<- function(x){ >> return(list(norm = M_LWS__, >> bg = x$bg, >> agi = normalizeBetweenArrays(x$agi, method = >> M_LWS__))) } >> >> #applies quantile normalization with limma >> nb_quantile<- function(x){ >> return(list(norm = M_QNT__, >> bg = x$bg, >> agi = normalizeBetweenArrays(x$agi, method = >> M_QNT__))) } >> >> #generates a plot of MA values with limma >> pl_ma<- function(x){ >> name<- paste(c(x[[1]]$bg, x[[1]]$norm, "-ma.pdf"), collapse="") >> pdf(name) >> plotMA(x[[1]]$agi$E) >> dev.off() >> } >> >> #generates a plot of mean versus stdev with VS >> pl_meansd<- function(x){ >> name<- paste(c(x[[1]]$bg, x[[1]]$norm, "-meansd.pdf"), collapse="") >> pdf(name) >> meanSdPlot(x[[1]]$agi$E) >> dev.off() >> } >> >> #generate volcanoplot of log-fold versus log-odds >> pl_volcano<- function(x){ >> name<- paste(c(x[[1]]$bg, x[[1]]$norm, "-volcano.pdf"), collapse="") >> pdf(name) >> volcanoplot(x[[1]]$bayes) >> dev.off() >> } >> >> #generate plot of expression data over time >> pl_lines<- function(x){ >> name<- paste(c(x[[1]]$bg, x[[1]]$norm, "-lines.pdf"), collapse="") >> pdf(name) >> plotlines(x[[1]]$agi$E) >> dev.off() >> } >> >> #generate heatmat of final results >> pl_heatmap<- function(x, ntop){ >> name<- paste(c(x[[1]]$bg, x[[1]]$norm, "-heatmap.pdf"), collapse="") >> pdf(name) >> genes<- as.numeric(rownames(topTable(x[[1]]$bayes, n = ntop))) >> heatmap(x[[1]]$lm$coefficients[genes,], >> col = redgreen(75), >> key = TRUE) >> dev.off() >> } >> >> #averages replicates of probes >> m_repavg<- function(x){ >> return(list(agi = x[[1]]$agi, >> avg = avereps(x[[1]]$agi, ID=x[[1]]$agi$genes$ProbeName), >> bg = x[[1]]$bg, >> norm = x[[1]]$norm)) >> } >> >> #generate contrasts >> m_fit<- function(x, design){ >> contmat<- makeContrasts(s2-s1, >> levels = design) >> fit1<- lmFit(x[[1]]$agi$E, design) >> fit2<- eBayes(contrasts.fit(fit1, contmat)) >> diff<- topTable(fit2, coef = 1, adjust = "BH") >> res<- decideTests(fit2) >> comp<- vennDiagram(res) >> return(list(agi = x[[1]]$agi, >> bg = x[[1]]$bg, >> norm = x[[1]]$norm, >> ave = x[[1]]$ave, >> cont = contmat, >> lm = fit1, >> bayes = fit2, >> diff = diff, >> res = res, >> comp = comp)) >> } >> >> ###################################################################### >> ######### # Function to parse array data into limma objects # Args: >> # tfile: target file (see limma documentation) >> # tsep: limma defaults TSV, have to set manually for CSV and so on >> # galfile: GAL file for printer/layout >> # Returns data object with targets, genes and printer layout >> ###################################################################### >> ######### >> parse<- function(tfile, tsep, galfile){ >> targets<- readTargets(tfile, sep=tsep) >> gal<- readGAL(galfile) >> layout<- getLayout(gal) >> agi<- read.maimages(targets, columns = COL__, >> annotation = ANO__, >> green.only = TRUE) >> bg<- "none" >> norm<- "none" >> desg<- model.matrix(~0+factor(c(1,1,2,2))) >> colnames(desg)<- c("s1","s2") >> d<- list(agi = agi, >> gal = gal, >> layout = layout, >> targets = targets, >> bg = bg, >> desg = desg, >> norm = norm) >> >> processed<- matrix(NA, nrow = N_BGM__, ncol = N_NRM__) >> processed<- apply(processed, 1:2, function(x){return(d)}) >> return(processed) >> } >> >> #parse command args >> args<- commandArgs(trailingOnly = TRUE) >> >> #load files into matrix >> a<- parse(args[1], args[2], args[3]) >> >> #keepy a copy of a raw dataset >> raw<- a[1,1] >> print(raw[[1]]$desg) >> >> #perform backgroung corrections >> a[,A_SUB__]<- lapply(a[,A_SUB__], bg_subtract) >> a[,A_NXP__]<- lapply(a[,A_NXP__], bg_normexp) >> a[,A_MIN__]<- lapply(a[,A_MIN__], bg_minimum) >> >> #perform normalization >> a[A_VSN__,]<- lapply(a[A_VSN__,], nb_vsn) >> a[A_LWS__,]<- lapply(a[A_LWS__,], nb_lowess) >> a[A_QNT__,]<- lapply(a[A_QNT__,], nb_quantile) >> >> #fit model and differentiate expression >> a<- apply(a, 1:2, m_repavg) >> a<- apply(a, 1:2, m_fit, design = raw[[1]]$desg) >> >> #generate plots >> apply(a, 1:2, pl_ma) >> apply(a, 1:2, pl_meansd) >> apply(a, 1:2, pl_volcano) >> apply(a, 1:2, pl_lines) >> apply(a, 1:2, pl_heatmap, args[4]) >> >> >> [[alternative HTML version deleted]] >> >> _______________________________________________ >> Bioconductor mailing list >> Bioconductor at r-project.org >> https://stat.ethz.ch/mailman/listinfo/bioconductor >> Search the archives: >> http://news.gmane.org/gmane.science.biology.informatics.conductor > -- > James W. MacDonald, M.S. > Biostatistician > University of Washington > Environmental and Occupational Health Sciences > 4225 Roosevelt Way NE, # 100 > Seattle WA 98105-6099 > -- James W. MacDonald, M.S. Biostatistician University of Washington Environmental and Occupational Health Sciences 4225 Roosevelt Way NE, # 100 Seattle WA 98105-6099
ADD REPLY

Login before adding your answer.

Traffic: 529 users visited in the last hour
Help About
FAQ
Access RSS
API
Stats

Use of this site constitutes acceptance of our User Agreement and Privacy Policy.

Powered by the version 2.3.6