Entering edit mode
Hello how can I generate a list of my most differentially expressed genes from my RNA seq data on DESeq2 using the raw pvalue (ie. unadjusted) cutoff of <0.05 instead of using DESeq2's Benjamini & Hochberg corrected pvalue cutoff of <0.05
setwd("Your_working_directory")
# Import count table#
countdata <- read.table("family_13_01_revised_RNA-seq.counts_fixed.txt", header=TRUE, row.names=1)
# Remove .bam or .sam from filenames#
colnames(countdata) <- gsub("\\.[sb]am$", "", colnames(countdata))
# Convert to matrix#
countdata <- as.matrix(countdata)
head(countdata)
# Assign condition (affected versus unaffected)#
condition <- factor(c("affected","affected","affected","unaffected","unaffected","unaffected"),levels=c("affected","unaffected"))
condition <- relevel(condition, ref = "unaffected")
#load DESeq2#
library(DESeq2)
# Create a coldata frame and instantiate the DESeqDataSet#
coldata <- data.frame(row.names=colnames(countdata),condition)
dds <- DESeqDataSetFromMatrix(countData=countdata, colData=coldata, design = ~ condition)
dds
#pre-filtering to keep only rows that have at least 1 reads total#
keep <- rowSums(counts(dds)) > 1
dds <- dds[keep,]
# Run the DESeq#
dds <- DESeq(dds)
# Regularized log transformation for clustering/heatmaps
rld <- rlogTransformation(dds)
head(assay(rld))
hist(assay(rld))
# Colors for plots below#
library(RColorBrewer)
(mycols <- brewer.pal(8, "Dark2")[1:length(unique(condition))])
# Sample distance heatmap
sampleDists <- as.matrix(dist(t(assay(rld))))
library(gplots)
#png("qc-heatmap_baker.png", w=1000, h=1000, pointsize=20)
heatmap.2(as.matrix(sampleDists), key=F, trace="none",
col=colorpanel(100, "black", "white"),
ColSideColors=mycols[condition], RowSideColors=mycols[condition],
margin=c(10, 10), main="Sample Distance Matrix")
#dev.off()
# Principal components analysis#
rld_pca <- function (rld, intgroup = "condition", ntop = 500, colors=NULL, legendpos="bottomleft", main="PCA Biplot", textcx=1, ...) {
require(genefilter)
require(calibrate)
require(RColorBrewer)
rv = rowVars(assay(rld))
select = order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))]
pca = prcomp(t(assay(rld)[select, ]))
fac = factor(apply(as.data.frame(colData(rld)[, intgroup, drop = FALSE]), 1, paste, collapse = " : "))
if (is.null(colors)) {
if (nlevels(fac) >= 3) {
colors = brewer.pal(nlevels(fac), "Paired")
} else {
colors = c("black", "red")
}
}
pc1var <- round(summary(pca)$importance[2,1]*100, digits=1)
pc2var <- round(summary(pca)$importance[2,2]*100, digits=1)
pc1lab <- paste0("PC1 (",as.character(pc1var),"%)")
pc2lab <- paste0("PC2 (",as.character(pc2var),"%)")
plot(PC2~PC1, data=as.data.frame(pca$x), bg=colors[fac], pch=21, xlab=pc1lab, ylab=pc2lab, main=main, ...)
with(as.data.frame(pca$x), textxy(PC1, PC2, labs=rownames(as.data.frame(pca$x)), cex=textcx))
legend(legendpos, legend=levels(fac), col=colors, pch=20)
# rldyplot(PC2 ~ PC1, groups = fac, data = as.data.frame(pca$rld),
# pch = 16, cerld = 2, aspect = "iso", col = colours, main = draw.key(key = list(rect = list(col = colours),
# terldt = list(levels(fac)), rep = FALSE)))
}
#png("qc-pca.png", 1000, 1000, pointsize=20)
rld_pca(rld, colors=mycols, intgroup="condition", xlim=c(-30, 30))
#dev.off()
# Get differential expression results#
res <- results(dds)
table(res$padj<0.05)
## Order by adjusted p-value #
res <- res[order(res$padj), ]
## Merge with normalized count data#
resdata <- merge(as.data.frame(res), as.data.frame(counts(dds, normalized=TRUE)), by="row.names", sort=FALSE)
names(resdata)[1] <- "Gene"
head(resdata)
#get significant results (FDR<0.05)
## Write results#
write.csv(resdata, file="sig_diffexpr-results.csv")
Hello Michael Love I'm aware the number of false positives can go up by using the raw p-value. I already did a primary analysis using the the adjusted p-value as set by Deseq2 and found a candidate gene of interest . I now want to do a secondary analysis using the raw p-value instead of the adjusted p-value to increase the number of genes in my top differentially expressed genes list for the purpose of finding other genes that can be in the same pathway of the candidate gene were interested in. We plan to do further functional testing to validate these secondary genes, to eliminate the possibility it could be a false positive. Please note the samples we submitted for RNAseq are not in the same tissue as the disease we're looking at, this is why we want to take this broader approach.
Still, if you want to allow a higher false discovery rate, you should still use adjusted p-values, but for example at FDR = 0.5. I can't think of a good reason to use a raw p-value in this context.
Hello Michael Love, thank you for the caution I still want to use raw p-value, how can I alter my script? I tried altering this part of my script to get my differentially gene list by raw p-value, but it didnt work:
I'd recommend to work with a bioinformatician who is familiar with R. It's not really appropriate to request developer support for doing basic R manipulations of data frames.