Hello I'm new to coding on Rstudio. I'm doing a RNA seq analysis to test for differential gene expression using deseq2, by using unpaired samples. I just wanted to know if I altered the following script template correctly to indicate the following : 1) the analysis will be using unpaired samples 2) the order of variable comparison will be affected versus unaffected
please see below the R studio script I used: # Import count table countdata <- read.table("family1301revisedRNA-seq.counts_fixed.txt", header=TRUE, row.names=1)
Remove .bam or .sam from filenames
colnames(countdata) <- gsub("\.[sb]am$", "", colnames(countdata))
Convert to matrix
countdata <- as.matrix(countdata) head(countdata)
Assign condition (affected versus unaffected)
condition <- factor(c("affected","affected","affected","unaffected","unaffected","unaffected"),levels=c("affected","unaffected")) condition <- relevel(condition, ref = "unaffected")
load DESeq2
library(DESeq2)
Create a coldata frame and instantiate the DESeqDataSet
coldata <- data.frame(row.names=colnames(countdata),condition)
dds <- DESeqDataSetFromMatrix(countData=countdata, colData=coldata, design = ~ condition) dds
pre-filtering to keep only rows that have at least 1 reads total
keep <- rowSums(counts(dds)) > 1 dds <- dds[keep,]
Run the DESeq
dds <- DESeq(dds)
Regularized log transformation for clustering/heatmaps
rld <- rlogTransformation(dds) head(assay(rld)) hist(assay(rld))
Colors for plots below
library(RColorBrewer) (mycols <- brewer.pal(8, "Dark2")[1:length(unique(condition))])
Sample distance heatmap
sampleDists <- as.matrix(dist(t(assay(rld)))) library(gplots) png("qc-heatmap_baker.png", w=1000, h=1000, pointsize=20) heatmap.2(as.matrix(sampleDists), key=F, trace="none", col=colorpanel(100, "black", "white"), ColSideColors=mycols[condition], RowSideColors=mycols[condition], margin=c(10, 10), main="Sample Distance Matrix") dev.off()
Principal components analysis
rldpca <- function (rld, intgroup = "condition", ntop = 500, colors=NULL, legendpos="bottomleft", main="PCA Biplot", textcx=1, ...) { require(genefilter) require(calibrate) require(RColorBrewer) rv = rowVars(assay(rld)) select = order(rv, decreasing = TRUE)[seqlen(min(ntop, length(rv)))] pca = prcomp(t(assay(rld)[select, ])) fac = factor(apply(as.data.frame(colData(rld)[, intgroup, drop = FALSE]), 1, paste, collapse = " : ")) if (is.null(colors)) { if (nlevels(fac) >= 3) { colors = brewer.pal(nlevels(fac), "Paired") } else { colors = c("black", "red") } } pc1var <- round(summary(pca)$importance[2,1]100, digits=1) pc2var <- round(summary(pca)$importance[2,2]100, digits=1) pc1lab <- paste0("PC1 (",as.character(pc1var),"%)") pc2lab <- paste0("PC2 (",as.character(pc2var),"%)") plot(PC2~PC1, data=as.data.frame(pca$x), bg=colors[fac], pch=21, xlab=pc1lab, ylab=pc2lab, main=main, ...) with(as.data.frame(pca$x), textxy(PC1, PC2, labs=rownames(as.data.frame(pca$x)), cex=textcx)) legend(legendpos, legend=levels(fac), col=colors, pch=20) # rldyplot(PC2 ~ PC1, groups = fac, data = as.data.frame(pca$rld), # pch = 16, cerld = 2, aspect = "iso", col = colours, main = draw.key(key = list(rect = list(col = colours), # terldt = list(levels(fac)), rep = FALSE))) } png("qc-pca.png", 1000, 1000, pointsize=20) rld_pca(rld, colors=mycols, intgroup="condition", xlim=c(-30, 30)) dev.off()
Get differential expression results
res <- results(dds) table(res$padj<0.05)
Order by adjusted p-value
res <- res[order(res$padj), ]
Merge with normalized count data
resdata <- merge(as.data.frame(res), as.data.frame(counts(dds, normalized=TRUE)), by="row.names", sort=FALSE) names(resdata)[1] <- "Gene" head(resdata)
get significant results (FDR<0.05)
Write results
write.csv(resdata, file="sig_diffexpr-results.csv")
MA plot
DESeq2::plotMA(dds, ylim=c(-1,1), cex=1)
Volcano plot with significant DE genes
volcanoplot <- function (res, lfcthresh=2, sigthresh=0.05, main="Volcano Plot", legendpos="bottomright", labelsig=TRUE, textcx=1, ...) { with(res, plot(log2FoldChange, -log10(padj), pch=20, main=main, ...)) with(subset(res, padj<sigthresh ),="" points(log2foldchange,="" -log10(padj),="" pch="20," col="red" ,="" ...))="" with(subset(res,="" abs(log2foldchange)>lfcthresh),="" points(log2foldchange,="" -log10(padj),="" pch="20," col="orange" ,="" ...))="" with(subset(res,="" padj<sigthresh="" &="" abs(log2foldchange)>lfcthresh),="" points(log2foldchange,="" -log10(padj),="" pch="20," col="green" ,="" ...))="" if="" (labelsig)="" {="" require(calibrate)="" with(subset(res,="" padj<sigthresh="" &="" abs(log2foldchange)>lfcthresh),="" textxy(log2foldchange,="" -log10(padj),="" labs="Gene," cex="textcx," ...))="" }="" legend(legendpos,="" xjust="1," yjust="1," legend="c(paste("FDR<",sigthresh,sep="")," paste("|logfc|>",lfcthresh,sep="" ),="" "both"),="" pch="20," col="c("red","orange","green"))" }="" png("diffexpr-volcanoplot2.png",="" 1200,="" 1000,="" pointsize="20)" volcanoplot(resdata,="" lfcthresh="1," sigthresh="0.05," textcx=".5," xlim="c(-10.5," 14))="" dev.off()<="" p="">
Cross: https://www.biostars.org/p/397399/