Question

struggling to create a heatmap

0

Entering edit mode

m-bihie ▴ 10

@4f002f4c

Last seen 2.0 years ago

Canada

I can't create a heatmap for the top 10 differentially expressed genes. I tried following other examples online (like the one in the DESeq2 workflow), but non seem to work with my data. Does anyone have advise on how to plot a heatmap?

Here is the error message:

> DEgenes <- mat[idx,] #subset mtx by only the set of DE gns abv
Error in mat[idx, ] : subscript out of bounds

Here is the code causing the error:

#BiocManager::install("DESeq2")
library(DESeq2)
#BiocManager::install("EnsDb.Hsapiens.v86")
library(EnsDb.Hsapiens.v86)
#BiocManager::install("EnhancedVolcano")
library(EnhancedVolcano)
#BiocManager::install("pheatmap")
library(pheatmap)
#BiocManager::install("org.Hs.eg.db")
library(org.Hs.eg.db)
#install.packages("tidyverse")
library(tidyverse)
#install.packages("recount")
library(recount)
#install.packages("RColorBrewer")
library(RColorBrewer)
#install.packages("DT")
library(DT)


#list of articles to extract data from
project_info <- abstract_search(query = "Ewing sarcoma")

# View the available projects and select one
selected_study <- "SRP028344" #"Gene regulation by EWS/ERG in Ewing sarcoma."


# Download the study data
download_study(selected_study) #creates an object with a class: RangedSummarizedExperiment

#load the data by specifying sub-folder
load("SRP028344/rse_gene.Rdata")

# Examine the read count matrix
cts <- assay(rse_gene) #gn counts within each smpl
#View(cts)


#editing condition column/colData
rse_gene$condition <- c(rep("untrtd", 3), rep("trtd",4))


#sample metadata (colData) - shows condition coloumn that was added above
cd <- as.data.frame(colData(rse_gene))
View(cd)

#confirm that row names in the colData (cd) matches the column names in cts (the assay of the data)
all(colnames(cts) %in% rownames(cd)) #TRUE

#confirm they have the same order
all(colnames(cts) == rownames(cd)) #TRUE

#!!!!!!!!!!!!!!!!!!!!!!!
#CREATE THE DESEQ OBJECT
#!!!!!!!!!!!!!!!!!!!!!!!

#make the dds
dds <- DESeqDataSet(rse_gene, design = ~condition)

#set the factor level
  #compare the other levels to the untreated sample
dds$condition <- relevel(dds$condition, ref = "untrtd")

#creat DESe2 object
dds <- DESeq(dds)

#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
#QUALITY CONTROL: RMV LOW GN CNTS
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

#remove rows with low gene counts
keep <- as.data.frame(counts(dds)) %>%
  rowSums(counts(dds, normalized=TRUE)) >= 10

dds <- dds[keep,]

#!!!!!!!!!!!!!!!!!!!!!!!!
#QUALITY CONTROL: NRMLZTN
#!!!!!!!!!!!!!!!!!!!!!!!!

#calculate normalized counts
ddsNorm <- estimateSizeFactors(dds)

#view the size factors used for normalization
sizeFactors(ddsNorm)

#extract normalized counts
normalizedCounts <- counts(ddsNorm, normalized = TRUE)
    #view(normalizedCounts)

#variance stabilizing transformation
ddsNormVst <- vst(ddsNorm, blind = TRUE)

#extract VST-transformed nrmlzd cnts as a mtx from the vsd bjct [use assay()]
mtx_vst_ddsNorm<- assay(ddsNormVst)

#compute the prws crltn vlus btwn each pair of samples using the cor() fx
cor_mtx_vst_ddsNorm <- cor(mtx_vst_ddsNorm)


#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
#QUALITY CONTROL: PCA PLOT, CLSTRNG HTMP
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

#create rlog for PCA
rld <- rlog(dds)

#plot the PCA
plotPCA(rld)


#!!!!!!!!!!!!!!!
#summary/results
#!!!!!!!!!!!!!!!

#extract the results
res <- results(dds,
               contrast = c("condition", #condition factor
                            "trtd",      #lvl/fctr to cmpr
                            "untrtd"),   #base level
               alpha = 0.05)

#confirm the name and number of the coefficients
resultsNames(dds) # lists the coefficients

summary(res)

#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
#VISUALIZING RESULTS: VOLCANO PLOTS
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!


#shrink data
resNorm <- lfcShrink(dds = dds,
                     res = res,
                     type = "normal",
                     coef = 2)

#make a dataframe of the results to view them
resdf <- as.data.frame(resNorm)

#extract gene symbol from EnsDb.Hsapiens.v86
ens2sym <- AnnotationDbi::select(EnsDb.Hsapiens.v86,
                                 keys = keys(EnsDb.Hsapiens.v86), #this code shows that ur selecting from this database (EnsD, using the key(EnsDb) to attach the symbol)
                                 columns = c("SYMBOL"))

#join ens2sym to resdf by shared column (GENEID)
#want to add the symbol coloum in ens2sym to resdf
resdfsym <- resdf %>%
  rownames_to_column() %>%
  mutate(GENEID = gsub(rowname, pattern = "\\..+", replacement = "")) %>% #rmvng the version at the end of the rowname
  inner_join(y = ens2sym, by = "GENEID") %>% #only keeps geneID/symbol found in both dfs
  dplyr::select(-rowname)

#create the volcano plot
EnhancedVolcano(resdfsym, lab = resdfsym$SYMBOL, pCutoff = 1e-100,
                FCcutoff = 3,
                x = "log2FoldChange",
                y = "padj")

#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
#TOP-10 OVER EXPRESSED DEGS HEATMAP
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

#first, #subset nrmlzd counts to sgnfcnt genes
  #800 less genes than resdfsym
sigres <- resdfsym[ #subset nrmlzd counts to sgnfcnt genes
  which(resdfsym$padj < 0.01 & abs(resdfsym$log2FoldChange) >= 1 &
          resdfsym$baseMean >= 20), ]

sigres <- resdfsym %>%
  dplyr::select(resdfsym$padj < 0.01)
sigres

#get normalized counts from dds object
  #only want counts with matching geneIDs to our significant data frame
counts(dds, normalized = TRUE)[]

#!!!!!!!!

slct <- order(rowMeans(
  counts(dds,
         normalized = TRUE)),
  decreasing = TRUE) [1:20]

df <- as.data.frame(colData(dds)[,c("condition","type")])
df <- as.data.frame(colData(dds)[,c("condition"), "type"])

rownames(df) <- colnames(rld)

pheatmap(assay(rld)[slct,], cluster_rows=FALSE, show_rownames=FALSE,
         cluster_cols=FALSE, annotation_col=df)

mat <- assay(rld) #extract mtx of nrmlzd counts
idx <- sigres$ENSEMBL #extract geneID of DE gns
DEgenes <- mat[idx,] #subset mtx by only the set of DE gns abv

rld <- na.omit(rld)
annotation <- as.data.frame(colData(rld)[, c("run","condition")]) #xtrct some dat from rld cndtns
pheatmap(DEgenes, scale = "row", show_rownames = FALSE, clustering_distance_rows = "correlation", annotation_col = annotation, main="Differentially Expressed genes")


#!!!!!!!!!!!!!!!!!

## add gene annotation to results table

# subset genes 


# plot heatmap of top 20 DE genes
orderedSig <- resdfsym[order(resdfsym$padj), ] #top 20 gns r the gns w lowest adjusted p value -> most sgnfcntly dfly xprsd
    #can also extract top 20 gns w lrgst or smllst log2fc
id1 <- orderedSig$GENEID #can add names since it's just top 20
id2 <- orderedSig$SYMBOL #this is the name
topDE <- mat[id1,]
rownames(topDE) <- id2 #rmv ensemble iDs from row name
top20DE <- head(topDE, n=20) #take top 20 gns 

#plot data now
pheatmap(top20DE, scale = "row", clustering_distance_rows = "correlation", annotation_col = annotation, main="Top 20 Differentially Expressed genes")

DESeq2 heatmaps • 1.1k views

ADD COMMENT • link updated 2.2 years ago by ATpoint ★ 4.8k • written 2.2 years ago by m-bihie ▴ 10

score 0 · Answer 1 · 2023-02-04

I am reluctant to read through that wall of code. Here is what would work. Note that prefiltering makes sense before, not after running DESeq(). Also, since you get excessively many DEGs here I would test against a certain logFC threshold, see below in the results() function. See the manual for other best practices for analysis.


library(ComplexHeatmap)
library(DESeq2)
library(edgeR)
library(recount)

# Get data
project_info <- abstract_search(query = "Ewing sarcoma")
selected_study <- "SRP028344" #"Gene regulation by EWS/ERG in Ewing sarcoma."
download_study(selected_study) #creates an object with a class: RangedSummarizedExperiment
load("SRP028344/rse_gene.Rdata")
rse_gene$condition <- c(rep("untrtd", 3), rep("trtd",4))

# DESeq2 part
dds <- DESeqDataSet(rse_gene, design = ~condition)
dds$condition <- relevel(dds$condition, ref = "untrtd")

# Prefilter, before running DESeq(), otherwise it makes little sense
keep <- edgeR::filterByExpr(counts(dds), dds$condition)
dds <- dds[keep,]
dds <- DESeq(dds)

# PCA
rld <- rlog(dds)
plotPCA(rld, "condition")

# Test against a threshold to avoid excessive DEGs and prioritize major ones
res <- results(dds, alpha = 0.05, lfcThreshold=log2(1.5))

# Get DEGs
DEgenes <- rownames(res[res$padj<0.05 & !is.na(res$padj),])

# Heatmap based on scaled vsd
v <- assay(rld)[DEgenes,]
vz <- t(scale(t(v)))

# Heatmap, not very informative with only two conditions
Heatmap(vz, clustering_method_rows="ward.D2", clustering_method_columns="ward.D2", 
        show_row_names=FALSE, top_annotation=HeatmapAnnotation(condition=dds$condition))