Question

Classic or GLM approach in edgeR, Why do I got less DEG in GLM approach?

0

Entering edit mode

RR • 0

@7e31e763

Last seen 2.9 years ago

Thailand

Hello all, I am doing gene expression profiling of 3 structures(BB, 79A, 28z). I have 9 samples from 3 donors. Each donor has 3 samples, representing 3 structures. For example Donor 1 has structure BB, 79A, 28z , Donor 2 has structure BB, 79A, 28z Donor 3 has structure BB, 79A, 28z. I would like to see the differentially expressed gene of structure 79A. I compare structure 79A vs 28z, 79A vs BB and 28z vs BB. I have tried both classic and glm approach. I got 24 DEGs in classic but only 6 DEGs in GLMs approach. Thank you in advance!!

The following are my script in classic approach (comparing 79A vs 28z)

counts <- read.table('NewCounting_file.txt')
    head(counts)
    str(counts)
    #create gene name and gene id variable
    gene_name <- counts[,2]
    gene_id <- counts[,1]
    class(counts)
    head(gene_id)
    head(gene_name)
    #Delete gene id and gene name column on counts 
    counts[1:2] <- list(NULL)
    head(counts)
    ##Delete BB group V3 &V6
    counts[1] <- list(NULL)
    counts[3] <- list(NULL)
    head(counts)
    #create gene count numeric matrix 
    counts_mat <- apply(as.matrix.noquote(counts),2,as.numeric)
    head(counts_mat)
    class(counts_mat) 
    row.names(counts_mat) <- gene_name
    head(counts_mat)
    newsample_name <- c("79A","28z","79A","28z","79A","28z")
    colnames(counts_mat) <- newsample_name
    head(counts_mat)
    library(edgeR)
    #create group variables
    groups79A_28z <- c("79A","28z","79A","28z","79A","28z")
    geneexp79A_28z <- DGEList(counts=counts_mat, group=groups79A_28z)
    head(geneexp79A_28z)
    geneexp79A_28z <- calcNormFactors(geneexp79A_28z)
    head(geneexp79A_28z)
    geneexp79A_28z<- estimateCommonDisp(geneexp79A_28z)
    head(geneexp79A_28z)
    geneexp79A_28z <- estimateTagwiseDisp(geneexp79A_28z)
    head(geneexp79A_28z)
    filterByExpr(geneexp79A_28z)
    head(geneexp79A_28z)
    genediff79A_28z <- exactTest(geneexp79A_28z)
    genediff79A_28z 
    topTags(genediff79A_28z)
    str(genediff79A_28z)
    genediff79A_28z$table <- cbind(genediff79A_28z$table, FDR=p.adjust(genediff79A_28z$table$PValue, method ='fdr'))
    head(genediff79A_28z)
    str(genediff79A_28z)
    gsign79A_28z <- genediff79A_28z$table[genediff79A_28z$table$FDR<0.05,]
    gsign79A_28z <- gsign79A_28z[order(gsign79A_28z$FDR),]
    dim(gsign79A_28z)
    head(gsign79A_28z)
---------

Below is my glm approach

counts <- read.table('NewCounting_file.txt')
head(counts)
str(counts)
#create gene name and gene id variable
gene_name <- counts[,2]
gene_id <- counts[,1]
class(counts)
head(gene_id)
head(gene_name)
#Delete gene id and gene name column on counts 
counts[1:2] <- list(NULL)
head(counts)
##Delete BB group V3 &V6
counts[1] <- list(NULL)
counts[3] <- list(NULL)
head(counts)
#create gene count numeric matrix 
counts_mat <- apply(as.matrix.noquote(counts),2,as.numeric)
head(counts_mat)
class(counts_mat) 
row.names(counts_mat) <- gene_name
head(counts_mat)
newsample_name <- c("79A","28z","79A","28z","79A","28z")
colnames(counts_mat) <- newsample_name
head(counts_mat)
library(edgeR)
#create group variables
group  <- c("79A","28z","79A","28z","79A","28z")
dglist  <- DGEList(counts=counts_mat, group=group)
keep <-filterByExpr(dglist)
summary(keep)
dglist <- dglist[keep, ,keep.lib.sizes=FALSE]
dim(dglist)
dglist <- calcNormFactors(dglist, method="TMM")
plotMDS(dglist)
dglist$samples
#Design matrix
group <- factor(dglist$samples, levels=c("79A","28z"))
design <- model.matrix(~0 +group)
design
dglist<- estimateDisp(dglist)
dglist$common.dispersion
plotBCV(dglist)
fit <- glmQLFit(dglist)
head(fit$coefficients)
plotQLDisp(fit)

edgeR DEGs • 1.2k views

ADD COMMENT • link updated 3.1 years ago by Gordon Smyth 52k • written 3.1 years ago by RR • 0

Gordon Smyth · Answer 1 · 2022-03-25

2

Entering edit mode

Gordon Smyth 52k

@gordon-smyth

Last seen 3 hours ago

WEHI, Melbourne, Australia

See Section 3.4.2 of the edgeR User's Guide for how to analyse paired samples from three Donors. The Donor structure is a type of blocking.

You absolutely have to use glms but neither of your scripts are correct at the moment. You need to analyse all 9 samples together and you need to take account of which samples are from which Donor.

ADD COMMENT • link 3.1 years ago Gordon Smyth 52k

0

Entering edit mode

Could you please help me explain the coef value? how to calculate its value ? Currently I got only 8 samples as another 1 is still in the process of sequencing but I want to try analyze these 8 samples first. Does the order of treatment ( A, B, C, B,A,C , C,B,A) correlate with coef value? Thank you so much for your comment, It means a lots to me, I try so hard to understand and writing the new script.

ADD REPLY • link 3.1 years ago RR • 0

0

Entering edit mode

Excuse me Gordon Smyth, Could you help me review my new script following your guidance last 2 days please, I have tried to write again. I have attached the pheno column in the picture. Currently I have only 8 samples to analyze. Thank you in advance!

    #Transform HTseq count output into dataframe
    counts <- read.table('NewCounting_file.txt')
    head(counts)
    #checking the structure of data.frame
    str(counts)
    #create gene name and gene id variable
    gene_name <- counts[,2]
    gene_id <- counts[,1]
    class(counts)
    head(gene_id)
    head(gene_name)
    #Delete gene id and gene name column on counts 
    counts[1:2] <- list(NULL)
    head(counts)
    #create gene count numeric matrix 
    counts_mat <- apply(as.matrix.noquote(counts),2,as.numeric)
    head(counts_mat)
    class(counts_mat)
    #Assign gene name as row name 
    row.names(counts_mat) <- gene_name
    head(counts_mat)
    str(counts_mat)
    counts_mat
    #Assign group as column name 
    sample_name <- c("MD7_BB","MD7_79A","MD7_28","BD7_BB","BD7_79A","BD7_28","JD7_79A","JD7_28")
    colnames(counts_mat) <- sample_name
    head(counts_mat)
    ## read phenotype data in ============
    group <- c("MD7_BB","MD7_79A","MD7_28","BD7_BB","BD7_79A","BD7_28","JD7_79A","JD7_28")
    Pheno <- read.csv("mypheno.csv", stringsAsFactors = TRUE)
    dglist <- DGEList(counts=counts_mat, samples = Pheno, group = group)
    str(dglist)
    dglist
    dim(dglist)
    keep <-filterByExpr(dglist)
    summary(keep)
    dglist <- dglist[keep, ,keep.lib.sizes=FALSE]
    dim(dglist)
    #confirm that the num of gene in keep is equal to the number of gene/rows in dglist. if the same R will give ans TRUE
    length(keep[keep == TRUE]) == dim(dglist) [1]
    dglist <- calcNormFactors(dglist, method="TMM")
    plotMDS(dglist)
    dglist$samples
    dglist
    #Design matrix
    donors <- factor(dglist$sample$Donors)
    groups <- factor(dglist$sample$Structures)
    design <- model.matrix(~0 + donors + groups)
    design

    #By estimating gene dispersion, we are estimating the relative variability of true expression levle btw replicates
    #To estimate common ,trended and tagwise dispersion in one run :
    dglist<- estimateDisp(dglist, design, robust=TRUE)
    dglist$common.dispersion
    plotBCV(dglist)
    #Fitting
    fit <- glmQLFit(dglist,design, robust=TRUE)
    head(fit$coefficients)
    colnames(fit)
    plotQLDisp(fit)

    ##############DGE
    ##To detect genes that are differentially expressed btw any of three ttt
    qlf <- glmQLFTest(fit, coef=4:5)
    topTags(qlf)
    FDR <- p.adjust(qlf$table$PValue, method='fdr')
    sum(FDR < 0.05)
    summary(decideTests(qlf))
    #closer look at individual counts per million for the top gens
    top <- rownames(topTags(qlf))
    cpm(dglist)[top,]
    #total num of genes significantly up-regulated or down regulated at 5%FDR 
    summary(decideTests(qlf))
    #plot all the logFCs against average count size, highlighting the DE genes
    plotMD(qlf)
    # the blue line indicate 2 fold up or down
    abline(h=c(-1,1), col="blue")
    ###############################

    #detect gene that are differential expressed in 79A vs BBz
    qlf_79A_BBz <- glmQLFTest(fit, coef=4)
    # Show top 10 differentially expressed genes (DEG)
    topTags(qlf_79A_BBz)
    FDR <- p.adjust(qlf_79A_BBz$table$PValue, method='fdr')
    sum(FDR < 0.05)
    summary(decideTests(qlf_79A_BBz))
    #closer look at individual counts per million for the top gens
    top <- rownames(topTags(qlf_79A_BBz))
    cpm(dglist)[top,]
    #total num of genes significantly up-regulated or down regulated at 5%FDR 
    summary(decideTests(qlf_79A_BBz))
    tab79A_BBz <- as.data.frame(topTags(qlf_79A_BBz, n=30))
    #plot all the logFCs against average count size, highlighting the DE genes
    plotMD(qlf_79A_BBz)
    # the blue line indicate 2 fold up or down
    abline(h=c(-1,1), col="blue")
    result_79A_BBz <- topTags(qlf_79A_BBz, n=Inf)
    write.csv(result_79A_BBz, file="result_79A_BBz.csv")


    ###############################
    #detect gene that are differential expressed in BB vs 28z
    qlf_BB_28z <- glmQLFTest(fit, coef=5)
    topTags(qlf_BB_28z)
    FDR <- p.adjust(qlf_BB_28z$table$PValue, method='fdr')
    sum(FDR < 0.05)
    #closer look at individual counts per million for the top gens
    top <- rownames(topTags(qlf_BB_28z))
    topFDR <- top <- rownames(topTags(qlf_BB_28z))
    cpm(dglist)[top,]
    #total num of genes significantly up-regulated or down regulated at 5%FDR 
    summary(decideTests(qlf_BB_28z))
    tabBB_28z <- as.data.frame(topTags(qlf_BB_28z, n= 30))
    #plot all the logFCs against average count size, highlighting the DE genes
    plotMD(qlf_BB_28z)
    # the blue line indicate 2 fold up or down
    abline(h=c(-1,1), col="blue")

    ######################################
    #detect gene that are differential expressed in 28z vs 79A
    qlf28z_79A <- glmQLFTest(fit, contrast=c(0,0,0,-1,1))
    topTags(qlf28z_79A)
    FDR <- p.adjust(qlf28z_79A$table$PValue, method='fdr')
    sum(FDR < 0.05)
    summary(decideTests(qlf28z_79A))
    #closer look at individual counts per million for the top gens
    top <- rownames(topTags(qlf28z_79A))
    cpm(dglist)[top,]
    #total num of genes significantly up-regulated or down regulated at 5%FDR 
    summary(decideTests(qlf28z_79A))
    tab28z_79A <- as.data.frame(topTags(qlf28z_79A, n=30))
    #plot all the logFCs against average count size, highlighting the DE genes
    plotMD(qlf28z_79A)
    # the blue line indicate 2 fold up or down
    abline(h=c(-1,1), col="blue")!

enter image description here

ADD REPLY • link updated 3.1 years ago by Gordon Smyth 52k • written 3.1 years ago by RR • 0