I am writing to inquire about a problem I keep running into while running DESeq2. After running DESeq2 comparing two samples, I encounter an issue where I get >10000 differentially expressed genes. Furthermore, I have been getting a couple thousand genes that has a padj of 0 or negligibly close to zero (e.g. 2E-320). I've pasted below a PCA plot of the samples. The samples I directly compared were the MII oocyte and secondary follicle oocytes. Comparing them either in vitro or in vivo also resulted in the same problem. I've also attached a volcano plot for reference as well. Lastly, I've pasted my R Session below as well:
library(DESeq2)
> setwd("/Users/anjalimohan/Desktop/Research/Hikabe/")
> list.files()
[1] "All_TPM_Maths.csv" "All_TPMs.csv" "All_TPMs.xlsx" "counts.txt"
[5] "counts.txt.summary" "countsonly_2ndfol.csv" "countsonly_copy.csv" "countsonly_MII.csv"
[9] "countsonly_PGCs.csv" "countsonly_vivo.csv" "countsonly.csv" "GTFs"
[13] "Metadata.csv" "Metadata1.csv" "Mouse_Gene_Reference_GRCm38.csv" "Results_Vitro"
[17] "results_vitro_2ndfol_MII_2.csv" "Results_Vivo" "SraRunTable.txt" "TPMs_Calc"
> somecounts <- read.delim("counts.txt", row.names=NULL, comment.char="#")
> countsonly <- cbind(somecounts[,1], somecounts[,7:29])
> write.csv(countsonly, "countsonly.csv", row.names= F)
>
> # Change naming scheme in notepad++
>
> countData <- read.delim("countsonly.csv", row.names = 1, header = TRUE, sep = ",")
> head(countData)
SRR3313577.bam SRR3313578.bam SRR3313579.bam SRR3313580.bam SRR3313581.bam SRR3313582.bam SRR3313583.bam SRR3313584.bam SRR3313585.bam
ENSMUSG00000102693 0 0 0 0 0 0 0 0 0
ENSMUSG00000064842 0 0 0 0 0 0 0 0 0
ENSMUSG00000051951 0 0 1 14 0 30 34 22 45
ENSMUSG00000102851 0 0 0 0 0 0 0 0 0
ENSMUSG00000103377 0 0 0 0 0 0 0 0 0
ENSMUSG00000104017 0 0 0 0 0 0 0 0 0
SRR3313586.bam SRR3313587.bam SRR3313588.bam SRR3313589.bam SRR3313590.bam SRR3313591.bam SRR3313592.bam SRR3313593.bam SRR3313594.bam
ENSMUSG00000102693 0 0 0 0 0 0 0 0 0
ENSMUSG00000064842 0 0 0 0 0 0 0 0 0
ENSMUSG00000051951 59 48 132 62 93 2 15 7 47
ENSMUSG00000102851 0 0 0 0 0 0 0 0 0
ENSMUSG00000103377 0 0 0 0 0 0 0 0 0
ENSMUSG00000104017 0 0 0 0 0 0 0 0 0
SRR3313595.bam SRR3313596.bam SRR3313597.bam SRR3313598.bam SRR3313599.bam
ENSMUSG00000102693 0 0 0 0 0
ENSMUSG00000064842 0 0 0 0 0
ENSMUSG00000051951 40 38 47 35 51
ENSMUSG00000102851 0 0 0 0 0
ENSMUSG00000103377 0 0 0 0 0
ENSMUSG00000104017 0 0 0 0 0
> groups <-read.csv("Metadata1.csv", row.names = NULL, header = T, sep = ",")
> head(groups)
Name Sample Type Group
1 SRR3313577.bam ESC vivo A
2 SRR3313578.bam ESC vivo A
3 SRR3313579.bam ESC vivo A
4 SRR3313580.bam PGCLC vitro B
5 SRR3313581.bam PGCLC vitro B
6 SRR3313582.bam PGCLC vitro C
> # Look at characteristics of data:
> str(groups)
'data.frame': 23 obs. of 4 variables:
$ Name : chr "SRR3313577.bam" "SRR3313578.bam" "SRR3313579.bam" "SRR3313580.bam" ...
$ Sample: chr "ESC" "ESC" "ESC" "PGCLC" ...
$ Type : chr "vivo" "vivo" "vivo" "vitro" ...
$ Group : chr "A" "A" "A" "B" ...
> #Filtering out genes w/ below 10 counts (average):
> keep <- countData[rowMeans(countData) >= 10,]
> #Change characteristics of group to factors instead of numbers:
> groups$Group <- as.factor(groups$Group)
> dds <- DESeqDataSetFromMatrix(countData , colData = groups , design = ~ Group)
> dds <- DESeq(dds)
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
> res <- results(dds, contrast=c("Group","G","H"))
> #Make the res into a data frame:
> resdata<- as.data.frame(res)
> #Convert row names into first column:
> resdata <- cbind(rownames(resdata), data.frame(resdata, row.names=NULL))
> ref <- read.csv("/Users/anjalimohan/Desktop/Research/Hikabe/Mouse_Gene_Reference_GRCm38.csv", row.names=NULL)
> ref$Gene_ID <- sub("\\..*", "", ref$Gene_ID)
> head(ref)
Chromosome Database ID_Type Start End Strand Evidence_Level Gene_ID Gene_Type Gene_Name MGI_ID
1 chr1 HAVANA gene 3073253 3074322 + level 2 ENSMUSG00000102693 TEC 4933401J01Rik MGI:1918292
2 chr1 ENSEMBL gene 3102016 3102125 + level 3 ENSMUSG00000064842 snRNA Gm26206 MGI:5455983
3 chr1 HAVANA gene 3205901 3671498 - level 2 ENSMUSG00000051951 protein_coding Xkr4 MGI:3528744
4 chr1 HAVANA gene 3252757 3253236 + level 1 ENSMUSG00000102851 processed_pseudogene Gm18956 MGI:5011141
5 chr1 HAVANA gene 3365731 3368549 - level 2 ENSMUSG00000103377 TEC Gm37180 MGI:5610408
6 chr1 HAVANA gene 3375556 3377788 - level 2 ENSMUSG00000104017 TEC Gm37363 MGI:5610591
> #Merging resdata and ref:
> merged <- as.data.frame(merge(resdata, ref, by.x = "rownames(resdata)",by.y = "Gene_ID", all.x = T))
> # For mouse:
> merged2 <- merged[, c(8:17, 1:7)]
> colnames(merged2) <- c("Chromosome", "Database", "ID_Type", "Start", "End","Strand","Evidence_Level", "Gene_Type", "Gene_Name", "MGI_ID", "Gene_ID", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj")
> head(merged2)
Chromosome Database ID_Type Start End Strand Evidence_Level Gene_Type Gene_Name MGI_ID Gene_ID baseMean
1 chr3 HAVANA gene 108107280 108146146 - level 2 protein_coding Gnai3 MGI:95773 ENSMUSG00000000001 10619.044861
2 chrX HAVANA gene 77837901 77853623 - level 2 protein_coding Pbsn MGI:1860484 ENSMUSG00000000003 0.000000
3 chr16 HAVANA gene 18780447 18811987 - level 2 protein_coding Cdc45 MGI:1338073 ENSMUSG00000000028 3470.071707
4 chr7 HAVANA gene 142575529 142578143 - level 2 lncRNA H19 MGI:95891 ENSMUSG00000000031 1937.134359
5 chrX HAVANA gene 161082525 161258213 + level 1 protein_coding Scml2 MGI:1340042 ENSMUSG00000000037 8086.321943
6 chr11 HAVANA gene 108343354 108414396 + level 2 protein_coding Apoh MGI:88058 ENSMUSG00000000049 1.532495
log2FoldChange lfcSE stat pvalue padj
1 -3.21292749 0.09630001 -33.36373050 4.605950e-244 3.360769e-242
2 NA NA NA NA NA
3 -2.89216761 0.19056813 -15.17655427 5.056560e-52 3.811170e-51
4 0.04406129 1.21546762 0.03625048 9.710826e-01 1.000000e+00
5 -4.75648056 0.20690752 -22.98843781 6.083945e-117 1.193078e-115
6 2.71359509 3.12823959 0.86745117 3.856949e-01 NA
> write.csv(merged2,row.names= FALSE, file = "results_vitro_2ndfol_MII_2.csv")
> library(ggplot2)
> library(dplyr)
>
> # order results table by the smallest adjusted p value:
> ordered <- merged2[order(merged2$padj),]
>
> results = as.data.frame(dplyr::mutate(as.data.frame(ordered), Significant=ifelse(ordered$padj<0.05, "FDR<0.05", "Non-sig")), row.names=rownames(ordered))
> head(results)
Chromosome Database ID_Type Start End Strand Evidence_Level Gene_Type Gene_Name MGI_ID Gene_ID baseMean log2FoldChange
55 chr8 HAVANA gene 106603351 106670246 + level 2 protein_coding Cdh1 MGI:88354 ENSMUSG00000000303 19329.150 -4.831978
209 chr6 HAVANA gene 86691768 86733383 - level 2 protein_coding Gmcl1 MGI:1345156 ENSMUSG00000001157 6095.072 -4.412727
421 chr17 HAVANA gene 5841329 5931954 + level 2 protein_coding Snx9 MGI:1913866 ENSMUSG00000002365 18311.996 -4.681864
736 chr5 HAVANA gene 33634952 33652574 - level 2 protein_coding Slbp MGI:108402 ENSMUSG00000004642 47557.392 -3.760671
750 chr1 HAVANA gene 33719887 33742564 + level 1 protein_coding Rab23 MGI:99833 ENSMUSG00000004768 3498.283 -3.850737
767 chr1 HAVANA gene 181815335 181843046 - level 2 protein_coding Lbr MGI:2138281 ENSMUSG00000004880 23091.828 -4.011269
lfcSE stat pvalue padj Significant
55 0.11711838 -41.25722 0 0 FDR<0.05
209 0.09899603 -44.57478 0 0 FDR<0.05
421 0.10533009 -44.44945 0 0 FDR<0.05
736 0.07913969 -47.51941 0 0 FDR<0.05
750 0.09672259 -39.81218 0 0 FDR<0.05
767 0.10478831 -38.27974 0 0 FDR<0.05
>
> DEgenes_DESeq <- results[which(abs(results$log2FoldChange) > log2(0.5) & results$padj < 0.05),]
>
> p = ggplot2::ggplot(results, ggplot2::aes(log2FoldChange, -log10(pvalue))) +
+ ggplot2::geom_point(ggplot2::aes(col = Significant)) +
+ theme(legend.text=element_text(size=18)) +
+ ggplot2::scale_color_manual(values = c("red", "black")) +
+ ggplot2::xlim(-7.5,7.5) +
+ ggplot2::ggtitle("Volcano Plot")
>
> p + ggrepel::geom_text_repel(data=results[1:20, ], ggplot2::aes(label=results[1:20,9]))
Warning message:
Removed 30360 rows containing missing values (geom_point).
OP did not add their images properly. Here are the PCA (scroll to view full width) and Volcano plots:
PCA plot:
PCA full width:
Volcano Plot: