Hi all,
I am using DEseq2 to find the differentially expressed genes in different libraries of mRNA seq. However, I am finding some difficulties to explict the correct design formula to pass to the DESeqDataSetFromMatrix function.
We treated some insects using three different approaches: ChemicalA, ChemicalB and ChemicalC. We sampled each stressed condition at 2 time points and, before the expriment took place we sampled one control group. Therefore the metadata looks like:
metadata <- data.frame(
SampleID = c("s24_1", "s24_2", "s24_3", "s24_4", "s24_5", "t8_1", "t8_2", "t8_3", "t8_4", "t8_5",
"ct_1", "ct_2", "ct_3", "ct_4", "ct_5", "l24_1", "l24_2", "l24_3", "l24_4", "l24_5",
"l8_1", "l8_2", "l8_3", "l8_4", "l8_5", "m36_1", "m36_2", "m36_3", "m36_4", "m36_5",
"m96_1", "m96_2", "m96_3", "m96_4", "m96_5"),
Treatment = c("ChemicalA", "ChemicalA", "ChemicalA", "ChemicalA", "ChemicalA", "ChemicalA", "ChemicalA", "ChemicalA", "ChemicalA", "ChemicalA",
"control", "control", "control", "control", "control", "ChemicalB", "ChemicalB", "ChemicalB", "ChemicalB", "ChemicalB",
"ChemicalB", "ChemicalB", "ChemicalB", "ChemicalB", "ChemicalB", "ChemicalC", "ChemicalC", "ChemicalC", "ChemicalC", "ChemicalC",
"ChemicalC", "ChemicalC", "ChemicalC", "ChemicalC", "ChemicalC"),
Sampling_time = c("t2", "t2", "t2", "t2", "t2", "t1", "t1", "t1", "t1", "t1",
"t0", "t0", "t0", "t0", "t0", "t2", "t2", "t2", "t2", "t2",
"t1", "t1", "t1", "t1", "t1", "t1", "t1", "t1", "t1", "t1",
"t2", "t2", "t2", "t2", "t2"),
Batch = c("ChemicalA_t2", "ChemicalA_t2", "ChemicalA_t2", "ChemicalA_t2", "ChemicalA_t2", "ChemicalA_t1", "ChemicalA_t1", "ChemicalA_t1", "ChemicalA_t1", "ChemicalA_t1",
"control_t0", "control_t0", "control_t0", "control_t0", "control_t0", "ChemicalB_t2", "ChemicalB_t2", "ChemicalB_t2", "ChemicalB_t2", "ChemicalB_t2",
"ChemicalB_t1", "ChemicalB_t1", "ChemicalB_t1", "ChemicalB_t1", "ChemicalB_t1", "ChemicalC_t1", "ChemicalC_t1", "ChemicalC_t1", "ChemicalC_t1", "ChemicalC_t1",
"ChemicalC_t2", "ChemicalC_t2", "ChemicalC_t2", "ChemicalC_t2", "ChemicalC_t2")
)
print(metadata)
SampleID Treatment Sampling_time Batch
1 s24_1 ChemicalA t2 ChemicalA_t2
2 s24_2 ChemicalA t2 ChemicalA_t2
3 s24_3 ChemicalA t2 ChemicalA_t2
4 s24_4 ChemicalA t2 ChemicalA_t2
5 s24_5 ChemicalA t2 ChemicalA_t2
6 t8_1 ChemicalA t1 ChemicalA_t1
7 t8_2 ChemicalA t1 ChemicalA_t1
8 t8_3 ChemicalA t1 ChemicalA_t1
9 t8_4 ChemicalA t1 ChemicalA_t1
10 t8_5 ChemicalA t1 ChemicalA_t1
11 ct_1 control t0 control_t0
12 ct_2 control t0 control_t0
13 ct_3 control t0 control_t0
14 ct_4 control t0 control_t0
15 ct_5 control t0 control_t0
16 l24_1 ChemicalB t2 ChemicalB_t2
17 l24_2 ChemicalB t2 ChemicalB_t2
18 l24_3 ChemicalB t2 ChemicalB_t2
19 l24_4 ChemicalB t2 ChemicalB_t2
20 l24_5 ChemicalB t2 ChemicalB_t2
21 l8_1 ChemicalB t1 ChemicalB_t1
22 l8_2 ChemicalB t1 ChemicalB_t1
23 l8_3 ChemicalB t1 ChemicalB_t1
24 l8_4 ChemicalB t1 ChemicalB_t1
25 l8_5 ChemicalB t1 ChemicalB_t1
26 m36_1 ChemicalC t1 ChemicalC_t1
27 m36_2 ChemicalC t1 ChemicalC_t1
28 m36_3 ChemicalC t1 ChemicalC_t1
29 m36_4 ChemicalC t1 ChemicalC_t1
30 m36_5 ChemicalC t1 ChemicalC_t1
31 m96_1 ChemicalC t2 ChemicalC_t2
32 m96_2 ChemicalC t2 ChemicalC_t2
33 m96_3 ChemicalC t2 ChemicalC_t2
34 m96_4 ChemicalC t2 ChemicalC_t2
35 m96_5 ChemicalC t2 ChemicalC_t2
I here generate a random example of transcript quantifications (in case you want to reproduce it)
gene_ids <- c("gene1", "gene2",
"gene3", "gene4",
"gene5", "gene6")
# Define the column names (sample IDs)
sample_ids <- c("s24_1", "s24_2", "s24_3", "s24_4", "s24_5", "t8_1",
"t8_2", "t8_3", "t8_4", "t8_5", "ct_1", "ct_2",
"ct_3", "ct_4", "ct_5", "l24_1", "l24_2", "l24_3",
"l24_4", "l24_5", "l8_1", "l8_2", "l8_3",
"l8_4", "l8_5", "m36_1", "m36_2", "m36_3",
"m36_4", "m36_5", "m96_1", "m96_2", "m96_3",
"m96_4", "m96_5")
# Define the count data
count_data <- matrix(c(
2740, 3719, 291, 6835, 1034, 4715, 3458, 184, 5145, 3430, 3891,
3692, 3588, 3702, 3332, 732, 4425, 3472, 3400, 631, 3912, 3453, 0, 4086, 3496, 8007, 5292, 2529, 119, 2315, 136, 4335, 3547, 399, 253,
4007, 5023, 5287, 12819, 13091, 5632, 4418, 5985, 6209, 5283, 5091, 4670, 4807, 4519, 3999, 5270, 4731, 4960, 3367, 3864, 4916, 5865, 5199, 5395, 4669, 10639, 7296, 4979, 4140, 4178, 4486, 4622, 3470, 5771, 4234,
1834, 598, 1857, 3594, 6729, 88, 43, 33, 177, 53, 164, 74, 56, 23, 53, 48, 192, 100, 541, 6, 87, 136, 33, 68, 136, 1344, 1723, 991, 2025, 583, 5581, 986, 1461, 4816, 3542,
0, 179, 2407, 379, 2069, 225, 22, 4771, 46, 154, 242, 168, 64, 20, 155, 3628, 206, 68, 48, 3252, 18, 28, 4528, 32, 73, 244, 18, 149, 409, 0, 463, 204, 149, 744, 1247,
1075, 1554, 124, 2416, 1289, 1648, 1090, 435, 2545, 1355, 2323, 1668, 2538, 1203, 1766, 177, 2617, 1769, 1495, 113, 2206, 2419, 20, 1833, 2041, 4002, 3219, 703, 181, 1074, 246, 1711, 1249, 0, 100,
724, 1899, 29, 1002, 0, 2458, 1561, 40, 2435, 1721, 2054, 1938, 1907, 1936, 1951, 147, 2839, 1814, 1600, 70, 1444, 1621, 106, 2018, 1836, 2801, 1595, 719, 0, 889, 394, 1282, 1306, 0, 0
), nrow = 6, byrow = TRUE)
# Create the dataframe
quant <- data.frame(count_data, row.names = gene_ids)
colnames(quant) <- sample_ids
The main goal is to assess which condition mostly affect the insect and assess if any variations is recorded between different sampling times. Firstly I run:
dds <- DESeqDataSetFromMatrix(colData = metadata, countData = round(quant), design = ~ Batch )
dds <- DESeq(dds)
resultsNames(dds)
[1] "Intercept" "Batch_ChemicalA_t2_vs_ChemicalA_t1" "Batch_ChemicalB_t1_vs_ChemicalA_t1"
[4] "Batch_ChemicalB_t2_vs_ChemicalA_t1" "Batch_ChemicalC_t1_vs_ChemicalA_t1" "Batch_ChemicalC_t2_vs_ChemicalA_t1"
[7] "Batch_control_t0_vs_ChemicalA_t1"
but why only a small fraction of results are displayed? I would expect also other results (e.g. Batch_control_t0_vs_Chemical_B_t1)
Then I tried with:
dds <- DESeqDataSetFromMatrix(colData = metadata, countData = round(quant), design = ~ Batch + Sampling_time )
converting counts to integer mode
Errore in checkFullRank(modelMatrix) :
the model matrix is not full rank, so the model cannot be fit as specified.
One or more variables or interaction terms in the design formula are linear
combinations of the others and must be removed.
Please read the vignette section 'Model matrix not full rank':
vignette('DESeq2')
In aggiunta: Messaggio di avvertimento:
In DESeqDataSet(se, design = design, ignoreRank) :
some variables in design formula are characters, converting to factors
And, as far I can understand, DESeq raises and error because of the presence of a biased experimental design (controls are not enough represented with repect to the other treatments).
For these reasons, I would like to ask you how to handle these data. How would you set the design formula? Of course controls are insects not exposed to chemical treatments.
Thanks a lot, any information will be appreciated!
Claudio
sessionInfo()
R version 4.4.0 (2024-04-24)
Platform: x86_64-pc-linux-gnu
Running under: Ubuntu 22.04.4 LTS
Matrix products: default
BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
locale:
[1] LC_CTYPE=it_IT.UTF-8 LC_NUMERIC=C LC_TIME=it_IT.UTF-8 LC_COLLATE=it_IT.UTF-8
[5] LC_MONETARY=it_IT.UTF-8 LC_MESSAGES=it_IT.UTF-8 LC_PAPER=it_IT.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=it_IT.UTF-8 LC_IDENTIFICATION=C
time zone: Europe/Rome
tzcode source: system (glibc)
attached base packages:
[1] stats4 stats graphics grDevices utils datasets methods base
other attached packages:
[1] PoiClaClu_1.0.2.1 EnhancedVolcano_1.22.0 ggrepel_0.9.5
[4] ggVennDiagram_1.5.2 RColorBrewer_1.1-3 pheatmap_1.0.12
[7] reshape2_1.4.4 ggplot2_3.5.1 DESeq2_1.44.0
[10] SummarizedExperiment_1.34.0 Biobase_2.64.0 MatrixGenerics_1.16.0
[13] matrixStats_1.3.0 GenomicRanges_1.56.0 GenomeInfoDb_1.40.0
[16] IRanges_2.38.0 S4Vectors_0.42.0 BiocGenerics_0.50.0
loaded via a namespace (and not attached):
[1] gtable_0.3.5 lattice_0.22-6 vctrs_0.6.5 tools_4.4.0
[5] generics_0.1.3 parallel_4.4.0 tibble_3.2.1 fansi_1.0.6
[9] pkgconfig_2.0.3 Matrix_1.6-5 SQUAREM_2021.1 lifecycle_1.0.4
[13] GenomeInfoDbData_1.2.12 truncnorm_1.0-9 farver_2.1.2 compiler_4.4.0
[17] stringr_1.5.1 munsell_0.5.1 codetools_0.2-20 pillar_1.9.0
[21] crayon_1.5.2 BiocParallel_1.38.0 DelayedArray_0.30.1 abind_1.4-5
[25] tidyselect_1.2.1 locfit_1.5-9.9 stringi_1.8.4 dplyr_1.1.4
[29] ashr_2.2-63 labeling_0.4.3 grid_4.4.0 colorspace_2.1-0
[33] cli_3.6.2 invgamma_1.1 SparseArray_1.4.5 magrittr_2.0.3
[37] S4Arrays_1.4.1 utf8_1.2.4 withr_3.0.0 scales_1.3.0
[41] UCSC.utils_1.0.0 XVector_0.44.0 httr_1.4.7 irlba_2.3.5.1
[45] rlang_1.1.3 mixsqp_0.3-54 Rcpp_1.0.12 glue_1.7.0
[49] rstudioapi_0.16.0 jsonlite_1.8.8 R6_2.5.1 plyr_1.8.9
[53] zlibbioc_1.50.0