Hi, I cannot make my dds structure ready for DESeq2 analysis. I have used quantMode GeneCounts from STAR mapping to create the *ReadsPerGene.out.tab files read into R at the start.
# List of file names and their corresponding shortened names
file_names <- c(
"star_3-IgM19neg-3c_S399_L002_R1_001AlignmentJob_ReadsPerGene.out.tab",
"star_4-IgM19pos-4b_S397_L002_R1_001AlignmentJob_ReadsPerGene.out.tab",
"star_1-unstain-1a_S385_L002_R1_001AlignmentJob_ReadsPerGene.out.tab")
shortened_names <- c("IgM19neg", "IgM19pos", "unstained_control")
# Create an empty list to store sample data frames
sample_data_list <- list()
# Loop through file names and read the count data
for (i in 1:length(file_names)) {
file_name <- file_names[i]
shortened_name <- shortened_names[i]
# Read the count data for the current sample with header
count_data <- read.table(file_name, header = TRUE, sep = "\t")
# Rename columns as needed
colnames(count_data) <- c("gene ID", "unstranded_RNA_seq", "R1", "R2")
# Add the sample name as a column (using the shortened name)
count_data$Sample <- shortened_name
# Append the sample data to the list
sample_data_list[[shortened_name]] <- count_data
}
# Access the data for each sample by name (e.g., sample_data_list[["IgM19neg"]])
# Combine the sample data frames into a single data frame
combined_data <- do.call(rbind, sample_data_list)
head(combined_data)
# Create a function to remove the top N rows from each sample
remove_top_n_rows <- function(dataframe, n) {
# Split the dataframe by "Sample" column
sample_splits <- split(dataframe, dataframe$Sample)
# Remove the top N rows from each sample
for (sample_name in names(sample_splits)) {
sample_df <- sample_splits[[sample_name]]
sample_splits[[sample_name]] <- sample_df[-(1:n), ]
}
# Combine the modified samples back into a single dataframe
modified_data <- do.call(rbind, sample_splits)
return(modified_data)
}
# Remove the top 3 rows from each sample
combined_data <- remove_top_n_rows(combined_data, 3)
# Change row names to sequential integers
rownames(combined_data) <- seq(nrow(combined_data))
head(combined_data)
gene ID unstranded_RNA_seq R1 R2 Sample
1 LOC115544811 10 7 3 IgM19neg
2 LOC115544869 0 0 0 IgM19neg
3 LOC115536824 2 1 1 IgM19neg
4 LOC115536896 0 0 0 IgM19neg
5 LOC115539754 0 0 0 IgM19neg
6 LOC115539762 0 0 0 IgM19neg
# Transpose the data and use 'unstranded_RNA_seq' as gene counts
transposed_data <- data.frame(gene_id = combined_data$`gene ID`,
IgM19neg = combined_data$unstranded_RNA_seq[combined_data$Sample == "IgM19neg"],
IgM19pos = combined_data$unstranded_RNA_seq[combined_data$Sample == "IgM19pos"],
unstained_control = combined_data$unstranded_RNA_seq[combined_data$Sample == "unstained_control"])
head(transposed_data)
gene_id IgM19neg IgM19pos unstained_control
1 LOC115544811 10 0 0
2 LOC115544869 0 0 0
3 LOC115536824 2 1 0
4 LOC115536896 0 0 0
5 LOC115539754 0 0 0
6 LOC115539762 0 0 0
# Create a DESeqDataSet object from your transposed_data
dds <- DESeqDataSetFromMatrix(
countData = transposed_data[, -1], # Remove the first column (gene_id)
colData = transposed_data[, 1, drop = FALSE], # Keep all columns for colData
design = ~Sample # Design formula specifying your samples
)
Error in DESeqDataSetFromMatrix(countData = transposed_data[, -1], colData = transposed_data[, : ncol(countData) == nrow(colData) is not TRUE
sessionInfo( ) R version 4.1.1 (2021-08-10) Platform: x86_64-apple-darwin17.0 (64-bit) Running under: macOS 13.5.1
Matrix products: default LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
locale: [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages: [1] parallel stats4 stats graphics grDevices utils datasets methods base
other attached packages:
[1] edgeR_3.34.1 limma_3.48.3 DESeq2_1.32.0 SummarizedExperiment_1.22.0 Biobase_2.52.0 MatrixGenerics_1.4.3
[7] matrixStats_0.61.0 GenomicRanges_1.44.0 GenomeInfoDb_1.28.4 IRanges_2.26.0 S4Vectors_0.30.2 BiocGenerics_0.38.0
[13] RColorBrewer_1.1-2 umap_0.2.7.0 forcats_0.5.1 stringr_1.4.0 purrr_0.3.4 readr_2.1.0
[19] tidyr_1.1.4 tibble_3.1.6 tidyverse_1.3.1 splitstackshape_1.4.8 plyr_1.8.6 cowplot_1.1.1
[25] ggplot2_3.3.5 magrittr_2.0.1 Matrix_1.3-4 SeuratObject_4.0.2 Seurat_4.0.5 dplyr_1.0.7
loaded via a namespace (and not attached):
[1] readxl_1.3.1 backports_1.4.0 igraph_1.2.8 lazyeval_0.2.2 splines_4.1.1 BiocParallel_1.26.2 listenv_0.8.0
[8] scattermore_0.7 digest_0.6.28 htmltools_0.5.2 fansi_0.5.0 memoise_2.0.0 tensor_1.5 cluster_2.1.2
[15] ROCR_1.0-11 tzdb_0.2.0 Biostrings_2.60.2 annotate_1.70.0 globals_0.14.0 modelr_0.1.8 askpass_1.1
[22] spatstat.sparse_2.0-0 colorspace_2.0-2 blob_1.2.2 rvest_1.0.2 ggrepel_0.9.1 haven_2.4.3 xfun_0.28
[29] crayon_1.4.2 RCurl_1.98-1.5 jsonlite_1.7.2 genefilter_1.74.1 spatstat.data_2.1-0 survival_3.2-13 zoo_1.8-9
[36] glue_1.5.0 polyclip_1.10-0 gtable_0.3.0 zlibbioc_1.38.0 XVector_0.32.0 leiden_0.3.9 DelayedArray_0.18.0
[43] future.apply_1.8.1 abind_1.4-5 scales_1.1.1 DBI_1.1.1 miniUI_0.1.1.1 Rcpp_1.0.7 viridisLite_0.4.0
[50] xtable_1.8-4 reticulate_1.22 spatstat.core_2.3-1 bit_4.0.4 htmlwidgets_1.5.4 httr_1.4.2 ellipsis_0.3.2
[57] ica_1.0-2 XML_3.99-0.8 pkgconfig_2.0.3 uwot_0.1.10 dbplyr_2.1.1 deldir_1.0-6 locfit_1.5-9.4
[64] utf8_1.2.2 AnnotationDbi_1.54.1 tidyselect_1.1.1 rlang_0.4.12 reshape2_1.4.4 later_1.3.0 cachem_1.0.6
[71] munsell_0.5.0 cellranger_1.1.0 tools_4.1.1 cli_3.1.0 RSQLite_2.2.8 generics_0.1.1 broom_0.7.10
[78] ggridges_0.5.3 evaluate_0.14 fastmap_1.1.0 yaml_2.2.1 goftest_1.2-3 bit64_4.0.5 knitr_1.36
[85] fs_1.5.0 fitdistrplus_1.1-6 RANN_2.6.1 KEGGREST_1.32.0 pbapply_1.5-0 future_1.23.0 nlme_3.1-153
[92] mime_0.12 xml2_1.3.2 compiler_4.1.1 rstudioapi_0.13 plotly_4.10.0 png_0.1-7 spatstat.utils_2.2-0
[99] reprex_2.0.1 geneplotter_1.70.0 stringi_1.7.5 RSpectra_0.16-0 lattice_0.20-45 vctrs_0.3.8 pillar_1.6.4
[106] lifecycle_1.0.1 spatstat.geom_2.3-0 lmtest_0.9-39 RcppAnnoy_0.0.19 data.table_1.14.2 bitops_1.0-7 irlba_2.3.3
[113] httpuv_1.6.3 patchwork_1.1.1 R6_2.5.1 promises_1.2.0.1 KernSmooth_2.23-20 gridExtra_2.3 parallelly_1.29.0
[120] codetools_0.2-18 MASS_7.3-54 assertthat_0.2.1 openssl_1.4.5 withr_2.4.2 sctransform_0.3.2 GenomeInfoDbData_1.2.6
[127] mgcv_1.8-38 hms_1.1.1 grid_4.1.1 rpart_4.1-15 rmarkdown_2.11 Rtsne_0.15 shiny_1.7.1
[134] lubridate_1.8.0
dim(transposed_data)
[1] 97344 4