I'm trying to follow a slight variant of the tutorial to get started from ~ page 9: https://www.bioconductor.org/packages/devel/bioc/vignettes/metagenomeSeq/inst/doc/metagenomeSeq.pdf
Essentially, I need to figure out how to go from: (A) my raw Otu counts; and (B) my "class" assignment (e.g. diseased, healthy) to get (C) differentially abundant Otus.
I'm starting with the built in dataset lungData
but I'm getting an error when I'm calculating the normalization factors.
I've made sure there are no empty features (Otus) or empty samples. I've documented my code heavily to serve as a tutorial in case anyone needs to learn how to use the pipeline.
Please help me figure out how to calculate the normalization factors
# Load packages
library(metagenomeSeq)
# Paths
path_datadirectory = system.file("extdata", package = "metagenomeSeq")
# Load in counts data
count_data = loadMeta(file.path(path_datadirectory, "CHK_NAME.otus.count.csv"))
dim(count_data$counts) # [1] 1000 78
names(count_data) # [1] "counts" "taxa"
class(count_data$counts) # [1] "data.frame"
# Load taxonomy data
df_taxonomy = read.delim(file.path(path_datadirectory, "CHK_otus.taxonomy.csv"), stringsAsFactors = FALSE)
dim(df_taxonomy) # [1] 1000 10
colnames(df_taxonomy)
# [1] "OTU" "Taxonomy" "superkingdom" "phylum" "class" "order" "family" "genus"
# [9] "species" "strain"
# Loading metadata
df_metadata = loadPhenoData(file.path(path_datadirectory, "CHK_clinical.csv"), tran = TRUE)
idx_samples_from_counts = colnames(count_data$counts)
idx_samples_from_metadata = rownames(df_metadata)
idx_samples = intersect(idx_samples_from_counts, idx_samples_from_metadata)
# Reorder
count_data$counts = count_data$counts[,idx_samples]
df_metadata = df_metadata[idx_samples,]
print(all.equal(rownames(df_metadata), colnames(count_data$counts))) # [1] TRUE
# Annotated DataFrames
Adf_metadata = AnnotatedDataFrame(df_metadata)
# An object of class 'AnnotatedDataFrame'
# rowNames: CHK_6467_E3B11_BRONCH2_PREWASH_V1V2 CHK_6467_E3B11_OW_V1V2 ... CHK_6467_E3B09_BAL_A_V1V2 (78 total)
# varLabels: SampleType SiteSampled SmokingStatus
# varMetadata: labelDescription
Adf_taxonomy = AnnotatedDataFrame(df_taxonomy)
# An object of class 'AnnotatedDataFrame'
# rowNames: 1 2 ... 1000 (1000 total)
# varLabels: OTU Taxonomy ... strain (10 total)
# varMetadata: labelDescription
obj = newMRexperiment(
count_data$counts,
phenoData=Adf_metadata,
featureData=Adf_taxonomy,
)
# MRexperiment (storageMode: environment)
# assayData: 1000 features, 78 samples
# element names: counts
# protocolData: none
# phenoData
# sampleNames: CHK_6467_E3B11_BRONCH2_PREWASH_V1V2 CHK_6467_E3B11_OW_V1V2 ... CHK_6467_E3B09_BAL_A_V1V2 (78 total)
# varLabels: SampleType SiteSampled SmokingStatus
# varMetadata: labelDescription
# featureData
# featureNames: 1 2 ... 1000 (1000 total)
# fvarLabels: OTU Taxonomy ... strain (10 total)
# fvarMetadata: labelDescription
# experimentData: use 'experimentData(object)'
# Annotation:
# Filter
idx_features_f = which(rowSums(obj) >= 10)
idx_samples_f = which(colSums(obj) > 0)
obj_f = obj[idx_features_f, idx_samples_f]
# MRexperiment (storageMode: environment)
# assayData: 12 features, 57 samples
# element names: counts
# protocolData: none
# phenoData
# sampleNames: CHK_6467_E3B11_OW_V1V2 CHK_6467_E3B08_OW_V1V2 ... CHK_6467_E3B09_BAL_A_V1V2 (57 total)
# varLabels: SampleType SiteSampled SmokingStatus
# varMetadata: labelDescription
# featureData
# featureNames: 77 79 ... 978 (12 total)
# fvarLabels: OTU Taxonomy ... strain (10 total)
# fvarMetadata: labelDescription
# experimentData: use 'experimentData(object)'
# Annotation:
# Dimensionality
n_samples = ncol(obj_f)
n_features = nrow(obj_f)
library_size = libSize(obj_f) # `libSize(obj)`` is the same as `colSums(count_data$counts)``
# Normalization
p = cumNormStatFast(obj_f)
# Error
Error in cumNormStatFast(obj_f) :
Warning sample with one or zero features
It's breaking here: `if (any(sapply(smat, length) == 1))
stop("Warning sample with one or zero features")`