Entering edit mode
I am trying to run MAST on a subset of single cell types, but I get the following error: Error: grouping factors must have > 1 sampled level
Grouping factors are:
group ngeneson replicate
neuroblastoma.Bridge -0.791254569558456 jansky
neuroblastoma.Bridge -2.1203581896722 jansky
neuroblastoma.Bridge -0.865093659564775 jansky
neuroblastoma.Bridge -0.748655094554811 jansky
neuroblastoma.Bridge -2.01953943216357 jansky
neuroblastoma.Bridge -1.2825685146005 jansky
neuroblastoma.Bridge -1.00141197957644 jansky
neuroblastoma.Bridge -1.70998324713708 jansky
neuroblastoma.Bridge -1.71850314213781 jansky
neuroblastoma.Bridge -0.769954832056634 jansky
neuroblastoma.Bridge -2.02521936216406 jansky
neuroblastoma.Bridge -0.911953082068786 jansky
neuroblastoma.Bridge -0.781314692057606 jansky
neuroblastoma.Bridge -0.792674552058578 jansky
neuroblastoma.Bridge -0.805454394559671 jansky
neuroblastoma.Bridge 0.309231868035723 jansky
neuroblastoma.Bridge -2.18141743717743 jansky
neuroblastoma.Bridge 0.381650975541921 jansky
neuroblastoma.Bridge -1.16612994959054 jansky
neuroblastoma.Bridge -1.04827140208045 jansky
neuroblastoma.Bridge 3.0455381457699 jansky
neuroblastoma.Bridge -1.50408578461946 jansky
neuroblastoma.Bridge -0.585357107040836 jansky
neuroblastoma.Bridge -0.588197072041079 jansky
et. cetera, with more than 1 level for each.
find_de_MAST_RE <- function(adata_){
# create a MAST object
sca <- SceToSingleCellAssay(adata_, class = "SingleCellAssay")
print("Dimensions before subsetting:")
print(dim(sca))
print("")
# keep genes that are expressed in more than 10% of all cells
sca <- sca[freq(sca)>0.1,]
print("Dimensions after subsetting:")
print(dim(sca))
print("")
# add a column to the data which contains scaled number of genes that are expressed in each cell
cdr2 <- colSums(assay(sca)>0)
colData(sca)$ngeneson <- scale(cdr2)
# store the columns that we are interested in as factors
label <- factor(colData(sca)$condition)
# set the reference level
label <- relevel(label,"control")
colData(sca)$label <- label
#celltype <- factor(colData(sca)$auto_manual_merged_annotation)
#colData(sca)$celltype <- celltype
# same for donors (which we need to model random effects)
replicate <- factor(colData(sca)$source)
colData(sca)$replicate <- replicate
# create a group per condition-celltype combination
colData(sca)$group <- paste0(colData(adata_)$condition, ".", colData(adata_)$auto_manual_merged_annotation)
colData(sca)$group <- factor(colData(sca)$group)
#selected_cols <- c("group", "ngeneson", "replicate")
#selected_data <- colData(sca)[, selected_cols]
#print(selected_data)
#write.csv(selected_data, file = "/project/data/nb24/Control_Dataset_Analysis/test.csv", row.names = FALSE)
print(levels(colData(sca)$label))
print(levels(colData(sca)$replicate))
print(levels(colData(sca)$ngeneson))
print(levels(colData(sca)$group))
#print(table(colData(sca)$group, colData(sca)$ngeneson))
# define and fit the model
zlmCond <- zlm(formula = ~ngeneson + group + (1 | replicate),
sca=sca,
method='glmer',
ebayes=F,
strictConvergence=F,
fitArgsD=list(nAGQ = 0)) # to speed up calculations
print("done")
# perform likelihood-ratio test for the condition that we are interested in
summaryCond <- summary(zlmCond, doLRT='neuroblastoma.Bridge')
# get the table with log-fold changes and p-values
summaryDt <- summaryCond$datatable
result <- merge(summaryDt[contrast=='neuroblastoma.Bridge' & component=='H',.(primerid, `Pr(>Chisq)`)], # p-values
summaryDt[contrast=='neuroblastoma.Bridge' & component=='logFC', .(primerid, coef)],
by='primerid') # logFC coefficients
# MAST uses natural logarithm so we convert the coefficients to log2 base to be comparable to edgeR
result[,coef:=result[,coef]/log(2)]
# do multiple testing correction
result[,FDR:=p.adjust(`Pr(>Chisq)`, 'fdr')]
result = result[result$FDR<0.01,, drop=F]
result <- stats::na.omit(as.data.frame(result))
return(result)
}