Entering edit mode
Hi
While implementing the fitbiasmodel, I end up with the following error. Trying to figure out the reason by running the code step by step but still unable to come to a conclusion.
Error in alpine::fitBiasModels(genes = ebtFit, bam.file = bf, fragtypes = fragtypes, :
offset needs to be finite
Code: This R script is part of the JCC tool https://github.com/csoneson/jcc/blob/master/R/fitAlpineBiasModel.R
fitAlpineBiasModel <- function(gtf, bam, organism, genome, genomeVersion,
version, minLength = 600, maxLength = 7000,
minCount = 500, maxCount = 10000,
subsample = TRUE, nbrSubsample = 200,
seed = 1, minSize = NULL,
maxSize = NULL, verbose = FALSE) {
## Define the temporary directory where the ensDb object will be saved
tmpdir <- tempdir()
## Create TxDb
if (verbose) message("Creating TxDb object...")
txdb <-
tryCatch({
ensembldb::ensDbFromGtf(gtf, path = tmpdir, organism = organism,
genomeVersion = genomeVersion, version = version)
ensembldb::EnsDb(paste0(tmpdir, "/", gsub("gtf", "sqlite",
basename(gtf))))
}, error = function(e) {
GenomicFeatures::makeTxDbFromGFF(gtf, format = "gtf",
organism = gsub("_", " ", organism))
})
## Get list of transcripts
if (verbose) message("Getting list of transcripts...")
if (class(txdb) == "EnsDb") {
## data frame format
txdf <- GenomicFeatures::transcripts(txdb, return.type = "DataFrame")
## GRanges format
txps <- GenomicFeatures::transcripts(txdb)
} else {
txps <- GenomicFeatures::transcripts(txdb,
columns = c("tx_name", "gene_id"),
use.names = TRUE)
txps$gene_id <- unlist(txps$gene_id)
txdf <- S4Vectors::DataFrame(tx_id = as.character(txps$tx_name),
gene_id = as.character(unlist(txps$gene_id)),
tx_seq_start = as.integer(start(txps)),
tx_seq_end = as.integer(end(txps)),
tx_name = as.character(txps$tx_name))
}
## Get list of exons by transcript
if (verbose) message("Generating list of exons by transcript...")
if (class(txdb) == "EnsDb") {
ebt0 <- ensembldb::exonsBy(txdb, by = "tx")
} else {
ebt0 <- GenomicFeatures::exonsBy(txdb, by = "tx", use.names = TRUE)
}
## Select genes with a single isoform
if (verbose) message("Selecting genes with a single isoform...")
tab <- table(txdf$gene_id)
oneIsoGenes <- names(tab)[tab == 1]
if (verbose) message(paste0(length(oneIsoGenes),
" single-isoform genes found."))
## Get transcript names for genes with a single isoform
oneIsoTxs <- txdf$tx_id[txdf$gene_id %in% oneIsoGenes]
## Extract transcripts from one-isoform genes for use in fitting bias model
ebtFit <- ebt0[oneIsoTxs]
ebtFit <- GenomeInfoDb::keepStandardChromosomes(ebtFit,
pruning.mode = "coarse")
## Filter short genes and long genes
if (verbose) message("Filtering out too short and too long transcripts...")
minBp <- minLength
maxBp <- maxLength
geneLengths <- sum(width(ebtFit))
ebtFit <- ebtFit[geneLengths > minBp & geneLengths < maxBp]
if (verbose) message(paste0(length(ebtFit), " transcripts remaining."))
## Read bam file
if (verbose) message("Reading bam file...")
bamFiles <- bam
names(bamFiles) <- "1"
## Subset to medium-to-high expressed genes
if (verbose) message("Subsetting to medium-to-highly expressed genes...")
txpsFit <- sort(txps[names(ebtFit)])
cts <- Rsamtools::countBam(Rsamtools::BamFile(bamFiles),
param = Rsamtools::ScanBamParam(which = txpsFit))
S4Vectors::mcols(txpsFit)$cts <- cts$records
txpsFit <- txpsFit[names(ebtFit)]
ebtFit <- ebtFit[txpsFit$cts > minCount & txpsFit$cts < maxCount]
if (verbose) message(paste0(length(ebtFit), " genes retained."))
## Subsample genes to use for the fitting of the bias model
if (subsample) {
if (verbose) message("Subsampling genes for fitting bias model...")
set.seed(seed)
ebtFit <- ebtFit[sample(length(ebtFit), min(nbrSubsample, length(ebtFit)))]
if (verbose) message(paste0(length(ebtFit), " genes retained."))
}
## Get fragment width and read length
message("Getting fragment widths and read lengths...")
m <- sort(which(vapply(ebtFit, length, numeric(1)) > 1))
m0 <- 0
w <- NULL
while(is.null(w)) {
w <- tryCatch({
m0 <- m0 + 1
alpine::getFragmentWidths(bamFiles, ebtFit[[m[m0]]])
}, error = function(e) {
NULL
})
}
quantiles <- stats::quantile(w, c(.025, .975))
if (is.null(minSize)) minSize <- round(quantiles[1])
if (is.null(maxSize)) maxSize <- round(quantiles[2])
readLength <- stats::median(alpine::getReadLength(bamFiles))
## Names of genes to retain
geneNames <-
names(geneNames) <- geneNames
## Build fragment types for selected genes
if (verbose) message("Building fragment types...")
fragtypes <- lapply(geneNames, function(geneName) {
alpine::buildFragtypes(exons = ebtFit[[geneName]],
genome = Oindica,
readlength = readLength,
minsize = minSize,
maxsize = 220,
gc.str = FALSE)
})
## Define models to fit
if (verbose) message("Fitting bias model...")
models <- list(
"all" = list(
formula = "count ~ ns(gc,knots=gc.knots,Boundary.knots=gc.bk) +
ns(relpos,knots=relpos.knots,Boundary.knots=relpos.bk) + gene",
offset = c("fraglen", "vlmm")
)
)
## Fit bias model
fitpar <- lapply(bamFiles, function(bf) {
alpine::fitBiasModels(genes = ebtFit,
bam.file = bf,
fragtypes = fragtypes,
genome = Oindica,
models = models,
readlength = readLength,
minsize = minSize,
maxsize = 220)
})
list(biasModel = fitpar, exonsByTx = ebt0, transcripts = txps)
}
Thanks Nagesh
Hi Michael, Please check the code which I have included with in the question.
So the offsets are VLMM and fragment length, you can inspect fragtypes to see if there are values that would cause a problem in the relevant columns of that data.frame.