Hi,
I am using SGSeq and have a problem with the analyzefeatures function. Below, I have provided my code and my problems. Could you please kindly guide me?
library(SGSeq)
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
library(GenomicRanges)
# Define the path to your BAM files
bam_path <- "the path of my bam files" (ps: I have 41 bam files)
# List the BAM files in the directory
bam_files <- list.files(path = bam_path, pattern = "\\.bam$", full.names = TRUE)
# Create a sample_info data frame with sample names and corresponding BAM file paths
si <- data.frame(sample_name = basename(bam_files), file_bam = bam_files)
# Get the BAM file information
si_complete <- getBamInfo(si)
Then, I used the following code:
txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
seqlevelsStyle(txdb) <- "UCSC"
txf_ucsc <- convertToTxFeatures(txdb)
Warning messages:
1: In .set_group_names(grl, use.names, txdb, by) :
some group names are NAs or duplicated
2: In .set_group_names(grl, use.names, txdb, by) :
some group names are NAs or duplicated
3: In convertToTxFeatures(txdb) :
Removed 348 transcripts with duplicate names
And then, When I looked at (ps: I have 41 bam files)
seqlevels(BamFile(si_complete$file_bam[1]))
seqlevels(txf_ucsc)
I noticed they are not exactly the same. Which causes a problem when I want to use analyseFeatures
sgfc_ucsc <- analyzeFeatures(si_complete, features = txf_ucsc)
Process features...
Obtain counts...
Error in .io_bam(.scan_bamfile, file, reverseComplement, yieldSize(file), :
seqlevels(param) not in BAM header:
#(I couldn't type all of it with its warnings here)
So, from the other posts on the bioconductor I found to use the seqlevels and keepSeqlevels as follow:
seqlevels_in_bam <- seqlevels(BamFile(si_complete$file_bam[1]))
txf_ucsc2 <- keepSeqlevels(txf_ucsc, seqlevels_in_bam)
Error in keepSeqlevels(txf_ucsc, seqlevels_in_bam) :
invalid seqlevels: chrEBV
How to solve this issue?
I also used intersect function:
seqlevels_in_bam <- seqlevels(BamFile(si_complete$file_bam[1]))
seqlevels_in_bam <- intersect(seqlevels_in_bam, seqlevels(txf_ucsc))
But then when I am running analyzeFeatures function I have the error:
sgfc_ucsc <- analyzeFeatures(si_complete, features = seqlevels_in_bam)
sgfc_ucsc <- analyzeFeatures(si_complete, features = seqlevels_in_bam)
Error in analyzeFeatures(si_complete, features = seqlevels_in_bam) :
features must be a TxFeatures or SGFeatures object
if I go step back in this chunck:
seqlevels_in_bam <- seqlevels(BamFile(si_complete$file_bam[1]))
txf_ucsc2 <- keepSeqlevels(txf_ucsc, seqlevels_in_bam)
Error in keepSeqlevels(txf_ucsc, seqlevels_in_bam) :
invalid seqlevels: chrEBV
and remove chrEBV:
seqlevels_in_bam <- seqlevels(BamFile(si_complete$file_bam[1]))
seqlevels_in_bam <- seqlevels_in_bam[seqlevels_in_bam != "chrEBV"]
common_seqlevels <- intersect(seqlevels_in_bam, seqlevels(txf_ucsc))
txf_ucsc_filtered <- keepSeqlevels(txf_ucsc, common_seqlevels)
Error in GenomeInfoDb:::getDanglingSeqlevels(x, new2old = new2old, pruning.mode = pruning.mode, :
The following seqlevels are to be dropped but are currently in use (i.e. have ranges on them): chr1_GL383518v1_alt,
chr1_GL383519v1_alt, chr1_GL383520v2_alt, chr1_KI270759v1_alt, chr1_KI270760v1_alt, chr1_KI270761v1_alt, chr1_KI270762v1_alt,
chr1_KI270763v1_alt, chr1_KI270765v1_alt, chr1_KI270766v1_alt, chr1_KI270892v1_alt, chr2_GL383521v1_alt, chr2_GL383522v1_alt,
chr2_GL582966v2_alt, chr2_KI270767v1_alt, chr2_KI270768v1_alt, chr2_KI270769v1_alt, chr2_KI270770v1_alt, chr2_KI270772v1_alt,
chr2_KI270773v1_alt, chr2_KI270774v1_alt, chr2_KI270776v1_alt, chr2_KI270893v1_alt, chr2_KI270894v1_alt, chr3_GL383526v1_alt,
chr3_JH636055v2_alt, chr3_KI270777v1_alt, chr3_KI270779v1_alt, chr3_KI270780v1_alt, chr3_KI270781v1_alt, chr3_KI270782v1_alt,
chr3_KI270783v1_alt, chr3_KI270784v1_alt, chr3_KI270895v1_alt, chr3_KI270924v1_alt, chr3_KI270934v1_alt, chr3_KI270935v1_alt,
chr3_KI270936v1_alt, chr3_KI270937v1_alt, chr4_GL000257v2_alt, chr4_GL383527v1_alt, chr4_GL383528v1_alt,
May you please help me out? Many thanks in advance