Hello,
I was using BSgenome.Mfascicularis.NCBI.6.0 genome for the Signac multiomics tutorial. It was working fine until the I was trying to change the name of the chromosomes to match the GTF I have (macFas6-hg38-gencode.v47.basic.sortedgtf).
This code worked before
mf_genome <- BSgenome.Mfascicularis.NCBI.6.0
# see 1st chromosomes
head(seqlengths(mf_genome))
# see how many base pairs are in chromosome 1
mf_genome[["1"]]
# chromosome 1 had 223606306 base pairs.
# if you look at the chromsosomes table on bottom of page, chromsome 1 is 'CM021939.1' and it has 223606306 base pairs (no longer working???)
# also lists the gc content % for each chromsome
But then when I tried to change the chromosome names...
# # Store mapping in R as vector
# chr_map <- c(
# "1" = "CM021939.1", "2" = "CM021940.1", "3" = "CM021941.1",
# "4" = "CM021942.1", "5" = "CM021943.1", "6" = "CM021944.1",
# "7" = "CM021945.1", "8" = "CM021946.1", "9" = "CM021947.1",
# "10" = "CM021948.1", "11" = "CM021949.1", "12" = "CM021950.1",
# "13" = "CM021951.1", "14" = "CM021952.1", "15" = "CM021953.1",
# "16" = "CM021954.1", "17" = "CM021955.1", "18" = "CM021956.1",
# "19" = "CM021957.1", "20" = "CM021958.1", "X" = "CM021959.1"
# )
#
#
# # Get the original chromosome names
# original_chr_names <- seqnames(BSgenome.Mfascicularis.NCBI.6.0)
# original_chr_names
# # [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14" "15" "16" "17" "18" "19" "20" "X"
#
#
#
# # Check which names need to be changed (need to remove NA's from here --- rest of the scaffolds that aren't chromosome 1-20 or X)
# new_chr_names <- chr_map[original_chr_names]
# new_chr_names
#
# # 1 2 3 4 5 6 7 8 9 10 11 12 13
# # "CM021939.1" "CM021940.1" "CM021941.1" "CM021942.1" "CM021943.1" "CM021944.1" "CM021945.1" "CM021946.1" "CM021947.1" "CM021948.1" "CM021949.1" "CM021950.1" "CM021951.1"
# # 14 15 16 17 18 19 20 X
# # "CM021952.1" "CM021953.1" "CM021954.1" "CM021955.1" "CM021956.1" "CM021957.1" "CM021958.1" "CM021959.1"
#
#
#
#
# # Keep only non-NA values ( chrom 1-20 & X)
# # new_chr_names <- new_chr_names[!is.na(new_chr_names)]
#
#
# # # Remove NA values to ensure lengths match
# # valid_indices <- !is.na(new_chr_names)
# # original_chr_names <- original_chr_names[valid_indices]
# # new_chr_names <- new_chr_names[valid_indices]
# #
# # # Confirm lengths match
# # length(original_chr_names) == length(new_chr_names)
#
#
# # Assign the new chromosome names
# names(BSgenome.Mfascicularis.NCBI.6.0@seqinfo) <- new_chr_names
#
# # Error in `seqnames<-`(x, value) :
# # length of supplied 'seqnames' vector must equal the number of sequences
#
#
#
#
# # Verify the update
# seqnames(BSgenome.Mfascicularis.NCBI.6.0)
1 2 3 4
NA NA NA NA
5 6 7 8
NA NA NA NA
9 10 11 12
NA NA NA NA
13 14 15 16
NA NA NA NA
17 18 19 20
NA NA NA NA
X Super-Scaffold_100058 Super-Scaffold_100060 Super-Scaffold_100064
NA NA NA NA
Super-Scaffold_100065 Super-Scaffold_100066 Super-Scaffold_100068 Super-Scaffold_100069
NA NA NA NA
Super-Scaffold_100072 Super-Scaffold_100073 Super-Scaffold_100078 Super-Scaffold_100080
NA NA NA NA
It removed all the other metadata. I've tried uninstalling and reinstalling BSgenome.Mfascicularis.NCBI.6.0 & BSgenome but it doesn't recover the seqlengths or the other metadata that was originally there.
remove.packages("BSgenome.Mfascicularis.NCBI.6.0")
remove.packages("BSgenome")
BiocManager::install("BSgenome")
BiocManager::install("BSgenome.Mfascicularis.NCBI.6.0")
print(names(BSgenome.Mfascicularis.NCBI.6.0@seqinfo))
[1] "CM021939.1" "CM021940.1" "CM021941.1" "CM021942.1" "CM021943.1" "CM021944.1" "CM021945.1" "CM021946.1" "CM021947.1" "CM021948.1" "CM021949.1" "CM021950.1" "CM021951.1"
[14] "CM021952.1" "CM021953.1" "CM021954.1" "CM021955.1" "CM021956.1" "CM021957.1" "CM021958.1" "CM021959.1"
I tried this too linked here. The metadata that was originally in BSgenome.Mfascicularis.NCBI.6.0 is no longer there and I can't restore it? I succesfully changed the chromosome names I needed and created a subsetted BS genome object called new_genome
but again I no longer have seqlength info, etc
# hack from https://support.bioconductor.org/p/83588/ to keep only some chromosomes within a BSgenome object.
keepBSgenomeSequences <- function(genome, seqnames)
{
stopifnot(all(seqnames %in% seqnames(genome)))
genome@user_seqnames <- setNames(seqnames, seqnames)
genome@seqinfo <- genome@seqinfo[seqnames]
genome
}
# load macfas6 NCBI BSgenome object
library(BSgenome.Mfascicularis.NCBI.6.0)
# assign to R object called genome
genome <- BSgenome.Mfascicularis.NCBI.6.0
# check if seqlengths info is in original genome
head(seqlengths(genome))
# 1 2 3 4 5 6
# NA NA NA NA NA NA
# see how many base pairs are in chromosome 1
genome[["1"]]
# Error in if (length(ans) != seqlengths(x)[[user_seqname]]) { :
# missing value where TRUE/FALSE needed
# # Store mapping in R as vector
chr_map <- c(
"1" = "CM021939.1", "2" = "CM021940.1", "3" = "CM021941.1",
"4" = "CM021942.1", "5" = "CM021943.1", "6" = "CM021944.1",
"7" = "CM021945.1", "8" = "CM021946.1", "9" = "CM021947.1",
"10" = "CM021948.1", "11" = "CM021949.1", "12" = "CM021950.1",
"13" = "CM021951.1", "14" = "CM021952.1", "15" = "CM021953.1",
"16" = "CM021954.1", "17" = "CM021955.1", "18" = "CM021956.1",
"19" = "CM021957.1", "20" = "CM021958.1", "X" = "CM021959.1"
)
# create new R object called gew_genome with only chromosomes 1-20 & X
# this is the new BSgenome object
new_genome <- keepBSgenomeSequences(genome, names(chr_map))
seqnames(new_genome) <- chr_map
# print out
genome
# | BSgenome object for Crab-eating macaque
# | - organism: Macaca fascicularis
# | - provider: NCBI
# | - genome: Macaca_fascicularis_6.0
# | - release date: 2020/03/10
# | - 936 sequence(s):
# | 1 2 3 4
# | 5 6 7 8
# | 9 10 11 12
# | 13 14 15 16
# | 17 18 19 20
# | ... ... ... ...
# | tig00001423_obj tig00001424_obj tig00001425_obj tig00001428_obj
# | tig00001430_obj tig00001431_obj tig00001432_obj tig00001433_obj
# | tig00001434_obj tig00001435_obj tig00001436_obj tig00001437_obj
# | tig00001438_obj tig00001440_obj tig00001441_obj tig00001442_obj
# | tig00001443_obj tig00001444_obj tig00001445_obj tig00001446_obj
# |
# | Tips: call 'seqnames()' on the object to get all the sequence names, call 'seqinfo()' to get the full sequence info, use the '$' or '[[' operator to access a given sequence, see
# | '?BSgenome' for more information.
new_genome
# | BSgenome object for Crab-eating macaque
# | - organism: Macaca fascicularis
# | - provider: NCBI
# | - genome: Macaca_fascicularis_6.0
# | - release date: 2020/03/10
# | - 21 sequence(s):
# | CM021939.1 CM021940.1 CM021941.1 CM021942.1 CM021943.1 CM021944.1 CM021945.1 CM021946.1 CM021947.1 CM021948.1 CM021949.1 CM021950.1 CM021951.1 CM021952.1 CM021953.1 CM021954.1
# | CM021955.1 CM021956.1 CM021957.1 CM021958.1 CM021959.1
# |
# | Tips: call 'seqnames()' on the object to get all the sequence names, call 'seqinfo()' to get the full sequence info, use the '$' or '[[' operator to access a given sequence, see
# | '?BSgenome' for more information.
chr_map
# 1 2 3 4 5 6 7 8 9 10 11 12 13 14
# "CM021939.1" "CM021940.1" "CM021941.1" "CM021942.1" "CM021943.1" "CM021944.1" "CM021945.1" "CM021946.1" "CM021947.1" "CM021948.1" "CM021949.1" "CM021950.1" "CM021951.1" "CM021952.1"
# 15 16 17 18 19 20 X
# "CM021953.1" "CM021954.1" "CM021955.1" "CM021956.1" "CM021957.1" "CM021958.1" "CM021959.1"
seqinfo(new_genome)
# Seqinfo object with 21 sequences from an unspecified genome; no seqlengths:
# seqnames seqlengths isCircular genome
# CM021939.1 <NA> <NA> <NA>
# CM021940.1 <NA> <NA> <NA>
# CM021941.1 <NA> <NA> <NA>
# CM021942.1 <NA> <NA> <NA>
# CM021943.1 <NA> <NA> <NA>
# ... ... ... ...
# CM021955.1 <NA> <NA> <NA>
# CM021956.1 <NA> <NA> <NA>
# CM021957.1 <NA> <NA> <NA>
# CM021958.1 <NA> <NA> <NA>
# CM021959.1 <NA> <NA> <NA>
# see 1st chromosomes
head(seqlengths(new_genome))
# CM021939.1 CM021940.1 CM021941.1 CM021942.1 CM021943.1 CM021944.1
# NA NA NA NA NA NA
# see how many base pairs are in chromosome 1
new_genome[["CM021939.1"]]
# Error in if (length(ans) != seqlengths(x)[[user_seqname]]) { :
# missing value where TRUE/FALSE needed
I tried manually adding the sequence length info in the seqlengths column of the new_genome
BS genome object using chromosomes info I retrieved from NCBI for Macaca_fascicularis_6.0 here, but I still wan't able to add tha to the BS genome object.
# Read the TSV file (assuming tab-separated and columns: seqname, length, gc_content or gc_count)
seq_info <- read.table("~/macaque_multiomics/macfas6_ncbi_sequence_report_chromosomes.tsv", header = TRUE, sep = "\t", stringsAsFactors = FALSE)
# Check column names to ensure correct assignment
head(seq_info)
# Print column names
print(colnames(seq_info))
[1] "Assembly.Accession" "Assembly.unit.accession" "Chromosome.name" "GC.Count" "GC.Percent" "GenBank.seq.accession"
[7] "Molecule.type" "Ordering" "RefSeq.seq.accession" "Role" "Seq.length" "UCSC.style.name"
[13] "Unlocalized.Count" "Sequence.name"
# Extract chromosome names from BSgenome object
bs_chroms <- seqnames(new_genome)
# Subset seq_info to only include relevant rows
seq_info_subset <- seq_info[seq_info$GenBank.seq.accession %in% bs_chroms, ]
# Check structure
str(seq_info_subset)
head(seq_info_subset[, c("GenBank.seq.accession", "Seq.length")])
#tried assigning sequence lenghts to new_genome
# Convert column types
seq_info_subset$GenBank.seq.accession <- as.character(seq_info_subset$GenBank.seq.accession)
seq_info_subset$Seq.length <- as.numeric(seq_info_subset$Seq.length)
# Create a named vector for sequence lengths
seq_lengths <- setNames(seq_info_subset$Seq.length, seq_info_subset$GenBank.seq.accession)
# Assign sequence lengths to BSgenome object
seqlengths(new_genome) <- seq_lengths
# Error in .check_new2old_and_new_seqinfo(new2old, value, seqinfo(x), context) :
# seqlengths() and isCircular() of the supplied 'seqinfo' must be identical to seqlengths() and isCircular() of the current 'seqinfo' when replacing the 'seqinfo' of
# a BSgenome object
I kept getting the same error when trying many different ways to add the sequence lengths to the BS genome object. Is it true that once a BSgenome object is created, you cannot directly modify the seqinfo slot, including the seqlengths? I don't know why the original BS genome object BSgenome.Mfascicularis.NCBI.6.0 doesn't have any metadata besides chromosome names even after unloading & reloading object. I've also tried uninstalling and reinstalling BSgenome.Mfascicularis.NCBI.6.0 and the BSgenome R packages but no luck.