Hello,
I've run into an issue with the check_input() function and was not sure if it was a biological issue or a mistake I was making somewhere along the process.
I am attempting to visualize synteny across five completely assembled Klebsiella pneumoniae protein fastas and gffs pulled from RefSeq. I have followed the vignette and FAQ to the best of my knowledge however, since hypothetical genes had no names I opted for their WP accession ID instead. The error that occurs is that the sequence names do in my seq input do not match my gene names in annotation. From what I understand, the protein sequence file supplied for example GCF_000814805.1_ASM81480v1 contains 4891 sequences and it's gff contains 4993 ranges fitting the requirements of check_input().
Examples for syntenet are plant genomes, so I wonder if this issue is due to my application of the tool.
Any help or advice would be appreciated, Thanks!
library(tidyverse)
library(syntenet)
# Running Syntenet on a sample of 5 RefSeq Klebsiella pneumoniae genomes
setwd("/analysis1/R_environments/BARNARDS_project")
## Path to directory containing FASTA files
fasta_dir <- "./0_input/testing_syntenet/RefSeq/test_faa/"
dir(fasta_dir) # see the contents of the directory
#> [1] "GCF_000814805.1_ASM81480v1.faa" "GCF_001482345.1_ASM148234v1.faa"
#> [3] "GCF_003812105.1_ASM381210v1.faa" "GCF_007955015.2_ASM795501v2.faa"
#> [5] "GCF_009661535.2_ASM966153v2.faa"
# Read all FASTA files in `fasta_dir`
RefSeq_aastringsetlist <- fasta2AAStringSetlist(fasta_dir)
head(names(RefSeq_aastringsetlist$GCF_000814805.1_ASM81480v1))
#> [1] "WP_000004292.1 MULTISPECIES: H-NS histone family protein [Enterobacteriaceae]"
#> [2] "WP_000005710.1 MULTISPECIES: His-Xaa-Ser system radical SAM maturase HxsC [Enterobacteriaceae]"
#> [3] "WP_000005768.1 MULTISPECIES: zinc metallochaperone GTPase ZigA [Enterobacterales]"
#> [4] "WP_000014594.1 MULTISPECIES: RNA chaperone/antiterminator CspA [Bacteria]"
#> [5] "WP_000020322.1 MULTISPECIES: GTP cyclohydrolase FolE2 [Enterobacterales]"
#> [6] "WP_000039782.1 MULTISPECIES: hypothetical protein [Enterobacterales]"
# Remove whitespace and everything after it in protein fasta list
RefSeq_aastringsetlist <- lapply(RefSeq_aastringsetlist, function(x) {
names(x) <- gsub(" .*", "", names(x))
return(x)
})
# Taking a look at the new names
head(names(RefSeq_aastringsetlist$GCF_000814805.1_ASM81480v1))
#> [1] "WP_000004292.1" "WP_000005710.1" "WP_000005768.1" "WP_000014594.1"
#> [5] "WP_000020322.1" "WP_000039782.1"
## Path to directory containing GFF files
gff_dir <- "./0_input/testing_syntenet/RefSeq/test_gff/"
dir(gff_dir) # see the contents of the directory
#> [1] "GCF_000814805.1_ASM81480v1.gff" "GCF_001482345.1_ASM148234v1.gff"
#> [3] "GCF_003812105.1_ASM381210v1.gff" "GCF_007955015.2_ASM795501v2.gff"
#> [5] "GCF_009661535.2_ASM966153v2.gff"
# Read all GFF files in `fasta_dir`
RefSeq_grangeslist <- gff2GRangesList(gff_dir)
# RefSeq uses IDs for CDS, not for genes
# Check for the gff ranges greater than matching protein fasta
# head(RefSeq_grangeslist$GCF_000814805.1_ASM81480v1[RefSeq_grangeslist$GCF_000814805.1_ASM81480v1$type == "CDS"])
# in this case the gene name is in the Name column and protein IDs need to be collapsed to gene IDs
# Create a list of data frames containing protein-to-gene ID correspondences
protein2gene <- lapply(RefSeq_grangeslist, function(x) {
# Extract only CDS ranges
cds_ranges <- x[x$type == "CDS"]
# Create the ID correspondence data frame
df <- data.frame(
protein_id = cds_ranges$protein_id,
gene_id = cds_ranges$Name
)
# Remove duplicate rows
df <- df[!duplicated(df$protein_id), ]
return(df)
})
# Taking a look at the list
head(protein2gene$GCF_000814805.1_ASM81480v1)
#> protein_id gene_id
#> 1 WP_004145090.1 WP_004145090.1
#> 2 WP_004150293.1 WP_004150293.1
#> 3 WP_004173845.1 WP_004173845.1
#> 4 WP_004151530.1 WP_004151530.1
#> 5 WP_029884213.1 WP_029884213.1
#> 6 <NA> <NA>
# Collapse protein IDs to gene IDs in list of sequences
RefSeq_aastringsetlist_collapsed <- collapse_protein_ids(RefSeq_aastringsetlist, protein2gene)
# Looking at the new sequences
RefSeq_aastringsetlist_collapsed$GCF_000814805.1_ASM81480v1
#> AAStringSet object of length 4891:
#> width seq names
#> [1] 3206 MDNTSGDFPCNKMDTRKQLPLT...QIAKRALQIAKNTARSHTSLH WP_001518711.1
#> [2] 3163 MDNLRFSSAPTADSIDASIAQH...ACAQHITRWLCATSTQPENTL WP_039109377.1
#> [3] 2166 MTIHHAALARMLPAEKKEKLLR...QRDASRSRAQQRLVRRHQRQR WP_012131830.1
#> [4] 2154 MTYSESDIAIVGMNCRYPGVHS...AARENLALRRKRAQQGEKGDE WP_023316156.1
#> [5] 2035 MISGAPSQDSLLPDNRHAADYQ...GVATVEKTKRPRPVRRRQRRI WP_039109379.1
#> ... ... ...
#> [4887] 19 MIEHELGNWKDFIEGMLRK WP_100181659.1
#> [4888] 19 MDSRSPLFKGLFFCPGVRR WP_227650304.1
#> [4889] 16 MNRVQFKHHHHHHHPD WP_004899375.1
#> [4890] 15 MKLVPFFFAFFFTFP WP_100250063.1
#> [4891] 14 MNAAIFRFFFYFST WP_001386830.1
check_input(RefSeq_aastringsetlist_collapsed, RefSeq_grangeslist, gene_field = "Name")
#> Error in check_gene_names(seq, annotation, gene_field = gene_field): Sequence names in 'seq' do not match gene names in 'annotation' for:
#> 1. GCF_000814805.1_ASM81480v1
#> 2. GCF_001482345.1_ASM148234v1
#> 3. GCF_003812105.1_ASM381210v1
#> 4. GCF_007955015.2_ASM795501v2
#> 5. GCF_009661535.2_ASM966153v2
sessionInfo()
#> R version 4.3.1 (2023-06-16)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Ubuntu 22.04.1 LTS
#>
#> Matrix products: default
#> BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0
#> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
#>
#> locale:
#> [1] LC_CTYPE=en_CA.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_CA.UTF-8 LC_COLLATE=en_CA.UTF-8
#> [5] LC_MONETARY=en_CA.UTF-8 LC_MESSAGES=en_CA.UTF-8
#> [7] LC_PAPER=en_CA.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_CA.UTF-8 LC_IDENTIFICATION=C
#>
#> time zone: America/Vancouver
#> tzcode source: system (glibc)
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] syntenet_1.2.4 lubridate_1.9.3 forcats_1.0.0 stringr_1.5.0
#> [5] dplyr_1.1.3 purrr_1.0.2 readr_2.1.4 tidyr_1.3.0
#> [9] tibble_3.2.1 ggplot2_3.4.4 tidyverse_2.0.0
#>
#> loaded via a namespace (and not attached):
#> [1] tidyselect_1.2.0 Biostrings_2.68.1
#> [3] bitops_1.0-7 fastmap_1.1.1
#> [5] RCurl_1.98-1.12 GenomicAlignments_1.36.0
#> [7] reprex_2.0.2 XML_3.99-0.14
#> [9] digest_0.6.33 timechange_0.2.0
#> [11] lifecycle_1.0.3 magrittr_2.0.3
#> [13] compiler_4.3.1 rlang_1.1.1
#> [15] tools_4.3.1 igraph_1.5.1
#> [17] utf8_1.2.3 yaml_2.3.7
#> [19] rtracklayer_1.60.1 knitr_1.44
#> [21] S4Arrays_1.0.6 htmlwidgets_1.6.2
#> [23] DelayedArray_0.26.7 RColorBrewer_1.1-3
#> [25] abind_1.4-5 BiocParallel_1.34.2
#> [27] withr_2.5.1 BiocGenerics_0.46.0
#> [29] grid_4.3.1 stats4_4.3.1
#> [31] fansi_1.0.5 colorspace_2.1-0
#> [33] scales_1.2.1 SummarizedExperiment_1.30.2
#> [35] cli_3.6.1 rmarkdown_2.25
#> [37] crayon_1.5.2 generics_0.1.3
#> [39] rstudioapi_0.15.0 tzdb_0.4.0
#> [41] rjson_0.2.21 zlibbioc_1.46.0
#> [43] network_1.18.1 parallel_4.3.1
#> [45] XVector_0.40.0 restfulr_0.0.15
#> [47] matrixStats_1.0.0 vctrs_0.6.4
#> [49] Matrix_1.6-0 IRanges_2.34.1
#> [51] hms_1.1.3 S4Vectors_0.38.2
#> [53] intergraph_2.0-3 glue_1.6.2
#> [55] statnet.common_4.9.0 codetools_0.2-19
#> [57] stringi_1.7.12 gtable_0.3.4
#> [59] GenomeInfoDb_1.36.4 GenomicRanges_1.52.1
#> [61] BiocIO_1.10.0 munsell_0.5.0
#> [63] pillar_1.9.0 ggnetwork_0.5.12
#> [65] htmltools_0.5.6.1 GenomeInfoDbData_1.2.10
#> [67] R6_2.5.1 networkD3_0.4
#> [69] evaluate_0.22 lattice_0.21-8
#> [71] Biobase_2.60.0 Rsamtools_2.16.0
#> [73] pheatmap_1.0.12 Rcpp_1.0.11
#> [75] coda_0.19-4 xfun_0.40
#> [77] fs_1.6.3 MatrixGenerics_1.12.3
#> [79] pkgconfig_2.0.3