Compress the GFF3 Exportar e compactar

Question

Reading GFF file and pre-processing, run iinfer_syntenet(), input and output

0

Entering edit mode

lwanderson8c • 0

@64e84694

Last seen 10 months ago

Brazil

Enter the body of text here

I made an input of gff.3 and fa.gz proteins "(Cillinoinensiselliott_562_v1.1.gene.gff3.gz)
Annotation, Gene, Transcript 7.2 MB 2 FEB 2020" and " (Cillinoinensiselliott_562_v1.1.protein_primaryTranscriptOnly.fa.gz)- Annotation, Protein 8.3 MB 12 FEB 2020 " 6 genomes 12 files, the first parameters are correct, but infer_syntenet() does not load anything, it generates a null and the readers are not lining up, blast_list remains OK, also tried different patterns in iinfer_syntenet() Code should be placed in three backticks as shown below


# include your problematic code here with any corresponding output 
# Descompacta os arquivos .fa.gz e .gff3.gz

fastsq2<-read.FASTA('extdata/sequences/Vunguiculata_469.fa.gz')

genes1<-rtracklayer::import('extdata/anotation/Vunguiculata_469.gff3.gz')


####ediar nomes do gff3 do fa

fasta_names<-names(fastsq2)

gene_names <- genes1$Name

for (i in 1:length(gene_names)) {
  gene_names[i] <- fasta_names[i]
}

genes1$Name<-gene_names


# Compress the GFF3 Exportar e compactar 

export.gff3(genes1,'extdata/sequences/Vunguiculata_469.gff3')


gzip('extdata/sequences/Vunguiculata_469.gff3', 
     destname = 'extdata/black/Vunguiculata_469.gff3.gz')


## input das sequencias 

fasta_dir<-file.path("extdata", "sequences")

proteomes1<-fasta2AAStringSetlist(fasta_dir)
proteomes1

dir(fasta_dir)

## inpute das anotação


gff_dir<-file.path("extdata", "anotation")

annotation1<-gff2GRangesList(gff_dir)
annotation1

check_input(proteomes1, annotation1)

####data

pdata1 <- process_input(proteomes1, annotation1)

attach(pdata1)

seq_2 <- process_input(proteomes1, annotation1)$seq[1:6]

if(diamond_is_installed()) {
  blast_po_an <- run_diamond(seq = pdata1$seq)
}


# List names

names(blast_po_an)

head(blast_po_an$Cdentata_673_Cdentata_673)

#detectar sintenia e intferir a rede de sintenia

annotation_pd<-pdata1$annotation

rede<-infer_syntenet(blast_list =  blast_po_an, annotation = pdata1$annotation, verbose = TRUE)

rede

# please also include the results of running the following in an R session 

> check_input(proteomes1, annotation1)
[1] TRUE

> names(blast_po_an)
 [1] "Cdentata_673_Cdentata_673"                           "Cdentata_673_Cillinoinensiselliott_562"             
 [3] "Cdentata_673_Fvesca_677"                             "Cdentata_673_Plunatus_563"                          
 [5] "Cdentata_673_PvulgarisUI111_534"                     "Cdentata_673_Vunguiculata_469"                      
 [7] "Cillinoinensiselliott_562_Cdentata_673"              "Cillinoinensiselliott_562_Cillinoinensiselliott_562"
 [9] "Cillinoinensiselliott_562_Fvesca_677"                "Cillinoinensiselliott_562_Plunatus_563"             
[11] "Cillinoinensiselliott_562_PvulgarisUI111_534"        "Cillinoinensiselliott_562_Vunguiculata_469"         
[13] "Fvesca_677_Cdentata_673"                             "Fvesca_677_Cillinoinensiselliott_562"               
[15] "Fvesca_677_Fvesca_677"                               "Fvesca_677_Plunatus_563"                            
[17] "Fvesca_677_PvulgarisUI111_534"                       "Fvesca_677_Vunguiculata_469"                        
[19] "Plunatus_563_Cdentata_673"                           "Plunatus_563_Cillinoinensiselliott_562"             
[21] "Plunatus_563_Fvesca_677"                             "Plunatus_563_Plunatus_563"                          
[23] "Plunatus_563_PvulgarisUI111_534"                     "Plunatus_563_Vunguiculata_469"                      
[25] "PvulgarisUI111_534_Cdentata_673"                     "PvulgarisUI111_534_Cillinoinensiselliott_562"       
[27] "PvulgarisUI111_534_Fvesca_677"                       "PvulgarisUI111_534_Plunatus_563"                    
[29] "PvulgarisUI111_534_PvulgarisUI111_534"               "PvulgarisUI111_534_Vunguiculata_469"                
[31] "Vunguiculata_469_Cdentata_673"                       "Vunguiculata_469_Cillinoinensiselliott_562"         
[33] "Vunguiculata_469_Fvesca_677"                         "Vunguiculata_469_Plunatus_563"                      
[35] "Vunguiculata_469_PvulgarisUI111_534"                 "Vunguiculata_469_Vunguiculata_469"                  
> 
> head(blast_po_an$Cdentata_673_Cdentata_673)
                  query                      db perc_identity length mismatches gap_open qstart qend tstart tend
1 Cde_Caden.M006400.1.p   Cde_Caden.M006400.1.p         100.0    460          0        0      1  460      1  460
2 Cde_Caden.M006400.1.p Cde_Caden.01G228700.1.p         100.0    460          0        0      1  460      1  460
3 Cde_Caden.M006400.1.p Cde_Caden.01G228600.1.p         100.0    460          0        0      1  460      1  460
4 Cde_Caden.M006400.1.p Cde_Caden.01G229000.1.p          97.0    460         14        0      1  460      1  460
5 Cde_Caden.M006400.1.p Cde_Caden.01G228900.1.p          87.8    460         30        1      1  460      1  434
6 Cde_Caden.M015000.1.p   Cde_Caden.M015000.1.p         100.0    192          0        0      1  192      1  192
     evalue bitscore
1  0.00e+00      929
2  0.00e+00      929
3  0.00e+00      929
4  0.00e+00      900
5 8.12e-294      798
6 2.32e-124      348

> rede<-infer_syntenet(blast_list =  blast_po_an, annotation = pdata1$annotation, verbose = TRUE)

Reading GFF file and pre-processing
Reading BLAST file and pre-processing
Generating BLAST list
match_list.size: 0
0 matches imported (88487 discarded)
0 pairwise comparisons
0 alignments generated
Pairwise collinear blocks written to Cde.collinearity
Writing multiple syntenic blocks to HTML files
Cde_Chr01.html
Cde_scaffold_83.html
Done!
Reading GFF file and pre-processing
Reading BLAST file and pre-processing
Generating BLAST list
match_list.size: 0
0 matches imported (89241 discarded)
0 pairwise comparisons
0 alignments generated
Pairwise collinear blocks written to Cil.collinearity
Writing multiple syntenic blocks to HTML files
Cil_Chr01.html
Cil_scaffold_600.html
Done!
Reading GFF file and pre-processing
Reading BLAST file and pre-processing
Generating BLAST list
match_list.size: 0
0 matches imported (91884 discarded)
0 pairwise comparisons
0 alignments generated
Pairwise collinear blocks written to Fve.collinearity
Writing multiple syntenic blocks to HTML files
Fve_Fvb1.html
Fve_Fvb7.html
Fve_contig_1.html
Fve_contig_10.html

> rede
NULL


sessionInfo( )

R version 4.2.2 (2022-10-31 ucrt)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 22621)

Matrix products: default

locale:
[1] LC_COLLATE=Portuguese_Brazil.utf8  LC_CTYPE=Portuguese_Brazil.utf8    LC_MONETARY=Portuguese_Brazil.utf8
[4] LC_NUMERIC=C                       LC_TIME=Portuguese_Brazil.utf8    

attached base packages:
[1] stats4    stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] rdiamond_0.0.0.9000  Rgff_0.1.6           ape_5.6-2            seqinr_4.2-23        microseq_2.1.5      
 [6] rlang_1.0.6          data.table_1.14.6    rtracklayer_1.58.0   R.utils_2.12.2       R.oo_1.25.0         
[11] R.methodsS3_1.8.2    Biostrings_2.64.1    XVector_0.36.0       forcats_0.5.2        stringr_1.5.0       
[16] dplyr_1.0.10         purrr_1.0.0          readr_2.1.3          tidyr_1.2.1          tibble_3.1.8        
[21] ggplot2_3.4.0        tidyverse_1.3.2      syntenet_1.0.3       GenomicRanges_1.48.0 GenomeInfoDb_1.32.4 
[26] IRanges_2.30.1       S4Vectors_0.34.0     BiocGenerics_0.44.0 

loaded via a namespace (and not attached):
  [1] googledrive_2.0.0           colorspace_2.0-3            rjson_0.2.21               
  [4] ellipsis_0.3.2              fs_1.5.2                    rstudioapi_0.14            
  [7] remotes_2.4.2               fansi_1.0.3                 lubridate_1.9.0            
 [10] xml2_1.3.3                  codetools_0.2-18            cachem_1.0.6               
 [13] ade4_1.7-20                 pkgload_1.3.2               jsonlite_1.8.4             
 [16] Rsamtools_2.14.0            broom_1.0.2                 dbplyr_2.2.1               
 [19] pheatmap_1.0.12             intergraph_2.0-2            shiny_1.7.4                
 [22] BiocManager_1.30.19         compiler_4.2.2              httr_1.4.4                 
 [25] backports_1.4.1             assertthat_0.2.1            Matrix_1.5-3               
 [28] fastmap_1.1.0               gargle_1.2.1                cli_3.5.0                  
 [31] later_1.3.0                 htmltools_0.5.4             prettyunits_1.1.1          
 [34] tools_4.2.2                 igraph_1.3.5                coda_0.19-4                
 [37] gtable_0.3.1                glue_1.6.2                  GenomeInfoDbData_1.2.9     
 [40] Rcpp_1.0.9                  Biobase_2.58.0              cellranger_1.1.0           
 [43] statnet.common_4.7.0        vctrs_0.5.1                 nlme_3.1-161               
 [46] networkD3_0.4               ps_1.7.2                    network_1.18.0             
 [49] rvest_1.0.3                 timechange_0.1.1            mime_0.12                  
 [52] miniUI_0.1.1.1              lifecycle_1.0.3             restfulr_0.0.15            
 [55] devtools_2.4.5              XML_3.99-0.13               googlesheets4_1.0.1        
 [58] MASS_7.3-58.1               zlibbioc_1.42.0             scales_1.2.1               
 [61] hms_1.1.2                   promises_1.2.0.1            MatrixGenerics_1.10.0      
 [64] parallel_4.2.2              SummarizedExperiment_1.28.0 RColorBrewer_1.1-3         
 [67] yaml_2.3.6                  memoise_2.0.1               ggnetwork_0.5.10           
 [70] stringi_1.7.8               BiocIO_1.8.0                pkgbuild_1.4.0             
 [73] BiocParallel_1.32.4         pkgconfig_2.0.3             matrixStats_0.63.0         
 [76] bitops_1.0-7                lattice_0.20-45             GenomicAlignments_1.34.0   
 [79] htmlwidgets_1.6.0           processx_3.8.0              tidyselect_1.2.0           
 [82] magrittr_2.0.3              R6_2.5.1                    generics_0.1.3             
 [85] profvis_0.3.7               DelayedArray_0.23.2         DBI_1.1.3                  
 [88] withr_2.5.0                 pillar_1.8.1                haven_2.5.1                
 [91] RCurl_1.98-1.9              modelr_0.1.10               crayon_1.5.2               
 [94] utf8_1.2.2                  tzdb_0.3.0                  urlchecker_1.0.1           
 [97] usethis_2.1.6               grid_4.2.2                  readxl_1.4.1               
[100] callr_3.7.3                 reprex_2.0.2                digest_0.6.31              
[103] xtable_1.8-4                httpuv_1.6.7                munsell_0.5.0              
[106] sessioninfo_1.2.2

syntenet • 1.7k views

ADD COMMENT • link updated 10 months ago by Sujeevan • 0 • written 23 months ago by lwanderson8c • 0

score 0 · Answer 1 · 2023-01-02

0

Entering edit mode

Fabricio Almeida-Silva ▴ 40

@fabricio_almeidasilva-14890

Last seen 8 months ago

Ghent, Belgium

Hi,

Thank you for providing a reproducible example.

From the log messages of infer_syntenet(), it looks the function can't detect matches between the BLAST-like list and the GRangesList objects ("match_list.size: 0"). Can you check if gene names in the annotation list match gene names in the sequence list? You can manually explore it with:

head(pdata1$seq$Cdentata_673)
head(pdata1$annotation$Cdentata_673)

You can also try:

length(intersect(
    names(pdata1$seq$Cdentata_673), pdata1$annotation$Cdentata_673$gene_id
))

The gene_id variable of pdata1$annotation$Cdentata_673 should match the names of pdata1$seq$Cdentata_673.

Best,

Fabricio

ADD COMMENT • link 23 months ago Fabricio Almeida-Silva ▴ 40

0

Entering edit mode

this 0 more

length(intersect(annotation1_0$Cdentata_673$Name, names(proteomes1$Cdentata_673)))

[1] 31254

after process_input(proteomes1_0, annotation1) the names are different

process_input(proteomes1, annotation1_0)

length(intersect(names(pdata1$seq$Cdentata_673), pdata1$annotation$Cdentata_673$gene))

[1] 0

to leave the genes and seq with the same names I used this function before creating the pdata1 below

fastsq2<-read.FASTA('extdata/sequences/Cdentata_673.fa.gz')

genes1<-rtracklayer::import('extdata/anotation/Cdentata_673.gff3.gz')

ediar nomes do gff3 do fa

fasta_names<-names(fastsq2)

gene_names <- genes1$Name

for (i in 1:length(gene_names)) { gene_names[i] <- fasta_names[i] }

genes1$Name<-gene_names

Compress the GFF3 Exportar e compactar

export.gff3(genes1,'extdata/sequences/Cdentata_673.gff3')

gzip('extdata/sequences/Cdentata_673.gff3', destname = 'extdata/black/Cdentata_673.gff3.gz')

without this function this error occurs when checking

check_input(proteomes1, annotation1)

Error in check_gene_names(seq, annotation) : Sequence names in 'seq' do not match gene names in 'annotation' for:

Cdentata_673
Cillinoinensiselliott_562
Fvesca_677
Plunatus_563
PvulgarisUI111_534
Vunguiculata_469

note - found a lot of NA in the gene file

after I did this so that the names are at least the names of at least some part of the file

names(pdata1$seq$Cdentata_673)<-names(proteomes1$Cdentata_673) pdata1$annotation$Cdentata_673$gene<-names(pdata1$seq$Cdentata_673)

length(intersect( names(pdata1$seq$Cdentata_673), pdata1$annotation$Cdentata_673$gene))

[1] 31254

but the blast_list gives the same thing as before, infer_syntenet() doesn't work, what can I do?

note

annotation1$Cdentata_673$Name [1] "Caden.01G000100" "Caden.01G000100.1" NA NA
[5] NA NA NA NA
[9] NA NA "Caden.01G000200" "Caden.01G000200.1" [13] NA NA NA NA
[17] NA NA NA NA

ADD REPLY • link 23 months ago lwanderson8c • 0

0

Entering edit mode

Hi.

You are changing gene IDs in the Name variable of your GRanges object, but syntenet uses the gene_id variable (if gene_id is not present, it uses Name). As I said in my previous response, could you check if sequence names match the IDs in the gene_id variable?

Besides, in your for loop, you are replacing gene IDs in the GRanges objects with the names in your AAStringSet objects, but did you check if gene order is the same in both objects? You might be replacing a gene ID with the ID of another gene.

ADD REPLY • link 23 months ago Fabricio Almeida-Silva ▴ 40

0

Entering edit mode

Not responding as shown in the script. I already downloaded all the files from phytozone nothing matches

ADD REPLY • link 23 months ago lwanderson8c • 0

0

Entering edit mode

Could you share the link to the exact files you downloaded from Phytozome so I can try here? 2-3 species from your list would be enough.

ADD REPLY • link 22 months ago Fabricio Almeida-Silva ▴ 40

0

Entering edit mode

this is the site of 3 species where it was downloaded and below is a code to download made available by phytozone

file names Plunatus_563_V1.gene.gff3.gz / Plunatus_563_V1.protein_primaryTranscriptOnly.fa.gz ; PvulgarisUI111_534_v1.1.gene.gff3.gz / PvulgarisUI111_534_v1.1.protein_primaryTranscriptOnly.fa.gz ; Vunguiculata_469_v1.1.gene.gff3.gz / Vunguiculata_469_v1.1.protein_primaryTranscriptOnly.fa.gz

https://data.jgi.doe.gov/refine-download/phytozome?genome_id=469%2C534%2C563&expanded=Phytozome-563%2CPhytozome-534%2CPhytozome-469

code

curl --cookie jgi_session=/api/sessions/034a5c0dd7c17e0d23c0b8d07cdef604 --output download.20230112.105349.zip -d "{\"ids\":{\"Phytozome-563\":[\"5ee7f88c6263bf2148832fc2\",\"5ee7f88d6263bf2148832fc6\"],\"Phytozome-534\":[\"5d94dc9ec0d65a87debccfbb\",\"5d94dc9ec0d65a87debccfba\"],\"Phytozome-469\":[\"597f9cb77ded5e0452b3f36a\",\"597f9cb77ded5e0452b3f36b\"]}}" -H "Content-Type: application/json" https://files.jgi.doe.gov/filedownload/

ADD REPLY • link 22 months ago lwanderson8c • 0

2

Entering edit mode

Hi, Wanderson.

I took a look at the data from Phytozome and found out what was causing the problem. As explained in syntenet's vignette, the names of the AAStringSet objects must match the gene IDs in the fields gene_id or Name of the GRanges objects. Your sequence names (equivalent to FASTA headers) were long strings containing information on several IDs, but they must contain only the IDs specified in gene_id/Name.

Here's how I identified the problem and fixed it:

library(syntenet)
library(here)
#> here() starts at /home/faalm/Downloads/syntenet_issue

# See directory structure
fs::dir_tree()
#> .
#> ├── annotation
#> │   ├── Plunatus_563_V1.gene_exons.gff3.gz
#> │   ├── PvulgarisUI111_534_v1.1.gene.gff3.gz
#> │   └── Vunguiculata_469_v1.1.gene.gff3.gz
#> ├── proteomes
#> │   ├── Plunatus_563_V1.protein_primaryTranscriptOnly.fa.gz
#> │   ├── PvulgarisUI111_534_v1.1.protein_primaryTranscriptOnly.fa.gz
#> │   └── Vunguiculata_469_v1.1.protein_primaryTranscriptOnly.fa.gz
#> ├── syntenet_issue.R
#> ├── syntenet_issue.Rproj
#> ├── syntenet_issue_reprex.R
#> ├── syntenet_issue_reprex.spin.R
#> └── syntenet_issue_reprex.spin.Rmd

# Load data
seq <- fasta2AAStringSetlist(here("proteomes"))
annotation <- gff2GRangesList(here("annotation"))

## Take a quick look at the data
names(seq)
#> [1] "Plunatus_563_V1.protein_primaryTranscriptOnly"        
#> [2] "PvulgarisUI111_534_v1.1.protein_primaryTranscriptOnly"
#> [3] "Vunguiculata_469_v1.1.protein_primaryTranscriptOnly"
names(annotation)
#> [1] "Plunatus_563_V1.gene_exons"   "PvulgarisUI111_534_v1.1.gene"
#> [3] "Vunguiculata_469_v1.1.gene"

##> Note: list element names are different; let's make them equal
names(seq) <- gsub("_.*", "", names(seq))
names(annotation) <- gsub("_.*", "", names(annotation))

identical(names(seq), names(annotation)) # checking if new names are identical
#> [1] TRUE

## Check if gene IDs in GRanges and AAStringSet objects match
head(seq$Plunatus)
#> AAStringSet object of length 6:
#>     width seq                                               names               
#> [1]   112 MNVAPTQLHPNGWAFVRAFAILY...GFPSTGCDSIPMSSSLDFGVPW* tig000002340100.1...
#> [2]   337 MGARGVVLIHSWKCEATDVTKPC...LLSFVDFTVVWCVSMCVKSVEV* tig000002340080.1...
#> [3]   274 MGARRVVLIYSWKCEATDVTESC...FLSLCGDRRISVLIADRLFWGF* tig000002340120.1...
#> [4]    33 MGPVVLTSDMSTSTVYCRRPRSGHYTSPPGLS*                 tig000002340060.1...
#> [5]   427 MLMEVGSSQVCLLQSGETWMTPY...ETLEGGPIPRTWNATHLKFYFS* tig000002340010.1...
#> [6]   325 MDESNSPQPSTPPTTLAVPLAHY...AGSSTAVLPLLPAPSHPHVARS* tig000002340040.1...
head(annotation$Plunatus)
#> GRanges object with 6 ranges and 9 metadata columns:
#>       seqnames    ranges strand |       source           type     score
#>          <Rle> <IRanges>  <Rle> |     <factor>       <factor> <numeric>
#>   [1]     Pl01 6264-7469      + | phytozomev13 gene                  NA
#>   [2]     Pl01 6264-7469      + | phytozomev13 mRNA                  NA
#>   [3]     Pl01 6264-6477      + | phytozomev13 exon                  NA
#>   [4]     Pl01 6264-6346      + | phytozomev13 five_prime_UTR        NA
#>   [5]     Pl01 6347-6477      + | phytozomev13 CDS                   NA
#>   [6]     Pl01 7357-7469      + | phytozomev13 exon                  NA
#>           phase                     ID                 Name       pacid
#>       <integer>            <character>          <character> <character>
#>   [1]      <NA>     Pl01G0000000100.v1   Pl01G0000000100.v1        <NA>
#>   [2]      <NA>   Pl01G0000000100.1.v1 Pl01G0000000100.1.v1    44365426
#>   [3]      <NA> Pl01G0000000100.1.v1..                 <NA>    44365426
#>   [4]      <NA> Pl01G0000000100.1.v1..                 <NA>    44365426
#>   [5]         0 Pl01G0000000100.1.v1..                 <NA>    44365426
#>   [6]      <NA> Pl01G0000000100.1.v1..                 <NA>    44365426
#>           longest               Parent
#>       <character>      <CharacterList>
#>   [1]        <NA>                     
#>   [2]           1   Pl01G0000000100.v1
#>   [3]        <NA> Pl01G0000000100.1.v1
#>   [4]        <NA> Pl01G0000000100.1.v1
#>   [5]        <NA> Pl01G0000000100.1.v1
#>   [6]        <NA> Pl01G0000000100.1.v1
#>   -------
#>   seqinfo: 373 sequences from an unspecified genome; no seqlengths

##> Note: names do not match! Let's double-check with `check_input()`
check_input(seq, annotation)
#> Error in check_gene_names(seq, annotation): Sequence names in 'seq' do not match gene names in 'annotation' for:
#> 1. Plunatus
#> 2. PvulgarisUI111
#> 3. Vunguiculata

## Taking a closer look
lapply(seq, function(x) head(names(x)))
#> $Plunatus
#> [1] "tig000002340100.1.v1 pacid=44352969 transcript=tig000002340100.1.v1 locus=tig000002340100.v1 ID=tig000002340100.1.v1 annot-version=V1"
#> [2] "tig000002340080.1.v1 pacid=44352970 transcript=tig000002340080.1.v1 locus=tig000002340080.v1 ID=tig000002340080.1.v1 annot-version=V1"
#> [3] "tig000002340120.1.v1 pacid=44352971 transcript=tig000002340120.1.v1 locus=tig000002340120.v1 ID=tig000002340120.1.v1 annot-version=V1"
#> [4] "tig000002340060.1.v1 pacid=44352972 transcript=tig000002340060.1.v1 locus=tig000002340060.v1 ID=tig000002340060.1.v1 annot-version=V1"
#> [5] "tig000002340010.1.v1 pacid=44352973 transcript=tig000002340010.1.v1 locus=tig000002340010.v1 ID=tig000002340010.1.v1 annot-version=V1"
#> [6] "tig000002340040.1.v1 pacid=44352974 transcript=tig000002340040.1.v1 locus=tig000002340040.v1 ID=tig000002340040.1.v1 annot-version=V1"
#> 
#> $PvulgarisUI111
#> [1] "PvUI111.04G035500.1.p pacid=42814644 transcript=PvUI111.04G035500.1 locus=PvUI111.04G035500 ID=PvUI111.04G035500.1.v1.1 annot-version=v1.1"
#> [2] "PvUI111.04G132100.1.p pacid=42814645 transcript=PvUI111.04G132100.1 locus=PvUI111.04G132100 ID=PvUI111.04G132100.1.v1.1 annot-version=v1.1"
#> [3] "PvUI111.04G160500.1.p pacid=42814646 transcript=PvUI111.04G160500.1 locus=PvUI111.04G160500 ID=PvUI111.04G160500.1.v1.1 annot-version=v1.1"
#> [4] "PvUI111.04G077300.1.p pacid=42814647 transcript=PvUI111.04G077300.1 locus=PvUI111.04G077300 ID=PvUI111.04G077300.1.v1.1 annot-version=v1.1"
#> [5] "PvUI111.04G122300.1.p pacid=42814648 transcript=PvUI111.04G122300.1 locus=PvUI111.04G122300 ID=PvUI111.04G122300.1.v1.1 annot-version=v1.1"
#> [6] "PvUI111.04G087300.1.p pacid=42814650 transcript=PvUI111.04G087300.1 locus=PvUI111.04G087300 ID=PvUI111.04G087300.1.v1.1 annot-version=v1.1"
#> 
#> $Vunguiculata
#> [1] "VigunL081000.1.p pacid=39013057 transcript=VigunL081000.1 locus=VigunL081000 ID=VigunL081000.1.v1.1 annot-version=v1.1"
#> [2] "VigunL080300.1.p pacid=39013058 transcript=VigunL080300.1 locus=VigunL080300 ID=VigunL080300.1.v1.1 annot-version=v1.1"
#> [3] "VigunL080500.1.p pacid=39013059 transcript=VigunL080500.1 locus=VigunL080500 ID=VigunL080500.1.v1.1 annot-version=v1.1"
#> [4] "VigunL080200.1.p pacid=39013060 transcript=VigunL080200.1 locus=VigunL080200 ID=VigunL080200.1.v1.1 annot-version=v1.1"
#> [5] "VigunL080400.1.p pacid=39013061 transcript=VigunL080400.1 locus=VigunL080400 ID=VigunL080400.1.v1.1 annot-version=v1.1"
#> [6] "VigunL080600.1.p pacid=39013062 transcript=VigunL080600.1 locus=VigunL080600 ID=VigunL080600.1.v1.1 annot-version=v1.1"
lapply(annotation, function(x) head(x$Name[x$type == "gene"]))
#> $Plunatus
#> [1] "Pl01G0000000100.v1" "Pl01G0000000200.v1" "Pl01G0000000300.v1"
#> [4] "Pl01G0000000400.v1" "Pl01G0000000500.v1" "Pl01G0000000600.v1"
#> 
#> $PvulgarisUI111
#> [1] "PvUI111.01G000100" "PvUI111.01G000200" "PvUI111.01G000300"
#> [4] "PvUI111.01G000400" "PvUI111.01G000500" "PvUI111.01G000600"
#> 
#> $Vunguiculata
#> [1] "VigunL000100" "VigunL000200" "VigunL000300" "VigunL000400" "VigunL000500"
#> [6] "VigunL000600"

## Problem detected: seq names should be only the string after "locus="
## Extract only the IDs after `locus=` and use them as sequence names
seq2 <- lapply(seq, function(x) {

    # Remove everything before "locus=" and after space    
    new_names <- gsub(".*locus=", "", names(x))
    new_names <- gsub(" .*", "", new_names)

    names(x) <- new_names
    return(x)
})

# Checking if the problem was solved
check_input(seq2, annotation)
#> [1] TRUE

Created on 2023-01-16 with reprex v2.0.2

Best,

Fabricio

ADD REPLY • link 22 months ago Fabricio Almeida-Silva ▴ 40

0

Entering edit mode

Thank you for the codes, I used ITAG4.0

$tomato [1] "Solyc00g500001" "Solyc00g500002" "Solyc00g500003" "Solyc00g500004" "Solyc00g500005" "Solyc00g500006"

lapply(seq, function(x) head(names(x))) $tomato [1] "Solyc00g500001" "Solyc00g500002" "Solyc00g500003" "Solyc00g500004" "Solyc00g500005" "Solyc00g500006"

check_input(seq, annotation4) Error in check_list_names(seq, annotation) : Names of list elements in 'seq' and 'annotation' must match.

Please help!

ADD REPLY • link 10 months ago Sujeevan • 0