830e8a9193ae7906770c396cee7e58b2b14dfdda hiram Wed Jan 31 13:43:17 2024 -0800 update to publication reference no redmine diff --git src/hg/utils/automation/asmHubTOGA.pl src/hg/utils/automation/asmHubTOGA.pl index d4dee34..6d3bd94 100755 --- src/hg/utils/automation/asmHubTOGA.pl +++ src/hg/utils/automation/asmHubTOGA.pl @@ -1,184 +1,185 @@ #!/usr/bin/env perl use strict; use warnings; use FindBin qw($Bin); use lib "$Bin"; use AsmHub; use File::Basename; my $argc = scalar(@ARGV); if ($argc != 3) { printf STDERR "usage: asmHubTOGA.pl asmId asmId.names.tab .../trackData/\n"; printf STDERR "where asmId is the assembly identifier,\n"; printf STDERR "and .../trackData/ is the path to the /trackData/ directory.\n"; exit 255; } # from Perl Cookbook Recipe 2.17, print out large numbers with comma # delimiters: sub commify($) { my $text = reverse $_[0]; $text =~ s/(\d\d\d)(?=\d)(?!\d*\.)/$1,/g; return scalar reverse $text } my $asmId = shift; my $namesFile = shift; my $trackDataDir = shift; my $TOGABbi = `ls $trackDataDir/TOGAv*/HLTOGAannotVs*.bb`; chomp $TOGABbi; my $track = "TOGAannotation"; if ( ! -s $TOGABbi ) { printf STDERR "ERROR: can not find trackData/TOGAv*/HLTOGAannotVs*.bb file\n"; exit 255; } my $gcX = substr($asmId,0,3); my $d0 = substr($asmId,4,3); my $d1 = substr($asmId,7,3); my $d2 = substr($asmId,10,3); my (undef, $acc, undef) = split('_', $asmId, 3); my $accession = "${gcX}_${acc}"; my $togaDir = dirname($TOGABbi); $togaDir =~ s#.*/trackData/##; my $bbFile = basename($TOGABbi); my $totalBases = `ave -col=2 $trackDataDir/../${asmId}.chrom.sizes | grep "^total" | awk '{printf "%d", \$2}'`; chomp $totalBases; my $itemsBases = `bigBedInfo $TOGABbi | egrep "itemCount|basesCovered" | awk '{print \$NF}' | sed -e 's/,//g;' | xargs echo`; my ($itemCount, $basesCovered) = split('\s+', $itemsBases); my $percentCoverage = sprintf("%.2f", 100.0 * $basesCovered / $totalBases); $itemCount = commify($itemCount); $basesCovered = commify($basesCovered); $totalBases = commify($totalBases); my $em = ""; my $noEm = ""; my $assemblyDate = `grep -v "^#" $namesFile | cut -f9`; chomp $assemblyDate; my $ncbiAssemblyId = `grep -v "^#" $namesFile | cut -f10`; chomp $ncbiAssemblyId; my $organism = `grep -v "^#" $namesFile | cut -f5`; chomp $organism; my $downloadUrl="https://hgdownload.soe.ucsc.edu/hubs/$gcX/$d0/$d1/$d2/$accession/bbi/$bbFile"; # there is a bug in the original files, should have been GalGal6 # the symlinks in bbi have been fixed $downloadUrl =~ s/galGal6/GalGal6/; print <<_EOF_

Description

TOGA (Tool to infer Orthologs from Genome Alignments) is a homology-based method that integrates gene annotation, inferring orthologs and classifying genes as intact or lost.

This track has $itemCount items in the track, covering $basesCovered bases in the sequence which is % $percentCoverage of the total sequence of size $totalBases nucleotides.

Methods

As input, TOGA uses a gene annotation of a reference species (human/hg38 for mammals, chicken/galGal6 for birds) and a whole genome alignment between the reference and query genome.

TOGA implements a novel paradigm that relies on alignments of intronic and intergenic regions and uses machine learning to accurately distinguish orthologs from paralogs or processed pseudogenes.

To annotate genes, CESAR 2.0 is used to determine the positions and boundaries of coding exons of a reference transcript in the orthologous genomic locus in the query species.

Display Conventions and Configuration

Each annotated transcript is shown in a color-coded classification as

"intact": middle 80% of the CDS (coding sequence) is present and exhibits no gene-inactivating mutation. These transcripts likely encode functional proteins.
"partially intact": 50% of the CDS is present in the query and the middle 80% of the CDS exhibits no inactivating mutation. These transcripts may also encode functional proteins, but the evidence is weaker as parts of the CDS are missing, often due to assembly gaps.
"missing": <50% of the CDS is present in the query and the middle 80% of the CDS exhibits no inactivating mutation.
"uncertain loss": there is 1 inactivating mutation in the middle 80% of the CDS, but evidence is not strong enough to classify the transcript as lost. These transcripts may or may not encode a functional protein.
"lost": typically several inactivating mutations are present, thus there is strong evidence that the transcript is unlikely to encode a functional protein.

Clicking on a transcript provides additional information about the orthology classification, inactivating mutations, the protein sequence and protein/exon alignments.

Data Access

The data for this track is available from the bigBed file format with the command line access tool bigBedToBed available from the utilities download directory hgdownload.soe.ucsc.edu/admin/exe/linux_x86_64.

To extract from the bigBed file:

   bigBedToBed "$downloadUrl" togaData.bed

with the result in the togaData.bed file.

Credits

This data was prepared by the Michael Hiller Lab

References

The TOGA software is available from github.com/hillerlab/TOGA

-Kirilenko BM, Munegowda C, Osipova E, Jebb D, Sharma V, Blumer M, Morales A, -Ahmed AW, Kontopoulos DG, Hilgers L, Zoonomia Consortium, Hiller M. -TOGA integrates gene annotation with orthology inference -at scale. bioRxiv preprint September 2022 +Kirilenko BM, Munegowda C, Osipova E, Jebb D, Sharma V, Blumer M, Morales AE, +Ahmed AW, Kontopoulos DG, Hilgers L, Lindblad-Toh K, Karlsson EK, +Zoonomia Consortium, Hiller M. +Integrating gene annotation with orthology inference at +scale. Science, 380(6643), eabn3107, 2023

_EOF_ ;