830e8a9193ae7906770c396cee7e58b2b14dfdda hiram Wed Jan 31 13:43:17 2024 -0800 update to publication reference no redmine diff --git src/hg/utils/automation/asmHubTOGA.pl src/hg/utils/automation/asmHubTOGA.pl index d4dee34..6d3bd94 100755 --- src/hg/utils/automation/asmHubTOGA.pl +++ src/hg/utils/automation/asmHubTOGA.pl @@ -1,184 +1,185 @@ #!/usr/bin/env perl use strict; use warnings; use FindBin qw($Bin); use lib "$Bin"; use AsmHub; use File::Basename; my $argc = scalar(@ARGV); if ($argc != 3) { printf STDERR "usage: asmHubTOGA.pl asmId asmId.names.tab .../trackData/\n"; printf STDERR "where asmId is the assembly identifier,\n"; printf STDERR "and .../trackData/ is the path to the /trackData/ directory.\n"; exit 255; } # from Perl Cookbook Recipe 2.17, print out large numbers with comma # delimiters: sub commify($) { my $text = reverse $_[0]; $text =~ s/(\d\d\d)(?=\d)(?!\d*\.)/$1,/g; return scalar reverse $text } my $asmId = shift; my $namesFile = shift; my $trackDataDir = shift; my $TOGABbi = `ls $trackDataDir/TOGAv*/HLTOGAannotVs*.bb`; chomp $TOGABbi; my $track = "TOGAannotation"; if ( ! -s $TOGABbi ) { printf STDERR "ERROR: can not find trackData/TOGAv*/HLTOGAannotVs*.bb file\n"; exit 255; } my $gcX = substr($asmId,0,3); my $d0 = substr($asmId,4,3); my $d1 = substr($asmId,7,3); my $d2 = substr($asmId,10,3); my (undef, $acc, undef) = split('_', $asmId, 3); my $accession = "${gcX}_${acc}"; my $togaDir = dirname($TOGABbi); $togaDir =~ s#.*/trackData/##; my $bbFile = basename($TOGABbi); my $totalBases = `ave -col=2 $trackDataDir/../${asmId}.chrom.sizes | grep "^total" | awk '{printf "%d", \$2}'`; chomp $totalBases; my $itemsBases = `bigBedInfo $TOGABbi | egrep "itemCount|basesCovered" | awk '{print \$NF}' | sed -e 's/,//g;' | xargs echo`; my ($itemCount, $basesCovered) = split('\s+', $itemsBases); my $percentCoverage = sprintf("%.2f", 100.0 * $basesCovered / $totalBases); $itemCount = commify($itemCount); $basesCovered = commify($basesCovered); $totalBases = commify($totalBases); my $em = "<em>"; my $noEm = "</em>"; my $assemblyDate = `grep -v "^#" $namesFile | cut -f9`; chomp $assemblyDate; my $ncbiAssemblyId = `grep -v "^#" $namesFile | cut -f10`; chomp $ncbiAssemblyId; my $organism = `grep -v "^#" $namesFile | cut -f5`; chomp $organism; my $downloadUrl="https://hgdownload.soe.ucsc.edu/hubs/$gcX/$d0/$d1/$d2/$accession/bbi/$bbFile"; # there is a bug in the original files, should have been GalGal6 # the symlinks in bbi have been fixed $downloadUrl =~ s/galGal6/GalGal6/; print <<_EOF_ <h2>Description</h2> <p> <b>TOGA</b> (<b>T</b>ool to infer <b>O</b>rthologs from <b>G</b>enome <b>A</b>lignments) is a homology-based method that integrates gene annotation, inferring orthologs and classifying genes as intact or lost. </p> <p> This track has $itemCount items in the track, covering $basesCovered bases in the sequence which is % $percentCoverage of the total sequence of size $totalBases nucleotides. </p> <h2>Methods</h2> <p> As input, <b>TOGA</b> uses a gene annotation of a reference species (human/hg38 for mammals, chicken/galGal6 for birds) and a whole genome alignment between the reference and query genome. </p> <p> <b>TOGA</b> implements a novel paradigm that relies on alignments of intronic and intergenic regions and uses machine learning to accurately distinguish orthologs from paralogs or processed pseudogenes. </p> <p> To annotate genes, <a href='https://academic.oup.com/bioinformatics/article/33/24/3985/4095639' target=blank>CESAR 2.0</a> is used to determine the positions and boundaries of coding exons of a reference transcript in the orthologous genomic locus in the query species. </p> <h2>Display Conventions and Configuration</h2> <p> Each annotated transcript is shown in a color-coded classification as <ul> <li><span style='display:inline-block; width:40px; height:15px; background-color:blue;'> </span> <span style='color:blue'>"intact"</span>: middle 80% of the CDS (coding sequence) is present and exhibits no gene-inactivating mutation. These transcripts likely encode functional proteins.</li> <li><span style='display:inline-block; width:40px; height:15px; background-color:lightblue;'> </span> <span style='color:#7193a0'>"partially intact"</span>: 50% of the CDS is present in the query and the middle 80% of the CDS exhibits no inactivating mutation. These transcripts may also encode functional proteins, but the evidence is weaker as parts of the CDS are missing, often due to assembly gaps.</li> <li><span style='display:inline-block; width:40px; height:15px; background-color:grey;'> </span> <span style='color:grey'>"missing"</span>: <50% of the CDS is present in the query and the middle 80% of the CDS exhibits no inactivating mutation.</li> <li><span style='display:inline-block; width:40px; height:15px; background-color:orange;'> </span> <span style='color:orange'>"uncertain loss"</span>: there is 1 inactivating mutation in the middle 80% of the CDS, but evidence is not strong enough to classify the transcript as lost. These transcripts may or may not encode a functional protein.</li> <li><span style='display:inline-block; width:40px; height:15px; background-color:red;'> </span> <span style='color:red'>"lost"</span>: typically several inactivating mutations are present, thus there is strong evidence that the transcript is unlikely to encode a functional protein.</li> </ul> </p> <p> Clicking on a transcript provides additional information about the orthology classification, inactivating mutations, the protein sequence and protein/exon alignments. </p> <H2>Data Access</H2> <p> The data for this track is available from the <em>bigBed</em> file format with the command line access tool <em>bigBedToBed</em> available from the utilities download directory <a href='http://hgdownload.soe.ucsc.edu/admin/exe/linux_x86_64/' target=_blank>hgdownload.soe.ucsc.edu/admin/exe/linux_x86_64</a>. </p> <p> To extract from the <em>bigBed</em> file: <pre> bigBedToBed "$downloadUrl" togaData.bed </pre> with the result in the <em>togaData.bed</em> file. </p> <h2>Credits</h2> <p> This data was prepared by the <a href='https://tbg.senckenberg.de/hillerlab/' target=_blank>Michael Hiller Lab</a> </p> <h2>References</h2> <p> The <b>TOGA</b> software is available from <a href='https://github.com/hillerlab/TOGA' target=_blank>github.com/hillerlab/TOGA</a> </p> <p> -Kirilenko BM, Munegowda C, Osipova E, Jebb D, Sharma V, Blumer M, Morales A, -Ahmed AW, Kontopoulos DG, Hilgers L, Zoonomia Consortium, Hiller M. -<a href='https://www.biorxiv.org/content/10.1101/2022.09.08.507143v1' -target=_blank>TOGA integrates gene annotation with orthology inference -at scale</a>. <em>bioRxiv preprint September 2022</em> +Kirilenko BM, Munegowda C, Osipova E, Jebb D, Sharma V, Blumer M, Morales AE, +Ahmed AW, Kontopoulos DG, Hilgers L, Lindblad-Toh K, Karlsson EK, +Zoonomia Consortium, Hiller M. +<a href="https://www.science.org/doi/10.1126/science.abn3107" +target="_blank">Integrating gene annotation with orthology inference at +scale</a>. <em>Science</em>, 380(6643), eabn3107, 2023 </p> _EOF_ ;