ac38533f23605cb72cffcf3017838c9a10dab63e hiram Wed Sep 21 16:06:17 2022 -0700 now making a track for TOGA annotations refs #29982 diff --git src/hg/utils/automation/asmHubTOGA.pl src/hg/utils/automation/asmHubTOGA.pl new file mode 100755 index 0000000..d4dee34 --- /dev/null +++ src/hg/utils/automation/asmHubTOGA.pl @@ -0,0 +1,184 @@ +#!/usr/bin/env perl + +use strict; +use warnings; +use FindBin qw($Bin); +use lib "$Bin"; +use AsmHub; +use File::Basename; + +my $argc = scalar(@ARGV); + +if ($argc != 3) { + printf STDERR "usage: asmHubTOGA.pl asmId asmId.names.tab .../trackData/\n"; + printf STDERR "where asmId is the assembly identifier,\n"; + printf STDERR "and .../trackData/ is the path to the /trackData/ directory.\n"; + exit 255; +} + +# from Perl Cookbook Recipe 2.17, print out large numbers with comma +# delimiters: +sub commify($) { + my $text = reverse $_[0]; + $text =~ s/(\d\d\d)(?=\d)(?!\d*\.)/$1,/g; + return scalar reverse $text +} + +my $asmId = shift; +my $namesFile = shift; +my $trackDataDir = shift; +my $TOGABbi = `ls $trackDataDir/TOGAv*/HLTOGAannotVs*.bb`; +chomp $TOGABbi; +my $track = "TOGAannotation"; + +if ( ! -s $TOGABbi ) { + printf STDERR "ERROR: can not find trackData/TOGAv*/HLTOGAannotVs*.bb file\n"; + exit 255; +} + +my $gcX = substr($asmId,0,3); +my $d0 = substr($asmId,4,3); +my $d1 = substr($asmId,7,3); +my $d2 = substr($asmId,10,3); +my (undef, $acc, undef) = split('_', $asmId, 3); +my $accession = "${gcX}_${acc}"; + +my $togaDir = dirname($TOGABbi); +$togaDir =~ s#.*/trackData/##; + +my $bbFile = basename($TOGABbi); + +my $totalBases = `ave -col=2 $trackDataDir/../${asmId}.chrom.sizes | grep "^total" | awk '{printf "%d", \$2}'`; +chomp $totalBases; +my $itemsBases = `bigBedInfo $TOGABbi | egrep "itemCount|basesCovered" | awk '{print \$NF}' | sed -e 's/,//g;' | xargs echo`; +my ($itemCount, $basesCovered) = split('\s+', $itemsBases); + +my $percentCoverage = sprintf("%.2f", 100.0 * $basesCovered / $totalBases); +$itemCount = commify($itemCount); +$basesCovered = commify($basesCovered); +$totalBases = commify($totalBases); + +my $em = ""; +my $noEm = ""; +my $assemblyDate = `grep -v "^#" $namesFile | cut -f9`; +chomp $assemblyDate; +my $ncbiAssemblyId = `grep -v "^#" $namesFile | cut -f10`; +chomp $ncbiAssemblyId; +my $organism = `grep -v "^#" $namesFile | cut -f5`; +chomp $organism; + +my $downloadUrl="https://hgdownload.soe.ucsc.edu/hubs/$gcX/$d0/$d1/$d2/$accession/bbi/$bbFile"; +# there is a bug in the original files, should have been GalGal6 +# the symlinks in bbi have been fixed +$downloadUrl =~ s/galGal6/GalGal6/; + +print <<_EOF_ + +

Description

+TOGA +(Tool to infer Orthologs from Genome Alignments) +is a homology-based method that integrates gene annotation, inferring +orthologs and classifying genes as intact or lost. +

+ +

+This track has $itemCount items in the track, covering $basesCovered bases +in the sequence which is % $percentCoverage of the total sequence of size +$totalBases nucleotides. +

+ +

Methods

+As input, TOGA uses a gene annotation of a reference species +(human/hg38 for mammals, chicken/galGal6 for birds) and +a whole genome alignment between the reference and query genome. +

+TOGA implements a novel paradigm that relies on alignments of intronic +and intergenic regions and uses machine learning to accurately distinguish +orthologs from paralogs or processed pseudogenes. +

+To annotate genes, +CESAR 2.0 +is used to determine the positions and boundaries of coding exons of a +reference transcript in the orthologous genomic locus in the query species. +

+ +

Display Conventions and Configuration

+Each annotated transcript is shown in a color-coded classification as +

+ "intact": middle 80% of the CDS + (coding sequence) is present and exhibits no gene-inactivating mutation. + These transcripts likely encode functional proteins.
+ "partially intact": 50% of the CDS + is present in the query and the middle 80% of the CDS exhibits no + inactivating mutation. These transcripts may also encode functional + proteins, but the evidence is weaker as parts of the CDS are missing, + often due to assembly gaps.
+ "missing": <50% of the CDS is present + in the query and the middle 80% of the CDS exhibits no inactivating + mutation.
+ "uncertain loss": there is 1 + inactivating mutation in the middle 80% of the CDS, but evidence is not + strong enough to classify the transcript as lost. These transcripts may + or may not encode a functional protein.
+ "lost": typically several inactivating + mutations are present, thus there is strong evidence that the transcript + is unlikely to encode a functional protein.

+Clicking on a transcript provides additional information about the orthology +classification, inactivating mutations, the protein sequence and protein/exon +alignments. +

+ +

Data Access

+The data for this track is available from the bigBed file format +with the command line access tool bigBedToBed available from +the utilities download directory +hgdownload.soe.ucsc.edu/admin/exe/linux_x86_64. +

+ +

+To extract from the bigBed file: +

+  bigBedToBed "$downloadUrl" togaData.bed
+

+with the result in the togaData.bed file. +

+ +

Credits

+This data was prepared by the Michael Hiller Lab +

+ +

References

+The TOGA software is available from +github.com/hillerlab/TOGA +

+ +

+Kirilenko BM, Munegowda C, Osipova E, Jebb D, Sharma V, Blumer M, Morales A, +Ahmed AW, Kontopoulos DG, Hilgers L, Zoonomia Consortium, Hiller M. +TOGA integrates gene annotation with orthology inference +at scale. bioRxiv preprint September 2022 +

+ +_EOF_ + ;