595d44078d99eb1f717f097e2e4c886c450b2fe1 hiram Tue Mar 28 12:19:29 2023 -0700 adding download links on the gene track description pages for the GTF file no redmine diff --git src/hg/utils/automation/asmHubAugustusGene.pl src/hg/utils/automation/asmHubAugustusGene.pl index fad0f2f..1f65119 100755 --- src/hg/utils/automation/asmHubAugustusGene.pl +++ src/hg/utils/automation/asmHubAugustusGene.pl @@ -6,33 +6,37 @@ use lib "$Bin"; use AsmHub; use File::Basename; my $argc = scalar(@ARGV); if ($argc != 3) { printf STDERR "usage: asmHubAugustusGene.pl asmId asmId.names.tab bbi/asmId\n"; printf STDERR "where asmId is the assembly identifier,\n"; printf STDERR "and asmId.names.tab is naming file for this assembly,\n"; printf STDERR "and bbi/asmId is the path prefix to .augustus.bb.\n"; exit 255; } my $asmId = shift; +my @parts = split('_', $asmId, 3); +my $accession = "$parts[0]_$parts[1]"; my $namesFile = shift; my $bbiPrefix = shift; my $augustusBbi = "$bbiPrefix.augustus.bb"; +my $asmIdPath = &AsmHub::asmIdToPath($asmId); +my $downloadGtf = "https://hgdownload.soe.ucsc.edu/hubs/$asmIdPath/$accession/genes/$asmId.augustus.gtf.gz"; if ( ! -s $augustusBbi ) { printf STDERR "ERROR: can not find augustus bbi file:\n\t'%s'\n", $augustusBbi; exit 255; } my $em = "<em>"; my $noEm = "</em>"; my $assemblyDate = `grep -v "^#" $namesFile | cut -f9`; chomp $assemblyDate; my $ncbiAssemblyId = `grep -v "^#" $namesFile | cut -f10`; chomp $ncbiAssemblyId; my $organism = `grep -v "^#" $namesFile | cut -f5`; chomp $organism; @@ -40,30 +44,35 @@ chomp $geneCount; print <<_EOF_ <h2>Description</h2> <p> This track shows <i>ab initio</i> predictions from the program <a href="http://bioinf.uni-greifswald.de/augustus/" target="_blank">AUGUSTUS</a> (version 3.1). for the $assemblyDate $em${organism}$noEm/$asmId genome assembly.<br> <br> The predictions are based on the genome sequence alone.<br> <br> $geneCount </p> +<h2>Data Access</h2> +<p> +Download <a href='$downloadGtf' target=_blank> $asmId.augustus.gtf.gz </a> GTF file. +</p> + <h2>Methods</h2> <p> Statistical signal models were built for splice sites, branch-point patterns, translation start sites, and the poly-A signal. Furthermore, models were built for the sequence content of protein-coding and non-coding regions as well as for the length distributions of different exon and intron types. Detailed descriptions of most of these different models can be found in Mario Stanke's <a href="http://ediss.uni-goettingen.de/handle/11858/00-1735-0000-0006-B3F8-4" target="_blank">dissertation</a>. This track shows the most likely gene structure according to a Semi-Markov Conditional Random Field model. Alternative splicing transcripts were obtained with a sampling algorithm (<tt>--alternatives-from-sampling=true --sample=100 --minexonintronprob=0.2 --minmeanexonintronprob=0.5 --maxtracks=3 --temperature=2</tt>).