13ca91a9d4a1ed3169fb78c4f39be9cbe5eb4869 hiram Tue Feb 2 14:48:17 2021 -0800 scripts used on hgdownload to construct text list of hubs and top level index.html page refs #23891 diff --git src/hg/makeDb/doc/asmHubs/hubIndex.pl src/hg/makeDb/doc/asmHubs/hubIndex.pl new file mode 100755 index 0000000..45bd9cf --- /dev/null +++ src/hg/makeDb/doc/asmHubs/hubIndex.pl @@ -0,0 +1,210 @@ +#!/usr/bin/env perl + +use strict; +use warnings; +use File::Basename; + +my $thisMachine = `uname -n`; +chomp $thisMachine; + +if ($thisMachine ne "hgdownload") { + printf STDERR "# NOTE: This script is only used on hgdownload\n"; + exit 255; +} + +############################################################################# +sub startHtml() { +printf ' + + + + + +

U.C. Santa Cruz Genomics Institute Assembly and Track hubs

+ +' +} + +############################################################################# +sub endHtml() { +printf ' + + + + + +' +} + +############################################################################# +startHtml; + +my %expectedList = ( + "VGP" => 1, + "birds" => 1, + "fish" => 1, + "globalReference" => 1, + "mammals" => 1, + "primates" => 1, + "vertebrate" => 1, + "legacy" => 1, + "plants" => 1, +); + +my %titles = ( + "VGP" => "Vertebrate Genomes Project collection", + "birds" => "NCBI bird genomes", + "fish" => "NCBI fish genomes", + "globalReference" => "Global Human Reference genomes, January 2020", + "mammals" => "NCBI mammal genomes", + "primates" => "NCBI primate genomes", + "vertebrate" => "NCBI other vertebrate genomes", + "legacy" => "NCBI genomes legacy/superseded by newer versions", + "plants" => "NCBI plant genomes", + "gtexAnalysis" => "Genotype-Tissue Expression (GTEx) Project analysis results track hub, V6 October 2015", + "gtex" => "Genotype-Tissue Expression (GTEx) RNA-seq signal track hub, V6 October 2015", + "mouseStrains" => "16 mouse strain assembly and track hub, May 2017", + "neuroDiffCrispr" => "Structurally conserved primate cerebral cortex lincRNAs track hub, December 2018", +); + +my %newDirectories; + +my %otherTopLevels = ( + "GCA" => 1, + "GCF" => 1, + "gtex" => 1, + "gtexAnalysis" => 1, + "mouseStrains" => 1, + "neuroDiffCrispr" => 1, + "UCSC_GI.assemblyHubList.txt" => 1, + "index.html" => 1, +); + +my @orderOutHubs = ( + "primates", + "mammals", + "birds", + "fish", + "vertebrate", + "legacy", + "plants", + "VGP", + "globalReference", + "mouseStrains", +); + +my @orderOutTracks = ( + "gtexAnalysis", + "gtex", + "neuroDiffCrispr", +); + +my %indexPage = ( + "primates" => "index.html", + "mammals" => "index.html", + "birds" => "index.html", + "fish" => "index.html", + "vertebrate" => "index.html", + "legacy" => "index.html", + "plants" => "index.html", + "VGP" => "index.html", + "mouseStrains" => "hubIndex.html", + "globalReference" => "index.html", + "gtexAnalysis" => "index.html", + "gtex" => "index.html", + "neuroDiffCrispr" => "index.html", +); + +# verify all known directories and files, alert for any new ones +open (FH, "ls -d /mirrordata/hubs/*|") or die "can not ls -d /mirrordata/hubs/*"; +while (my $dirPath = ) { + chomp $dirPath; + my $fileDirName = basename($dirPath); + if (! (defined($expectedList{$fileDirName}) || defined($otherTopLevels{$fileDirName})) ) { + printf STDERR "# something new: %s\n", $fileDirName; + $newDirectories{$fileDirName} = 1; + } +} + +close (FH); + +### Determine genome counts: +my %genomeCounts; + +my $genomeCount = `grep -h ^genome /mirrordata/hubs/VGP/*enomes.txt | wc -l`; +chomp $genomeCount; +$genomeCounts{"VGP"} = $genomeCount; + +my @checkList = ('primates', 'mammals', 'birds', 'fish', 'vertebrate', 'legacy', 'plants', 'globalReference'); + +foreach my $genome (@checkList) { + $genomeCount = `grep -h ^genome /mirrordata/hubs/$genome/genomes.txt | wc -l`; + chomp $genomeCount; + $genomeCounts{$genome} = $genomeCount; +} + +my $hubCount = 0; + +printf "

Assembly hubs

\n\n"; + +printf "\n"; +printf "\n"; +printf " \n"; +printf " \n"; +printf "\n"; + +# construct table +foreach my $orderUp (@orderOutHubs) { + printf "\n"; + ++$hubCount; + if ($orderUp eq "fish") { + printf " \n", $orderUp, $indexPage{$orderUp}; + } else { + printf " \n", $orderUp, $indexPage{$orderUp}, $orderUp; + } + if (defined($genomeCounts{$orderUp})) { + printf " \n", $titles{$orderUp}, $genomeCounts{$orderUp}; + } else { + printf " \n", $titles{$orderUp}; + } + printf "\n"; +} + +printf "
hub gatewaydescription
fishes%s%s (%d assemblies)%s
\n"; + +my $totalAsmHubs = `grep -v "^#" /mirrordata/hubs/UCSC_GI.assemblyHubList.txt | wc -l`; +chomp $totalAsmHubs; +printf "

\n"; +printf "Please note: text file listing of %d NCBI/VGP genome assembly hubs\n", $totalAsmHubs; +printf "

\n"; + +printf "\n

Track hubs

\n\n"; + +printf "\n"; +printf "\n"; +printf " \n"; +printf " \n"; +printf "\n"; + +# construct table +foreach my $orderUp (@orderOutTracks) { + printf "\n"; + ++$hubCount; + if ($orderUp eq "fish") { + printf " \n", $orderUp, $indexPage{$orderUp}; + } else { + printf " \n", $orderUp, $indexPage{$orderUp}, $orderUp; + } + if (defined($genomeCounts{$orderUp})) { + printf " \n", $titles{$orderUp}, $genomeCounts{$orderUp}; + } else { + printf " \n", $titles{$orderUp}; + } + printf "\n"; +} + +printf "
hub gatewaydescription
fishes%s%s (%d assemblies)%s
\n"; + +endHtml; + +