a3221ce060e8efe2e9ef46392625606896f64f96
hiram
  Fri May 8 23:06:59 2020 -0700
do not print links to other assembly hubs from VGP index pages refs #23734

diff --git src/hg/makeDb/doc/asmHubs/mkHubIndex.pl src/hg/makeDb/doc/asmHubs/mkHubIndex.pl
index 54600b2..a9d44b2 100755
--- src/hg/makeDb/doc/asmHubs/mkHubIndex.pl
+++ src/hg/makeDb/doc/asmHubs/mkHubIndex.pl
@@ -1,344 +1,346 @@
 #!/usr/bin/env perl
 
 use strict;
 use warnings;
 
 my $argc = scalar(@ARGV);
 if ($argc != 3) {
   printf STDERR "mkAsmStats Name asmName\n";
   printf STDERR "e.g.: mkHubIndex Primates primates GCF_000001405.39_GRCh38.p13\n";
   exit 255;
 }
 my $Name = shift;
 my $asmHubName = shift;
 my $defaultAssembly = shift;
 
 my $home = $ENV{'HOME'};
 my $toolsDir = "$home/kent/src/hg/makeDb/doc/asmHubs";
 my $commonNameOrder = "$asmHubName.commonName.asmId.orderList.tsv";
 my $vgpIndex = 0;
 $vgpIndex = 1 if ($Name =~ m/vgp/i);
 my %vgpClass;	# key is asmId, value is taxon 'class' as set by VGP project
 if ($vgpIndex) {
   my $vgpClass = "$home/kent/src/hg/makeDb/doc/vgpAsmHub/vgp.taxId,asmId.class.txt";
   open (FH, "<$vgpClass") or die "can not read $vgpClass";
   while (my $line = <FH>) {
     my ($taxId, $asmId, $class) = split('\t', $line);
     $vgpClass{$asmId} = $class;
   }
   close (FH);
 }
 
 my @orderList;	# asmId of the assemblies in order from the *.list files
 # the order to read the different .list files:
 my $assemblyCount = 0;
 my %betterName;	# key is asmId, value is a common name better than found
 			# in assembly_report file
 
 ##############################################################################
 # from Perl Cookbook Recipe 2.17, print out large numbers with comma delimiters:
 ##############################################################################
 sub commify($) {
     my $text = reverse $_[0];
     $text =~ s/(\d\d\d)(?=\d)(?!\d*\.)/$1,/g;
     return scalar reverse $text
 }
 
 ##############################################################################
 ### start the HTML output
 ##############################################################################
 sub startHtml() {
 
 my $timeStamp = `date "+%F"`;
 chomp $timeStamp;
 
 # <html xmlns="http://www.w3.org/1999/xhtml">
 
 my $subSetMessage = "subset of $asmHubName only";
 if ($asmHubName eq "vertebrate") {
    $subSetMessage = "subset of other ${asmHubName}s only";
 }
 
 if ($vgpIndex) {
   print <<"END"
 <!DOCTYPE HTML 4.01 Transitional>
 <!--#set var="TITLE" value="VGP - Vertebrate Genomes Project assembly hub" -->
 <!--#set var="ROOT" value="../.." -->
 
 <!--#include virtual="\$ROOT/inc/gbPageStartHardcoded.html" -->
 
 <h1>VGP - Vertebrate Genomes Project assembly hub</h1>
 <p>
 <a href='https://vertebrategenomesproject.org/' target=_blank>
 <img src='VGPlogo.png' width=280 alt='VGP logo'></a></p>
 <p>
 This assembly hub contains assemblies released
 by the <a href='https://vertebrategenomesproject.org/' target=_blank>
 Vertebrate Genomes Project.</a>
 </p>
 
 END
 } else {
   print <<"END"
 <!DOCTYPE HTML 4.01 Transitional>
 <!--#set var="TITLE" value="$Name genomes assembly hubs" -->
 <!--#set var="ROOT" value="../.." -->
 
 <!--#include virtual="\$ROOT/inc/gbPageStartHardcoded.html" -->
 
 <h1>$Name Genomes assembly hubs</h1>
 <p>
 Assemblies from NCBI/Genbank/Refseq sources, $subSetMessage.
 </p>
 
 END
 }
 
 print <<"END"
 <h3>How to view the hub</h3>
 <p>
 Options:
 <ol>
   <li>The links to the genome browser in the table below will attach that
       one specific assembly to the genome browser.  This is most likely what
       you want.</li>
   <li>Instead, you can attach the entire set of assemblies as one group to
       the genome browser with the following links depending upon which of
       our mirror site browsers you prefer to use:
     <ul>
     <li><a href="https://genome.ucsc.edu/cgi-bin/hgGateway?hubUrl=https://hgdownload.soe.ucsc.edu/hubs/$asmHubName/hub.txt&amp;genome=GCF_000001405.39"
         target="_blank">genome.ucsc.edu</a></li>
     <li><a href="https://genome-euro.ucsc.edu/cgi-bin/hgGateway?hubUrl=https://hgdownload.soe.ucsc.edu/hubs/$asmHubName/hub.txt&amp;genome=GCF_000001405.39"
         target="_blank">genome-euro.ucsc.edu</a></li>
     <li><a href="https://genome-asia.ucsc.edu/cgi-bin/hgGateway?hubUrl=https://hgdownload.soe.ucsc.edu/hubs/$asmHubName/hub.txt&amp;genome=GCF_000001405.39"
         target="_blank">genome-asia.ucsc.edu</a></li>
     </ul>
   </li>
   <li>To manually attach all the assemblies in this hub to genome browsers
       that are not one of the three UCSC mirror sites:
     <ol>
       <li>From the blue navigation bar, go to
     <em><strong>My Data</strong> -&gt; <strong>Track Hubs</strong></em></li>
       <li>Then select the <strong>My Hubs</strong> tab and enter this URL into
           the textbox:
     <br><code>https://hgdownload.soe.ucsc.edu/hubs/$asmHubName/hub.txt</code></li>
       <li> Once you have added the URL to the entry form,
            press the <em><strong>Add Hub</strong></em> button to add the hub.</li>
     </ol>
   </li>
 </ol>
 </p>
 
 <p>
 After adding the hub, you will be redirected to the gateway page.  The
 genome assemblies can be selected from the
 <em>${Name} Hub Assembly</em> dropdown menu.
 Instead of adding all the assemblies in one collected group, use the individual
 <em>view in browser</em> in the table below.
 </p>
 <h3>See also: <a href='asmStats.html'>assembly statistics</a>,&nbsp;<a href='trackData.html'>track statistics</a> <== additional information for these assemblies.</h3><br>
 <h3>Data resource links</h3>
 <p>
 NOTE: <em>Click on the column headers to sort the table by that column</em><br>
 The <em>common name/view in browser</em> will attach only that single assembly to
 the genome browser.<br>
 The <em>scientific name/and data download</em> link provides access to the files for that one
 assembly hub.<br>
 The <em>class/VGP link</em> provides access to the VGP GenomeArk page for that genome<br>
 The other links provide access to NCBI resources for these assemblies.
 </p>
 END
 }	#	sub startHtml()
 
 ##############################################################################
 ### start the table output
 ##############################################################################
 sub startTable() {
 print '
 <table class="sortable" border="1">
 <thead><tr><th>count</th>
   <th>common&nbsp;name&nbsp;and<br>view&nbsp;in&nbsp;browser</th>
   <th>scientific name<br>and&nbsp;data&nbsp;download</th>
   <th>NCBI&nbsp;assembly</th>
   <th>bioSample</th><th>bioProject</th>
   <th>assembly&nbsp;date,<br>source&nbsp;link</th>
 ';
 
 if ($vgpIndex) {
   printf "<th>class<br>VGP&nbsp;link</th>\n";
 }
 print "</tr></thead><tbody>\n";
 }	#	sub startTable()
 
 ##############################################################################
 ### end the table output
 ##############################################################################
 sub endTable() {
 
 print <<"END"
 
 </tbody>
 </table>
 END
 }	#	sub endTable()
 
 ##############################################################################
 ### end the HTML output
 ##############################################################################
 sub endHtml() {
 
-if ($asmHubName ne "viral") {
+# do not print these links for VGP index
+
+if ((0 == $vgpIndex) && ($asmHubName ne "viral")) {
   printf "<p>\n<table border='1'><thead>\n";
   printf "<tr><th colspan=6 style='text-align:center;'>Additional hubs with collections of assemblies</th></tr>\n";
   printf "<tr><th>Assembly hubs index pages:&nbsp;</th>\n";
   printf "<th><a href='../primates/index.html'>Primates</a></th>\n";
   printf "<th><a href='../mammals/index.html'>Mammals</a></th>\n";
   printf "<th><a href='../birds/index.html'>Birds</a></th>\n";
   printf "<th><a href='../fish/index.html'>Fish</a></th>\n";
   printf "<th><a href='../vertebrate/index.html'>other vertebrates</a></th>\n";
 
   printf "</tr><tr>\n";
   printf "<th>Hubs assembly statistics:&nbsp;</th>\n";
   printf "<th><a href='../primates/asmStats.html'>Primates</a></th>\n";
   printf "<th><a href='../mammals/asmStats.html'>Mammals</a></th>\n";
   printf "<th><a href='../birds/asmStats.html'>Birds</a></th>\n";
   printf "<th><a href='../fish/asmStats.html'>Fish</a></th>\n";
   printf "<th><a href='../vertebrate/asmStats.html'>other vertebrates</a></th>\n";
 
   printf "</tr><tr>\n";
   printf "<th>Hubs track statistics:&nbsp;</th>\n";
   printf "<th><a href='../primates/trackData.html'>Primates</a></th>\n";
   printf "<th><a href='../mammals/trackData.html'>Mammals</a></th>\n";
   printf "<th><a href='../birds/trackData.html'>Birds</a></th>\n";
   printf "<th><a href='../fish/trackData.html'>Fish</a></th>\n";
   printf "<th><a href='../vertebrate/trackData.html'>other vertebrates</a></th>\n";
 
   printf "</tr></thead>\n</table>\n</p>\n";
 }
 
 print <<"END"
 </div><!-- closing gbsPage from gbPageStartHardcoded.html -->
 </div><!-- closing container-fluid from gbPageStartHardcoded.html -->
 <!--#include virtual="\$ROOT/inc/gbFooterHardcoded.html"-->
 <script type="text/javascript" src="/js/sorttable.js"></script>
 </body></html>
 END
 }	#	sub endHtml()
 
 ##############################################################################
 ### tableContents()
 ##############################################################################
 sub tableContents() {
   my $rowCount = 0;
   foreach my $asmId (reverse(@orderList)) {
     my ($gcPrefix, $asmAcc, $asmName) = split('_', $asmId, 3);
     my $accessionId = sprintf("%s_%s", $gcPrefix, $asmAcc);
     my $accessionDir = substr($asmId, 0 ,3);
     $accessionDir .= "/" . substr($asmId, 4 ,3);
     $accessionDir .= "/" . substr($asmId, 7 ,3);
     $accessionDir .= "/" . substr($asmId, 10 ,3);
     my $ncbiFtpLink = "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/$accessionDir/$asmId";
     my $buildDir = "/hive/data/genomes/asmHubs/refseqBuild/$accessionDir/$asmId";
     if ($gcPrefix eq "GCA") {
      $buildDir = "/hive/data/genomes/asmHubs/genbankBuild/$accessionDir/$asmId";
     }
     my $asmReport="$buildDir/download/${asmId}_assembly_report.txt";
     my $trackDb="$buildDir/${asmId}.trackDb.txt";
     next if (! -s "$trackDb");	# assembly build not complete
     my $chromSizes="${buildDir}/${asmId}.chrom.sizes";
     my $sciName = "notFound";
     my $commonName = "notFound";
     my $bioSample = "notFound";
     my $bioProject = "notFound";
     my $taxId = "notFound";
     my $asmDate = "notFound";
     my $itemsFound = 0;
     open (FH, "<$asmReport") or die "can not read $asmReport";
     while (my $line = <FH>) {
       last if ($itemsFound > 5);
       chomp $line;
       $line =~ s/
//g;;
       $line =~ s/\s+$//g;;
       if ($line =~ m/Date:/) {
         if ($asmDate =~ m/notFound/) {
            ++$itemsFound;
            $line =~ s/.*:\s+//;
            my @a = split('-', $line);
            $asmDate = sprintf("%04d-%02d-%02d", $a[0], $a[1], $a[2]);
         }
       } elsif ($line =~ m/BioSample:/) {
         if ($bioSample =~ m/notFound/) {
            ++$itemsFound;
            $bioSample = $line;
            $bioSample =~ s/.*:\s+//;
         }
       } elsif ($line =~ m/BioProject:/) {
         if ($bioProject =~ m/notFound/) {
            ++$itemsFound;
            $bioProject = $line;
            $bioProject =~ s/.*:\s+//;
         }
       } elsif ($line =~ m/Organism name:/) {
         if ($sciName =~ m/notFound/) {
            ++$itemsFound;
            $commonName = $line;
            $sciName = $line;
            $commonName =~ s/.*\(//;
            $commonName =~ s/\)//;
            $commonName = $betterName{$asmId} if (exists($betterName{$asmId}));
            $sciName =~ s/.*:\s+//;
            $sciName =~ s/\s+\(.*//;
         }
       } elsif ($line =~ m/Taxid:/) {
         if ($taxId =~ m/notFound/) {
            ++$itemsFound;
            $taxId = $line;
            $taxId =~ s/.*:\s+//;
         }
       }
     }
     close (FH);
     my $hubUrl = "https://hgdownload.soe.ucsc.edu/hubs/$accessionDir/$accessionId";
     printf "<tr><td align=right>%d</td>\n", ++$rowCount;
 ###    printf "<td align=center><a href='https://genome.ucsc.edu/cgi-bin/hgGateway?hubUrl=%s/hub.txt&amp;genome=%s&amp;position=lastDbPos' target=_blank>%s</a></td>\n", $hubUrl, $accessionId, $commonName;
     printf "<td align=center><a href='https://genome.ucsc.edu/h/%s' target=_blank>%s</a></td>\n", $accessionId, $commonName;
     printf "    <td align=center><a href='%s/' target=_blank>%s</a></td>\n", $hubUrl, $sciName;
     printf "    <td align=left><a href='https://www.ncbi.nlm.nih.gov/assembly/%s_%s/' target=_blank>%s</a></td>\n", $gcPrefix, $asmAcc, $asmId;
     if ( $bioSample ne "notFound" ) {
     printf "    <td align=left><a href='https://www.ncbi.nlm.nih.gov/biosample/?term=%s' target=_blank>%s</a></td>\n", $bioSample, $bioSample;
     } else {
     printf "    <td align=left>n/a</td>\n";
     }
     printf "    <td align=left><a href='https://www.ncbi.nlm.nih.gov/bioproject/?term=%s' target=_blank>%s</a></td>\n", $bioProject, $bioProject;
     printf "    <td align=center><a href='%s' target=_blank>%s</a></td>\n", $ncbiFtpLink, $asmDate;
     if ($vgpIndex) {
       my $sciNameUnderscore = $sciName;
       $sciNameUnderscore =~ s/ /_/g;
       $sciNameUnderscore = "Strigops_habroptilus" if ($sciName =~ m/Strigops habroptila/);
 
       printf "    <td align=center><a href='https://vgp.github.io/genomeark/%s/' target=_blank>%s</a></td>\n", $sciNameUnderscore, $vgpClass{$asmId}
     }
     printf "</tr>\n";
   }
 }	#	sub tableContents()
 
 ##############################################################################
 ### main()
 ##############################################################################
 
 open (FH, "<$toolsDir/${commonNameOrder}") or die "can not read ${commonNameOrder}";
 while (my $line = <FH>) {
   next if ($line =~ m/^#/);
   chomp $line;
   my ($commonName, $asmId) = split('\t', $line);
   push @orderList, $asmId;
   $betterName{$asmId} = $commonName;
   ++$assemblyCount;
 }
 close (FH);
 
 startHtml();
 startTable();
 tableContents();
 endTable();
 endHtml();