afee0fcf2a7b7de2eb37e1420dc126607373453e hiram Wed May 27 17:31:45 2020 -0700 rework scripts to allow subset hubs to be created from a list of assemblies refs #23734 diff --git src/hg/makeDb/doc/asmHubs/mkHubIndex.pl src/hg/makeDb/doc/asmHubs/mkHubIndex.pl index a9d44b2..c88c340 100755 --- src/hg/makeDb/doc/asmHubs/mkHubIndex.pl +++ src/hg/makeDb/doc/asmHubs/mkHubIndex.pl @@ -1,51 +1,58 @@ #!/usr/bin/env perl use strict; use warnings; my $argc = scalar(@ARGV); -if ($argc != 3) { - printf STDERR "mkAsmStats Name asmName\n"; - printf STDERR "e.g.: mkHubIndex Primates primates GCF_000001405.39_GRCh38.p13\n"; +if ($argc != 4) { + printf STDERR "mkHubIndex.pl Name asmName defaultAsmId [two column name list] > index.html\n"; + printf STDERR "e.g.: mkHubIndex Primates primates GCF_000001405.39_GRCh38.p13 primates.commonName.asmId.orderList.tsv\n"; + printf STDERR "the name list is found in \$HOME/kent/src/hg/makeDb/doc/asmHubs/\n"; + printf STDERR "\nthe two columns are 1: asmId (accessionId_assemblyName)\n"; + printf STDERR "column 2: common name for species, columns separated by tab\n"; + printf STDERR "The result prints to stdout the index.html page for this set of assemblies\n"; exit 255; } + my $Name = shift; my $asmHubName = shift; my $defaultAssembly = shift; +my $commonNameOrder = shift; + +printf STDERR "# mkHubIndex %s %s %s %s\n", $Name, $asmHubName, $defaultAssembly, $commonNameOrder; my $home = $ENV{'HOME'}; my $toolsDir = "$home/kent/src/hg/makeDb/doc/asmHubs"; -my $commonNameOrder = "$asmHubName.commonName.asmId.orderList.tsv"; my $vgpIndex = 0; $vgpIndex = 1 if ($Name =~ m/vgp/i); my %vgpClass; # key is asmId, value is taxon 'class' as set by VGP project if ($vgpIndex) { my $vgpClass = "$home/kent/src/hg/makeDb/doc/vgpAsmHub/vgp.taxId,asmId.class.txt"; open (FH, "<$vgpClass") or die "can not read $vgpClass"; while (my $line = <FH>) { my ($taxId, $asmId, $class) = split('\t', $line); $vgpClass{$asmId} = $class; } close (FH); } my @orderList; # asmId of the assemblies in order from the *.list files # the order to read the different .list files: my $assemblyCount = 0; -my %betterName; # key is asmId, value is a common name better than found - # in assembly_report file +my %commonName; # key is asmId, value is a common name, perhaps more appropriate + # than found in assembly_report file ############################################################################## # from Perl Cookbook Recipe 2.17, print out large numbers with comma delimiters: ############################################################################## sub commify($) { my $text = reverse $_[0]; $text =~ s/(\d\d\d)(?=\d)(?!\d*\.)/$1,/g; return scalar reverse $text } ############################################################################## ### start the HTML output ############################################################################## sub startHtml() { @@ -94,35 +101,35 @@ END } print <<"END" <h3>How to view the hub</h3> <p> Options: <ol> <li>The links to the genome browser in the table below will attach that one specific assembly to the genome browser. This is most likely what you want.</li> <li>Instead, you can attach the entire set of assemblies as one group to the genome browser with the following links depending upon which of our mirror site browsers you prefer to use: <ul> - <li><a href="https://genome.ucsc.edu/cgi-bin/hgGateway?hubUrl=https://hgdownload.soe.ucsc.edu/hubs/$asmHubName/hub.txt&genome=GCF_000001405.39" + <li><a href="https://genome.ucsc.edu/cgi-bin/hgGateway?hubUrl=https://hgdownload.soe.ucsc.edu/hubs/$asmHubName/hub.txt&genome=$defaultAssembly" target="_blank">genome.ucsc.edu</a></li> - <li><a href="https://genome-euro.ucsc.edu/cgi-bin/hgGateway?hubUrl=https://hgdownload.soe.ucsc.edu/hubs/$asmHubName/hub.txt&genome=GCF_000001405.39" + <li><a href="https://genome-euro.ucsc.edu/cgi-bin/hgGateway?hubUrl=https://hgdownload.soe.ucsc.edu/hubs/$asmHubName/hub.txt&genome=$defaultAssembly" target="_blank">genome-euro.ucsc.edu</a></li> - <li><a href="https://genome-asia.ucsc.edu/cgi-bin/hgGateway?hubUrl=https://hgdownload.soe.ucsc.edu/hubs/$asmHubName/hub.txt&genome=GCF_000001405.39" + <li><a href="https://genome-asia.ucsc.edu/cgi-bin/hgGateway?hubUrl=https://hgdownload.soe.ucsc.edu/hubs/$asmHubName/hub.txt&genome=$defaultAssembly" target="_blank">genome-asia.ucsc.edu</a></li> </ul> </li> <li>To manually attach all the assemblies in this hub to genome browsers that are not one of the three UCSC mirror sites: <ol> <li>From the blue navigation bar, go to <em><strong>My Data</strong> -> <strong>Track Hubs</strong></em></li> <li>Then select the <strong>My Hubs</strong> tab and enter this URL into the textbox: <br><code>https://hgdownload.soe.ucsc.edu/hubs/$asmHubName/hub.txt</code></li> <li> Once you have added the URL to the entry form, press the <em><strong>Add Hub</strong></em> button to add the hub.</li> </ol> </li> @@ -220,31 +227,31 @@ print <<"END" </div><!-- closing gbsPage from gbPageStartHardcoded.html --> </div><!-- closing container-fluid from gbPageStartHardcoded.html --> <!--#include virtual="\$ROOT/inc/gbFooterHardcoded.html"--> <script type="text/javascript" src="/js/sorttable.js"></script> </body></html> END } # sub endHtml() ############################################################################## ### tableContents() ############################################################################## sub tableContents() { my $rowCount = 0; - foreach my $asmId (reverse(@orderList)) { + foreach my $asmId (@orderList) { my ($gcPrefix, $asmAcc, $asmName) = split('_', $asmId, 3); my $accessionId = sprintf("%s_%s", $gcPrefix, $asmAcc); my $accessionDir = substr($asmId, 0 ,3); $accessionDir .= "/" . substr($asmId, 4 ,3); $accessionDir .= "/" . substr($asmId, 7 ,3); $accessionDir .= "/" . substr($asmId, 10 ,3); my $ncbiFtpLink = "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/$accessionDir/$asmId"; my $buildDir = "/hive/data/genomes/asmHubs/refseqBuild/$accessionDir/$asmId"; if ($gcPrefix eq "GCA") { $buildDir = "/hive/data/genomes/asmHubs/genbankBuild/$accessionDir/$asmId"; } my $asmReport="$buildDir/download/${asmId}_assembly_report.txt"; my $trackDb="$buildDir/${asmId}.trackDb.txt"; next if (! -s "$trackDb"); # assembly build not complete my $chromSizes="${buildDir}/${asmId}.chrom.sizes"; @@ -275,72 +282,80 @@ $bioSample =~ s/.*:\s+//; } } elsif ($line =~ m/BioProject:/) { if ($bioProject =~ m/notFound/) { ++$itemsFound; $bioProject = $line; $bioProject =~ s/.*:\s+//; } } elsif ($line =~ m/Organism name:/) { if ($sciName =~ m/notFound/) { ++$itemsFound; $commonName = $line; $sciName = $line; $commonName =~ s/.*\(//; $commonName =~ s/\)//; - $commonName = $betterName{$asmId} if (exists($betterName{$asmId})); + $commonName = $commonName{$asmId} if (exists($commonName{$asmId})); $sciName =~ s/.*:\s+//; $sciName =~ s/\s+\(.*//; } } elsif ($line =~ m/Taxid:/) { if ($taxId =~ m/notFound/) { ++$itemsFound; $taxId = $line; $taxId =~ s/.*:\s+//; } } } close (FH); my $hubUrl = "https://hgdownload.soe.ucsc.edu/hubs/$accessionDir/$accessionId"; printf "<tr><td align=right>%d</td>\n", ++$rowCount; ### printf "<td align=center><a href='https://genome.ucsc.edu/cgi-bin/hgGateway?hubUrl=%s/hub.txt&genome=%s&position=lastDbPos' target=_blank>%s</a></td>\n", $hubUrl, $accessionId, $commonName; printf "<td align=center><a href='https://genome.ucsc.edu/h/%s' target=_blank>%s</a></td>\n", $accessionId, $commonName; printf " <td align=center><a href='%s/' target=_blank>%s</a></td>\n", $hubUrl, $sciName; printf " <td align=left><a href='https://www.ncbi.nlm.nih.gov/assembly/%s_%s/' target=_blank>%s</a></td>\n", $gcPrefix, $asmAcc, $asmId; if ( $bioSample ne "notFound" ) { printf " <td align=left><a href='https://www.ncbi.nlm.nih.gov/biosample/?term=%s' target=_blank>%s</a></td>\n", $bioSample, $bioSample; } else { printf " <td align=left>n/a</td>\n"; } + if ($bioProject eq "notFound") { + printf " <td align=left>%s</td>\n", $bioProject; + } else { printf " <td align=left><a href='https://www.ncbi.nlm.nih.gov/bioproject/?term=%s' target=_blank>%s</a></td>\n", $bioProject, $bioProject; + } printf " <td align=center><a href='%s' target=_blank>%s</a></td>\n", $ncbiFtpLink, $asmDate; if ($vgpIndex) { my $sciNameUnderscore = $sciName; $sciNameUnderscore =~ s/ /_/g; $sciNameUnderscore = "Strigops_habroptilus" if ($sciName =~ m/Strigops habroptila/); + if (! defined($vgpClass{$asmId})) { + printf STDERR "# ERROR: no 'class' defined for VGP assembly %s\n", $asmId; + exit 255; + } printf " <td align=center><a href='https://vgp.github.io/genomeark/%s/' target=_blank>%s</a></td>\n", $sciNameUnderscore, $vgpClass{$asmId} } printf "</tr>\n"; } } # sub tableContents() ############################################################################## ### main() ############################################################################## open (FH, "<$toolsDir/${commonNameOrder}") or die "can not read ${commonNameOrder}"; while (my $line = <FH>) { next if ($line =~ m/^#/); chomp $line; - my ($commonName, $asmId) = split('\t', $line); + my ($asmId, $commonName) = split('\t', $line); push @orderList, $asmId; - $betterName{$asmId} = $commonName; + $commonName{$asmId} = $commonName; ++$assemblyCount; } close (FH); startHtml(); startTable(); tableContents(); endTable(); endHtml();