982d69ac4a666c4ff32f7ed167f788b81d960285
hiram
Wed Jun 17 12:30:05 2020 -0700
reduce the vertical space consumed by options and explanations refs #25555
diff --git src/hg/makeDb/doc/asmHubs/mkHubIndex.pl src/hg/makeDb/doc/asmHubs/mkHubIndex.pl
index e2ee199..719c1a2 100755
--- src/hg/makeDb/doc/asmHubs/mkHubIndex.pl
+++ src/hg/makeDb/doc/asmHubs/mkHubIndex.pl
@@ -1,344 +1,349 @@
#!/usr/bin/env perl
#
# mkHubIndex.pl - construct index.html page for a set of assemblies in a hub
#
use strict;
use warnings;
use FindBin qw($Bin);
use lib "$Bin";
use commonHtml;
my $argc = scalar(@ARGV);
if ($argc != 4) {
printf STDERR "mkHubIndex.pl Name asmName defaultAsmId [two column name list] > index.html\n";
printf STDERR "e.g.: mkHubIndex Primates primates GCF_000001405.39_GRCh38.p13 primates.commonName.asmId.orderList.tsv\n";
printf STDERR "the name list is found in \$HOME/kent/src/hg/makeDb/doc/asmHubs/\n";
printf STDERR "\nthe two columns are 1: asmId (accessionId_assemblyName)\n";
printf STDERR "column 2: common name for species, columns separated by tab\n";
printf STDERR "The result prints to stdout the index.html page for this set of assemblies\n";
exit 255;
}
my $home = $ENV{'HOME'};
my $toolsDir = "$home/kent/src/hg/makeDb/doc/asmHubs";
my $Name = shift;
my $asmHubName = shift;
my $defaultAssembly = shift;
my $inputList = shift;
my $orderList = $inputList;
if ( ! -s "$orderList" ) {
$orderList = $toolsDir/$inputList;
}
printf STDERR "# mkHubIndex %s %s %s %s\n", $Name, $asmHubName, $defaultAssembly, $orderList;
my $vgpIndex = 0;
$vgpIndex = 1 if ($Name =~ m/vgp/i);
my %vgpClass; # key is asmId, value is taxon 'class' as set by VGP project
if ($vgpIndex) {
my $vgpClass = "$home/kent/src/hg/makeDb/doc/vgpAsmHub/vgp.taxId.asmId.class.txt";
open (FH, "<$vgpClass") or die "can not read $vgpClass";
while (my $line = ) {
my ($taxId, $asmId, $class) = split('\t', $line);
$vgpClass{$asmId} = $class;
}
close (FH);
}
my @orderList; # asmId of the assemblies in order from the *.list files
# the order to read the different .list files:
my $assemblyCount = 0;
my %commonName; # key is asmId, value is a common name, perhaps more appropriate
# than found in assembly_report file
##############################################################################
# from Perl Cookbook Recipe 2.17, print out large numbers with comma delimiters:
##############################################################################
sub commify($) {
my $text = reverse $_[0];
$text =~ s/(\d\d\d)(?=\d)(?!\d*\.)/$1,/g;
return scalar reverse $text
}
##############################################################################
### start the HTML output
##############################################################################
sub startHtml() {
my $timeStamp = `date "+%F"`;
chomp $timeStamp;
#
my $subSetMessage = "subset of $asmHubName only";
if ($asmHubName eq "vertebrate") {
$subSetMessage = "subset of other ${asmHubName}s only";
}
if ($vgpIndex) {
my $vgpSubset = "(set of primary assemblies)";
if ($orderList =~ m/vgp.alternate/) {
$vgpSubset = "(set of alternate/haplotype assemblies)";
} elsif ($orderList =~ m/vgp.trio/) {
$vgpSubset = "(set of trio assemblies, maternal/paternal)";
} elsif ($orderList =~ m/vgp.legacy/) {
$vgpSubset = "(set of legacy/superseded assemblies)";
}
print <<"END"
VGP - Vertebrate Genomes Project assembly hub

This assembly hub contains assemblies released
by the
Vertebrate Genomes Project. $vgpSubset
END
} else {
print <<"END"
$Name Genomes assembly hubs
Assemblies from NCBI/Genbank/Refseq sources, $subSetMessage.
END
}
print <<"END"
How to view the hub
Options:
- The links to the genome browser in the table below will attach that
one specific assembly to the genome browser. This is most likely what
- you want.
- - Instead, you can attach the entire set of assemblies as one group to
- the genome browser with the following links depending upon which of
- our mirror site browsers you prefer to use:
-
+ you want. Alternatively, the entire set of assemblies can be attached
+ as one group to the genome browser with the following links depending
+ upon which of our mirror site browsers you prefer to use:
+
- To manually attach all the assemblies in this hub to genome browsers
that are not one of the three UCSC mirror sites:
- From the blue navigation bar, go to
My Data -> Track Hubs
- Then select the My Hubs tab and enter this URL into
the textbox:
https://hgdownload.soe.ucsc.edu/hubs/$asmHubName/hub.txt
- Once you have added the URL to the entry form,
press the Add Hub button to add the hub.
After adding the hub, you will be redirected to the gateway page. The
genome assemblies can be selected from the
${Name} Hub Assembly dropdown menu.
Instead of adding all the assemblies in one collected group, use the individual
view in browser in the table below.
Data resource links
-
NOTE: Click on the column headers to sort the table by that column
-The common name/view in browser will attach only that single assembly to
+The common name and view in browser will attach only that single assembly to
the genome browser.
-The scientific name/and data download link provides access to the files for that one
+The scientific name and data download link provides access to the files for that one
assembly hub.
-The class/VGP link provides access to the VGP GenomeArk page for that genome
+The class VGP link provides access to the VGP GenomeArk page for that genome.
The other links provide access to NCBI resources for these assemblies.
-
END
} # sub startHtml()
##############################################################################
### start the table output
##############################################################################
sub startTable() {
print '
count |
common name and view in browser |
scientific name and data download |
NCBI assembly |
- bioSample | bioProject |
+ BioSample | BioProject |
assembly date, source link |
';
if ($vgpIndex) {
printf "class VGP link | \n";
}
print "
\n";
} # sub startTable()
##############################################################################
### end the table output
##############################################################################
sub endTable() {
print <<"END"
END
} # sub endTable()
##############################################################################
### end the HTML output
##############################################################################
sub endHtml() {
&commonHtml::otherHubLinks($vgpIndex, $asmHubName);
&commonHtml::htmlFooter($vgpIndex, $asmHubName);
} # sub endHtml()
##############################################################################
### tableContents()
##############################################################################
sub tableContents() {
my $rowCount = 0;
foreach my $asmId (@orderList) {
my ($gcPrefix, $asmAcc, $asmName) = split('_', $asmId, 3);
my $accessionId = sprintf("%s_%s", $gcPrefix, $asmAcc);
my $accessionDir = substr($asmId, 0 ,3);
$accessionDir .= "/" . substr($asmId, 4 ,3);
$accessionDir .= "/" . substr($asmId, 7 ,3);
$accessionDir .= "/" . substr($asmId, 10 ,3);
my $ncbiFtpLink = "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/$accessionDir/$asmId";
my $buildDir = "/hive/data/genomes/asmHubs/refseqBuild/$accessionDir/$asmId";
if ($gcPrefix eq "GCA") {
$buildDir = "/hive/data/genomes/asmHubs/genbankBuild/$accessionDir/$asmId";
}
my $asmReport="$buildDir/download/${asmId}_assembly_report.txt";
my $trackDb="$buildDir/${asmId}.trackDb.txt";
next if (! -s "$trackDb"); # assembly build not complete
my $chromSizes="${buildDir}/${asmId}.chrom.sizes";
my $sciName = "notFound";
my $commonName = "notFound";
my $bioSample = "notFound";
my $bioProject = "notFound";
my $taxId = "notFound";
my $asmDate = "notFound";
my $itemsFound = 0;
open (FH, "<$asmReport") or die "can not read $asmReport";
while (my $line = ) {
last if ($itemsFound > 5);
chomp $line;
$line =~ s/
//g;;
$line =~ s/\s+$//g;;
if ($line =~ m/Date:/) {
if ($asmDate =~ m/notFound/) {
++$itemsFound;
$line =~ s/.*:\s+//;
my @a = split('-', $line);
$asmDate = sprintf("%04d-%02d-%02d", $a[0], $a[1], $a[2]);
}
} elsif ($line =~ m/BioSample:/) {
if ($bioSample =~ m/notFound/) {
++$itemsFound;
$bioSample = $line;
$bioSample =~ s/.*:\s+//;
}
} elsif ($line =~ m/BioProject:/) {
if ($bioProject =~ m/notFound/) {
++$itemsFound;
$bioProject = $line;
$bioProject =~ s/.*:\s+//;
}
} elsif ($line =~ m/Organism name:/) {
if ($sciName =~ m/notFound/) {
++$itemsFound;
$commonName = $line;
$sciName = $line;
$commonName =~ s/.*\(//;
$commonName =~ s/\)//;
$commonName = $commonName{$asmId} if (exists($commonName{$asmId}));
$sciName =~ s/.*:\s+//;
$sciName =~ s/\s+\(.*//;
}
} elsif ($line =~ m/Taxid:/) {
if ($taxId =~ m/notFound/) {
++$itemsFound;
$taxId = $line;
$taxId =~ s/.*:\s+//;
}
}
}
close (FH);
my $hubUrl = "https://hgdownload.soe.ucsc.edu/hubs/$accessionDir/$accessionId";
printf "%d | \n", ++$rowCount;
### printf "%s | \n", $hubUrl, $accessionId, $commonName;
printf "%s | \n", $accessionId, $commonName;
printf " %s | \n", $hubUrl, $sciName;
printf " %s | \n", $gcPrefix, $asmAcc, $asmId;
if ( $bioSample ne "notFound" ) {
printf " %s | \n", $bioSample, $bioSample;
} else {
printf " n/a | \n";
}
+ # one broken assembly_report
+ $bioProject= "PRJEB25768" if ($accessionId eq "GCA_900324465.2");
if ($bioProject eq "notFound") {
printf " %s | \n", $bioProject;
} else {
printf " %s | \n", $bioProject, $bioProject;
}
printf " %s | \n", $ncbiFtpLink, $asmDate;
if ($vgpIndex) {
my $sciNameUnderscore = $sciName;
$sciNameUnderscore =~ s/ /_/g;
$sciNameUnderscore = "Strigops_habroptilus" if ($sciName =~ m/Strigops habroptila/);
if (! defined($vgpClass{$asmId})) {
printf STDERR "# ERROR: no 'class' defined for VGP assembly %s\n", $asmId;
exit 255;
}
printf " %s | \n", $sciNameUnderscore, $vgpClass{$asmId}
}
printf "
\n";
}
} # sub tableContents()
##############################################################################
### main()
##############################################################################
open (FH, "<${orderList}") or die "can not read ${orderList}";
while (my $line = ) {
next if ($line =~ m/^#/);
chomp $line;
my ($asmId, $commonName) = split('\t', $line);
push @orderList, $asmId;
$commonName{$asmId} = $commonName;
++$assemblyCount;
}
close (FH);
startHtml();
startTable();
tableContents();
endTable();
endHtml();