22a9ec8b34dd60a14257bdef408f21c03db601b4 hiram Mon Feb 17 16:06:38 2020 -0800 now using the shorter assembly name for genome identifier and fix some common names refs #23891 diff --git src/hg/makeDb/doc/asmHubs/mkHubIndex.pl src/hg/makeDb/doc/asmHubs/mkHubIndex.pl index 4b22d5c..2fe5b20 100755 --- src/hg/makeDb/doc/asmHubs/mkHubIndex.pl +++ src/hg/makeDb/doc/asmHubs/mkHubIndex.pl @@ -1,273 +1,275 @@ #!/usr/bin/env perl use strict; use warnings; my $argc = scalar(@ARGV); if ($argc != 3) { printf STDERR "mkAsmStats Name asmName\n"; printf STDERR "e.g.: mkHubIndex Primates primates GCF_000001405.39_GRCh38.p13\n"; exit 255; } my $Name = shift; my $asmHubName = shift; my $defaultAssembly = shift; my $home = $ENV{'HOME'}; my $toolsDir = "$home/kent/src/hg/makeDb/doc/asmHubs"; my $commonNameOrder = "$asmHubName.commonName.asmId.orderList.tsv"; my @orderList; # asmId of the assemblies in order from the *.list files # the order to read the different .list files: my $assemblyCount = 0; my %betterName; # key is asmId, value is a common name better than found # in assembly_report file ############################################################################## # from Perl Cookbook Recipe 2.17, print out large numbers with comma delimiters: ############################################################################## sub commify($) { my $text = reverse $_[0]; $text =~ s/(\d\d\d)(?=\d)(?!\d*\.)/$1,/g; return scalar reverse $text } ############################################################################## ### start the HTML output ############################################################################## sub startHtml() { my $timeStamp = `date "+%F"`; chomp $timeStamp; # my $subSetMessage = "subset of $asmHubName only"; if ($asmHubName eq "vertebrate") { $subSetMessage = "subset of other ${asmHubName}s only"; } print <<"END"

$Name Genomes assembly hubs

Assemblies from NCBI/Genbank/Refseq sources, $subSetMessage.

How to view the hub

You can load this hub from our Public Hubs page or by clicking these assembly links to any of our official websites:

To manually attach all the assemblies in this hub to other genome browsers:

  1. From the blue navigation bar, go to My Data -> Track Hubs
  2. Then select the My Hubs tab and enter this URL into the textbox:
    https://hgdownload.soe.ucsc.edu/hubs/$asmHubName/hub.txt
  3. Once you have added the URL to the entry form, press the Add Hub button to add the hub.

After adding the hub, you will be redirected to the gateway page. The genome assemblies can be selected from the ${Name} Hub Assembly dropdown menu. Instead of adding all the assemblies in one collected group, use the individual link to genome browser in the table below.

See also: assembly statistics


Data resource links

NOTE: Click on the column headers to sort the table by that column
The link to genome browser will attach only that single assembly to the genome browser. END } # sub startHtml() ############################################################################## ### start the table output ############################################################################## sub startTable() { print <<"END" END } # sub startTable() ############################################################################## ### end the table output ############################################################################## sub endTable() { print <<"END"
count common name
link to genome browser
scientific name
and data download
NCBI assembly bioSamplebioProject assembly date,
source link
END } # sub endTable() ############################################################################## ### end the HTML output ############################################################################## sub endHtml() { if ($asmHubName ne "viral") { printf "

\nOther assembly hubs available:
\n\n"; printf "\n" if ($asmHubName ne "primates"); printf "\n" if ($asmHubName ne "mammals"); printf "\n" if ($asmHubName ne "birds"); printf "\n" if ($asmHubName ne "fish"); printf "\n" if ($asmHubName ne "vertebrate"); printf "\n
PrimatesMammalsBirdsFishother vertebrates
\n

\n"; } print <<"END" END } # sub endHtml() ############################################################################## ### tableContents() ############################################################################## sub tableContents() { my $rowCount = 0; foreach my $asmId (reverse(@orderList)) { my ($gcPrefix, $asmAcc, $asmName) = split('_', $asmId, 3); my $accessionId = sprintf("%s_%s", $gcPrefix, $asmAcc); my $accessionDir = substr($asmId, 0 ,3); $accessionDir .= "/" . substr($asmId, 4 ,3); $accessionDir .= "/" . substr($asmId, 7 ,3); $accessionDir .= "/" . substr($asmId, 10 ,3); my $ncbiFtpLink = "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/$accessionDir/$asmId"; my $buildDir = "/hive/data/genomes/asmHubs/refseqBuild/$accessionDir/$asmId"; my $asmReport="$buildDir/download/${asmId}_assembly_report.txt"; my $trackDb="$buildDir/${asmId}.trackDb.txt"; next if (! -s "$trackDb"); my $chromSizes="${buildDir}/${asmId}.chrom.sizes"; my $sciName = "notFound"; my $commonName = "notFound"; my $bioSample = "notFound"; my $bioProject = "notFound"; my $taxId = "notFound"; my $asmDate = "notFound"; my $itemsFound = 0; open (FH, "<$asmReport") or die "can not read $asmReport"; while (my $line = ) { last if ($itemsFound > 5); chomp $line; $line =~ s/ //g;; $line =~ s/\s+$//g;; if ($line =~ m/Date:/) { if ($asmDate =~ m/notFound/) { ++$itemsFound; $line =~ s/.*:\s+//; my @a = split('-', $line); $asmDate = sprintf("%04d-%02d-%02d", $a[0], $a[1], $a[2]); } } elsif ($line =~ m/BioSample:/) { if ($bioSample =~ m/notFound/) { ++$itemsFound; $bioSample = $line; $bioSample =~ s/.*:\s+//; } } elsif ($line =~ m/BioProject:/) { if ($bioProject =~ m/notFound/) { ++$itemsFound; $bioProject = $line; $bioProject =~ s/.*:\s+//; } } elsif ($line =~ m/Organism name:/) { if ($sciName =~ m/notFound/) { ++$itemsFound; $commonName = $line; $sciName = $line; $commonName =~ s/.*\(//; $commonName =~ s/\)//; $commonName = $betterName{$asmId} if (exists($betterName{$asmId})); $sciName =~ s/.*:\s+//; $sciName =~ s/\s+\(.*//; } } elsif ($line =~ m/Taxid:/) { if ($taxId =~ m/notFound/) { ++$itemsFound; $taxId = $line; $taxId =~ s/.*:\s+//; } } } close (FH); my $hubUrl = "https://hgdownload.soe.ucsc.edu/hubs/$accessionDir/$accessionId"; printf "%d\n", ++$rowCount; - printf "%s\n", $hubUrl, $asmId, $commonName; +### printf "%s\n", $hubUrl, $accessionId, $commonName; + printf "%s\n", $accessionId, $commonName; printf " %s\n", $hubUrl, $sciName; printf " %s\n", $gcPrefix, $asmAcc, $asmId; if ( $bioSample ne "notFound" ) { printf " %s\n", $bioSample, $bioSample; } else { printf " n/a\n"; } printf " %s\n", $bioProject, $bioProject; printf " %s\n", $ncbiFtpLink, $asmDate; printf "\n"; } } # sub tableContents() ############################################################################## ### main() ############################################################################## open (FH, "<$toolsDir/${commonNameOrder}") or die "can not read ${commonNameOrder}"; while (my $line = ) { + next if ($line =~ m/^#/); chomp $line; my ($commonName, $asmId) = split('\t', $line); push @orderList, $asmId; $betterName{$asmId} = $commonName; ++$assemblyCount; } close (FH); startHtml(); startTable(); tableContents(); endTable(); endHtml();