631a7882f8c689b66f3d81fc8f25fbf98f73499a hiram Mon Oct 2 16:55:41 2023 -0700 remove the all genomes at once attachment option refs #29545 diff --git src/hg/makeDb/doc/asmHubs/mkHubIndex.pl src/hg/makeDb/doc/asmHubs/mkHubIndex.pl index 4414bf6..e72a403 100755 --- src/hg/makeDb/doc/asmHubs/mkHubIndex.pl +++ src/hg/makeDb/doc/asmHubs/mkHubIndex.pl @@ -1,561 +1,523 @@ #!/usr/bin/env perl # # mkHubIndex.pl - construct index.html page for a set of assemblies in a hub # use strict; use warnings; use File::Basename; use FindBin qw($Bin); use lib "$Bin"; use commonHtml; my $argc = scalar(@ARGV); if ($argc != 4) { printf STDERR "mkHubIndex.pl Name asmName defaultAsmId [two column name list] > index.html\n"; printf STDERR "e.g.: mkHubIndex Primates primates GCF_000001405.39_GRCh38.p13 primates.commonName.asmId.orderList.tsv\n"; printf STDERR "the name list is found in \$HOME/kent/src/hg/makeDb/doc/asmHubs/\n"; printf STDERR "\nthe two columns are 1: asmId (accessionId_assemblyName)\n"; printf STDERR "column 2: common name for species, columns separated by tab\n"; printf STDERR "The result prints to stdout the index.html page for this set of assemblies\n"; exit 255; } my $home = $ENV{'HOME'}; my $toolsDir = "$home/kent/src/hg/makeDb/doc/asmHubs"; my $Name = shift; my $asmHubName = shift; my $defaultAssembly = shift; my $inputList = shift; my $orderList = $inputList; if ( ! -s "$orderList" ) { $orderList = $toolsDir/$inputList; } my %cladeId; # value is asmId, value is clade, useful for 'legacy' index page printf STDERR "# mkHubIndex %s %s %s %s\n", $Name, $asmHubName, $defaultAssembly, $orderList; my $hprcIndex = 0; my $ccgpIndex = 0; my $vgpIndex = 0; $hprcIndex = 1 if ($Name =~ m/hprc/i); $ccgpIndex = 1 if ($Name =~ m/ccgp/i); $vgpIndex = 1 if ($Name =~ m/vgp/i); my %extraClass; # key is asmId, value is taxon 'class' as set by VGP project if ($vgpIndex || $ccgpIndex) { my $whichIndex = "vgp"; $whichIndex = "ccgp" if ($ccgpIndex); my $extraClass = "$home/kent/src/hg/makeDb/doc/${whichIndex}AsmHub/${whichIndex}.taxId.asmId.class.txt"; open (FH, "<$extraClass") or die "can not read $extraClass"; while (my $line = ) { my ($taxId, $asmId, $class) = split('\t', $line); $extraClass{$asmId} = $class; } close (FH); } my @orderList; # asmId of the assemblies in order from the *.list files # the order to read the different .list files: my $assemblyTotal = 0; my %commonName; # key is asmId, value is a common name, perhaps more appropriate # than found in assembly_report file ############################################################################## # from Perl Cookbook Recipe 2.17, print out large numbers with comma delimiters: ############################################################################## sub commify($) { my $text = reverse $_[0]; $text =~ s/(\d\d\d)(?=\d)(?!\d*\.)/$1,/g; return scalar reverse $text } ############################################################################## ### start the HTML output ############################################################################## sub startHtml() { my $timeStamp = `date "+%F"`; chomp $timeStamp; # my $subSetMessage = "subset of $asmHubName only"; if ($asmHubName eq "vertebrate") { $subSetMessage = "subset of other ${asmHubName}s only"; } if ($vgpIndex) { my $vgpSubset = "(set of primary assemblies)"; if ($orderList =~ m/vgp.alternate/) { $vgpSubset = "(set of alternate/haplotype assemblies)"; } elsif ($orderList =~ m/vgp.trio/) { $vgpSubset = "(set of trio assemblies, maternal/paternal)"; } elsif ($orderList =~ m/vgp.legacy/) { $vgpSubset = "(set of legacy/superseded assemblies)"; } print <<"END";

VGP - Vertebrate Genomes Project assembly hub

This assembly hub contains assemblies released by the Vertebrate Genomes Project. $vgpSubset

END } else { if ($ccgpIndex) { print <<"END";

CCGP - California Conservation Genomics Project assembly hub

This assembly hub contains assemblies released by the California Conservation Genomics Project.

END } elsif ($hprcIndex) { print <<"END";

HPRC - Human Pangenome Reference Consortium assembly hub

This assembly hub contains assemblies released by the Human Pangenome Reference Consortium.

END } else { print <<"END";

$Name Genomes assembly hubs

Assemblies from NCBI/Genbank/Refseq sources, $subSetMessage.

END } } print <<"END"; -

How to view the hub

How to view the assembly of interest

-Options: -

The links to the genome browser in the table below will attach that - one specific assembly to the genome browser. This is most likely what - you want. Alternatively, the entire set of assemblies can be attached - as one group to the genome browser with the following links depending - upon which of our mirror site browsers you prefer to use: - - - - - - - - - - -
attach all assemblies to selected site: genome.ucsc.edu genome-euro.ucsc.edu genome-asia.ucsc.edu
-
To manually attach all the assemblies in this hub to genome browsers - that are not one of the three UCSC mirror sites: -
1. From the blue navigation bar, go to - My Data -> Track Hubs
2. Then select the My Hubs tab and enter this URL into - the textbox: -
  https://hgdownload.soe.ucsc.edu/hubs/$asmHubName/hub.txt
3. Once you have added the URL to the entry form, - press the Add Hub button to add the hub.
-

+The links to the genome browser in the table below will attach that +one specific assembly to the genome browser. Use the links in +the column labeled common name and view in browser to view that +assembly in the genome browser.

-After adding the hub, you will be redirected to the gateway page. The -genome assemblies can be selected from the -${Name} Hub Assembly dropdown menu. -Instead of adding all the assemblies in one collected group, use the individual -view in browser in the table below. -

Listings:

(from RepeatModeler masking)

families fasta.gz list of URLs for the custom library created by the RepeatModeler run
assembly 2bit file list of URLs as masked with the RepeatModeler + TRF/simpleRepeats with period of 12 or less
the rmod.log files from each RepeatModeler run
default GenArk 2bit file list of URLs as masked with the ordinary RepeatMasker + TRF/simpleRepeats with period of 12 or less
this data table in tab-separated file text format (including TBD not working yet, or in VGP collection but not on the alignment list)

END } print <<"END";

Data resource links

NOTE: Click on the column headers to sort the table by that column

The common name and view in browser will attach only that single assembly to the genome browser.
The scientific name and data download link provides access to the files for that one assembly hub.
END if ($vgpIndex) { print <<"END"; The class VGP link provides access to the VGP GenomeArk page for that genome.
END } print <<"END"; The other links provide access to NCBI resources for these assemblies. END } # sub startHtml() ############################################################################## ### start the table output ############################################################################## sub startTable() { print ' '; if ("viral" ne $asmHubName) { printf " \n"; } printf "\n"; if ("legacy" eq $asmHubName) { printf "\n"; } if ($ccgpIndex) { printf "\n"; } elsif ($vgpIndex) { printf "\n"; } print "\n"; } # sub startTable() ############################################################################## ### end the table output ############################################################################## sub endTable() { print <<"END";

count	common name and view in browser	scientific name and data download	NCBI assembly	BioSample	BioProject	assembly date, source link	clade	class CCGP link	class VGP link

END } # sub endTable() ############################################################################## ### end the HTML output ############################################################################## sub endHtml() { &commonHtml::otherHubLinks($vgpIndex, $asmHubName); &commonHtml::htmlFooter($vgpIndex, $asmHubName); } # sub endHtml() ############################################################################## ### tableContents() ############################################################################## sub tableContents() { my $rowCount = 0; foreach my $asmId (@orderList) { my $gcPrefix = "GCx"; my $asmAcc = "asmAcc"; my $asmName = "asmName"; my $accessionId = "GCx_098765432.1"; my $accessionDir = substr($asmId, 0 ,3); my $configRa = "n/a"; if ($asmId !~ m/^GC/) { # looking at a UCSC database, not a hub $configRa = "/hive/data/genomes/$asmId/$asmId.config.ra"; $accessionId = `grep ^genBankAccessionID "${configRa}" | cut -d' ' -f2`; chomp $accessionId; $asmName = `grep ^ncbiAssemblyName "${configRa}" | cut -d' ' -f2`; chomp $asmName; $accessionDir = substr($accessionId, 0 ,3); $accessionDir .= "/" . substr($accessionId, 4 ,3); $accessionDir .= "/" . substr($accessionId, 7 ,3); $accessionDir .= "/" . substr($accessionId, 10 ,3); ($gcPrefix, $asmAcc) = split('_', $accessionId, 2); printf STDERR "# not an assembly hub: '%s' - '%s' '%s' '%s'\n", $asmId, $accessionId, $gcPrefix, $asmAcc; } else { ($gcPrefix, $asmAcc, $asmName) = split('_', $asmId, 3); $accessionDir .= "/" . substr($asmId, 4 ,3); $accessionDir .= "/" . substr($asmId, 7 ,3); $accessionDir .= "/" . substr($asmId, 10 ,3); } $accessionId = sprintf("%s_%s", $gcPrefix, $asmAcc); my $ncbiFtpLink = "https://ftp.ncbi.nlm.nih.gov/genomes/all/$accessionDir/$asmId"; my $buildDir = "/hive/data/genomes/asmHubs/refseqBuild/$accessionDir/$asmId"; my $asmReport="$buildDir/download/${asmId}_assembly_report.txt"; if ($asmId =~ m/^GCA/) { $buildDir = "/hive/data/genomes/asmHubs/genbankBuild/$accessionDir/$asmId"; $asmReport="$buildDir/download/${asmId}_assembly_report.txt"; } elsif ($asmId !~ m/^GC/) { $buildDir="/hive/data/outside/ncbi/genomes/$accessionDir/${accessionId}_${asmName}"; $asmReport="$buildDir/${accessionId}_${asmName}_assembly_report.txt"; } my $trackDb="$buildDir/${asmId}.trackDb.txt"; # next if (! -s "$trackDb"); # assembly build not complete my $commonName = "notFound(${asmId})"; my $sciName = "notFound"; my $bioSample = "notFound"; my $bioProject = "notFound"; my $taxId = "notFound"; my $asmDate = "notFound"; my $itemsFound = 0; if ( -s "${asmReport}" ) { open (FH, "<$asmReport") or die "can not read $asmReport"; while (my $line = ) { last if ($itemsFound > 5); chomp $line; $line =~ s/ //g;; $line =~ s/\s+$//g;; if ($line =~ m/Date:/) { if ($asmDate =~ m/notFound/) { ++$itemsFound; $line =~ s/.*:\s+//; my @a = split('-', $line); $asmDate = sprintf("%04d-%02d-%02d", $a[0], $a[1], $a[2]); } } elsif ($line =~ m/BioSample:/) { if ($bioSample =~ m/notFound/) { ++$itemsFound; $bioSample = $line; $bioSample =~ s/.*:\s+//; } } elsif ($line =~ m/BioProject:/) { if ($bioProject =~ m/notFound/) { ++$itemsFound; $bioProject = $line; $bioProject =~ s/.*:\s+//; } } elsif ($line =~ m/Organism name:/) { if ($sciName =~ m/notFound/) { ++$itemsFound; $commonName = $line; $sciName = $line; $commonName =~ s/.*$//; $commonName =~ s/$//; $commonName = $commonName{$asmId} if (exists($commonName{$asmId})); $sciName =~ s/.*:\s+//; $sciName =~ s/\s+\(.*//; } } elsif ($line =~ m/Taxid:/) { if ($taxId =~ m/notFound/) { ++$itemsFound; $taxId = $line; $taxId =~ s/.*:\s+//; } } } close (FH); } elsif ( -s "${configRa}" ) { # if ( -s "${asmReport}" ) # ncbiAssemblyName Sscrofa10.2 # genBankAccessionID GCA_000003025.4 # ncbiBioProject 13421 # assemblyDate Aug. 2011 $asmName = `grep ^ncbiAssemblyName "${configRa}" | cut -d' ' -f2`; chomp $asmName; $taxId = `grep ^taxId "${configRa}" | cut -d' ' -f2`; chomp $taxId; $commonName = `grep ^commonName "${configRa}" | cut -d' ' -f2-`; chomp $commonName; $sciName = `grep ^scientificName "${configRa}" | cut -d' ' -f2-`; chomp $sciName; $asmDate = `grep ^assemblyDate "${configRa}" | cut -d' ' -f2-`; chomp $asmDate; $bioProject = `grep ^ncbiBioProject "${configRa}" | cut -d' ' -f2-`; chomp $bioProject; $bioSample = `grep ^ncbiBioSample "${configRa}" | cut -d' ' -f2-`; chomp $bioSample; $ncbiFtpLink = "https://ftp.ncbi.nlm.nih.gov/genomes/all/$accessionDir/${accessionId}_${asmName}"; } my $hubUrl = "https://hgdownload.soe.ucsc.edu/hubs/$accessionDir/$accessionId"; my $browserName = $commonName; my $browserUrl = "https://genome.ucsc.edu/h/$accessionId"; if ($asmId !~ m/^GC/) { $hubUrl = "https://hgdownload.soe.ucsc.edu/goldenPath/$asmId/bigZips"; $browserUrl = "https://genome.ucsc.edu/cgi-bin/hgTracks?db=$asmId"; $browserName = "$commonName ($asmId)"; } printf "%d\n", ++$rowCount; printf "%s\n", $browserUrl, $browserName; printf " %s\n", $hubUrl, $sciName; if ($asmId !~ m/^GC/) { printf " %s_%s\n", $gcPrefix, $asmAcc, $accessionId, $asmName; } else { printf " %s\n", $accessionId, $asmId; } # viruses do not appear to have BioSample if ($asmHubName ne "viral") { if ( $bioSample ne "notFound" ) { printf " %s\n", $bioSample, $bioSample; } else { printf " n/a\n"; } } # one broken assembly_report $bioProject= "PRJEB25768" if ($accessionId eq "GCA_900324465.2"); if ($bioProject eq "notFound") { printf " %s\n", $bioProject; } else { printf " %s\n", $bioProject, $bioProject; } printf " %s\n", $ncbiFtpLink, $asmDate; if ("legacy" eq $asmHubName) { if (! defined($cladeId{$asmId})) { printf STDERR "# ERROR: missing clade definition for %s\n", $asmId; exit 255; } else { printf " %s\n", $cladeId{$asmId}; } } if ($ccgpIndex) { my $sciNameUnderscore = $sciName; $sciNameUnderscore =~ s/ /_/g; $sciNameUnderscore = "Strigops_habroptilus" if ($sciName =~ m/Strigops habroptila/); if (! defined($extraClass{$asmId})) { printf STDERR "# ERROR: no 'class' defined for CCGP assembly %s\n", $asmId; exit 255; } # it isn't clear how we can get these names # https://www.ccgproject.org/species/corynorhinus-townsendii-townsends-big-eared-bat printf " %s\n", $sciNameUnderscore, $extraClass{$asmId} } elsif ($vgpIndex) { my $sciNameUnderscore = $sciName; $sciNameUnderscore =~ s/ /_/g; $sciNameUnderscore = "Strigops_habroptilus" if ($sciName =~ m/Strigops habroptila/); if (! defined($extraClass{$asmId})) { printf STDERR "# ERROR: no 'class' defined for VGP/CCGP assembly %s\n", $asmId; exit 255; } printf " %s\n", $sciNameUnderscore, $extraClass{$asmId} } printf "\n"; } } # sub tableContents() ############################################################################## ### main() ############################################################################## # if there is a 'promoted' list, it has been taken out of the 'orderList' # so will need to stuff it back in at the correct ordered location my %promotedList; # key is asmId, value is common name my $promotedList = dirname(${orderList}) . "/promoted.list"; my @promotedList; # contents are asmIds, in order by lc(common name) my $promotedIndex = -1; # to walk through @promotedList; if ( -s "${promotedList}" ) { open (FH, "<${promotedList}" ) or die "can not read ${promotedList}"; while (my $line = ) { next if ($line =~ m/^#/); chomp $line; my ($asmId, $commonName) = split('\t', $line); $promotedList{$asmId} = $commonName; } close (FH); foreach my $asmId ( sort { lc($promotedList{$a}) cmp lc($promotedList{$b}) } keys %promotedList) { push @promotedList, $asmId; } $promotedIndex = 0; } my $cladeList = dirname(${orderList}) . "/$asmHubName.clade.txt"; if ( -s "${cladeList}" ) { open (FH, "<$cladeList") or die "can not read ${cladeList}"; while (my $clade = ) { chomp $clade; my @a = split('\t', $clade); $cladeId{$a[0]} = $a[1]; } close (FH); } open (FH, "<${orderList}") or die "can not read ${orderList}"; while (my $line = ) { next if ($line =~ m/^#/); chomp $line; my ($asmId, $commonName) = split('\t', $line); if ( ($promotedIndex > -1) && ($promotedIndex < scalar(@promotedList))) { my $checkInsertAsmId = $promotedList[$promotedIndex]; my $checkInsertName = $promotedList{$checkInsertAsmId}; # insert before this commonName when alphabetic before if (lc($checkInsertName) lt lc($commonName)) { push @orderList, $checkInsertAsmId; $commonName{$checkInsertAsmId} = $checkInsertName; ++$assemblyTotal; printf STDERR "# inserting '%s' before '%s' at # %03d\n", $checkInsertName, $commonName, $assemblyTotal; ++$promotedIndex; # only doing one at this time # TBD: will need to improve this for more inserts } } push @orderList, $asmId; $commonName{$asmId} = $commonName; ++$assemblyTotal; } close (FH); # TBD: and would need to check if all promoted assemblies have been included startHtml(); startTable(); tableContents(); endTable(); endHtml();