05d27bee6baaa2a68c516dc3b2ae31841479c8e8 hiram Wed Jun 18 13:01:40 2025 -0700 helper script to get a hub.txt and trackDb.txt out of an existing GenArk hub refs #35898 diff --git src/hg/makeDb/trackDb/hubTxtFromGenArk.pl src/hg/makeDb/trackDb/hubTxtFromGenArk.pl new file mode 100755 index 00000000000..f82decc75cc --- /dev/null +++ src/hg/makeDb/trackDb/hubTxtFromGenArk.pl @@ -0,0 +1,146 @@ +#!/usr/bin/env perl + +############################################################################ +### generate the hubAndGenome.txt file and the trackDb.txt from +### an existing GenArk assembly. The trackDb.txt text file should +### have previously been checked into the source tree as, for example: +### rat/rn8/genark.trackDb.ra +### And when this script makes this file, it should be compared to the +### source tree copy to see if it has been changing. +### This process also expects the dbDb table entry to exist for this +### stated assembly 'db' name +############################################################################ + +use strict; +use warnings; + +my $argc = scalar(@ARGV); +if ($argc != 3) { + printf STDERR "usage: ./hubTxtFromGenArk.pl <db> <GCx_...> <genark.trackDb.txt> > hubAndGenome.txt\n"; + printf STDERR " where db is the curated hub UCSC name (and dbDb table entry)\n"; + printf STDERR " where the given GCx_ identifier is the accession name of the GenArk hub.\n"; + printf STDERR " where genark.trackDb.txt is a file to write the hub track definitions.\n"; + printf STDERR "e.g. ./hubTxtFromGenArk.pl rn8 GCF_036323735.1 rn8.genark.trackDb.txt > hubAndGenome.txt\n"; + exit 255; +} + +sub addPath($$) { + my ($oneLine, $gbdbPath) = @_; + my ($tag, $shortUrl) = split(/\s/, $oneLine, 2); + my $ret = sprintf("%s %s/%s", $tag, $gbdbPath, $shortUrl); + return $ret; +} + +############################################################################# +### begin main() +############################################################################# +my $db = shift; +my $accession = shift; +my $trackDbOut = shift; +my $gcX = substr($accession, 0, 3); +my $d0 = substr($accession, 4, 3); +my $d1 = substr($accession, 7, 3); +my $d2 = substr($accession, 10, 3); +my $gbDbPath = "/gbdb/genark/${gcX}/${d0}/${d1}/${d2}/$accession"; +if ( ! -d "${gbDbPath}" ) { + printf "ERROR: can not find directory:\n%s\n", $gbDbPath; + exit 255; +} + +my $hubTxt = "${gbDbPath}/hub.txt"; +if ( ! -s "${hubTxt}" ) { + printf STDERR "ERROR: can not find hub.txt:\n%s\n", $hubTxt; + exit 255; +} + +open (my $tdb, ">", $trackDbOut) or die "can not write to $trackDbOut"; + +my $stanza = ""; +my $stanzaLine = 0; +my $firstTrack = 1; + +my ($hubShortLabel, $hubLongLabel) = split(/\t/, `hgsql -N -e 'SELECT description, sourceName FROM dbDb WHERE name="${db}";' hgcentraltest`); +chomp $hubShortLabel; +chomp $hubLongLabel; + +open (my $fh, "<", ${hubTxt}) or die "can not read $hubTxt"; +while (my $line = <$fh>) { + chomp $line; + if (length($line) < 1) { + next; + } + $line =~ s/^\s+//; + if ($line =~ m/^hub\s/) { + $stanza = "hub"; + $stanzaLine = 0; + } elsif ($line =~ m/^genome\s/) { + $stanza = "genome"; + $stanzaLine = 0; + } elsif ($line =~ m/^track\s/) { + $stanza = "track"; + $stanzaLine = 0; + } elsif ($line =~ m/^include\s/) { + $stanza = "include"; + $stanzaLine = 0; + } + if ($stanza eq "hub") { + if ($stanzaLine < 1) { + printf "hub %s genome assembly\n", $db; + printf "shortLabel %s\n", $hubShortLabel; + printf "longLabel %s\n", $hubLongLabel; + printf "useOneFile on\n"; + } + if ($line =~ m/^email\s/) { + printf "email genome-www\@soe.ucsc.edu\n"; + } elsif ($line =~ m/^descriptionUrl\s/) { + printf "%s\n", addPath($line, $gbDbPath); + } + ++$stanzaLine; + } elsif ($stanza eq "genome") { + printf "\n" if ($stanzaLine < 1); + ++$stanzaLine; + if ($line =~ m/^genome\s/) { + printf "genome %s\n", $db; + } elsif ($line =~ m/^groups\s|^twoBitPath\s|^twoBitBptUrl\s|^chromSizes\s|^chromAliasBb\s|^htmlPath\s|^liftOver/) { + printf "%s\n", addPath($line, $gbDbPath); + } else { + printf "%s\n", $line; + } + } elsif ($stanza eq "track") { + if ($firstTrack) { + $firstTrack = 0; + } else { + printf $tdb "\n" if ($stanzaLine < 1); + } + ++$stanzaLine; + if ($line =~ m/^html\s|^bigDataUrl\s|^linkDataUrl\s|^searchTrix\s|^summary\s|^xrefDataUrl\s/) { + printf $tdb "%s\n", addPath($line, $gbDbPath); + } else { + printf $tdb "%s\n", $line; + } + } +} +close ($fh); +printf "\n"; +close ($tdb); + +__END__ + + hgsql -e 'select * from dbDb where name="rn8"\G' hgcentraltest + +*************************** 1. row *************************** + name: rn8 + description: Jan. 2024 (GRCr8/rn8) + nibPath: hub:/gbdb/rn8/hubs + organism: Rat + defaultPos: NC_086019.1:90172726-90182726 + active: 1 + orderKey: 18017 + genome: Rat +scientificName: Rattus norvegicus + htmlPath: /gbdb/genark/GCF/036/323/735/GCF_036323735.1/html/GCF_036323735.1_GRCr8.description.html + hgNearOk: 0 + hgPbOk: 0 + sourceName: BN/NHsdMcwi 2024 refseq (GCF_036323735.1) + taxId: 10116 +