312be12573fb0c4823e6e30ef5a58e10ae604f8c
hiram
  Wed Jul 30 15:30:07 2025 -0700
new scripts to replace pushRR.sh and alphaBetaPush.pl for dramatic performance improvement refs #34917

diff --git src/hg/utils/otto/genArk/quickPush.pl src/hg/utils/otto/genArk/quickPush.pl
new file mode 100755
index 00000000000..7bdd054c9b9
--- /dev/null
+++ src/hg/utils/otto/genArk/quickPush.pl
@@ -0,0 +1,377 @@
+#!/usr/bin/env perl
+
+#############################################################################
+###  The 'stat' file listings used here: dev.todayList.gz, hgw1.todayList.gz
+###    and hgwbeta.todayList.gz are made by cron jobs previous to this
+###    script running.  They are a listing of files in /gbdb/genark/GC[AF]/
+###    with the 'mtime' - last modified time.  This time will be compared
+###    to decide if anything needs to go out.
+#############################################################################
+
+use strict;
+use warnings;
+use File::Basename;
+
+my @machList = qw( hgw0 hgw1 hgw2 Genome-Browser-Mirror-3.dhcp.uni-bielefeld.de );
+
+my $lf;		# going to become the output handle for the logfile
+my $expectName = "hgwdev";
+my $hostName = `hostname -s`;
+chomp $hostName;
+
+if ($hostName ne $expectName) {
+  printf STDERR "ERROR: must run this on %s !  This is: %s\n", ${expectName}, ${hostName};
+  exit 255;
+}
+
+##############################################################################
+### pass in a machine name and a contrib/directory to rsync out
+sub rsyncContrib($$) {
+  my ($dest, $contribDir) = @_;
+  my $contribPath = "/gbdb/genark/" . $contribDir;
+  my $cmd = qq(rsync --stats -a -L --itemize-changes "$contribPath/" "qateam\@$dest:$contribPath/" 2>&1);
+  printf $lf "%s\n", $cmd;
+  my $cmdOut = `$cmd`;
+  if (length($cmdOut) > 1) {
+    ;
+#            printf $lf "%s\n", $cmdOut;
+  } else {
+    printf $lf "# rsync output mysteriously empty ? '%s'\n", $cmd;
+  }
+}
+
+##############################################################################
+### a contrib/ directory has been identified to go out
+### '$which' is either 'beta' or 'all'
+sub contribOut($$) {
+  my ($which, $contribHash) = @_;
+  my $dirCount = scalar keys %$contribHash;
+  if ($which eq "beta") {
+     for my $contribDir (keys %$contribHash) {
+       rsyncContrib("hgwbeta", $contribDir);
+     }
+  } elsif ($which eq "all") {
+     for my $destMach (@machList) {
+       for my $contribDir (keys %$contribHash) {
+         rsyncContrib($destMach, $contribDir);
+       }
+     }
+  } else {
+    printf $lf "# ERROR: contribOut given '%s' which is not 'beta' or 'all'\n", $which;
+  }
+}
+
+##############################################################################
+# given a source hub.txt path and a destination hub.txt path
+# send out to the given '$dest' machine
+sub rsyncHubTxt($$$) {
+  my ($dest, $from, $to) = @_;
+  my $cmd = qq(rsync --stats -a -L --itemize-changes "$from" "qateam\@${dest}:$to" 2>&1);
+  printf $lf "%s\n", $cmd;
+  my $cmdOut = `$cmd`;
+  if (length($cmdOut) > 1) {
+     ;
+  #       printf $lf "%s\n", $cmdOut;
+  } else {
+    printf $lf "# rsync output mysteriously empty ? '%s'\n", $cmd;
+  }
+}
+
+##############################################################################
+# given a source hub.txt path and a destination hub.txt path
+#  push it out to all the machines
+# '$which' is either 'beta' or 'all' to determine where to go
+sub sendHubTxt($$$) {
+  my ($which, $from, $to) = @_;
+  if ($which eq "beta") {
+     rsyncHubTxt("hgwbeta", $from, $to);
+  } elsif ($which eq "all") {
+    foreach my $mach (@machList) {
+      rsyncHubTxt($mach, $from, $to);
+    }
+  } else {
+    printf $lf "# ERROR: sendHubTxt given '%s' which is not 'beta' or 'all'\n", $which;
+  }
+}
+##############################################################################
+### begin scrip main()
+##############################################################################
+
+my $DS = `date "+%F"`; chomp $DS;
+my $TS = `date "+%T"`; chomp $TS;
+my $Y = `date "+%Y"`; chomp $Y;
+my $M = `date "+%m"`; chomp $M;
+my $logDir = "/hive/data/inside/GenArk/pushRR/logs/${Y}/${M}";
+`mkdir -p "${logDir}"`;
+my $logFile="${logDir}/quickBetaPublic.${DS}.gz";
+open ($lf, "|gzip -c > '$logFile'") or die "can not write to logFile";
+
+printf $lf "### starting quickPush.pl at: %s %s\n", $DS, $TS;
+
+my %betaContrib;	# key is name of contrib track, the directory under
+                        #  <buildDir>/contrib/<contribName>
+my $betaTrackCount = 0;
+my %publicContrib;	# key is name of contrib track, the directory under
+                        #  <buildDir>/contrib/<contribName>
+my $publicTrackCount = 0;
+my $home = $ENV{'HOME'};
+
+my $fh;	# file handle used in all open() calls (one at a time of course)
+
+### check if any 'public' or 'beta' contrib tracks are defined for release
+###  the betaGenArk.txt and publicGenArk.txt are in the source tree and
+###  control the release of 'contrib' tracks
+
+### this is just getting the listing of 'contrib' directories from those
+### source tree files.  These two lists: 'publicContrib' and 'betaContrib' will
+### be used later to identify files in the /contrib/<trackName>/ directories
+
+if ( -s "$home/kent/src/hg/makeDb/trackDb/betaGenArk.txt" ) {
+  open ($fh, "<", "$home/kent/src/hg/makeDb/trackDb/betaGenArk.txt") or die "can not read ~/kent/src/hg/makeDb/trackDb/betaGenArk.txt";
+  while (my $line = <$fh>) {
+    next if ($line =~ m/^#/);
+    chomp $line;
+    $betaContrib{$line} = 1;
+    $betaTrackCount += 1;
+    $DS = `date "+%F"`; chomp $DS;
+    $TS = `date "+%T"`; chomp $TS;
+    printf $lf "# beta track: '%s' specified %s %s\n", $line, $DS, $TS;
+  }
+  close ($fh);
+}
+if ( -s "$home/kent/src/hg/makeDb/trackDb/publicGenArk.txt" ) {
+  open ($fh, "<", "$home/kent/src/hg/makeDb/trackDb/publicGenArk.txt") or die "can not read ~/kent/src/hg/makeDb/trackDb/publicGenArk.txt";
+  while (my $line = <$fh>) {
+    next if ($line =~ m/^#/);
+    chomp $line;
+    $publicContrib{$line} = 1;
+    $publicTrackCount += 1;
+    $DS = `date "+%F"`; chomp $DS;
+    $TS = `date "+%T"`; chomp $TS;
+    printf $lf "# public track: '%s' specified %s %s\n", $line, $DS, $TS;
+  }
+  close ($fh);
+}
+
+# no tracks defined, nothing to do (never happens since 'tiberius' has existed)
+if ( ($betaTrackCount < 1) && ($publicTrackCount < 1) ) {
+   exit 0;
+}
+
+printf $lf "# %d beta contrib tracks defined\n", $betaTrackCount;
+printf $lf "# %d public contrib tracks defined\n", $publicTrackCount;
+
+### the %devAccession list is going to be the set of 'accession' directories
+###  on hgwdev to allow verifying it is the same set as exists on hgw1
+
+my %devAccession;	# key is an assembly 'accession' found on hgwdev, value
+			# is the 'mtime' of the hub.txt file on hgwdev
+my %contribPublic;	# key is file name under /contrib/ value is mtime for public tracks
+my %contribBeta;	# key is file name under /contrib/ value is mtime for beta tracks
+my %devList;	# key is file name, value is mtime == last modified time
+my %hubList;	# same thing for file names hub.txt
+my %publicList;	# same thing for file names public.hub.txt, name changed to hub.txt
+my %betaHubList;	# same thing for file names beta.hub.txt, name changed to hub.txt
+
+### this is going to establish the listings of 'contrib' and 'hub.txt'
+### files that exist on hgwdev.  'public' and 'beta' contrib files will
+###  be on separate listings from the 'hub.txt' listings.
+
+open ($fh, "-|", "zegrep '/contrib/|hub.txt' dev.todayList.gz") or die "can not read dev.todayList.gz";
+while (my $line = <$fh>) {
+  chomp $line;
+  my ($mtime, $fileName) = split('\t', $line);
+  $devList{$fileName} = $mtime;
+  if ($line =~ m#/contrib/#) {
+    foreach my $pubContrib (keys %publicContrib) {
+       if ($line =~ m#/contrib/$pubContrib/#) {
+          $contribPublic{$fileName} = $mtime;
+          last;
+       }
+    }
+    foreach my $betaContrib (keys %betaContrib) {
+       if ($line =~ m#/contrib/$betaContrib/#) {
+          $contribBeta{$fileName} = $mtime;
+          last;
+       }
+    }
+  } elsif ($line =~ m#/hub.txt#) {
+    $hubList{$fileName} = $mtime;
+    my $dirName = dirname($fileName);
+    my $accession = basename($dirName);
+    $devAccession{$accession} = $mtime;
+  } elsif ($line =~ m#/public.hub.txt#) {
+    $fileName =~ s#/public.#/#;
+    $publicList{$fileName} = $mtime;
+  } elsif ($line =~ m#/beta.hub.txt#) {
+    $fileName =~ s#/beta.#/#;
+    $betaHubList{$fileName} = $mtime;
+  }
+}
+close ($fh);
+
+my $publicContribCount = scalar keys %contribPublic;
+my $betaContribCount = scalar keys %contribBeta;
+
+printf $lf "# %d contrib files available for 'public' release\n", $publicContribCount;
+printf $lf "# %d contrib files available for 'beta' release\n", $betaContribCount;
+
+### establish the list of hgw1 files with their mtime
+### the %hgw1Accession list is going to be the set of 'accession' directories
+###  on hgw1 to allow verifying it is the same set as exists on hgwdev
+
+my %hgw1List;	# key is file name, value is mtime == last modified time
+my %hgw1Accession;	# key is an assembly 'accession' found on hgw1, value
+			# is the 'mtime' of the hub.txt file on hgw1
+# this list only has /hub.txt files and /contrib/ files
+open ($fh, "-|", "zegrep '/contrib/|hub.txt' hgw1.todayList.gz | grep -v GCA_019395325.1") or die "can not read hgw1.todayList.gz";
+while (my $line = <$fh>) {
+  chomp $line;
+  my ($mtime, $fileName) = split('\t', $line);
+  $hgw1List{$fileName} = $mtime;
+  if ($fileName =~ m#/hub.txt#) {
+    my $dirName = dirname($fileName);
+    my $accession = basename($dirName);
+    $hgw1Accession{$accession} = $mtime;
+  }
+}
+close ($fh);
+
+###  check which accessions do not exist on hgw1
+my $toPushCount = 0;
+foreach my $accession (keys %devAccession) {
+  ++$toPushCount if (! defined($hgw1Accession{$accession}));
+}
+
+###  check if there are extra accessions on hgw1, should not be
+my $extraExisting = 0;
+foreach my $accession (keys %hgw1Accession) {
+  ++$extraExisting if (! defined($devAccession{$accession}));
+}
+
+printf $lf "### %d assemblies to push from hgwdev out\n", $toPushCount;
+printf $lf "### %d assemblies on hgw1 not on hgwdev - ERROR should not be present.\n", $extraExisting if ($extraExisting);
+
+### establish the list of hgwbeta files with their mtime
+
+my %betaList;	# key is file name, value is mtime == last modified time
+# this list only has /hub.txt files and /contrib/ files
+open ($fh, "-|", "zegrep '/contrib/|hub.txt' hgwbeta.todayList.gz | grep -v GCA_019395325.1") or die "can not read hgwbeta.todayList.gz";
+while (my $line = <$fh>) {
+  chomp $line;
+  my ($mtime, $fileName) = split('\t', $line);
+  $betaList{$fileName} = $mtime;
+}
+close ($fh);
+
+###  check for updated or new 'contig' directory files, public and beta
+
+my %publicContribList;	# key is contig directory name, value is number of files
+
+# check /contrib/... files on the 'public' release list
+foreach my $fileName (keys %contribPublic) {
+   my $devTime = $contribPublic{$fileName};
+   my $yesUpdate = 0;
+   if (defined($hgw1List{$fileName})) {
+     my $hubTime = $hgw1List{$fileName};
+     if ($hubTime ne $devTime) {	# an updated file
+       $yesUpdate = 1;
+     }
+   } else {	# a new file
+     $yesUpdate = 1;
+   }
+   if ($yesUpdate) {
+      my $dirName = dirname($fileName);
+      $publicContribList{$dirName} += 1;
+   }
+}
+
+my %betaContribList;	# key is contig directory name, value is number of files
+
+# check /contrib/... files on the 'beta' release list
+foreach my $fileName (keys %contribBeta) {
+   my $devTime = $contribBeta{$fileName};
+   my $yesUpdate = 0;
+   if (defined($betaList{$fileName})) {
+     my $betaTime = $betaList{$fileName};
+     if ($betaTime ne $devTime) {	# an updated file
+       $yesUpdate = 1;
+     }
+   } else {	# a new file
+     $yesUpdate = 1;
+   }
+   if ($yesUpdate) {
+      my $dirName = dirname($fileName);
+      $betaContribList{$dirName} += 1;
+   }
+}
+
+### push out any new or modified 'contrib' directories
+if ((scalar keys %betaContribList) > 0) {
+  $DS = `date "+%F"`; chomp $DS;
+  $TS = `date "+%T"`; chomp $TS;
+  printf $lf "# pushing %d beta /contrib/ directories %s %s\n", scalar keys %betaContribList, $DS, $TS;
+  contribOut("beta", \%betaContribList);
+}
+if ((scalar keys %publicContribList) > 0) {
+  $DS = `date "+%F"`; chomp $DS;
+  $TS = `date "+%T"`; chomp $TS;
+  printf $lf "# pushing %d public /contrib/ directories %s %s\n", scalar keys %publicContribList, $DS, $TS;
+  contribOut("all", \%publicContribList);
+}
+
+$DS = `date "+%F"`; chomp $DS;
+$TS = `date "+%T"`; chomp $TS;
+printf $lf "# checking %d hub.txt files for pushing %s %s\n", scalar keys %publicContribList, $DS, $TS;
+
+### push out any new or modified hub.txt files
+### when beta.hub.txt exists it goes to hgwbeta as 'hub.txt'
+### and the public.hub.txt goes out to all RR machines as 'hub.txt'
+foreach my $fileName (keys %devList) {
+   my $devTime = $devList{$fileName};
+   if ($fileName =~ m#public.hub.txt#) {
+      my $pubHub = $fileName;
+      $pubHub =~ s#/public.#/#;
+      my $yesUpdate = 0;
+      if (defined($hgw1List{$pubHub})) {
+        my $hubTime = $hgw1List{$pubHub};
+        if ($hubTime ne $devTime) {
+          $yesUpdate = 1;
+        }
+      } else {	# new file
+          $yesUpdate = 1;
+      }
+      if ($yesUpdate) {
+          my $dirName = dirname($pubHub);
+          my $accession = basename($dirName);
+          my $pathDir = sprintf("/gbdb/genark/%s", $dirName);
+          my $src = "$pathDir/public.hub.txt";
+          my $dest = "$pathDir/hub.txt";
+          sendHubTxt("all", $src, $dest);
+      }
+   } elsif ($fileName =~ m#beta.hub.txt#) {
+      my $betaHub = $fileName;
+      $betaHub =~ s#/beta.#/#;
+      my $yesUpdate = 0;
+      if (defined($betaList{$betaHub})) {
+        my $hubTime = $betaList{$betaHub};
+        if ($hubTime ne $devTime) {
+          $yesUpdate = 1;
+        }
+      } else {	# new file
+          $yesUpdate = 1;
+      }
+      if ($yesUpdate) {
+          my $dirName = dirname($betaHub);
+          my $accession = basename($dirName);
+          my $pathDir = sprintf("/gbdb/genark/%s/%s", $dirName, $accession);
+          my $src = "$pathDir/beta.hub.txt";
+          my $dest = "$pathDir/hub.txt";
+          sendHubTxt("beta", $src, $dest);
+      }
+   }	#	elsif ($fileName =~ m#beta.hub.txt#)
+}	#	foreach my $fileName (keys %devList)
+
+$DS = `date "+%F"`; chomp $DS;
+$TS = `date "+%T"`; chomp $TS;
+printf $lf "### quickPush.pl finished at: %s %s\n", $DS, $TS;