d95edf23464d1e0768072f9ed96d6d237ff9da83
hiram
  Thu May 14 10:41:48 2026 -0700
push script for any GenArk hub update refs #31811

diff --git src/hg/utils/otto/userRequests/ottoLib.py src/hg/utils/otto/userRequests/ottoLib.py
new file mode 100644
index 00000000000..6cc951b1e42
--- /dev/null
+++ src/hg/utils/otto/userRequests/ottoLib.py
@@ -0,0 +1,212 @@
+"""
+ottoLib.py - shared helpers for the otto userRequests scripts.
+
+Provides clade lookup (via hgsql -> genark and the dbDb.name.clade.tsv
+checked in next to this file), the filter that writes tsv.otto into the
+matching cladeAsmHub source directory, and the make-command sequence
+that (re)builds a GenArk assembly hub.  Also wraps the per-asm
+doTrackDb.bash invocation used by ottoRequestWatch.sh.
+
+Used by:
+  ottoRequestPush2.py     (lib-using rewrite of ottoRequestPush.py)
+  ottoBuildGenArkHub.py   (manual driver: rebuild hub files for a list
+                           of GenArk accessions)
+"""
+
+import fcntl
+import os
+import re
+import subprocess
+import sys
+from collections import defaultdict
+
+libDir = os.path.dirname(os.path.abspath(__file__))
+cladeTsv = os.path.join(libDir, "dbDb.name.clade.tsv")
+gcPattern = re.compile(r"^GC[AF]_")
+
+
+def acquireSingletonLock(lockPath, exitOnLocked=True):
+    """Ensure only one instance holding lockPath runs at a time.  Holds
+    an exclusive flock for the lifetime of the process; the kernel
+    releases it on exit (including crash / kill -9), so no stale-lock
+    cleanup is needed.  Returns the open file handle, which the caller
+    must keep alive.
+
+    exitOnLocked=True (cron-style): sys.exit(0) when the lock is held.
+    exitOnLocked=False (manual-style): return None so the caller can
+    print a message and exit non-zero.
+    """
+    # "a+" opens read+write without truncating (and creates if missing),
+    # so a second instance that fails to lock doesn't wipe the running
+    # instance's PID from the file before exiting.
+    fh = open(lockPath, "a+")
+    try:
+        fcntl.flock(fh, fcntl.LOCK_EX | fcntl.LOCK_NB)
+    except BlockingIOError:
+        if exitOnLocked:
+            sys.exit(0)
+        return None
+    fh.seek(0)
+    fh.truncate()
+    fh.write("%d\n" % os.getpid())
+    fh.flush()
+    return fh
+    ### FYI: can also see the locking process via: lsof <lockPath>
+
+
+def hgsql(query, db="hgcentraltest"):
+    """Run hgsql -N -B and return rows as list of tuples (tab-split)."""
+    out = subprocess.run(
+        ["/cluster/bin/x86_64/hgsql", "-N", "-B", "-e", query, db],
+        check=True, capture_output=True, text=True,
+    ).stdout
+    return [tuple(line.split("\t")) for line in out.splitlines() if line]
+
+
+def loadDbDbClades():
+    """Read dbDb.name.clade.tsv -> {dbName: clade}."""
+    result = {}
+    with open(cladeTsv) as fh:
+        for line in fh:
+            if line.startswith("#") or not line.strip():
+                continue
+            name, clade = line.rstrip("\n").split("\t")[:2]
+            result[name] = clade
+    return result
+
+
+def lookupGenark(accessions):
+    """Bulk-lookup GenArk accessions -> {acc: (asmName, clade)}."""
+    if not accessions:
+        return {}
+    quoted = ",".join("'%s'" % a for a in sorted(accessions))
+    rows = hgsql(
+        "SELECT gcAccession, asmName, clade FROM genark "
+        "WHERE gcAccession IN (%s);" % quoted
+    )
+    return {acc: (asmName, clade) for acc, asmName, clade in rows}
+
+
+def groupByClade(dbs, dbDbClades, genarkInfo):
+    """Build {clade: [assemblyId, ...]}.  dbs may mix GenArk accessions
+    and UCSC native db names; dbDbClades may be empty when no native
+    dbs are expected."""
+    grouped = defaultdict(set)
+    for db in dbs:
+        if gcPattern.match(db):
+            info = genarkInfo.get(db)
+            if info is None:
+                print("WARNING: %s not in genark table" % db, file=sys.stderr)
+                continue
+            asmName, clade = info
+            grouped[clade].add("%s_%s" % (db, asmName))
+        else:
+            clade = dbDbClades.get(db)
+            if clade is None:
+                print("WARNING: %s not in %s" % (db, cladeTsv), file=sys.stderr)
+                continue
+            grouped[clade].add(db)
+    return {clade: sorted(ids) for clade, ids in grouped.items()}
+
+
+def writeCladeTsv(clade, asmIds):
+    """Filter <clade>.orderList.tsv down to lines matching any asmId and
+    write the result to tsv.otto in the same directory.  Mirrors:
+        cd ~/kent/src/hg/makeDb/doc/<clade>AsmHub
+        egrep '<id1>|<id2>|...' <clade>.orderList.tsv > tsv.otto
+    Only GenArk identifiers are used; UCSC native dbs are not in the
+    AsmHub orderList files.
+
+    Returns cladeDir on success (so the caller can chain the make
+    sequence), or None if there is nothing to do for this clade.
+    """
+    genarkIds = [a for a in asmIds if gcPattern.match(a)]
+    if not genarkIds:
+        return None
+    cladeDir = os.path.expanduser(
+        "~/kent/src/hg/makeDb/doc/%sAsmHub" % clade)
+    orderList = os.path.join(cladeDir, "%s.orderList.tsv" % clade)
+    outPath = os.path.join(cladeDir, "tsv.otto")
+    if not os.path.isfile(orderList):
+        print("WARNING: missing %s" % orderList, file=sys.stderr)
+        return None
+    # orderList.tsv files occasionally contain Latin-1 bytes (e.g. in
+    # Scandinavian fish names) that aren't valid UTF-8.  surrogateescape
+    # round-trips those bytes through read+write byte-for-byte instead of
+    # raising UnicodeDecodeError.
+    matched = []
+    with open(orderList, encoding="utf-8", errors="surrogateescape") as fh:
+        for line in fh:
+            if any(asmId in line for asmId in genarkIds):
+                matched.append(line)
+    if not matched:
+        print("WARNING: no matches in %s" % orderList, file=sys.stderr)
+        return None
+    with open(outPath, "w", encoding="utf-8", errors="surrogateescape") as fh:
+        fh.writelines(matched)
+    return cladeDir
+
+
+# Sequence of make commands run in the clade AsmHub directory after
+# tsv.otto is written.  Stops on the first failure.
+genArkMakeCommands = [
+    "time (make symLinks orderList=tsv.otto) >> dbg 2>&1",
+    "time (make mkGenomes orderList=tsv.otto) >> dbg 2>&1",
+    "time (make symLinks orderList=tsv.otto) >> dbg 2>&1",
+    "time (make verifyTestDownload orderList=tsv.otto) >> test.down.log 2>&1",
+    "time (make sendDownload orderList=tsv.otto) >> send.down.log 2>&1",
+    "time (make verifyDownload orderList=tsv.otto) >> verify.down.log 2>&1",
+]
+
+
+def runGenArkMake(cladeDir):
+    """Run the genArkMakeCommands sequence in cladeDir.  Uses bash so
+    'time (...)' (a builtin on a subshell) and '>>' / '2>&1' work as
+    written.  Returns True on success, False if any step fails (the
+    chain stops at the first failure)."""
+    for cmd in genArkMakeCommands:
+        result = subprocess.run(
+            cmd, shell=True, executable="/bin/bash", cwd=cladeDir,
+        )
+        if result.returncode != 0:
+            print("# ERROR: exit %d from: %s -- stopping chain"
+                  % (result.returncode, cmd), file=sys.stderr)
+            return False
+    return True
+
+
+def genArkBuildDir(asmId):
+    """Return /hive/data/genomes/asmHubs/allBuild/<3>/<3>/<3>/<3>/<asmId>
+    for a full GenArk asmId (GC[AF]_<digits.version>_<asmName>).
+    The four 3-char path segments come from the accession part only, so
+    callers that only have the bare accession still get a valid parent
+    directory, but the final segment requires the asmName too."""
+    return ("/hive/data/genomes/asmHubs/allBuild/"
+            "%s/%s/%s/%s/%s"
+            % (asmId[0:3], asmId[4:7], asmId[7:10], asmId[10:13], asmId))
+
+
+def runDoTrackDb(asmId, logPath=None):
+    """Run the doTrackDb.bash script that sits at the top of asmId's
+    GenArk hub build dir, the same way ottoRequestWatch.sh does.
+    asmId must be the full <acc>_<asmName>.  When logPath is given,
+    stdout+stderr are appended to that file; otherwise they go to
+    /dev/null.  Returns True on success, False on failure (including
+    when the script can't be found or isn't executable)."""
+    hubBuild = genArkBuildDir(asmId)
+    doTdb = os.path.join(hubBuild, "doTrackDb.bash")
+    if not os.access(doTdb, os.X_OK):
+        print("# ERROR: cannot find executable %s" % doTdb, file=sys.stderr)
+        return False
+    logFh = open(logPath, "a") if logPath else open(os.devnull, "w")
+    try:
+        result = subprocess.run(
+            [doTdb], stdout=logFh, stderr=subprocess.STDOUT,
+        )
+    finally:
+        logFh.close()
+    if result.returncode != 0:
+        print("# ERROR: %s exited %d" % (doTdb, result.returncode),
+              file=sys.stderr)
+        return False
+    return True