79837f643d9b21431d679ea7216f7e68150b2adf
hiram
  Mon May 11 15:34:40 2026 -0700
making the ottoRequestView CGI efficient and avoiding repeatedly scanning files to get numbers that are not changing refs #31811

diff --git src/hg/utils/otto/userRequests/featureBitsSnapshot.py src/hg/utils/otto/userRequests/featureBitsSnapshot.py
new file mode 100755
index 00000000000..e6f444b9dd0
--- /dev/null
+++ src/hg/utils/otto/userRequests/featureBitsSnapshot.py
@@ -0,0 +1,197 @@
+#!/usr/bin/env python3
+"""
+featureBitsSnapshot.py - maintain an append-only snapshot of
+featureBits coverage percentages used by ottoRequestView.cgi.
+
+For every (fromDb, toDb) pair in hgcentraltest.ottoRequest, reads the
+fb.<src>.chain<Qry>Link.txt produced by the lastz/chain/net pipeline
+and records its "(X.Y%)" value in
+  /data/apache/trash/ottoRequestFeatureBitsPct.json
+keyed as "<src>\\t<qry>".  Once a pair is recorded the value never
+changes, so subsequent runs only inspect pairs that aren't already in
+the snapshot.
+
+Intended cadence: invoked from ottoRequestWatch.sh on every cron tick.
+Per-tick cost in steady state is zero (no new pairs to measure).
+"""
+
+import fcntl
+import json
+import os
+import re
+import subprocess
+import sys
+import tempfile
+import time
+
+scriptDir     = os.path.dirname(os.path.abspath(__file__))
+lockPath      = os.path.join(scriptDir, "featureBitsSnapshot.lock")
+SNAPSHOT_PATH = "/data/apache/trash/ottoRequestFeatureBitsPct.json"
+
+HIVE_GENOMES  = "/hive/data/genomes"
+ASMHUB_ROOT   = HIVE_GENOMES + "/asmHubs"
+
+gcPattern = re.compile(r"^GC[AF]_")
+pctRegex  = re.compile(r"\(([\d.]+)%\)")
+
+
+def acquireSingletonLock():
+    """Exclusive flock on lockPath for the lifetime of the process.
+    Silent exit 0 if another instance holds the lock so cron doesn't
+    email on every overlapping tick."""
+    fh = open(lockPath, "a+")
+    try:
+        fcntl.flock(fh, fcntl.LOCK_EX | fcntl.LOCK_NB)
+    except BlockingIOError:
+        sys.exit(0)
+    fh.seek(0)
+    fh.truncate()
+    fh.write("%d\n" % os.getpid())
+    fh.flush()
+    return fh
+
+
+def hgsql(sql, db="hgcentraltest"):
+    """Run hgsql -N -B and return rows as a list of tab-split tuples."""
+    out = subprocess.run(
+        ["/cluster/bin/x86_64/hgsql", "-N", "-B", "-e", sql, db],
+        check=True, capture_output=True, text=True,
+    ).stdout
+    return [tuple(line.split("\t")) for line in out.splitlines() if line]
+
+
+def loadSnapshot():
+    """Return the existing pct dict from the snapshot file, or {} if
+    the file is missing or malformed."""
+    try:
+        with open(SNAPSHOT_PATH) as f:
+            data = json.load(f)
+    except (OSError, ValueError):
+        return {}
+    return data.get("pct", {}) or {}
+
+
+def writeSnapshot(pct):
+    """Write pct to SNAPSHOT_PATH atomically.  tempfile in the same
+    directory + os.replace() guarantees readers never see a partial
+    write."""
+    dstDir = os.path.dirname(SNAPSHOT_PATH)
+    fd, tmpPath = tempfile.mkstemp(
+        dir=dstDir,
+        prefix=os.path.basename(SNAPSHOT_PATH) + ".",
+    )
+    try:
+        with os.fdopen(fd, "w") as tmp:
+            json.dump({
+                "ts":  time.strftime("%Y-%m-%d %H:%M:%S"),
+                "pct": pct,
+            }, tmp, indent=2, sort_keys=True)
+        os.chmod(tmpPath, 0o664)
+        os.replace(tmpPath, SNAPSHOT_PATH)
+    except BaseException:
+        try:
+            os.unlink(tmpPath)
+        except OSError:
+            pass
+        raise
+
+
+def pendingPairs():
+    """Return a set of (src, qry) tuples covering both directions for
+    every row in ottoRequest."""
+    rows = hgsql(
+        "SELECT fromDb, toDb FROM ottoRequest "
+        "WHERE fromDb != '' AND toDb != '';"
+    )
+    pairs = set()
+    for fromDb, toDb in rows:
+        pairs.add((fromDb, toDb))
+        pairs.add((toDb, fromDb))
+    return pairs
+
+
+def lookupGenarkNames(accessions):
+    """Bulk-lookup {gcAccession: asmName} for GenArk accessions in one
+    hgsql call.  Mirrors ottoRequestPush.lookupGenark() and the CGI's
+    loadGenarkNames()."""
+    if not accessions:
+        return {}
+    quoted = ",".join("'%s'" % a for a in sorted(accessions))
+    rows = hgsql(
+        "SELECT gcAccession, asmName FROM genark "
+        "WHERE gcAccession IN (%s);" % quoted
+    )
+    return {acc: asmName for acc, asmName in rows}
+
+
+def hubBuildDir(acc, genarkAsmName):
+    """Same path resolution as ottoRequestView.cgi's hubBuildDir():
+       GenArk -> asmHubs/{genbank,refseq}Build/<XXX>/<XXX>/<XXX>/<acc>_<asmName>
+       UCSC native -> /hive/data/genomes/<db>
+    Returns None when asmName is unknown or native dir is missing."""
+    if not acc:
+        return None
+    if gcPattern.match(acc) and len(acc) >= 13:
+        asmName = genarkAsmName.get(acc)
+        if not asmName:
+            return None
+        src    = acc[:3]
+        sub    = "refseqBuild" if src == "GCF" else "genbankBuild"
+        digits = acc[4:].split(".", 1)[0]
+        if len(digits) < 9:
+            return None
+        return "%s/%s/%s/%s/%s/%s/%s_%s" % (
+            ASMHUB_ROOT, sub, src,
+            digits[0:3], digits[3:6], digits[6:9],
+            acc, asmName,
+        )
+    candidate = "%s/%s" % (HIVE_GENOMES, acc)
+    if os.path.isdir(candidate):
+        return candidate
+    return None
+
+
+def measurePct(src, qry, genarkAsmName):
+    """Read fb.<src>.chain<Qry>Link.txt and return its 'X.Y%' string,
+    or None if the file isn't present yet (alignment still in
+    progress) or contains no parenthesized percentage."""
+    bdir = hubBuildDir(src, genarkAsmName)
+    if not bdir:
+        return None
+    sub = "trackData" if "/asmHubs/" in bdir else "bed"
+    Qry = qry[:1].upper() + qry[1:]
+    path = "%s/%s/lastz.%s/fb.%s.chain%sLink.txt" % (
+        bdir, sub, qry, src, Qry,
+    )
+    try:
+        with open(path) as f:
+            txt = f.read()
+    except OSError:
+        return None
+    m = pctRegex.search(txt)
+    return (m.group(1) + "%") if m else None
+
+
+def main():
+    lockFh = acquireSingletonLock()  # noqa: F841 - keep ref alive
+    snapshot = loadSnapshot()
+    pairs = pendingPairs()
+    # only inspect pairs not yet recorded; the values are immutable
+    # once an alignment completes, so this stays append-only
+    todo = [p for p in pairs if "%s\t%s" % p not in snapshot]
+    if not todo:
+        return
+    gcAccs = {db for p in todo for db in p if gcPattern.match(db)}
+    genarkAsmName = lookupGenarkNames(gcAccs)
+    added = 0
+    for src, qry in todo:
+        pct = measurePct(src, qry, genarkAsmName)
+        if pct is not None:
+            snapshot["%s\t%s" % (src, qry)] = pct
+            added += 1
+    if added:
+        writeSnapshot(snapshot)
+
+
+if __name__ == "__main__":
+    main()