d95edf23464d1e0768072f9ed96d6d237ff9da83 hiram Thu May 14 10:41:48 2026 -0700 push script for any GenArk hub update refs #31811 diff --git src/hg/utils/otto/userRequests/ottoLib.py src/hg/utils/otto/userRequests/ottoLib.py new file mode 100644 index 00000000000..6cc951b1e42 --- /dev/null +++ src/hg/utils/otto/userRequests/ottoLib.py @@ -0,0 +1,212 @@ +""" +ottoLib.py - shared helpers for the otto userRequests scripts. + +Provides clade lookup (via hgsql -> genark and the dbDb.name.clade.tsv +checked in next to this file), the filter that writes tsv.otto into the +matching cladeAsmHub source directory, and the make-command sequence +that (re)builds a GenArk assembly hub. Also wraps the per-asm +doTrackDb.bash invocation used by ottoRequestWatch.sh. + +Used by: + ottoRequestPush2.py (lib-using rewrite of ottoRequestPush.py) + ottoBuildGenArkHub.py (manual driver: rebuild hub files for a list + of GenArk accessions) +""" + +import fcntl +import os +import re +import subprocess +import sys +from collections import defaultdict + +libDir = os.path.dirname(os.path.abspath(__file__)) +cladeTsv = os.path.join(libDir, "dbDb.name.clade.tsv") +gcPattern = re.compile(r"^GC[AF]_") + + +def acquireSingletonLock(lockPath, exitOnLocked=True): + """Ensure only one instance holding lockPath runs at a time. Holds + an exclusive flock for the lifetime of the process; the kernel + releases it on exit (including crash / kill -9), so no stale-lock + cleanup is needed. Returns the open file handle, which the caller + must keep alive. + + exitOnLocked=True (cron-style): sys.exit(0) when the lock is held. + exitOnLocked=False (manual-style): return None so the caller can + print a message and exit non-zero. + """ + # "a+" opens read+write without truncating (and creates if missing), + # so a second instance that fails to lock doesn't wipe the running + # instance's PID from the file before exiting. + fh = open(lockPath, "a+") + try: + fcntl.flock(fh, fcntl.LOCK_EX | fcntl.LOCK_NB) + except BlockingIOError: + if exitOnLocked: + sys.exit(0) + return None + fh.seek(0) + fh.truncate() + fh.write("%d\n" % os.getpid()) + fh.flush() + return fh + ### FYI: can also see the locking process via: lsof <lockPath> + + +def hgsql(query, db="hgcentraltest"): + """Run hgsql -N -B and return rows as list of tuples (tab-split).""" + out = subprocess.run( + ["/cluster/bin/x86_64/hgsql", "-N", "-B", "-e", query, db], + check=True, capture_output=True, text=True, + ).stdout + return [tuple(line.split("\t")) for line in out.splitlines() if line] + + +def loadDbDbClades(): + """Read dbDb.name.clade.tsv -> {dbName: clade}.""" + result = {} + with open(cladeTsv) as fh: + for line in fh: + if line.startswith("#") or not line.strip(): + continue + name, clade = line.rstrip("\n").split("\t")[:2] + result[name] = clade + return result + + +def lookupGenark(accessions): + """Bulk-lookup GenArk accessions -> {acc: (asmName, clade)}.""" + if not accessions: + return {} + quoted = ",".join("'%s'" % a for a in sorted(accessions)) + rows = hgsql( + "SELECT gcAccession, asmName, clade FROM genark " + "WHERE gcAccession IN (%s);" % quoted + ) + return {acc: (asmName, clade) for acc, asmName, clade in rows} + + +def groupByClade(dbs, dbDbClades, genarkInfo): + """Build {clade: [assemblyId, ...]}. dbs may mix GenArk accessions + and UCSC native db names; dbDbClades may be empty when no native + dbs are expected.""" + grouped = defaultdict(set) + for db in dbs: + if gcPattern.match(db): + info = genarkInfo.get(db) + if info is None: + print("WARNING: %s not in genark table" % db, file=sys.stderr) + continue + asmName, clade = info + grouped[clade].add("%s_%s" % (db, asmName)) + else: + clade = dbDbClades.get(db) + if clade is None: + print("WARNING: %s not in %s" % (db, cladeTsv), file=sys.stderr) + continue + grouped[clade].add(db) + return {clade: sorted(ids) for clade, ids in grouped.items()} + + +def writeCladeTsv(clade, asmIds): + """Filter <clade>.orderList.tsv down to lines matching any asmId and + write the result to tsv.otto in the same directory. Mirrors: + cd ~/kent/src/hg/makeDb/doc/<clade>AsmHub + egrep '<id1>|<id2>|...' <clade>.orderList.tsv > tsv.otto + Only GenArk identifiers are used; UCSC native dbs are not in the + AsmHub orderList files. + + Returns cladeDir on success (so the caller can chain the make + sequence), or None if there is nothing to do for this clade. + """ + genarkIds = [a for a in asmIds if gcPattern.match(a)] + if not genarkIds: + return None + cladeDir = os.path.expanduser( + "~/kent/src/hg/makeDb/doc/%sAsmHub" % clade) + orderList = os.path.join(cladeDir, "%s.orderList.tsv" % clade) + outPath = os.path.join(cladeDir, "tsv.otto") + if not os.path.isfile(orderList): + print("WARNING: missing %s" % orderList, file=sys.stderr) + return None + # orderList.tsv files occasionally contain Latin-1 bytes (e.g. in + # Scandinavian fish names) that aren't valid UTF-8. surrogateescape + # round-trips those bytes through read+write byte-for-byte instead of + # raising UnicodeDecodeError. + matched = [] + with open(orderList, encoding="utf-8", errors="surrogateescape") as fh: + for line in fh: + if any(asmId in line for asmId in genarkIds): + matched.append(line) + if not matched: + print("WARNING: no matches in %s" % orderList, file=sys.stderr) + return None + with open(outPath, "w", encoding="utf-8", errors="surrogateescape") as fh: + fh.writelines(matched) + return cladeDir + + +# Sequence of make commands run in the clade AsmHub directory after +# tsv.otto is written. Stops on the first failure. +genArkMakeCommands = [ + "time (make symLinks orderList=tsv.otto) >> dbg 2>&1", + "time (make mkGenomes orderList=tsv.otto) >> dbg 2>&1", + "time (make symLinks orderList=tsv.otto) >> dbg 2>&1", + "time (make verifyTestDownload orderList=tsv.otto) >> test.down.log 2>&1", + "time (make sendDownload orderList=tsv.otto) >> send.down.log 2>&1", + "time (make verifyDownload orderList=tsv.otto) >> verify.down.log 2>&1", +] + + +def runGenArkMake(cladeDir): + """Run the genArkMakeCommands sequence in cladeDir. Uses bash so + 'time (...)' (a builtin on a subshell) and '>>' / '2>&1' work as + written. Returns True on success, False if any step fails (the + chain stops at the first failure).""" + for cmd in genArkMakeCommands: + result = subprocess.run( + cmd, shell=True, executable="/bin/bash", cwd=cladeDir, + ) + if result.returncode != 0: + print("# ERROR: exit %d from: %s -- stopping chain" + % (result.returncode, cmd), file=sys.stderr) + return False + return True + + +def genArkBuildDir(asmId): + """Return /hive/data/genomes/asmHubs/allBuild/<3>/<3>/<3>/<3>/<asmId> + for a full GenArk asmId (GC[AF]_<digits.version>_<asmName>). + The four 3-char path segments come from the accession part only, so + callers that only have the bare accession still get a valid parent + directory, but the final segment requires the asmName too.""" + return ("/hive/data/genomes/asmHubs/allBuild/" + "%s/%s/%s/%s/%s" + % (asmId[0:3], asmId[4:7], asmId[7:10], asmId[10:13], asmId)) + + +def runDoTrackDb(asmId, logPath=None): + """Run the doTrackDb.bash script that sits at the top of asmId's + GenArk hub build dir, the same way ottoRequestWatch.sh does. + asmId must be the full <acc>_<asmName>. When logPath is given, + stdout+stderr are appended to that file; otherwise they go to + /dev/null. Returns True on success, False on failure (including + when the script can't be found or isn't executable).""" + hubBuild = genArkBuildDir(asmId) + doTdb = os.path.join(hubBuild, "doTrackDb.bash") + if not os.access(doTdb, os.X_OK): + print("# ERROR: cannot find executable %s" % doTdb, file=sys.stderr) + return False + logFh = open(logPath, "a") if logPath else open(os.devnull, "w") + try: + result = subprocess.run( + [doTdb], stdout=logFh, stderr=subprocess.STDOUT, + ) + finally: + logFh.close() + if result.returncode != 0: + print("# ERROR: %s exited %d" % (doTdb, result.returncode), + file=sys.stderr) + return False + return True