c3fc03a88ff8fcd8fffadcdd960693268c2a6f5f hiram Thu May 14 11:43:43 2026 -0700 correctly use a dedicated clean otto kent source tree for operations in the source tree refs #31811 diff --git src/hg/utils/otto/userRequests/ottoLib.py src/hg/utils/otto/userRequests/ottoLib.py index 6cc951b1e42..2ceb5599b06 100644 --- src/hg/utils/otto/userRequests/ottoLib.py +++ src/hg/utils/otto/userRequests/ottoLib.py @@ -1,71 +1,110 @@ """ ottoLib.py - shared helpers for the otto userRequests scripts. Provides clade lookup (via hgsql -> genark and the dbDb.name.clade.tsv checked in next to this file), the filter that writes tsv.otto into the -matching cladeAsmHub source directory, and the make-command sequence -that (re)builds a GenArk assembly hub. Also wraps the per-asm -doTrackDb.bash invocation used by ottoRequestWatch.sh. +matching cladeAsmHub source directory inside the otto kent working tree, +the 'git pull' helper that brings that tree up to date, and the make- +command sequence that (re)builds a GenArk assembly hub. Also wraps the +per-asm doTrackDb.bash invocation used by ottoRequestWatch.sh. + +The otto kent working tree is resolved the same way chainNetTrackDb.pl +resolves it: + $OTTO_KENT_TREE (default /hive/data/outside/genark/ottoKent/kent) Used by: ottoRequestPush2.py (lib-using rewrite of ottoRequestPush.py) ottoBuildGenArkHub.py (manual driver: rebuild hub files for a list of GenArk accessions) """ import fcntl import os import re import subprocess import sys from collections import defaultdict libDir = os.path.dirname(os.path.abspath(__file__)) cladeTsv = os.path.join(libDir, "dbDb.name.clade.tsv") gcPattern = re.compile(r"^GC[AF]_") +# Otto's kent working tree: where cladeAsmHub directories live and where +# the make chain runs. Matches chainNetTrackDb.pl's $kentTree resolution. +kentTree = os.environ.get( + "OTTO_KENT_TREE", "/hive/data/outside/genark/ottoKent/kent") + def acquireSingletonLock(lockPath, exitOnLocked=True): """Ensure only one instance holding lockPath runs at a time. Holds an exclusive flock for the lifetime of the process; the kernel releases it on exit (including crash / kill -9), so no stale-lock cleanup is needed. Returns the open file handle, which the caller must keep alive. exitOnLocked=True (cron-style): sys.exit(0) when the lock is held. exitOnLocked=False (manual-style): return None so the caller can print a message and exit non-zero. """ # "a+" opens read+write without truncating (and creates if missing), # so a second instance that fails to lock doesn't wipe the running # instance's PID from the file before exiting. fh = open(lockPath, "a+") try: fcntl.flock(fh, fcntl.LOCK_EX | fcntl.LOCK_NB) except BlockingIOError: if exitOnLocked: sys.exit(0) return None fh.seek(0) fh.truncate() fh.write("%d\n" % os.getpid()) fh.flush() return fh ### FYI: can also see the locking process via: lsof <lockPath> +def gitPullKentTree(): + """Run 'git -C <kentTree> pull' so make commands run against an + up-to-date checkout (mirrors the first thing chainNetTrackDb.pl + does after sanity-checking $kentTree). Returns True on success, + False otherwise (with the error printed to stderr). Untracked + files such as the regenerated tsv.otto are tolerated; conflicting + local edits will cause 'git pull' to fail, which is what we want + -- we don't want to silently run makes against a dirty tree. + The "Already up to date." case is suppressed to keep cron output + quiet; any other pull output is surfaced to stderr.""" + if not os.path.isdir(os.path.join(kentTree, ".git")): + print("ERROR: not a git working tree: %s" % kentTree, + file=sys.stderr) + return False + result = subprocess.run( + ["git", "-C", kentTree, "pull"], + capture_output=True, text=True, + ) + if result.returncode != 0: + print("ERROR: 'git pull' failed in %s:\n%s%s" + % (kentTree, result.stdout, result.stderr), + file=sys.stderr) + return False + out = result.stdout.strip() + if out and out != "Already up to date.": + print("# git pull in %s:\n%s" % (kentTree, out), file=sys.stderr) + return True + + def hgsql(query, db="hgcentraltest"): """Run hgsql -N -B and return rows as list of tuples (tab-split).""" out = subprocess.run( ["/cluster/bin/x86_64/hgsql", "-N", "-B", "-e", query, db], check=True, capture_output=True, text=True, ).stdout return [tuple(line.split("\t")) for line in out.splitlines() if line] def loadDbDbClades(): """Read dbDb.name.clade.tsv -> {dbName: clade}.""" result = {} with open(cladeTsv) as fh: for line in fh: if line.startswith("#") or not line.strip(): @@ -111,32 +150,32 @@ def writeCladeTsv(clade, asmIds): """Filter <clade>.orderList.tsv down to lines matching any asmId and write the result to tsv.otto in the same directory. Mirrors: cd ~/kent/src/hg/makeDb/doc/<clade>AsmHub egrep '<id1>|<id2>|...' <clade>.orderList.tsv > tsv.otto Only GenArk identifiers are used; UCSC native dbs are not in the AsmHub orderList files. Returns cladeDir on success (so the caller can chain the make sequence), or None if there is nothing to do for this clade. """ genarkIds = [a for a in asmIds if gcPattern.match(a)] if not genarkIds: return None - cladeDir = os.path.expanduser( - "~/kent/src/hg/makeDb/doc/%sAsmHub" % clade) + cladeDir = os.path.join( + kentTree, "src/hg/makeDb/doc/%sAsmHub" % clade) orderList = os.path.join(cladeDir, "%s.orderList.tsv" % clade) outPath = os.path.join(cladeDir, "tsv.otto") if not os.path.isfile(orderList): print("WARNING: missing %s" % orderList, file=sys.stderr) return None # orderList.tsv files occasionally contain Latin-1 bytes (e.g. in # Scandinavian fish names) that aren't valid UTF-8. surrogateescape # round-trips those bytes through read+write byte-for-byte instead of # raising UnicodeDecodeError. matched = [] with open(orderList, encoding="utf-8", errors="surrogateescape") as fh: for line in fh: if any(asmId in line for asmId in genarkIds): matched.append(line) if not matched: