6ddaedca96e314bcd1607e69bcfa8fb228d22a3c hiram Thu May 21 21:53:56 2026 -0700 common functions moved to ottoLib.py and shared with ottoBuildGenArkHub.py and fixup the legacy clade situation refs #31811 diff --git src/hg/utils/otto/userRequests/ottoLib.py src/hg/utils/otto/userRequests/ottoLib.py index 2ceb5599b06..25e369027d2 100644 --- src/hg/utils/otto/userRequests/ottoLib.py +++ src/hg/utils/otto/userRequests/ottoLib.py @@ -144,55 +144,88 @@ if clade is None: print("WARNING: %s not in %s" % (db, cladeTsv), file=sys.stderr) continue grouped[clade].add(db) return {clade: sorted(ids) for clade, ids in grouped.items()} def writeCladeTsv(clade, asmIds): """Filter <clade>.orderList.tsv down to lines matching any asmId and write the result to tsv.otto in the same directory. Mirrors: cd ~/kent/src/hg/makeDb/doc/<clade>AsmHub egrep '<id1>|<id2>|...' <clade>.orderList.tsv > tsv.otto Only GenArk identifiers are used; UCSC native dbs are not in the AsmHub orderList files. + If no matches are found in the expected clade directory, falls back + to checking legacyAsmHub/legacy.orderList.tsv and works there instead. + Returns cladeDir on success (so the caller can chain the make sequence), or None if there is nothing to do for this clade. """ genarkIds = [a for a in asmIds if gcPattern.match(a)] if not genarkIds: return None + + # First try the expected clade directory cladeDir = os.path.join( kentTree, "src/hg/makeDb/doc/%sAsmHub" % clade) orderList = os.path.join(cladeDir, "%s.orderList.tsv" % clade) outPath = os.path.join(cladeDir, "tsv.otto") - if not os.path.isfile(orderList): - print("WARNING: missing %s" % orderList, file=sys.stderr) - return None + # orderList.tsv files occasionally contain Latin-1 bytes (e.g. in # Scandinavian fish names) that aren't valid UTF-8. surrogateescape # round-trips those bytes through read+write byte-for-byte instead of # raising UnicodeDecodeError. matched = [] + + if os.path.isfile(orderList): with open(orderList, encoding="utf-8", errors="surrogateescape") as fh: for line in fh: if any(asmId in line for asmId in genarkIds): matched.append(line) + + # If no matches found in expected clade, try legacy directory if not matched: - print("WARNING: no matches in %s" % orderList, file=sys.stderr) + legacyDir = os.path.join( + kentTree, "src/hg/makeDb/doc/legacyAsmHub") + legacyOrderList = os.path.join(legacyDir, "legacy.orderList.tsv") + legacyOutPath = os.path.join(legacyDir, "tsv.otto") + + if os.path.isfile(legacyOrderList): + with open(legacyOrderList, encoding="utf-8", errors="surrogateescape") as fh: + for line in fh: + if any(asmId in line for asmId in genarkIds): + matched.append(line) + + if matched: + # Found matches in legacy - work there instead + with open(legacyOutPath, "w", encoding="utf-8", errors="surrogateescape") as fh: + fh.writelines(matched) + return legacyDir + + # No matches found anywhere + if not os.path.isfile(orderList): + print("WARNING: missing %s" % orderList, file=sys.stderr) + if not os.path.isfile(legacyOrderList): + print("WARNING: missing %s" % legacyOrderList, file=sys.stderr) + if os.path.isfile(orderList) or os.path.isfile(legacyOrderList): + print("WARNING: no matches for %s in %s or legacy.orderList.tsv" % + (genarkIds, clade), file=sys.stderr) return None + + # Found matches in expected clade directory with open(outPath, "w", encoding="utf-8", errors="surrogateescape") as fh: fh.writelines(matched) return cladeDir # Sequence of make commands run in the clade AsmHub directory after # tsv.otto is written. Stops on the first failure. genArkMakeCommands = [ "time (make symLinks orderList=tsv.otto) >> dbg 2>&1", "time (make mkGenomes orderList=tsv.otto) >> dbg 2>&1", "time (make symLinks orderList=tsv.otto) >> dbg 2>&1", "time (make verifyTestDownload orderList=tsv.otto) >> test.down.log 2>&1", "time (make sendDownload orderList=tsv.otto) >> send.down.log 2>&1", "time (make verifyDownload orderList=tsv.otto) >> verify.down.log 2>&1", ]