b6aee4c6471cddebd638fec8dbb988c29a69bc22 markd Thu Apr 23 21:58:41 2026 -0700 import of GENCODE V50, MV39, and V50lift37; added a command to do import with a single command diff --git src/hg/makeDb/outside/gencode/bin/gencodeBuildRelease src/hg/makeDb/outside/gencode/bin/gencodeBuildRelease new file mode 100755 index 00000000000..93aaf115876 --- /dev/null +++ src/hg/makeDb/outside/gencode/bin/gencodeBuildRelease @@ -0,0 +1,274 @@ +#!/usr/bin/env python3 +import sys +import os +import os.path as osp +import re +import argparse +import pipettor + +BIN_DIR = osp.abspath(osp.normpath(osp.dirname(sys.argv[0]))) +sys.path.insert(0, osp.join(BIN_DIR, "../lib")) +sys.path.insert(0, osp.expanduser("/hive/groups/browser/pycbio/lib")) +from pycbio.sys import fileOps + +KENT_SRC_DIR = osp.normpath(osp.join(BIN_DIR, "../../../../..")) +ALL_JOINER = osp.join(KENT_SRC_DIR, "hg/makeDb/schema/all.joiner") +TRACKDB_DIR = osp.join(KENT_SRC_DIR, "hg/makeDb/trackDb") + +GENCODE_SRC_DIR = osp.dirname(osp.dirname(__file__)) +GENCODE_MAKEFILE = osp.join(GENCODE_SRC_DIR, "gencodeLoad.mk") +NUM_PROCS = 16 + +def parseArgs(): + desc = """Run steps to build a GENCODE Versions release. + + This runs the gencodeLoad.mk makefile and other tasks and tracks completion. + +""" + parser = argparse.ArgumentParser(description=desc) + parser.add_argument("--noJoinerCheck", action="store_true", + help="skip running joinerCheck") + parser.add_argument("db", choices=("hg38", "hg19", "mm39"), + help="which UCSC database/organism") + parser.add_argument("version", + help="GENCODE version, 47, M38, 47lift37") + parser.add_argument("relType", choices=("pre", "final"), + help="which version") + parser.add_argument("releaseDate", + help="Month and year of GENCODE release, in the form `August 2014'") + parser.add_argument("ensemblVersion", + help="what Ensembl version does this correspond to?") + args = parser.parse_args() + if not parseVersion(args.version): + parser.error("invalid version, should be in the form 47, M38, or 47lift37") + return args + + +def parseVersion(version): + "returns (M|"", num, lift37|"") or None if not valid" + + # M38 -> ('M', '38', None) + # 47lift37 -> (None, '47', 'lift37') + m = re.match("^(M)?([0-9]+)(lift37)?$", version) + if m is None: + return None + g = m.groups() + if (g[0] is not None) and (g[2] is not None): + return None # no mouse backmap + return (g[0] if g[0] is not None else '', + g[1], + g[2] if g[2] is not None else '') + +def status(msg): + sys.stdout.flush() + print("===>", msg, file=sys.stderr, flush=True) + +def slurpFile(path): + with open(path) as fh: + return fh.read() + +def replaceFile(path, content): + with open(path, "w") as fh: + fh.write(content) + +def getAsmTrackDbDir(db): + org = "human" if db.startswith("hg") else "mouse" + return osp.join(TRACKDB_DIR, org, db) + +class GencodeSpec: + "bag for GENCODE release information" + def __init__(self, db, version, prevVersion, baseVersion, relType, + releaseDate, ensemblVersion): + self.db = db + self.version = version + self.ucscVersion = "V" + version + self.prevVersion = prevVersion + self.baseVersion = baseVersion + self.relType = relType + self.preRelease = "yes" if relType == "pre" else "no" + self.releaseDate = releaseDate + self.ensemblVersion = ensemblVersion + +def getBuildDir(spec): + relTypeDir = "hgcImport" if spec.relType == "final" else "hgcImportPre" + return f"/hive/data/genomes/{spec.db}/bed/gencodeV{spec.version}/{relTypeDir}" + +def getPrevVersion(version): + verinfo = parseVersion(version) + prevNum = int(verinfo[1]) - 1 + return f"{verinfo[0]}{prevNum}{verinfo[2]}" + +def getBaseVersion(version): + verinfo = parseVersion(version) + return f"{verinfo[0]}{verinfo[1]}" + +def runMake(target, spec, buildDir): + cmd = ["make", "-f", GENCODE_MAKEFILE, '-R', + "-C", buildDir, "-k", "-j", NUM_PROCS, + target, + f"db={spec.db}", + f"version={spec.version}", + f"relType={spec.relType}", + f"prevVersion={spec.prevVersion}", + f"baseVersion={spec.baseVersion}", + f"prevVersion={spec.prevVersion}"] + pipettor.run(cmd, stderr=2) + +def downloadAndBuild(buildDir, spec): + status("downloading and building tracks") + runMake("all", spec, buildDir) + gencode_cmp = osp.join(buildDir, "gencode-cmp.tsv") + sys.stdout.flush() + print("---> Please review", gencode_cmp, file=sys.stderr) + with open(gencode_cmp) as fh: + print(fh.read(), file=sys.stderr) + sys.stderr.flush() + + +DB_TO_ALL_JOINER_VARS = { + "hg38": ("gencodeHg38Vers", + "gencodeHg38GeneSymbolVers", + "gencodeHg38EntrezGeneVers", + "gencodeHg38RefSeqToRefGeneVers",), + "hg19": ("gencodeHg19LiftVers", + "gencodeHg19LiftGeneSymbolVers", + "gencodeHg19LiftEntrezGeneVers", + "gencodeHg19LiftRefSeqToRefGeneVers",), + "mm39": ("gencodeMm39Vers", + "gencodeMm39RefSeqToRefGeneVers",), +} + +def addGencodeVersion(joinerText, setName, spec): + pattern = rf'^(set\s+{re.escape(setName)}\s+)(\S+)' + m = re.search(pattern, joinerText, re.MULTILINE) + if not m: + raise ValueError(f"set {setName} not found") + versions = m.group(2).split(',') + if spec.ucscVersion in versions: + print(f"Note: all.joiner {setName} already has {spec.ucscVersion}", file=sys.stderr) + return joinerText + else: + versions.append(spec.ucscVersion) + return joinerText[:m.start(2)] + ','.join(versions) + joinerText[m.end(2):] + +def editJoinerCheck(spec): + status("updating all.joiner") + newAllJoiner = allJoiner = slurpFile(ALL_JOINER) + + for setName in DB_TO_ALL_JOINER_VARS[spec.db]: + newAllJoiner = addGencodeVersion(newAllJoiner, setName, spec) + + if newAllJoiner != allJoiner: + replaceFile(ALL_JOINER, newAllJoiner) + +def generateTrackDb(spec): + status("added trackDb and description") + cwd = os.getcwd() + try: + os.chdir(TRACKDB_DIR) + dateDesc = spec.releaseDate + if spec.relType == "pre": + dateDesc += " (pre-release)" + cmd = [osp.join(BIN_DIR, "gencodeGenerateTrackDbs"), + spec.db, spec.version, spec.ensemblVersion, + dateDesc] + pipettor.run(cmd) + finally: + os.chdir(cwd) + +def makeReleaseNotesEntry(spec): + label = f"GENCODE version {spec.version}" + if spec.db == "hg19": + label += " (mapped from GRCh38 to GRCh37)" + return (f'

\n' + f'{label}\n' + f'corresponds to Ensembl {spec.ensemblVersion}.\n' + f'

\n') + +def editReleaseNotes(spec): + status("Editing super track release notes") + superHtml = osp.join(getAsmTrackDbDir(spec.db), "wgEncodeGencodeSuper.html") + html = slurpFile(superHtml) + label = f"GENCODE version {spec.version}" + if spec.db == "hg19": + label += " (mapped from GRCh38 to GRCh37)" + spanRe = re.compile(rf']*>\s*{re.escape(label)}\s*') + if spanRe.search(html): + print(f"Note: {superHtml} already has {label}", file=sys.stderr) + else: + header = "

Release Notes

\n" + idx = html.find(header) + if idx < 0: + raise ValueError(f"'

Release Notes

' not found in {superHtml}") + ins = idx + len(header) + newHtml = html[:ins] + makeReleaseNotesEntry(spec) + html[ins:] + replaceFile(superHtml, newHtml) + +def editTrackDbGencodeRa(spec): + status("updating trackDb.gencode.ra") + raPath = osp.join(getAsmTrackDbDir(spec.db), "trackDb.gencode.ra") + ra = slurpFile(raPath) + incLine = f"include wgEncodeGencode{spec.ucscVersion}.ra" + if re.search(rf'^{re.escape(incLine)}\b', ra, re.MULTILINE): + print(f"Note: {raPath} already has {incLine}", file=sys.stderr) + return + if not ra.endswith("\n"): + ra += "\n" + replaceFile(raPath, ra + incLine + "\n") + +def joinerCheck(buildDir, spec): + status("running joinerCheck") + pipettor.run([osp.join(BIN_DIR, "gencodeJoinerCheck"), spec.db, spec.version], stderr=None) + +def loadTrackDb(spec): + status("loading trackDb") + cwd = os.getcwd() + try: + os.chdir(TRACKDB_DIR) + cmd = ["make", f"DBS={spec.db}"] + pipettor.run(cmd, stderr=None) + finally: + os.chdir(cwd) + + +DONE_MSG = """ +*********************************************************** +Finish loading GENCODE Versions {ucscVersion} + +1) check your sandbox +2) commit +3) make alpha DBS={db} +3) push +4) 🍷🍷🍷 +*********************************************************** +""" + +def gencodeBuildVersion(spec, noJoinerCheck): + buildDir = getBuildDir(spec) + fileOps.ensureDir(buildDir) + downloadAndBuild(buildDir, spec) + generateTrackDb(spec) + editTrackDbGencodeRa(spec) + if spec.relType != "pre": + editReleaseNotes(spec) + loadTrackDb(spec) + editJoinerCheck(spec) + if not noJoinerCheck: + joinerCheck(buildDir, spec) + print(DONE_MSG.format(db=spec.db, ucscVersion=spec.ucscVersion), file=sys.stderr) + +def main(): + args = parseArgs() + spec = GencodeSpec(args.db, args.version, + getPrevVersion(args.version), + getBaseVersion(args.version), + args.relType, args.releaseDate, + args.ensemblVersion) + try: + gencodeBuildVersion(spec, args.noJoinerCheck) + except pipettor.ProcessException as ex: + print("Error: " + str(ex), file=sys.stderr) + exit(1) + + +main()