b6aee4c6471cddebd638fec8dbb988c29a69bc22 markd Thu Apr 23 21:58:41 2026 -0700 import of GENCODE V50, MV39, and V50lift37; added a command to do import with a single command diff --git src/hg/makeDb/outside/gencode/bin/buildGencodeToUcscLift src/hg/makeDb/outside/gencode/bin/buildGencodeToUcscLift index 4d68b14488a..c292e6646f9 100755 --- src/hg/makeDb/outside/gencode/bin/buildGencodeToUcscLift +++ src/hg/makeDb/outside/gencode/bin/buildGencodeToUcscLift @@ -1,117 +1,117 @@ #!/usr/bin/env python3 import sys import os import re from collections import defaultdict myBinDir = os.path.normpath(os.path.dirname(sys.argv[0])) -sys.path.append(os.path.expanduser("/hive/groups/browser/pycbio/lib")) +sys.path.insert(0, os.path.expanduser("/hive/groups/browser/pycbio/lib")) import argparse from pycbio.db import mysqlOps from pycbio.hgdata import hgDb from pycbio.hgdata.chromInfo import ChromInfoTbl mysqlOps.mySqlSetErrorOnWarn() DEBUG = False ucscDbToRefAsm = { "hg19": "GRCh37", "hg38": "GRCh38", "mm10": "GRCm38", "mm39": "GRCm39", } badChroms = frozenset([ "chrUn_KI270752v1" # contamination in GRCh38 ]) def parseArgs(): desc = """Generate liftOver chains of GENCODE chromosome names to UCSC chromosomes. For GRCh37, this does special handling of chrM to handle mapping NC_012920 to UCSC chrM. It also create mappings for the chr* names used by GENCODE and ENCODE, with the exclusion of chrM on GRCh37. """ parser = argparse.ArgumentParser(description=desc) parser.add_argument("ucscDbName", choices=ucscDbToRefAsm.keys(), help="UCSC daatbase name for reference assembly name") parser.add_argument("gencodeToUcscLift", help="GENCODE to UCSC to liftOver chains") return parser.parse_args() # template for 1-to-1 chain chainTmpl = """chain {size} {qName} {size} + 0 {size} {tName} {size} + 0 {size} {chainId} {size} """ # alignment of chrMs on hg19. GRCh37 did not which initially have chrM, # so UCSC chose a different one. hg19ChrMQNames = ("NC_012920", "MT", "chrM") hg19ChrMChain = """chain 16493 {qName} 16569 + 0 16569 chrM 16571 + 0 16571 {chainId} 309 0 1 6 0 1 2791 1 0 13081 1 2 380 """ def isPrimaryChrom(chrom): return re.match("^chr(([1-9][0-9]?)|[XYM]|MT)$", chrom) def loadChromAlias(conn): # other ucsc sources chromAliases = defaultdict(set) for row in mysqlOps.query(conn, 'SELECT alias, chrom FROM chromAlias'): chromAliases[row[1]].add(row[0]) chromAliases.default_factory = None return chromAliases def writeHg19ChrM(nextChainId, fh): for chrMName in hg19ChrMQNames: fh.write(hg19ChrMChain.format(qName=chrMName, chainId=nextChainId)) nextChainId += 1 return nextChainId def writeStdChain(chromName, ucscChrom, nextChainId, fh): fh.write(chainTmpl.format(size=ucscChrom.size, qName=chromName, tName=ucscChrom.chrom, chainId=nextChainId)) return nextChainId + 1 def writeChain(ucscDbName, ucscChrom, chromAliases, nextChainId, fh): if ucscChrom.chrom not in chromAliases: # raise Exception(f"UCSC chrom {ucscChrom.chrom} not in chromAlias table") print(f"WARNING: UCSC chrom {ucscChrom.chrom} not in chromAlias table", file=sys.stderr) return nextChainId elif (ucscDbName == "hg19") and (ucscChrom.chrom == "chrM"): nextChainId = writeHg19ChrM(nextChainId, fh) elif (ucscDbName == "hg19") and (ucscChrom.chrom == "chrMT"): # don't create a constant lift for chrM -> chrMT, this is handled by gencodeGxfToGenePred # since liftOver can't handle multiple lifts pass elif isPrimaryChrom(ucscChrom.chrom): nextChainId = writeStdChain(ucscChrom.chrom, ucscChrom, nextChainId, fh) else: for chromName in sorted(chromAliases[ucscChrom.chrom]): nextChainId = writeStdChain(chromName, ucscChrom, nextChainId, fh) return nextChainId def writeChains(ucscDbName, chromInfos, chromAliases, fh): nextChainId = 1 for ucscChrom in sorted(chromInfos.values()): if ucscChrom.chrom not in badChroms: nextChainId = writeChain(ucscDbName, ucscChrom, chromAliases, nextChainId, fh) def buildGencodeToUcscLift(opts): with hgDb.connect(opts.ucscDbName) as conn: chromInfos = ChromInfoTbl.loadDb(conn) chromAliases = loadChromAlias(conn) with open(opts.gencodeToUcscLift, "w") as fh: writeChains(opts.ucscDbName, chromInfos, chromAliases, fh) buildGencodeToUcscLift(parseArgs())