b6aee4c6471cddebd638fec8dbb988c29a69bc22
markd
  Thu Apr 23 21:58:41 2026 -0700
import of GENCODE V50, MV39, and V50lift37; added a command to do import with a single command

diff --git src/hg/makeDb/outside/gencode/bin/buildGencodeToUcscLift src/hg/makeDb/outside/gencode/bin/buildGencodeToUcscLift
index 4d68b14488a..c292e6646f9 100755
--- src/hg/makeDb/outside/gencode/bin/buildGencodeToUcscLift
+++ src/hg/makeDb/outside/gencode/bin/buildGencodeToUcscLift
@@ -1,117 +1,117 @@
 #!/usr/bin/env python3
 
 import sys
 import os
 import re
 from collections import defaultdict
 myBinDir = os.path.normpath(os.path.dirname(sys.argv[0]))
-sys.path.append(os.path.expanduser("/hive/groups/browser/pycbio/lib"))
+sys.path.insert(0, os.path.expanduser("/hive/groups/browser/pycbio/lib"))
 import argparse
 from pycbio.db import mysqlOps
 from pycbio.hgdata import hgDb
 from pycbio.hgdata.chromInfo import ChromInfoTbl
 
 mysqlOps.mySqlSetErrorOnWarn()
 
 DEBUG = False
 
 ucscDbToRefAsm = {
     "hg19": "GRCh37",
     "hg38": "GRCh38",
     "mm10": "GRCm38",
     "mm39": "GRCm39",
 }
 
 badChroms = frozenset([
     "chrUn_KI270752v1"   # contamination in GRCh38
 ])
 
 def parseArgs():
     desc = """Generate liftOver chains of GENCODE chromosome names to UCSC chromosomes.
 
     For GRCh37, this does special handling of chrM to handle mapping
     NC_012920 to UCSC chrM.
 
     It also create mappings for the chr* names used by GENCODE and ENCODE,
     with the exclusion of chrM on GRCh37.
 """
     parser = argparse.ArgumentParser(description=desc)
     parser.add_argument("ucscDbName", choices=ucscDbToRefAsm.keys(),
                         help="UCSC daatbase name for reference assembly name")
     parser.add_argument("gencodeToUcscLift",
                         help="GENCODE to UCSC to liftOver chains")
     return parser.parse_args()
 
 
 # template for 1-to-1 chain
 chainTmpl = """chain {size} {qName} {size} + 0 {size} {tName} {size} + 0 {size} {chainId}
 {size}
 """
 
 # alignment of chrMs on hg19. GRCh37 did not which initially have chrM,
 # so UCSC chose a different one.
 hg19ChrMQNames = ("NC_012920", "MT", "chrM")
 
 hg19ChrMChain = """chain 16493 {qName} 16569 + 0 16569 chrM 16571 + 0 16571 {chainId}
 309	0	1
 6	0	1
 2791	1	0
 13081	1	2
 380
 """
 
 def isPrimaryChrom(chrom):
     return re.match("^chr(([1-9][0-9]?)|[XYM]|MT)$", chrom)
 
 def loadChromAlias(conn):
     # other ucsc sources
     chromAliases = defaultdict(set)
     for row in mysqlOps.query(conn, 'SELECT alias, chrom FROM chromAlias'):
         chromAliases[row[1]].add(row[0])
     chromAliases.default_factory = None
     return chromAliases
 
 def writeHg19ChrM(nextChainId, fh):
     for chrMName in hg19ChrMQNames:
         fh.write(hg19ChrMChain.format(qName=chrMName, chainId=nextChainId))
         nextChainId += 1
     return nextChainId
 
 def writeStdChain(chromName, ucscChrom, nextChainId, fh):
     fh.write(chainTmpl.format(size=ucscChrom.size, qName=chromName, tName=ucscChrom.chrom, chainId=nextChainId))
     return nextChainId + 1
 
 def writeChain(ucscDbName, ucscChrom, chromAliases, nextChainId, fh):
     if ucscChrom.chrom not in chromAliases:
         # raise Exception(f"UCSC chrom {ucscChrom.chrom} not in chromAlias table")
         print(f"WARNING: UCSC chrom {ucscChrom.chrom} not in chromAlias table", file=sys.stderr)
         return nextChainId
     elif (ucscDbName == "hg19") and (ucscChrom.chrom == "chrM"):
         nextChainId = writeHg19ChrM(nextChainId, fh)
     elif (ucscDbName == "hg19") and (ucscChrom.chrom == "chrMT"):
         # don't create a constant lift for chrM -> chrMT, this is handled by gencodeGxfToGenePred
         # since liftOver can't handle multiple lifts
         pass
     elif isPrimaryChrom(ucscChrom.chrom):
         nextChainId = writeStdChain(ucscChrom.chrom, ucscChrom, nextChainId, fh)
     else:
         for chromName in sorted(chromAliases[ucscChrom.chrom]):
             nextChainId = writeStdChain(chromName, ucscChrom, nextChainId, fh)
     return nextChainId
 
 def writeChains(ucscDbName, chromInfos, chromAliases, fh):
     nextChainId = 1
     for ucscChrom in sorted(chromInfos.values()):
         if ucscChrom.chrom not in badChroms:
             nextChainId = writeChain(ucscDbName, ucscChrom, chromAliases, nextChainId, fh)
 
 def buildGencodeToUcscLift(opts):
     with hgDb.connect(opts.ucscDbName) as conn:
         chromInfos = ChromInfoTbl.loadDb(conn)
         chromAliases = loadChromAlias(conn)
 
     with open(opts.gencodeToUcscLift, "w") as fh:
         writeChains(opts.ucscDbName, chromInfos, chromAliases, fh)
 
 
 buildGencodeToUcscLift(parseArgs())