b6aee4c6471cddebd638fec8dbb988c29a69bc22 markd Thu Apr 23 21:58:41 2026 -0700 import of GENCODE V50, MV39, and V50lift37; added a command to do import with a single command diff --git src/hg/makeDb/outside/gencode/bin/gencodeBackMapMetadataIds src/hg/makeDb/outside/gencode/bin/gencodeBackMapMetadataIds index b6d7ad052dd..23e50d95a1d 100755 --- src/hg/makeDb/outside/gencode/bin/gencodeBackMapMetadataIds +++ src/hg/makeDb/outside/gencode/bin/gencodeBackMapMetadataIds @@ -1,105 +1,105 @@ #!/usr/bin/env python3 import sys import os myBinDir = os.path.normpath(os.path.dirname(sys.argv[0])) -sys.path.append(os.path.join(myBinDir, "../lib")) -sys.path.append(os.path.expanduser("/hive/groups/browser/pycbio/lib")) +sys.path.insert(0, os.path.join(myBinDir, "../lib")) +sys.path.insert(0, os.path.expanduser("/hive/groups/browser/pycbio/lib")) from collections import defaultdict import argparse from pycbio.sys import fileOps from pycbio.tsv import TsvReader, TabFileReader # Note: # hg19 backmap generates join errors because of this transcript # missing # ENSG00000168939.6 / ENST00000302805.2 / SPRY3 # # This is the case of SPRV3 which has now has a transcript past the PAR. # While addressed in hg38, the state of GENCODE V19 causes issues, # so we added these to the metadata. # def parseArgs(): desc = """Edit GENCODE metadata tab file use mapped ids. When there are multiple mappings,lines are duplicated.""" parser = argparse.ArgumentParser(description=desc) parser.add_argument("--verbose", dest="verbose", action="store_true", default=False, help="verbose output for debugging") parser.add_argument("--drop", action="append", default=[], help="drop rows with this id, maybe repeated") parser.add_argument("targetDb", help="""target UCSC database""") parser.add_argument("srcAttrsTsv", help="""attributes from src (newer) database""") parser.add_argument("targetAttrsTsv", help="""attributes from target (older) database""") parser.add_argument("inTab", help="""tab file to edit""") parser.add_argument("outTab", nargs='?', default="/dev/stdout", help="""output tab file""") return parser.parse_args() class GencodeBackMapIdMap(defaultdict): """Map GENCODE standard transcript and gene ids to the possible back-mapped ids. Used to update metadata tables""" def __init__(self, mappedAttrsTsv): super(GencodeBackMapIdMap, self).__init__(set) for rec in TsvReader(mappedAttrsTsv): self.__addIds(rec) @staticmethod def __getSrcId(mappedId): return mappedId.split('_')[0] def __addId(self, mappedId): self[self.__getSrcId(mappedId)].add(mappedId) def __addIds(self, row): self.__addId(row.geneId) self.__addId(row.transcriptId) def dump(self, fh): for id in self.keys(): sys.stderr.write("{} = {}\n".format(id, " ".join(self[id]))) def mapTabFileRow(srcIdMap, targetIdMap, row, dupCheck, outTabFh): rowId = row[0] idMap = srcIdMap if rowId not in idMap: idMap = targetIdMap if rowId not in idMap: raise Exception("id {} not found in attributes table".format(rowId)) for backMapId in idMap[rowId]: outRow = tuple([backMapId] + row[1:]) if outRow not in dupCheck: fileOps.prRow(outTabFh, outRow) dupCheck.add(outRow) def isBackMapGeneSource(targetDb, inTab): # gencode.v39lift37.metadata.Gene_source.gz return ((targetDb == "hg19") and (inTab.find(".metadata.Gene_source.") >= 0)) def isBackMapTranscriptSource(targetDb, inTab): # gencode.v39lift37.metadata.Transcript_source.gz return ((targetDb == "hg19") and (inTab.find(".metadata.Transcript_source.") >= 0)) def mapTabFileIds(targetDb, srcIdMap, targetIdMap, inTab, dropIds, outTabFh): # at least one release had some metadata entries accidentally duplicated, so check # for entire row being the same dupCheck = set() for row in TabFileReader(inTab): if row[0] not in dropIds: mapTabFileRow(srcIdMap, targetIdMap, row, dupCheck, outTabFh) def gencodeBackMapMetadataIds(opts): dropIds = frozenset(opts.drop) srcIdMap = GencodeBackMapIdMap(opts.srcAttrsTsv) targetIdMap = GencodeBackMapIdMap(opts.targetAttrsTsv) if opts.verbose: sys.stderr.write(">>> srcAttrs\n") srcIdMap.dump(sys.stderr) sys.stderr.write(">>> targetAttrs\n") targetIdMap.dump(sys.stderr) with open(opts.outTab, "w") as outTabFh: mapTabFileIds(opts.targetDb, srcIdMap, targetIdMap, opts.inTab, dropIds, outTabFh) gencodeBackMapMetadataIds(parseArgs())