da323191c57c36932ad1751af87b59b71b836501 markd Fri Jul 25 23:20:00 2025 -0700 fixed issues created by the pain of parsing gene2accession #36073 diff --git src/hg/makeDb/outside/hgnc/hgncToBigBed src/hg/makeDb/outside/hgnc/hgncToBigBed new file mode 100755 index 00000000000..4ca53088e19 --- /dev/null +++ src/hg/makeDb/outside/hgnc/hgncToBigBed @@ -0,0 +1,374 @@ +#!/usr/bin/env python3 + +import sys +import re +import os.path as osp +import argparse +import subprocess +from collections import defaultdict, namedtuple + +sys.path.append('/hive/groups/browser/pycbio/lib') +from pycbio.sys.objDict import ObjDict +from pycbio.tsv import TsvReader + +# these are for analysis, select which location sources to use +INCL_GENCODE = True +INCL_REFSEQ_CURATED = True +INCL_REFSEQ_OTHER = True + +def parseArgs(): + desc = """Generate bigBed and trix indexes for HGNC gene identifiers. + This uses the GENCODE SQL tables, and RefSeq curated and other data to + find locations of the HGNC entries. RefSeq predicted is not used, as + these wouldn't not be used to define a HGNC locus, only add new + transcripts. """ + + parser = argparse.ArgumentParser(description=desc) + parser.add_argument("--unmappedTsv", + help="Write HGNC entries that couldn't be mapped to this file. " + "Some are expected, include genes not in genome and types not with cooordinates") + parser.add_argument("ucscDbName", choices=['hg38', 'hg19'], + help="UCSC database name") + parser.add_argument("gencodeVersion", + help="GENCODE version, e.g. V47 on hg38 and V47lift37 oh hg19") + parser.add_argument("hgncFile", + help="Path to the HGNC complete set file from: " + "https://storage.googleapis.com/public-download-files/hgnc/tsv/tsv/hgnc_complete_set.txt") + parser.add_argument("hgncBed", + help="Output BED file name") + parser.add_argument("hgncBigBed", + help="Output bigBed file name") + parser.add_argument("trixPrefix", + help="prefix file name for Trix input") + parser.add_argument("locusTypeFilter", + help="text for locus_type filter, with RNAs first") + args = parser.parse_args() + if not re.fullmatch(r'V\d+(lift37)?', args.gencodeVersion): + parser.error(f"invalid GENCODE version '{args.gencodeVersion}', " + "expected string like `V47' or `V47lift37'") + return args + +# these must match hgncBig62.as; TSV 'name' is renamed to 'geneName" +stdBedFields = ("chrom", "chromStart", "chromEnd", "name", "score", "strand", + "thickStart", "thickEnd", "itemRgb") + +hgncBedFields = ("symbol", "geneName", "locus_group", "locus_type", "status", + "location", "location_sortable", "alias_symbol", "alias_name", + "prev_symbol", "prev_name", "gene_group", "gene_group_id", + "date_approved_reserved", "date_symbol_changed", "date_name_changed", + "date_modified", "entrez_id", "ensembl_gene_id", "vega_id", "ucsc_id", + "ena", "refseq_accession", "ccds_id", "uniprot_ids", "pubmed_id", "mgd_id", + "rgd_id", "lsdb", "cosmic", "omim_id", "mirbase", "homeodb", "snornabase", + "bioparadigms_slc", "orphanet", "pseudogene.org", "horde_id", "merops", + "imgt", "iuphar", "kznf_gene_catalog", "mamit-trnadb", "cd", "lncrnadb", + "enzyme_id", "intermediate_filament_db", "rna_central_id", "lncipedia", + "gtrnadb", "agr", "mane_select", "gencc") +allFields = stdBedFields + hgncBedFields + +class HGNCBed(ObjDict): + "hgncBig62.as in memory, both a dict and an object with fields" + + def __init__(self, chrom, chromStart, chromEnd, name, strand, itemRgb): + self.chrom = chrom + self.chromStart = chromStart + self.chromEnd = chromEnd + self.name = name + self.score = 0 + self.strand = strand + self.thickStart = chromStart + self.thickEnd = chromEnd + self.itemRgb = itemRgb + + def __str__(self): + row = [str(self[f]) for f in allFields] + return "\t".join(row) + +class HgncCoords(namedtuple("Coords", + ("hgncId", "chrom", "start", "end", "strand"))): + pass + +RNA_COLOR = "9,99,11" +PROTEIN_CODING_COLOR = "13,20,118" +PSEUDOGENE_COLOR = "253,66,253" +OTHER_COLOR = "0,0,0" + +# post-comma removal color coding +LOCUS_TYPE_COLOR_MAP = { + "Y RNA": RNA_COLOR, + "cluster RNA": RNA_COLOR, + "long non-coding RNA": RNA_COLOR, + "micro RNA": RNA_COLOR, + "misc RNA": RNA_COLOR, + "ribosomal RNA": RNA_COLOR, + "small nuclear RNA": RNA_COLOR, + "small nucleolar RNA": RNA_COLOR, + "transfer RNA": RNA_COLOR, + "vault RNA": RNA_COLOR, + "T cell receptor gene": PROTEIN_CODING_COLOR, + "T cell receptor pseudogene": PSEUDOGENE_COLOR, + "complex locus constituent": OTHER_COLOR, + "endogenous retrovirus": OTHER_COLOR, + "fragile site": OTHER_COLOR, + "gene with protein product": PROTEIN_CODING_COLOR, + "immunoglobulin gene": OTHER_COLOR, + "immunoglobulin pseudogene": PSEUDOGENE_COLOR, + "protocadherin": OTHER_COLOR, + "pseudogene": PSEUDOGENE_COLOR, + "readthrough": OTHER_COLOR, + "region": OTHER_COLOR, + "unknown": OTHER_COLOR, + "virus integration site": OTHER_COLOR +} + +#### +# reading hgnc to coordinates from various sournces +#### + +def cmdTsvReader(cmd, columnNameMapper=None): + typeMap = {"txStart": int, + "txEnd": int} + with subprocess.Popen(cmd, stdout=subprocess.PIPE, text=True) as proc: + for row in TsvReader(cmd[0], inFh=proc.stdout, typeMap=typeMap, columnNameMapper=columnNameMapper): + yield row + proc.wait() + if proc.returncode != 0: + raise subprocess.CalledProcessError(proc.returncode, cmd) + + +def readHgncTranscriptCoords(tsvReader): + """Read hgnc id and coordinates for transcript mapped to genome + into a two dimensional dict in the form [hgncId][chrom]. + TSVReader object can be reading from a database of big bed. + Needs the columns: hgncId, chrom, txStart, txEnd, strand. + """ + hgncIdToTransCoords = defaultdict(lambda: defaultdict(list)) + for row in tsvReader: + hgncIdToTransCoords[row.hgncId][row.chrom].append(row) + hgncIdToTransCoords.default_factory = None + return hgncIdToTransCoords + +def transCoordsToGeneCoords(transChromRows): + """"convert a set of transcripts for a given HGNC to gene coordinates""" + row0 = transChromRows[0] + start = row0.txStart + end = row0.txEnd + for row in transChromRows[1:]: + start = min(start, row.txStart) + end = max(end, row.txEnd) + return HgncCoords(row0.hgncId, row0.chrom, start, end, row0.strand) + +def buildHgncCoordsMap(tsvReader): + """get a reader of HGNC id to transcript coordinates, dict + of HGNC id to coordinates. Alts and patches mean a HGNC can be on + multiple chromosomes""" + hgncIdToTransCoords = readHgncTranscriptCoords(tsvReader) + hgncIdCoordsMap = defaultdict(list) + for hgncId, hgncIdsRows in hgncIdToTransCoords.items(): + for transChromRows in hgncIdsRows.values(): + hgncIdCoordsMap[hgncId].append(transCoordsToGeneCoords(transChromRows)) + return hgncIdCoordsMap + +def sqlHgncTransTsvReader(ucscDbName, sql): + "read using an hgsql query" + cmd = ["hgsql", ucscDbName, "-e", sql] + yield from cmdTsvReader(cmd) + +def gencodeHgncTransTsvReader(ucscDbName, gencodeVersion): + "read from GENCODE SQL tables" + sql = f""" + SELECT geneId as hgncId, chrom, txStart, txEnd, strand + FROM wgEncodeGencodeComp{gencodeVersion} comp, + wgEncodeGencodeGeneSymbol{gencodeVersion} as sym + WHERE (comp.name = sym.transcriptId) + UNION + SELECT geneId as hgncId, chrom, txStart, txEnd, strand + FROM wgEncodeGencodePseudoGene{gencodeVersion} comp, + wgEncodeGencodeGeneSymbol{gencodeVersion} as sym + WHERE (comp.name = sym.transcriptId) + """ + return sqlHgncTransTsvReader(ucscDbName, sql) + +def refSeqCuratedHgncTransTsvReader(ucscDbName): + "read from RefSeq curate tables" + sql = """ + SELECT concat("HGNC:", hgnc) as hgncId, chrom, txStart, txEnd, strand + FROM ncbiRefSeqCurated rsc, ncbiRefSeqLink rsl + WHERE (rsc.name = rsl.id) AND (rsl.hgnc != "") + """ + return sqlHgncTransTsvReader(ucscDbName, sql) + +def refSeqOtherNameMapper(colName): + "bigBed to genePred column names" + if colName == "chromStart": + return "txStart" + elif colName == "chromEnd": + return "txEnd" + elif colName == "HGNC": + return "hgncId" + else: + return colName + +def refSeqOtherHgncTransTsvReader(ucscDbName): + "read from RefSeq other bigbed" + bigBed = f"/gbdb/{ucscDbName}/ncbiRefSeq/ncbiRefSeqOther.bb" + cmd = ["bigBedToBed", "-tsv", bigBed, "stdout"] + yield from cmdTsvReader(cmd, refSeqOtherNameMapper) + + +class HgncIdCoordsMap: + """Read HGNC coordinates from various sources""" + def __init__(self, ucscDbName, gencodeVersion): + self.gencodeMap = self.refseqCuratedMap = self.refseqOtherMap = None + if INCL_GENCODE: + self.gencodeMap = buildHgncCoordsMap(gencodeHgncTransTsvReader(ucscDbName, gencodeVersion)) + if INCL_REFSEQ_CURATED: + self.refseqCuratedMap = buildHgncCoordsMap(refSeqCuratedHgncTransTsvReader(ucscDbName)) + if INCL_REFSEQ_OTHER: + self.refseqOtherMap = buildHgncCoordsMap(refSeqOtherHgncTransTsvReader(ucscDbName)) + + def get(self, hgncId, default=None): + """get list HgncCoords object for a HGNC ID or None""" + hgncCoords = None + if (hgncCoords is None) and INCL_GENCODE: + hgncCoords = self.gencodeMap.get(hgncId) + if (hgncCoords is None) and INCL_REFSEQ_CURATED: + hgncCoords = self.refseqCuratedMap.get(hgncId) + if (hgncCoords is None) and INCL_REFSEQ_OTHER: + hgncCoords = self.refseqOtherMap.get(hgncId) + if hgncCoords is None: + return default + return hgncCoords + +### +# process of HGNC data +### +def loadHgncGenes(hgncFile): + return list(TsvReader(hgncFile)) + +def hgncGenePrevSymbols(prevSymbols): + "split into a list" + return prevSymbols.strip('"').split('|') + +def getLocusColor(locusType): + color = LOCUS_TYPE_COLOR_MAP.get(locusType, None) + if color is None: + raise Exception(f"No color for locus_type `{locusType}', most likely HGNC added a new type") + return color + +def renameLocusType(locusType): + """bigBed filters can't include commas due to trackDb syntax, + 'RNA, micro' -> 'micro RNA'""" + if locusType.startswith("RNA, "): + parts = locusType.split(", ") + return f"{parts[1]} RNA" + return locusType + +def hgncGeneToBed(hgncGene, coords): + locusType = renameLocusType(hgncGene.locus_type) + bed = HGNCBed(coords.chrom, coords.start, coords.end, hgncGene.hgnc_id, coords.strand, + getLocusColor(locusType)) + + # copy remaining fields with a couple of special cases + for fld in hgncBedFields: + if fld == "locus_type": + bed[fld] = locusType + elif fld == "geneName": + bed[fld] = hgncGene.name + else: + # remove quotes + bed[fld] = getattr(hgncGene, fld).strip('"') + return bed + +def hgncGeneToBeds(hgncGene, hgncIdCoordsMap, hgncMappedIds): + all_coords = hgncIdCoordsMap.get(hgncGene.hgnc_id) + if all_coords is None: + return [] + else: + hgncMappedIds.add(hgncGene.hgnc_id) + return [hgncGeneToBed(hgncGene, coords) for coords in all_coords] + +def generateHgncBeds(hgncGenes, hgncIdCoordsMap, hgncMappedIds): + hgncBeds = [] + for hgncGene in hgncGenes: + beds = hgncGeneToBeds(hgncGene, hgncIdCoordsMap, hgncMappedIds) + hgncBeds.extend(beds) + return hgncBeds + +def writeHgncBeds(hgncBeds, bedFh): + for bed in sorted(hgncBeds, key=lambda b: (b.chrom, b.chromStart, b.chromEnd)): + print(str(bed), file=bedFh) + +def buildBigBed(ucscDbName, hgncBedFile, hgncBigBed): + # .as relative to this file + libDir = osp.join(osp.dirname(__file__), "../../../lib") + asFile = osp.join(libDir, "hgncBig62.as") + chromSizes = f"/cluster/data/{ucscDbName}/chrom.sizes" + cmd = ["bedToBigBed", "-extraIndex=name", "-tab", "-type=bed9+53", + "-as=" + asFile, hgncBedFile, chromSizes, hgncBigBed] + subprocess.run(cmd, check=True) + +def trixValues(hgncBed): + # prev_symbol MSK16|CSPG1|AGC1 + prev_symbols = hgncGenePrevSymbols(hgncBed.prev_symbol) + return hgncBed.symbol + " " + " ".join(prev_symbols) + ';' + +def buildTrixInput(hgncBeds, trixInFh): + # alts & patches can result in multiple entries + done = set() + for hgncBed in hgncBeds: + if hgncBed.name not in done: + print(hgncBed.name, trixValues(hgncBed), sep='\t', file=trixInFh) + done.add(hgncBed.name) + +def buildTrix(hgncBeds, trixPrefix): + trixIn = trixPrefix + ".trixin" + with open(trixIn, 'w') as trixInFh: + buildTrixInput(hgncBeds, trixInFh) + cmd = ["ixIxx", "-maxWordLength=32", trixIn, trixPrefix + ".ix", trixPrefix + ".ixx"] + subprocess.run(cmd, check=True) + +def writeLocusTypeFilter(hgncBeds, locusTypeFilter): + rnaTypes = set() + otherTypes = set() + # split by RNA, and others + for hgncBed in hgncBeds: + if "RNA" in hgncBed.locus_type: + rnaTypes.add(hgncBed.locus_type) + else: + otherTypes.add(hgncBed.locus_type) + + filterVal = (", ".join(sorted(rnaTypes)) + ", " + + ", ".join(sorted(otherTypes))) + with open(locusTypeFilter, 'w') as fh: + print(filterVal, file=fh) + +def writeUnmappedTsv(hgncGenes, hgncMappedIds, tsvFh): + hgncGenes[0].writeHeader(tsvFh) + for row in hgncGenes: + if row.hgnc_id not in hgncMappedIds: + row.write(tsvFh) + +def hgncToBigBed(ucscDbName, gencodeVersion, hgncFile, hgncBedFile, hgncBigBed, + trixPrefix, locusTypeFilter, unmappedTsv): + hgncIdCoordsMap = HgncIdCoordsMap(ucscDbName, gencodeVersion) + hgncGenes = loadHgncGenes(hgncFile) + hgncMappedIds = set() + hgncBeds = generateHgncBeds(hgncGenes, hgncIdCoordsMap, hgncMappedIds) + + with open(hgncBedFile, 'w') as bedFh: + writeHgncBeds(hgncBeds, bedFh) + buildBigBed(ucscDbName, hgncBedFile, hgncBigBed) + buildTrix(hgncBeds, trixPrefix) + writeLocusTypeFilter(hgncBeds, locusTypeFilter) + if unmappedTsv is not None: + with open(unmappedTsv, 'w') as tsvFh: + writeUnmappedTsv(hgncGenes, hgncMappedIds, tsvFh) + +def main(): + args = parseArgs() + hgncToBigBed(args.ucscDbName, args.gencodeVersion, args.hgncFile, + args.hgncBed, args.hgncBigBed, args.trixPrefix, + args.locusTypeFilter, args.unmappedTsv) + +if __name__ == "__main__": + main()