bac95a147f49cd331052e597006e04b3deee40fc max Wed Apr 22 10:43:20 2026 -0700 lrSv/srSv: human-readable SV type filter labels, script cleanups Add human-readable labels to the supertrack-level svType filter on both the lrSv and srSv supertracks using the "CODE|CODE (Long name)" filterValues syntax: DEL -> "DEL (Deletion)", INS -> "INS (Insertion)", etc. Labels keep the short code up front so users can match what hgTracks shows next to each feature. Also sweep in the in-progress converter/as-file cleanups under scripts/lrSv/ and scripts/srSv/ (introduction of lrSvCommon.py helpers, consistent insLen / svLen / AC column naming, tightened field-description text) that had been piling up as an unstaged working tree. refs #36258 diff --git src/hg/makeDb/scripts/lrSv/lrSvHprc2VcfToBed.py src/hg/makeDb/scripts/lrSv/lrSvHprc2VcfToBed.py index 6807b9f7d8d..474bcbed15f 100644 --- src/hg/makeDb/scripts/lrSv/lrSvHprc2VcfToBed.py +++ src/hg/makeDb/scripts/lrSv/lrSvHprc2VcfToBed.py @@ -1,180 +1,197 @@ #!/usr/bin/env python3 """Convert the HPRC release-2 wave-decomposed pangenome VCF to BED9+. Filters to SV-sized alleles (|LEN| >= 50 bp) and inversions (INV flag). Expands multi-allelic rows so each ALT becomes its own BED row. Usage: lrSvHprc2VcfToBed.py input.vcf.gz output.bed Source: https://humanpangenome.org/hprc-data-release-2/ Graph/VCF from the HPRC v2.0 minigraph-cactus GRCh38 release (see https://github.com/human-pangenomics/hprc_intermediate_assembly/ blob/main/data_tables/pangenomes/alignments_v2.0.csv). """ import gzip +import os import sys +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +from lrSvCommon import svName, normalizeSvType + MIN_SV_LEN = 50 SV_COLORS = { "INS": "0,0,200", # blue "DEL": "200,0,0", # red - "COMPLEX": "140,0,200", # purple + "CPX": "140,0,200", # purple "INV": "230,140,0", # orange } def parseInfo(infoStr): d = {} for item in infoStr.split(";"): if "=" in item: k, v = item.split("=", 1) d[k] = v else: d[item] = True return d def splitMulti(s, n): """Split a per-allele INFO value; pad with '' if missing.""" if s is None: return [""] * n parts = s.split(",") if len(parts) < n: parts = parts + [""] * (n - len(parts)) return parts def deriveType(ref, alt): """Derive allele type when TYPE isn't in INFO.""" dr, da = len(ref), len(alt) if dr == da: return "snp" if dr == 1 else "mnp" if dr == 1 and da > 1: return "ins" if da == 1 and dr > 1: return "del" return "complex" def main(): if len(sys.argv) != 3: print(__doc__, file=sys.stderr) sys.exit(1) inPath, outPath = sys.argv[1], sys.argv[2] opener = gzip.open if inPath.endswith(".gz") else open with opener(inPath, "rt") as fIn, open(outPath, "w") as fOut: for line in fIn: if line.startswith("#"): continue fields = line.rstrip("\n").split("\t") chrom = fields[0] pos = int(fields[1]) - name = fields[2] + rowId = fields[2] ref = fields[3] alts = fields[4].split(",") info = parseInfo(fields[7]) n = len(alts) typeList = splitMulti(info.get("TYPE"), n) lenList = splitMulti(info.get("LEN"), n) acList = splitMulti(info.get("AC"), n) afList = splitMulti(info.get("AF"), n) isInv = "INV" in info try: nSamples = int(info.get("NS", "0")) except ValueError: nSamples = 0 try: an = int(info.get("AN", "0")) except ValueError: an = 0 try: snarlLevel = int(info.get("LV", "0")) except ValueError: snarlLevel = 0 for i, alt in enumerate(alts): if alt in (".", "*"): continue - t = (typeList[i] or "").strip().lower() + t = (typeList[i] or "").strip() if not t: t = deriveType(ref, alt) lenStr = (lenList[i] or "").strip() try: svLenVal = int(lenStr) except ValueError: svLenVal = abs(len(alt) - len(ref)) # Filter: keep SV-sized alleles plus INV flag records. if svLenVal < MIN_SV_LEN and not isInv: continue - # Normalize type for display. - svType = t.upper() - if svType not in ("INS", "DEL", "COMPLEX"): - svType = "COMPLEX" + # Normalize type for display (complex -> CPX via lrSvCommon). + svType = normalizeSvType(t) + if svType not in ("INS", "DEL", "CPX"): + svType = "CPX" if isInv: svType = "INV" color = SV_COLORS.get(svType, "100,100,100") chromStart = pos - 1 - # INS: 1-bp anchor; DEL/COMPLEX/INV: span the REF interval. + # INS: 1-bp anchor; DEL/CPX/INV: span the REF interval. if svType == "INS": chromEnd = chromStart + 1 else: chromEnd = chromStart + len(ref) if chromEnd <= chromStart: chromEnd = chromStart + 1 try: ac = int(acList[i]) except ValueError: ac = 0 try: af = float(afList[i]) except ValueError: af = 0.0 - # Name: VCF ID if informative, else constructed. Truncate + svLen = chromEnd - chromStart + # insLen: for INS use abs(ALT-REF) / svLenVal; for DEL/CPX/INV 0. + if svType == "INS": + insLen = abs(len(alt) - len(ref)) + if insLen == 0: + insLen = svLenVal + else: + insLen = 0 + + # ID: VCF ID if informative, else constructed. Truncate # very long snarl/ORIGIN-chain IDs to stay within bigBed's # 255-char limit. - if name and name != ".": - rowName = f"{name}.{i+1}" if n > 1 else name + if rowId and rowId != ".": + baseId = f"{rowId}.{i+1}" if n > 1 else rowId else: - rowName = f"{chrom}:{pos}:{svType}:{svLenVal}" - if len(rowName) > 200: - rowName = f"{chrom}:{pos}:{svType}:{svLenVal}" + baseId = f"{chrom}:{pos}:{svType}:{svLenVal}" + if len(baseId) > 200: + baseId = f"{chrom}:{pos}:{svType}:{svLenVal}" + + featLen = insLen if svType in ("INS", "MEI") else svLen + name = svName(svType, featLen, ac) row = [ chrom, str(chromStart), str(chromEnd), - rowName, + name, "0", ".", str(chromStart), str(chromEnd), color, svType, - str(svLenVal), + str(svLen), + str(insLen), str(ac), str(an), f"{af:.6f}", str(nSamples), str(snarlLevel), ] fOut.write("\t".join(row) + "\n") if __name__ == "__main__": main()