9eb4e0937782954c19d664e7d384d210bffb3b25 max Sat Jun 13 16:01:42 2026 -0700 lrSv: QA fixes from Lou's review - dedup, shared color palette, deCODE/AoU cleanup - Drop kwanhoSv (KimPD) from the lrSvAll merge in databases.tsv; it stays on dev/alpha until published, which also removes its >5 Mb breakend artifacts from the merged track. - Remove searchIndex from colorsDbSv, lrSv1kLin and lrSvAll (and the merge generator): the bigBeds were built without a name index, so by-name search never worked. - Single shared per-SV-type color palette in lrSvCommon.py (svColor), used by every converter and the merge. CPX is purple everywhere (was orange in 1kgOnt/apr/cpc1, colliding with INV's orange), colorsDb DEL is 200,0,0 like the rest, and TRA/INSDEL get their own colors. - deCODE: drop byte-identical duplicate rows and blank the fake AC=50 placeholder (AC is now a string field, omitted from the name and mouseOver). - AoU: numeric-entity-encode non-ASCII gene/trait text and drop duplicate rows. - gustafson, chirmade101, hprc2v21: drop byte-identical duplicate rows. - lrSvMergeAll.py: skip byte-identical duplicate source rows instead of summing their allele counts, which had inflated the per-database and total AC. refs #36258 diff --git src/hg/makeDb/scripts/lrSv/lrSvHprc2RoVcfToBed.py src/hg/makeDb/scripts/lrSv/lrSvHprc2RoVcfToBed.py index 57efda1917b..57ee8aa61cf 100644 --- src/hg/makeDb/scripts/lrSv/lrSvHprc2RoVcfToBed.py +++ src/hg/makeDb/scripts/lrSv/lrSvHprc2RoVcfToBed.py @@ -1,217 +1,221 @@ #!/usr/bin/env python3 """Convert the HPRC release-2.1 minigraph-cactus pangenome VCF to BED9+. This is the v2.1 "gref95.ro" callset (Glenn Hickey, HPRC). Unlike the v2.0 "wave" VCF, this file is the raw `vg deconstruct` output: it carries graph traversals (AT), nested snarls (LV) and a parent-snarl pointer (PS), and it has NO per-allele TYPE or LEN fields. We therefore classify each ALT by parsimony-trimming the shared prefix/suffix of REF/ALT and looking at the net length change: trimmed ref empty -> INS (pure insertion) trimmed alt empty -> DEL (pure deletion) both non-empty, alt longer -> INS (net insertion, ragged ends) both non-empty, ref longer -> DEL (net deletion, ragged ends) both non-empty, equal length-> CPX (balanced substitution / MNP) An allele is kept when abs(len(ALT) - len(REF)) >= 50 bp, matching the length threshold used for the v2.0 track. AC / AF / AN / NS / LV are taken from the INFO fields. There is no INV annotation in this file, so no inversions are emitted (balanced events fall into CPX). Usage: lrSvHprc2RoVcfToBed.py input.vcf.gz output.bed Source: https://humanpangenome.org/hprc-data-release-2/ hprc-v2.1-mc-grch38.gref95.ro.vcf.gz (233-sample minigraph-cactus graph). """ import gzip import os import sys sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) -from lrSvCommon import svName +from lrSvCommon import svName, svColor MIN_SV_LEN = 50 -SV_COLORS = { - "INS": "0,0,200", # blue - "DEL": "200,0,0", # red - "CPX": "140,0,200", # purple -} - def parseInfo(infoStr): d = {} for item in infoStr.split(";"): if "=" in item: k, v = item.split("=", 1) d[k] = v else: d[item] = True return d def splitMulti(s, n): """Split a per-allele INFO value; pad with '' if missing.""" if s is None: return [""] * n parts = s.split(",") if len(parts) < n: parts = parts + [""] * (n - len(parts)) return parts def classify(ref, alt): """Parsimony-trim REF/ALT and classify; return (svType, refSpan). refSpan is the number of trimmed reference bases the event spans, used to position chromEnd. The original chromStart is shifted by the trimmed prefix length so the feature sits on the actual changed bases. """ # trim shared prefix p = 0 while p < len(ref) and p < len(alt) and ref[p] == alt[p]: p += 1 r = ref[p:] a = alt[p:] # trim shared suffix s = 0 while s < len(r) and s < len(a) and r[-1 - s] == a[-1 - s]: s += 1 if s: r = r[:len(r) - s] a = a[:len(a) - s] lr, la = len(r), len(a) if la > lr: svType = "INS" elif lr > la: svType = "DEL" else: svType = "CPX" return svType, p, lr def main(): if len(sys.argv) != 3: print(__doc__, file=sys.stderr) sys.exit(1) inPath, outPath = sys.argv[1], sys.argv[2] opener = gzip.open if inPath.endswith(".gz") else open counts = {"INS": 0, "DEL": 0, "CPX": 0} nested = 0 + seen = set() + nDup = 0 with opener(inPath, "rt") as fIn, open(outPath, "w") as fOut: for line in fIn: if line.startswith("#"): continue fields = line.rstrip("\n").split("\t") chrom = fields[0] pos = int(fields[1]) rowId = fields[2] ref = fields[3] alts = fields[4].split(",") info = parseInfo(fields[7]) n = len(alts) acList = splitMulti(info.get("AC"), n) afList = splitMulti(info.get("AF"), n) try: nSamples = int(info.get("NS", "0")) except ValueError: nSamples = 0 try: an = int(info.get("AN", "0")) except ValueError: an = 0 try: snarlLevel = int(info.get("LV", "0")) except ValueError: snarlLevel = 0 for i, alt in enumerate(alts): if alt in (".", "*"): continue svLenVal = abs(len(alt) - len(ref)) # Filter: keep SV-sized alleles by net length change. if svLenVal < MIN_SV_LEN: continue svType, prefixLen, refSpan = classify(ref, alt) - color = SV_COLORS.get(svType, "100,100,100") + color = svColor(svType) # chromStart shifts past the shared prefix to the first # changed base; INS gets a 1-bp anchor, DEL/CPX span the # trimmed reference interval. chromStart = (pos - 1) + prefixLen if svType == "INS": chromEnd = chromStart + 1 else: chromEnd = chromStart + max(1, refSpan) if chromEnd <= chromStart: chromEnd = chromStart + 1 try: ac = int(acList[i]) except ValueError: ac = 0 try: af = float(afList[i]) except ValueError: af = 0.0 svLen = chromEnd - chromStart if svType == "INS": insLen = abs(len(alt) - len(ref)) else: insLen = 0 if rowId and rowId != ".": baseId = f"{rowId}.{i + 1}" if n > 1 else rowId else: baseId = f"{chrom}:{pos}:{svType}:{svLenVal}" if len(baseId) > 200: baseId = f"{chrom}:{pos}:{svType}:{svLenVal}" featLen = insLen if svType == "INS" else svLen name = svName(svType, featLen, ac) - counts[svType] += 1 - if snarlLevel > 0: - nested += 1 - row = [ chrom, str(chromStart), str(chromEnd), name, "0", ".", str(chromStart), str(chromEnd), color, svType, str(svLen), str(insLen), str(ac), str(an), f"{af:.6f}", str(nSamples), str(snarlLevel), ] - fOut.write("\t".join(row) + "\n") + # The raw vg deconstruct output decomposes some snarls into + # byte-identical SV rows; collapse them. + line_out = "\t".join(row) + if line_out in seen: + nDup += 1 + continue + seen.add(line_out) + + counts[svType] += 1 + if snarlLevel > 0: + nested += 1 + fOut.write(line_out + "\n") total = sum(counts.values()) print(f"kept {total} SV-sized alleles: " f"{counts['INS']} INS, {counts['DEL']} DEL, {counts['CPX']} CPX " - f"({nested} at nested snarl levels LV>0)", file=sys.stderr) + f"({nested} at nested snarl levels LV>0); " + f"{nDup} duplicate rows dropped", file=sys.stderr) if __name__ == "__main__": main()