9eb4e0937782954c19d664e7d384d210bffb3b25 max Sat Jun 13 16:01:42 2026 -0700 lrSv: QA fixes from Lou's review - dedup, shared color palette, deCODE/AoU cleanup - Drop kwanhoSv (KimPD) from the lrSvAll merge in databases.tsv; it stays on dev/alpha until published, which also removes its >5 Mb breakend artifacts from the merged track. - Remove searchIndex from colorsDbSv, lrSv1kLin and lrSvAll (and the merge generator): the bigBeds were built without a name index, so by-name search never worked. - Single shared per-SV-type color palette in lrSvCommon.py (svColor), used by every converter and the merge. CPX is purple everywhere (was orange in 1kgOnt/apr/cpc1, colliding with INV's orange), colorsDb DEL is 200,0,0 like the rest, and TRA/INSDEL get their own colors. - deCODE: drop byte-identical duplicate rows and blank the fake AC=50 placeholder (AC is now a string field, omitted from the name and mouseOver). - AoU: numeric-entity-encode non-ASCII gene/trait text and drop duplicate rows. - gustafson, chirmade101, hprc2v21: drop byte-identical duplicate rows. - lrSvMergeAll.py: skip byte-identical duplicate source rows instead of summing their allele counts, which had inflated the per-database and total AC. refs #36258 diff --git src/hg/makeDb/scripts/lrSv/lrSvHprc2RoVcfToBed.py src/hg/makeDb/scripts/lrSv/lrSvHprc2RoVcfToBed.py index 57efda1917b..57ee8aa61cf 100644 --- src/hg/makeDb/scripts/lrSv/lrSvHprc2RoVcfToBed.py +++ src/hg/makeDb/scripts/lrSv/lrSvHprc2RoVcfToBed.py @@ -20,40 +20,34 @@ inversions are emitted (balanced events fall into CPX). Usage: lrSvHprc2RoVcfToBed.py input.vcf.gz output.bed Source: https://humanpangenome.org/hprc-data-release-2/ hprc-v2.1-mc-grch38.gref95.ro.vcf.gz (233-sample minigraph-cactus graph). """ import gzip import os import sys sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) -from lrSvCommon import svName +from lrSvCommon import svName, svColor MIN_SV_LEN = 50 -SV_COLORS = { - "INS": "0,0,200", # blue - "DEL": "200,0,0", # red - "CPX": "140,0,200", # purple -} - def parseInfo(infoStr): d = {} for item in infoStr.split(";"): if "=" in item: k, v = item.split("=", 1) d[k] = v else: d[item] = True return d def splitMulti(s, n): """Split a per-allele INFO value; pad with '' if missing.""" if s is None: @@ -92,30 +86,32 @@ else: svType = "CPX" return svType, p, lr def main(): if len(sys.argv) != 3: print(__doc__, file=sys.stderr) sys.exit(1) inPath, outPath = sys.argv[1], sys.argv[2] opener = gzip.open if inPath.endswith(".gz") else open counts = {"INS": 0, "DEL": 0, "CPX": 0} nested = 0 + seen = set() + nDup = 0 with opener(inPath, "rt") as fIn, open(outPath, "w") as fOut: for line in fIn: if line.startswith("#"): continue fields = line.rstrip("\n").split("\t") chrom = fields[0] pos = int(fields[1]) rowId = fields[2] ref = fields[3] alts = fields[4].split(",") info = parseInfo(fields[7]) n = len(alts) acList = splitMulti(info.get("AC"), n) @@ -132,31 +128,31 @@ try: snarlLevel = int(info.get("LV", "0")) except ValueError: snarlLevel = 0 for i, alt in enumerate(alts): if alt in (".", "*"): continue svLenVal = abs(len(alt) - len(ref)) # Filter: keep SV-sized alleles by net length change. if svLenVal < MIN_SV_LEN: continue svType, prefixLen, refSpan = classify(ref, alt) - color = SV_COLORS.get(svType, "100,100,100") + color = svColor(svType) # chromStart shifts past the shared prefix to the first # changed base; INS gets a 1-bp anchor, DEL/CPX span the # trimmed reference interval. chromStart = (pos - 1) + prefixLen if svType == "INS": chromEnd = chromStart + 1 else: chromEnd = chromStart + max(1, refSpan) if chromEnd <= chromStart: chromEnd = chromStart + 1 try: ac = int(acList[i]) except ValueError: @@ -170,48 +166,56 @@ if svType == "INS": insLen = abs(len(alt) - len(ref)) else: insLen = 0 if rowId and rowId != ".": baseId = f"{rowId}.{i + 1}" if n > 1 else rowId else: baseId = f"{chrom}:{pos}:{svType}:{svLenVal}" if len(baseId) > 200: baseId = f"{chrom}:{pos}:{svType}:{svLenVal}" featLen = insLen if svType == "INS" else svLen name = svName(svType, featLen, ac) - counts[svType] += 1 - if snarlLevel > 0: - nested += 1 - row = [ chrom, str(chromStart), str(chromEnd), name, "0", ".", str(chromStart), str(chromEnd), color, svType, str(svLen), str(insLen), str(ac), str(an), f"{af:.6f}", str(nSamples), str(snarlLevel), ] - fOut.write("\t".join(row) + "\n") + # The raw vg deconstruct output decomposes some snarls into + # byte-identical SV rows; collapse them. + line_out = "\t".join(row) + if line_out in seen: + nDup += 1 + continue + seen.add(line_out) + + counts[svType] += 1 + if snarlLevel > 0: + nested += 1 + fOut.write(line_out + "\n") total = sum(counts.values()) print(f"kept {total} SV-sized alleles: " f"{counts['INS']} INS, {counts['DEL']} DEL, {counts['CPX']} CPX " - f"({nested} at nested snarl levels LV>0)", file=sys.stderr) + f"({nested} at nested snarl levels LV>0); " + f"{nDup} duplicate rows dropped", file=sys.stderr) if __name__ == "__main__": main()