9eb4e0937782954c19d664e7d384d210bffb3b25
max
  Sat Jun 13 16:01:42 2026 -0700
lrSv: QA fixes from Lou's review - dedup, shared color palette, deCODE/AoU cleanup

- Drop kwanhoSv (KimPD) from the lrSvAll merge in databases.tsv; it stays on
dev/alpha until published, which also removes its >5 Mb breakend artifacts
from the merged track.
- Remove searchIndex from colorsDbSv, lrSv1kLin and lrSvAll (and the merge
generator): the bigBeds were built without a name index, so by-name search
never worked.
- Single shared per-SV-type color palette in lrSvCommon.py (svColor), used by
every converter and the merge. CPX is purple everywhere (was orange in
1kgOnt/apr/cpc1, colliding with INV's orange), colorsDb DEL is 200,0,0 like
the rest, and TRA/INSDEL get their own colors.
- deCODE: drop byte-identical duplicate rows and blank the fake AC=50
placeholder (AC is now a string field, omitted from the name and mouseOver).
- AoU: numeric-entity-encode non-ASCII gene/trait text and drop duplicate rows.
- gustafson, chirmade101, hprc2v21: drop byte-identical duplicate rows.
- lrSvMergeAll.py: skip byte-identical duplicate source rows instead of summing
their allele counts, which had inflated the per-database and total AC.

refs #36258

diff --git src/hg/makeDb/scripts/lrSv/lrSvHprc2RoVcfToBed.py src/hg/makeDb/scripts/lrSv/lrSvHprc2RoVcfToBed.py
index 57efda1917b..57ee8aa61cf 100644
--- src/hg/makeDb/scripts/lrSv/lrSvHprc2RoVcfToBed.py
+++ src/hg/makeDb/scripts/lrSv/lrSvHprc2RoVcfToBed.py
@@ -20,40 +20,34 @@
 inversions are emitted (balanced events fall into CPX).
 
 Usage:
     lrSvHprc2RoVcfToBed.py input.vcf.gz output.bed
 
 Source:
     https://humanpangenome.org/hprc-data-release-2/
     hprc-v2.1-mc-grch38.gref95.ro.vcf.gz (233-sample minigraph-cactus graph).
 """
 
 import gzip
 import os
 import sys
 
 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
-from lrSvCommon import svName
+from lrSvCommon import svName, svColor
 
 MIN_SV_LEN = 50
 
-SV_COLORS = {
-    "INS": "0,0,200",      # blue
-    "DEL": "200,0,0",      # red
-    "CPX": "140,0,200",    # purple
-}
-
 
 def parseInfo(infoStr):
     d = {}
     for item in infoStr.split(";"):
         if "=" in item:
             k, v = item.split("=", 1)
             d[k] = v
         else:
             d[item] = True
     return d
 
 
 def splitMulti(s, n):
     """Split a per-allele INFO value; pad with '' if missing."""
     if s is None:
@@ -92,30 +86,32 @@
     else:
         svType = "CPX"
     return svType, p, lr
 
 
 def main():
     if len(sys.argv) != 3:
         print(__doc__, file=sys.stderr)
         sys.exit(1)
 
     inPath, outPath = sys.argv[1], sys.argv[2]
     opener = gzip.open if inPath.endswith(".gz") else open
 
     counts = {"INS": 0, "DEL": 0, "CPX": 0}
     nested = 0
+    seen = set()
+    nDup = 0
 
     with opener(inPath, "rt") as fIn, open(outPath, "w") as fOut:
         for line in fIn:
             if line.startswith("#"):
                 continue
             fields = line.rstrip("\n").split("\t")
             chrom = fields[0]
             pos = int(fields[1])
             rowId = fields[2]
             ref = fields[3]
             alts = fields[4].split(",")
             info = parseInfo(fields[7])
 
             n = len(alts)
             acList = splitMulti(info.get("AC"), n)
@@ -132,31 +128,31 @@
             try:
                 snarlLevel = int(info.get("LV", "0"))
             except ValueError:
                 snarlLevel = 0
 
             for i, alt in enumerate(alts):
                 if alt in (".", "*"):
                     continue
 
                 svLenVal = abs(len(alt) - len(ref))
                 # Filter: keep SV-sized alleles by net length change.
                 if svLenVal < MIN_SV_LEN:
                     continue
 
                 svType, prefixLen, refSpan = classify(ref, alt)
-                color = SV_COLORS.get(svType, "100,100,100")
+                color = svColor(svType)
 
                 # chromStart shifts past the shared prefix to the first
                 # changed base; INS gets a 1-bp anchor, DEL/CPX span the
                 # trimmed reference interval.
                 chromStart = (pos - 1) + prefixLen
                 if svType == "INS":
                     chromEnd = chromStart + 1
                 else:
                     chromEnd = chromStart + max(1, refSpan)
                 if chromEnd <= chromStart:
                     chromEnd = chromStart + 1
 
                 try:
                     ac = int(acList[i])
                 except ValueError:
@@ -170,48 +166,56 @@
                 if svType == "INS":
                     insLen = abs(len(alt) - len(ref))
                 else:
                     insLen = 0
 
                 if rowId and rowId != ".":
                     baseId = f"{rowId}.{i + 1}" if n > 1 else rowId
                 else:
                     baseId = f"{chrom}:{pos}:{svType}:{svLenVal}"
                 if len(baseId) > 200:
                     baseId = f"{chrom}:{pos}:{svType}:{svLenVal}"
 
                 featLen = insLen if svType == "INS" else svLen
                 name = svName(svType, featLen, ac)
 
-                counts[svType] += 1
-                if snarlLevel > 0:
-                    nested += 1
-
                 row = [
                     chrom,
                     str(chromStart),
                     str(chromEnd),
                     name,
                     "0",
                     ".",
                     str(chromStart),
                     str(chromEnd),
                     color,
                     svType,
                     str(svLen),
                     str(insLen),
                     str(ac),
                     str(an),
                     f"{af:.6f}",
                     str(nSamples),
                     str(snarlLevel),
                 ]
-                fOut.write("\t".join(row) + "\n")
+                # The raw vg deconstruct output decomposes some snarls into
+                # byte-identical SV rows; collapse them.
+                line_out = "\t".join(row)
+                if line_out in seen:
+                    nDup += 1
+                    continue
+                seen.add(line_out)
+
+                counts[svType] += 1
+                if snarlLevel > 0:
+                    nested += 1
+                fOut.write(line_out + "\n")
 
     total = sum(counts.values())
     print(f"kept {total} SV-sized alleles: "
           f"{counts['INS']} INS, {counts['DEL']} DEL, {counts['CPX']} CPX "
-          f"({nested} at nested snarl levels LV>0)", file=sys.stderr)
+          f"({nested} at nested snarl levels LV>0); "
+          f"{nDup} duplicate rows dropped", file=sys.stderr)
 
 
 if __name__ == "__main__":
     main()