src/hg/makeDb/scripts/lrSv/lrSvCpc1VcfToBed.py 9eb4e0937782954c19d664e7d384d210bffb3b25

9eb4e0937782954c19d664e7d384d210bffb3b25
max
  Sat Jun 13 16:01:42 2026 -0700
lrSv: QA fixes from Lou's review - dedup, shared color palette, deCODE/AoU cleanup

- Drop kwanhoSv (KimPD) from the lrSvAll merge in databases.tsv; it stays on
dev/alpha until published, which also removes its >5 Mb breakend artifacts
from the merged track.
- Remove searchIndex from colorsDbSv, lrSv1kLin and lrSvAll (and the merge
generator): the bigBeds were built without a name index, so by-name search
never worked.
- Single shared per-SV-type color palette in lrSvCommon.py (svColor), used by
every converter and the merge. CPX is purple everywhere (was orange in
1kgOnt/apr/cpc1, colliding with INV's orange), colorsDb DEL is 200,0,0 like
the rest, and TRA/INSDEL get their own colors.
- deCODE: drop byte-identical duplicate rows and blank the fake AC=50
placeholder (AC is now a string field, omitted from the name and mouseOver).
- AoU: numeric-entity-encode non-ASCII gene/trait text and drop duplicate rows.
- gustafson, chirmade101, hprc2v21: drop byte-identical duplicate rows.
- lrSvMergeAll.py: skip byte-identical duplicate source rows instead of summing
their allele counts, which had inflated the per-database and total AC.

refs #36258

diff --git src/hg/makeDb/scripts/lrSv/lrSvCpc1VcfToBed.py src/hg/makeDb/scripts/lrSv/lrSvCpc1VcfToBed.py
index 277bc06d601..252c2a8ef17 100755
--- src/hg/makeDb/scripts/lrSv/lrSvCpc1VcfToBed.py
+++ src/hg/makeDb/scripts/lrSv/lrSvCpc1VcfToBed.py
@@ -31,39 +31,31 @@
          insLen = max inserted-sequence length for INS alts (0 otherwise)
          AC     = sum of per-alt CPC AC
          alleleNumber = CPC AN
          alleleFreq   = AC / alleleNumber
          numSamples   = CPC NS
 
 Usage:
     lrSvCpc1VcfToBed.py input.vcf.gz output.bed chrom.sizes
 """
 
 import gzip
 import os
 import sys
 
 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
-from lrSvCommon import svName, normalizeSvType
-
-# Colors per SV type
-COLORS = {
-    "INS": "0,0,200",       # blue
-    "DEL": "200,0,0",       # red
-    "CPX": "230,140,0",     # orange
-    "MIXED": "120,120,120", # grey
-}
+from lrSvCommon import svName, normalizeSvType, svColor
 
 SIZE_THRESHOLD = 50
 
 
 def is_cpc_sample(name):
     """CPC samples are HIFI032* (Chinese, 47) and RY* (Chinese, 11)."""
     return name.startswith("HIFI032") or name.startswith("RY")
 
 
 def classify(ref_len, alt_len):
     d = alt_len - ref_len
     if d >= SIZE_THRESHOLD:
         return "INS", d
     if d <= -SIZE_THRESHOLD:
         return "DEL", -d
@@ -84,31 +76,31 @@
             if a == ".":
                 continue
             an += 1
             has_called = True
             if a == "1":
                 ac += 1
         if has_called:
             ns += 1
     return ac, an, ns
 
 
 def emit(site, fout):
     classes = site["types"]
     sv_type = next(iter(classes)) if len(classes) == 1 else "MIXED"
     sv_type = normalizeSvType(sv_type)
-    rgb = COLORS.get(sv_type, "120,120,120")
+    rgb = svColor(sv_type)
     chrom = site["chrom"]
     start = site["pos0"]
     end = start + max(site["ref_len"], 1)
     af = (site["ac_sum"] / site["an"]) if site["an"] else 0.0
     score = min(1000, max(0, int(round(af * 1000))))
     svLen = end - start
     insLen = site["max_ins"] if sv_type == "INS" else 0
     featLen = insLen if sv_type in ("INS", "MEI") else svLen
     name = svName(sv_type, featLen, site["ac_sum"])
     row = [
         chrom, str(start), str(end),
         name,
         str(score),
         ".",
         str(start), str(end),