9eb4e0937782954c19d664e7d384d210bffb3b25
max
  Sat Jun 13 16:01:42 2026 -0700
lrSv: QA fixes from Lou's review - dedup, shared color palette, deCODE/AoU cleanup

- Drop kwanhoSv (KimPD) from the lrSvAll merge in databases.tsv; it stays on
dev/alpha until published, which also removes its >5 Mb breakend artifacts
from the merged track.
- Remove searchIndex from colorsDbSv, lrSv1kLin and lrSvAll (and the merge
generator): the bigBeds were built without a name index, so by-name search
never worked.
- Single shared per-SV-type color palette in lrSvCommon.py (svColor), used by
every converter and the merge. CPX is purple everywhere (was orange in
1kgOnt/apr/cpc1, colliding with INV's orange), colorsDb DEL is 200,0,0 like
the rest, and TRA/INSDEL get their own colors.
- deCODE: drop byte-identical duplicate rows and blank the fake AC=50
placeholder (AC is now a string field, omitted from the name and mouseOver).
- AoU: numeric-entity-encode non-ASCII gene/trait text and drop duplicate rows.
- gustafson, chirmade101, hprc2v21: drop byte-identical duplicate rows.
- lrSvMergeAll.py: skip byte-identical duplicate source rows instead of summing
their allele counts, which had inflated the per-database and total AC.

refs #36258

diff --git src/hg/makeDb/scripts/lrSv/lrSvAprVcfToBed.py src/hg/makeDb/scripts/lrSv/lrSvAprVcfToBed.py
index 78b439bae4e..11742f0293c 100755
--- src/hg/makeDb/scripts/lrSv/lrSvAprVcfToBed.py
+++ src/hg/makeDb/scripts/lrSv/lrSvAprVcfToBed.py
@@ -22,38 +22,31 @@
          insLen = max inserted-sequence length across passing INS alts (0 otherwise)
          AC     = sum of AC values for passing alts
          alleleNumber = AN (constant)
          alleleFreq   = AC / alleleNumber
      Rows with zero passing alts are skipped.
 
 Usage:
     lrSvAprVcfToBed.py input.vcf.gz output.bed chrom.sizes
 """
 
 import gzip
 import os
 import sys
 
 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
-from lrSvCommon import svName, normalizeSvType
-
-COLORS = {
-    "INS": "0,0,200",       # blue
-    "DEL": "200,0,0",       # red
-    "CPX": "230,140,0",     # orange
-    "MIXED": "120,120,120", # grey
-}
+from lrSvCommon import svName, normalizeSvType, svColor
 
 SIZE_THRESHOLD = 50
 
 
 def parse_info(info_str):
     d = {}
     for token in info_str.split(";"):
         if not token:
             continue
         if "=" in token:
             k, v = token.split("=", 1)
             d[k] = v
         else:
             d[token] = True
     return d
@@ -132,31 +125,31 @@
                     max_mag = mag
                 if sv_type == "INS":
                     d = len(alt_seq) - ref_len
                     if d > max_ins:
                         max_ins = d
                 if i < len(ac_list):
                     ac_sum += _int(ac_list[i])
                 num_pass += 1
 
             if num_pass == 0:
                 skipped_no_sv_alt += 1
                 continue
 
             sv_type = next(iter(types)) if len(types) == 1 else "MIXED"
             sv_type = normalizeSvType(sv_type)
-            rgb = COLORS.get(sv_type, "120,120,120")
+            rgb = svColor(sv_type)
 
             pos0 = int(pos) - 1
             start = pos0
             end = start + max(ref_len, 1)
             af = (ac_sum / an) if an else 0.0
             score = min(1000, max(0, int(round(af * 1000))))
 
             svLen = end - start
             if sv_type == "INS":
                 insLen = max_ins
             else:
                 insLen = 0
 
             featLen = insLen if sv_type in ("INS", "MEI") else svLen
             name = svName(sv_type, featLen, ac_sum)