9eb4e0937782954c19d664e7d384d210bffb3b25
max
  Sat Jun 13 16:01:42 2026 -0700
lrSv: QA fixes from Lou's review - dedup, shared color palette, deCODE/AoU cleanup

- Drop kwanhoSv (KimPD) from the lrSvAll merge in databases.tsv; it stays on
dev/alpha until published, which also removes its >5 Mb breakend artifacts
from the merged track.
- Remove searchIndex from colorsDbSv, lrSv1kLin and lrSvAll (and the merge
generator): the bigBeds were built without a name index, so by-name search
never worked.
- Single shared per-SV-type color palette in lrSvCommon.py (svColor), used by
every converter and the merge. CPX is purple everywhere (was orange in
1kgOnt/apr/cpc1, colliding with INV's orange), colorsDb DEL is 200,0,0 like
the rest, and TRA/INSDEL get their own colors.
- deCODE: drop byte-identical duplicate rows and blank the fake AC=50
placeholder (AC is now a string field, omitted from the name and mouseOver).
- AoU: numeric-entity-encode non-ASCII gene/trait text and drop duplicate rows.
- gustafson, chirmade101, hprc2v21: drop byte-identical duplicate rows.
- lrSvMergeAll.py: skip byte-identical duplicate source rows instead of summing
their allele counts, which had inflated the per-database and total AC.

refs #36258

diff --git src/hg/makeDb/scripts/lrSv/lrSvCommon.py src/hg/makeDb/scripts/lrSv/lrSvCommon.py
index 0cdfdd61014..eadd7979ff6 100644
--- src/hg/makeDb/scripts/lrSv/lrSvCommon.py
+++ src/hg/makeDb/scripts/lrSv/lrSvCommon.py
@@ -1,138 +1,173 @@
 """Shared helpers for lrSv subtrack converters.
 
 The lrSv supertrack aggregates SVs from many studies. To enable uniform
 supertrack-level filtering and mouseovers, every subtrack's bigBed must:
 
   1. Store these four fields, named exactly like this:
         svType  - string, uppercase (DEL, INS, INV, CPX, DUP, CNV, CTX,
                   INSDEL, MIXED, BND, MEI, TRA, LOSS, GAIN, BOTH, ...).
         svLen   - int,  feature length on the reference (chromEnd-chromStart).
         insLen  - int,  length of inserted sequence (0 for DEL/INV/CPX/INSDEL;
                   reported absolute length for INS).
         AC      - int,  allele count. Required. Tracks that don't publish
                   AC use a documented sentinel (e.g. deCODE: 50).
 
   2. Use a canonical `name` column of the form:
         TYPE-LEN[:AC]
      where LEN is a short form of the feature length (svLen for DEL/INV/...,
      insLen for INS, omitted for CTX/BND):
         len <  1000 bp  -> integer bp, e.g. "200"
         len >= 1000 bp  -> "Xk" or "X.Xk" with at most one decimal,
                            trailing ".0" stripped, e.g. "1k", "5.5k", "15k"
      :AC is appended when AC is known and >= 0; otherwise omitted.
 
 Importing:
     from lrSvCommon import svName, shortLen, normalizeSvType, insLenFor
 
 Everything in this module has no external deps.
 """
 
 from __future__ import annotations
 from typing import Optional
 
 # Canonical types. Anything not listed here is upper-cased and passed through.
 TYPE_ALIASES = {
     "COMPLEX": "CPX",
     "CPX":     "CPX",
     "DEL":     "DEL",
     "INS":     "INS",
     "INV":     "INV",
     "DUP":     "DUP",
     "CNV":     "CNV",
     "CTX":     "CTX",
     "TRA":     "TRA",
     "BND":     "BND",
     "MEI":     "MEI",
     "INSDEL":  "INSDEL",
     "MIXED":   "MIXED",
     "LOSS":    "LOSS",
     "GAIN":    "GAIN",
     "BOTH":    "BOTH",
 }
 
 # Types for which the length segment in the name is dropped:
 # CTX/BND have no meaningful single-position length.
 NO_LEN_TYPES = {"CTX", "BND"}
 
 
+# Canonical SV-type -> itemRgb color. Every lrSv subtrack converter (and the
+# lrSvAll merge) MUST use this single map so a given SV type has one color
+# across the whole supertrack. The scheme is the long-standing flat-color set
+# (red/blue/green/orange/purple); the rule that is easy to get wrong is that
+# CPX is purple (NOT orange) so it never collides with INV's orange, and
+# INSDEL/TRA/BND get their own colors so they stay distinct from CPX in the
+# merged track. Pass a raw or canonical type to svColor().
+SV_COLORS = {
+    "DEL":    "200,0,0",      # red
+    "INS":    "0,0,200",      # blue
+    "DUP":    "0,160,0",      # green
+    "INV":    "230,140,0",    # orange
+    "CPX":    "140,0,200",    # purple
+    "MIXED":  "120,120,120",  # grey
+    "INSDEL": "100,100,150",  # slate (deCODE combined ins/del)
+    "MEI":    "0,160,160",    # teal
+    "CNV":    "200,0,160",    # magenta
+    "TRA":    "90,90,90",     # dark grey (translocation)
+    "BND":    "90,90,90",     # dark grey (breakend)
+    "CTX":    "90,90,90",     # dark grey (chromosomal translocation)
+}
+
+# Fallback for any type not in SV_COLORS.
+DEFAULT_SV_COLOR = "100,100,100"
+
+
 def normalizeSvType(raw: Optional[str]) -> str:
     """Return the canonical upper-case svType string for a raw VCF/TSV value."""
     if raw is None:
         return "UNK"
     s = str(raw).strip()
     if not s or s == "." or s == "?":
         return "UNK"
     upper = s.upper()
     return TYPE_ALIASES.get(upper, upper)
 
 
+def svColor(svType: Optional[str]) -> str:
+    """Return the canonical itemRgb color string for an SV type.
+
+    Normalizes the type first, so aliases (e.g. COMPLEX -> CPX) get the
+    right color. Unknown types fall back to DEFAULT_SV_COLOR.
+    """
+    return SV_COLORS.get(normalizeSvType(svType), DEFAULT_SV_COLOR)
+
+
 def shortLen(lenBp: Optional[int]) -> str:
     """Short text form for a length in bp.
 
     <1000 bp  -> integer bp as string, e.g. "200"
     >=1000 bp -> "Xk" or "X.Xk" (at most one decimal, no trailing .0),
                   e.g. "1k", "5.5k", "15k", "1.5k"
     None or <=0 -> empty string.
     """
     if lenBp is None:
         return ""
     try:
         n = int(lenBp)
     except (TypeError, ValueError):
         return ""
     if n <= 0:
         return ""
     if n < 1000:
         return str(n)
     k = round(n / 1000.0, 1)
     if k == int(k):
         return f"{int(k)}k"
     return f"{k:g}k"
 
 
 def svName(svType: str, featLen: Optional[int], ac: Optional[int] = None) -> str:
     """Build the canonical name column value.
 
     svType  - raw or canonical (will be normalized to upper-case)
     featLen - length to display; typically svLen for DEL/INV/CPX/DUP/...,
               insLen for INS. Pass None (or <=0) and the length segment
               is omitted entirely. For CTX/BND, the length is always
               dropped regardless of featLen.
     ac      - None or negative -> the ":AC" suffix is omitted.
     """
     t = normalizeSvType(svType)
     core = t
     if t not in NO_LEN_TYPES:
         lenStr = shortLen(featLen)
         if lenStr:
             core = f"{t}-{lenStr}"
     if ac is not None:
         try:
             acInt = int(ac)
             if acInt >= 0:
                 return f"{core}:{acInt}"
         except (TypeError, ValueError):
             pass
     return core
 
 
 def insLenFor(svType: str, refLen: int, altLen: int,
               svlenField: Optional[int] = None) -> int:
     """Compute the insLen value for a record.
 
     For INS / MEI: uses abs(altLen - refLen) as the inserted-sequence length,
       but if svlenField is provided and positive, prefers that (matches
       SVLEN reported by the caller).
     For everything else: 0.
     """
     t = normalizeSvType(svType)
     if t in ("INS", "MEI"):
         if svlenField is not None:
             try:
                 v = int(svlenField)
                 if v > 0:
                     return v
             except (TypeError, ValueError):
                 pass
         return max(0, int(altLen) - int(refLen))
     return 0