bac95a147f49cd331052e597006e04b3deee40fc
max
  Wed Apr 22 10:43:20 2026 -0700
lrSv/srSv: human-readable SV type filter labels, script cleanups

Add human-readable labels to the supertrack-level svType filter on
both the lrSv and srSv supertracks using the "CODE|CODE (Long name)"
filterValues syntax: DEL -> "DEL (Deletion)", INS -> "INS (Insertion)",
etc. Labels keep the short code up front so users can match what
hgTracks shows next to each feature.

Also sweep in the in-progress converter/as-file cleanups under
scripts/lrSv/ and scripts/srSv/ (introduction of lrSvCommon.py
helpers, consistent insLen / svLen / AC column naming, tightened
field-description text) that had been piling up as an unstaged
working tree.

refs #36258

diff --git src/hg/makeDb/scripts/lrSv/lrSvCommon.py src/hg/makeDb/scripts/lrSv/lrSvCommon.py
new file mode 100644
index 00000000000..0cdfdd61014
--- /dev/null
+++ src/hg/makeDb/scripts/lrSv/lrSvCommon.py
@@ -0,0 +1,138 @@
+"""Shared helpers for lrSv subtrack converters.
+
+The lrSv supertrack aggregates SVs from many studies. To enable uniform
+supertrack-level filtering and mouseovers, every subtrack's bigBed must:
+
+  1. Store these four fields, named exactly like this:
+        svType  - string, uppercase (DEL, INS, INV, CPX, DUP, CNV, CTX,
+                  INSDEL, MIXED, BND, MEI, TRA, LOSS, GAIN, BOTH, ...).
+        svLen   - int,  feature length on the reference (chromEnd-chromStart).
+        insLen  - int,  length of inserted sequence (0 for DEL/INV/CPX/INSDEL;
+                  reported absolute length for INS).
+        AC      - int,  allele count. Required. Tracks that don't publish
+                  AC use a documented sentinel (e.g. deCODE: 50).
+
+  2. Use a canonical `name` column of the form:
+        TYPE-LEN[:AC]
+     where LEN is a short form of the feature length (svLen for DEL/INV/...,
+     insLen for INS, omitted for CTX/BND):
+        len <  1000 bp  -> integer bp, e.g. "200"
+        len >= 1000 bp  -> "Xk" or "X.Xk" with at most one decimal,
+                           trailing ".0" stripped, e.g. "1k", "5.5k", "15k"
+     :AC is appended when AC is known and >= 0; otherwise omitted.
+
+Importing:
+    from lrSvCommon import svName, shortLen, normalizeSvType, insLenFor
+
+Everything in this module has no external deps.
+"""
+
+from __future__ import annotations
+from typing import Optional
+
+# Canonical types. Anything not listed here is upper-cased and passed through.
+TYPE_ALIASES = {
+    "COMPLEX": "CPX",
+    "CPX":     "CPX",
+    "DEL":     "DEL",
+    "INS":     "INS",
+    "INV":     "INV",
+    "DUP":     "DUP",
+    "CNV":     "CNV",
+    "CTX":     "CTX",
+    "TRA":     "TRA",
+    "BND":     "BND",
+    "MEI":     "MEI",
+    "INSDEL":  "INSDEL",
+    "MIXED":   "MIXED",
+    "LOSS":    "LOSS",
+    "GAIN":    "GAIN",
+    "BOTH":    "BOTH",
+}
+
+# Types for which the length segment in the name is dropped:
+# CTX/BND have no meaningful single-position length.
+NO_LEN_TYPES = {"CTX", "BND"}
+
+
+def normalizeSvType(raw: Optional[str]) -> str:
+    """Return the canonical upper-case svType string for a raw VCF/TSV value."""
+    if raw is None:
+        return "UNK"
+    s = str(raw).strip()
+    if not s or s == "." or s == "?":
+        return "UNK"
+    upper = s.upper()
+    return TYPE_ALIASES.get(upper, upper)
+
+
+def shortLen(lenBp: Optional[int]) -> str:
+    """Short text form for a length in bp.
+
+    <1000 bp  -> integer bp as string, e.g. "200"
+    >=1000 bp -> "Xk" or "X.Xk" (at most one decimal, no trailing .0),
+                  e.g. "1k", "5.5k", "15k", "1.5k"
+    None or <=0 -> empty string.
+    """
+    if lenBp is None:
+        return ""
+    try:
+        n = int(lenBp)
+    except (TypeError, ValueError):
+        return ""
+    if n <= 0:
+        return ""
+    if n < 1000:
+        return str(n)
+    k = round(n / 1000.0, 1)
+    if k == int(k):
+        return f"{int(k)}k"
+    return f"{k:g}k"
+
+
+def svName(svType: str, featLen: Optional[int], ac: Optional[int] = None) -> str:
+    """Build the canonical name column value.
+
+    svType  - raw or canonical (will be normalized to upper-case)
+    featLen - length to display; typically svLen for DEL/INV/CPX/DUP/...,
+              insLen for INS. Pass None (or <=0) and the length segment
+              is omitted entirely. For CTX/BND, the length is always
+              dropped regardless of featLen.
+    ac      - None or negative -> the ":AC" suffix is omitted.
+    """
+    t = normalizeSvType(svType)
+    core = t
+    if t not in NO_LEN_TYPES:
+        lenStr = shortLen(featLen)
+        if lenStr:
+            core = f"{t}-{lenStr}"
+    if ac is not None:
+        try:
+            acInt = int(ac)
+            if acInt >= 0:
+                return f"{core}:{acInt}"
+        except (TypeError, ValueError):
+            pass
+    return core
+
+
+def insLenFor(svType: str, refLen: int, altLen: int,
+              svlenField: Optional[int] = None) -> int:
+    """Compute the insLen value for a record.
+
+    For INS / MEI: uses abs(altLen - refLen) as the inserted-sequence length,
+      but if svlenField is provided and positive, prefers that (matches
+      SVLEN reported by the caller).
+    For everything else: 0.
+    """
+    t = normalizeSvType(svType)
+    if t in ("INS", "MEI"):
+        if svlenField is not None:
+            try:
+                v = int(svlenField)
+                if v > 0:
+                    return v
+            except (TypeError, ValueError):
+                pass
+        return max(0, int(altLen) - int(refLen))
+    return 0