9eb4e0937782954c19d664e7d384d210bffb3b25 max Sat Jun 13 16:01:42 2026 -0700 lrSv: QA fixes from Lou's review - dedup, shared color palette, deCODE/AoU cleanup - Drop kwanhoSv (KimPD) from the lrSvAll merge in databases.tsv; it stays on dev/alpha until published, which also removes its >5 Mb breakend artifacts from the merged track. - Remove searchIndex from colorsDbSv, lrSv1kLin and lrSvAll (and the merge generator): the bigBeds were built without a name index, so by-name search never worked. - Single shared per-SV-type color palette in lrSvCommon.py (svColor), used by every converter and the merge. CPX is purple everywhere (was orange in 1kgOnt/apr/cpc1, colliding with INV's orange), colorsDb DEL is 200,0,0 like the rest, and TRA/INSDEL get their own colors. - deCODE: drop byte-identical duplicate rows and blank the fake AC=50 placeholder (AC is now a string field, omitted from the name and mouseOver). - AoU: numeric-entity-encode non-ASCII gene/trait text and drop duplicate rows. - gustafson, chirmade101, hprc2v21: drop byte-identical duplicate rows. - lrSvMergeAll.py: skip byte-identical duplicate source rows instead of summing their allele counts, which had inflated the per-database and total AC. refs #36258 diff --git src/hg/makeDb/scripts/lrSv/lrSvCommon.py src/hg/makeDb/scripts/lrSv/lrSvCommon.py index 0cdfdd61014..eadd7979ff6 100644 --- src/hg/makeDb/scripts/lrSv/lrSvCommon.py +++ src/hg/makeDb/scripts/lrSv/lrSvCommon.py @@ -1,138 +1,173 @@ """Shared helpers for lrSv subtrack converters. The lrSv supertrack aggregates SVs from many studies. To enable uniform supertrack-level filtering and mouseovers, every subtrack's bigBed must: 1. Store these four fields, named exactly like this: svType - string, uppercase (DEL, INS, INV, CPX, DUP, CNV, CTX, INSDEL, MIXED, BND, MEI, TRA, LOSS, GAIN, BOTH, ...). svLen - int, feature length on the reference (chromEnd-chromStart). insLen - int, length of inserted sequence (0 for DEL/INV/CPX/INSDEL; reported absolute length for INS). AC - int, allele count. Required. Tracks that don't publish AC use a documented sentinel (e.g. deCODE: 50). 2. Use a canonical `name` column of the form: TYPE-LEN[:AC] where LEN is a short form of the feature length (svLen for DEL/INV/..., insLen for INS, omitted for CTX/BND): len < 1000 bp -> integer bp, e.g. "200" len >= 1000 bp -> "Xk" or "X.Xk" with at most one decimal, trailing ".0" stripped, e.g. "1k", "5.5k", "15k" :AC is appended when AC is known and >= 0; otherwise omitted. Importing: from lrSvCommon import svName, shortLen, normalizeSvType, insLenFor Everything in this module has no external deps. """ from __future__ import annotations from typing import Optional # Canonical types. Anything not listed here is upper-cased and passed through. TYPE_ALIASES = { "COMPLEX": "CPX", "CPX": "CPX", "DEL": "DEL", "INS": "INS", "INV": "INV", "DUP": "DUP", "CNV": "CNV", "CTX": "CTX", "TRA": "TRA", "BND": "BND", "MEI": "MEI", "INSDEL": "INSDEL", "MIXED": "MIXED", "LOSS": "LOSS", "GAIN": "GAIN", "BOTH": "BOTH", } # Types for which the length segment in the name is dropped: # CTX/BND have no meaningful single-position length. NO_LEN_TYPES = {"CTX", "BND"} +# Canonical SV-type -> itemRgb color. Every lrSv subtrack converter (and the +# lrSvAll merge) MUST use this single map so a given SV type has one color +# across the whole supertrack. The scheme is the long-standing flat-color set +# (red/blue/green/orange/purple); the rule that is easy to get wrong is that +# CPX is purple (NOT orange) so it never collides with INV's orange, and +# INSDEL/TRA/BND get their own colors so they stay distinct from CPX in the +# merged track. Pass a raw or canonical type to svColor(). +SV_COLORS = { + "DEL": "200,0,0", # red + "INS": "0,0,200", # blue + "DUP": "0,160,0", # green + "INV": "230,140,0", # orange + "CPX": "140,0,200", # purple + "MIXED": "120,120,120", # grey + "INSDEL": "100,100,150", # slate (deCODE combined ins/del) + "MEI": "0,160,160", # teal + "CNV": "200,0,160", # magenta + "TRA": "90,90,90", # dark grey (translocation) + "BND": "90,90,90", # dark grey (breakend) + "CTX": "90,90,90", # dark grey (chromosomal translocation) +} + +# Fallback for any type not in SV_COLORS. +DEFAULT_SV_COLOR = "100,100,100" + + def normalizeSvType(raw: Optional[str]) -> str: """Return the canonical upper-case svType string for a raw VCF/TSV value.""" if raw is None: return "UNK" s = str(raw).strip() if not s or s == "." or s == "?": return "UNK" upper = s.upper() return TYPE_ALIASES.get(upper, upper) +def svColor(svType: Optional[str]) -> str: + """Return the canonical itemRgb color string for an SV type. + + Normalizes the type first, so aliases (e.g. COMPLEX -> CPX) get the + right color. Unknown types fall back to DEFAULT_SV_COLOR. + """ + return SV_COLORS.get(normalizeSvType(svType), DEFAULT_SV_COLOR) + + def shortLen(lenBp: Optional[int]) -> str: """Short text form for a length in bp. <1000 bp -> integer bp as string, e.g. "200" >=1000 bp -> "Xk" or "X.Xk" (at most one decimal, no trailing .0), e.g. "1k", "5.5k", "15k", "1.5k" None or <=0 -> empty string. """ if lenBp is None: return "" try: n = int(lenBp) except (TypeError, ValueError): return "" if n <= 0: return "" if n < 1000: return str(n) k = round(n / 1000.0, 1) if k == int(k): return f"{int(k)}k" return f"{k:g}k" def svName(svType: str, featLen: Optional[int], ac: Optional[int] = None) -> str: """Build the canonical name column value. svType - raw or canonical (will be normalized to upper-case) featLen - length to display; typically svLen for DEL/INV/CPX/DUP/..., insLen for INS. Pass None (or <=0) and the length segment is omitted entirely. For CTX/BND, the length is always dropped regardless of featLen. ac - None or negative -> the ":AC" suffix is omitted. """ t = normalizeSvType(svType) core = t if t not in NO_LEN_TYPES: lenStr = shortLen(featLen) if lenStr: core = f"{t}-{lenStr}" if ac is not None: try: acInt = int(ac) if acInt >= 0: return f"{core}:{acInt}" except (TypeError, ValueError): pass return core def insLenFor(svType: str, refLen: int, altLen: int, svlenField: Optional[int] = None) -> int: """Compute the insLen value for a record. For INS / MEI: uses abs(altLen - refLen) as the inserted-sequence length, but if svlenField is provided and positive, prefers that (matches SVLEN reported by the caller). For everything else: 0. """ t = normalizeSvType(svType) if t in ("INS", "MEI"): if svlenField is not None: try: v = int(svlenField) if v > 0: return v except (TypeError, ValueError): pass return max(0, int(altLen) - int(refLen)) return 0