9eb4e0937782954c19d664e7d384d210bffb3b25 max Sat Jun 13 16:01:42 2026 -0700 lrSv: QA fixes from Lou's review - dedup, shared color palette, deCODE/AoU cleanup - Drop kwanhoSv (KimPD) from the lrSvAll merge in databases.tsv; it stays on dev/alpha until published, which also removes its >5 Mb breakend artifacts from the merged track. - Remove searchIndex from colorsDbSv, lrSv1kLin and lrSvAll (and the merge generator): the bigBeds were built without a name index, so by-name search never worked. - Single shared per-SV-type color palette in lrSvCommon.py (svColor), used by every converter and the merge. CPX is purple everywhere (was orange in 1kgOnt/apr/cpc1, colliding with INV's orange), colorsDb DEL is 200,0,0 like the rest, and TRA/INSDEL get their own colors. - deCODE: drop byte-identical duplicate rows and blank the fake AC=50 placeholder (AC is now a string field, omitted from the name and mouseOver). - AoU: numeric-entity-encode non-ASCII gene/trait text and drop duplicate rows. - gustafson, chirmade101, hprc2v21: drop byte-identical duplicate rows. - lrSvMergeAll.py: skip byte-identical duplicate source rows instead of summing their allele counts, which had inflated the per-database and total AC. refs #36258 diff --git src/hg/makeDb/scripts/lrSv/lrSvCommon.py src/hg/makeDb/scripts/lrSv/lrSvCommon.py index 0cdfdd61014..eadd7979ff6 100644 --- src/hg/makeDb/scripts/lrSv/lrSvCommon.py +++ src/hg/makeDb/scripts/lrSv/lrSvCommon.py @@ -43,41 +43,76 @@ "TRA": "TRA", "BND": "BND", "MEI": "MEI", "INSDEL": "INSDEL", "MIXED": "MIXED", "LOSS": "LOSS", "GAIN": "GAIN", "BOTH": "BOTH", } # Types for which the length segment in the name is dropped: # CTX/BND have no meaningful single-position length. NO_LEN_TYPES = {"CTX", "BND"} +# Canonical SV-type -> itemRgb color. Every lrSv subtrack converter (and the +# lrSvAll merge) MUST use this single map so a given SV type has one color +# across the whole supertrack. The scheme is the long-standing flat-color set +# (red/blue/green/orange/purple); the rule that is easy to get wrong is that +# CPX is purple (NOT orange) so it never collides with INV's orange, and +# INSDEL/TRA/BND get their own colors so they stay distinct from CPX in the +# merged track. Pass a raw or canonical type to svColor(). +SV_COLORS = { + "DEL": "200,0,0", # red + "INS": "0,0,200", # blue + "DUP": "0,160,0", # green + "INV": "230,140,0", # orange + "CPX": "140,0,200", # purple + "MIXED": "120,120,120", # grey + "INSDEL": "100,100,150", # slate (deCODE combined ins/del) + "MEI": "0,160,160", # teal + "CNV": "200,0,160", # magenta + "TRA": "90,90,90", # dark grey (translocation) + "BND": "90,90,90", # dark grey (breakend) + "CTX": "90,90,90", # dark grey (chromosomal translocation) +} + +# Fallback for any type not in SV_COLORS. +DEFAULT_SV_COLOR = "100,100,100" + + def normalizeSvType(raw: Optional[str]) -> str: """Return the canonical upper-case svType string for a raw VCF/TSV value.""" if raw is None: return "UNK" s = str(raw).strip() if not s or s == "." or s == "?": return "UNK" upper = s.upper() return TYPE_ALIASES.get(upper, upper) +def svColor(svType: Optional[str]) -> str: + """Return the canonical itemRgb color string for an SV type. + + Normalizes the type first, so aliases (e.g. COMPLEX -> CPX) get the + right color. Unknown types fall back to DEFAULT_SV_COLOR. + """ + return SV_COLORS.get(normalizeSvType(svType), DEFAULT_SV_COLOR) + + def shortLen(lenBp: Optional[int]) -> str: """Short text form for a length in bp. <1000 bp -> integer bp as string, e.g. "200" >=1000 bp -> "Xk" or "X.Xk" (at most one decimal, no trailing .0), e.g. "1k", "5.5k", "15k", "1.5k" None or <=0 -> empty string. """ if lenBp is None: return "" try: n = int(lenBp) except (TypeError, ValueError): return "" if n <= 0: