9eb4e0937782954c19d664e7d384d210bffb3b25 max Sat Jun 13 16:01:42 2026 -0700 lrSv: QA fixes from Lou's review - dedup, shared color palette, deCODE/AoU cleanup - Drop kwanhoSv (KimPD) from the lrSvAll merge in databases.tsv; it stays on dev/alpha until published, which also removes its >5 Mb breakend artifacts from the merged track. - Remove searchIndex from colorsDbSv, lrSv1kLin and lrSvAll (and the merge generator): the bigBeds were built without a name index, so by-name search never worked. - Single shared per-SV-type color palette in lrSvCommon.py (svColor), used by every converter and the merge. CPX is purple everywhere (was orange in 1kgOnt/apr/cpc1, colliding with INV's orange), colorsDb DEL is 200,0,0 like the rest, and TRA/INSDEL get their own colors. - deCODE: drop byte-identical duplicate rows and blank the fake AC=50 placeholder (AC is now a string field, omitted from the name and mouseOver). - AoU: numeric-entity-encode non-ASCII gene/trait text and drop duplicate rows. - gustafson, chirmade101, hprc2v21: drop byte-identical duplicate rows. - lrSvMergeAll.py: skip byte-identical duplicate source rows instead of summing their allele counts, which had inflated the per-database and total AC. refs #36258 diff --git src/hg/makeDb/scripts/lrSv/lrSvCpc1VcfToBed.py src/hg/makeDb/scripts/lrSv/lrSvCpc1VcfToBed.py index 277bc06d601..252c2a8ef17 100755 --- src/hg/makeDb/scripts/lrSv/lrSvCpc1VcfToBed.py +++ src/hg/makeDb/scripts/lrSv/lrSvCpc1VcfToBed.py @@ -31,39 +31,31 @@ insLen = max inserted-sequence length for INS alts (0 otherwise) AC = sum of per-alt CPC AC alleleNumber = CPC AN alleleFreq = AC / alleleNumber numSamples = CPC NS Usage: lrSvCpc1VcfToBed.py input.vcf.gz output.bed chrom.sizes """ import gzip import os import sys sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) -from lrSvCommon import svName, normalizeSvType - -# Colors per SV type -COLORS = { - "INS": "0,0,200", # blue - "DEL": "200,0,0", # red - "CPX": "230,140,0", # orange - "MIXED": "120,120,120", # grey -} +from lrSvCommon import svName, normalizeSvType, svColor SIZE_THRESHOLD = 50 def is_cpc_sample(name): """CPC samples are HIFI032* (Chinese, 47) and RY* (Chinese, 11).""" return name.startswith("HIFI032") or name.startswith("RY") def classify(ref_len, alt_len): d = alt_len - ref_len if d >= SIZE_THRESHOLD: return "INS", d if d <= -SIZE_THRESHOLD: return "DEL", -d @@ -84,31 +76,31 @@ if a == ".": continue an += 1 has_called = True if a == "1": ac += 1 if has_called: ns += 1 return ac, an, ns def emit(site, fout): classes = site["types"] sv_type = next(iter(classes)) if len(classes) == 1 else "MIXED" sv_type = normalizeSvType(sv_type) - rgb = COLORS.get(sv_type, "120,120,120") + rgb = svColor(sv_type) chrom = site["chrom"] start = site["pos0"] end = start + max(site["ref_len"], 1) af = (site["ac_sum"] / site["an"]) if site["an"] else 0.0 score = min(1000, max(0, int(round(af * 1000)))) svLen = end - start insLen = site["max_ins"] if sv_type == "INS" else 0 featLen = insLen if sv_type in ("INS", "MEI") else svLen name = svName(sv_type, featLen, site["ac_sum"]) row = [ chrom, str(start), str(end), name, str(score), ".", str(start), str(end),