src/hg/makeDb/scripts/lrSv/lrSvMergeAll.py 9eb4e0937782954c19d664e7d384d210bffb3b25

9eb4e0937782954c19d664e7d384d210bffb3b25
max
  Sat Jun 13 16:01:42 2026 -0700
lrSv: QA fixes from Lou's review - dedup, shared color palette, deCODE/AoU cleanup

- Drop kwanhoSv (KimPD) from the lrSvAll merge in databases.tsv; it stays on
dev/alpha until published, which also removes its >5 Mb breakend artifacts
from the merged track.
- Remove searchIndex from colorsDbSv, lrSv1kLin and lrSvAll (and the merge
generator): the bigBeds were built without a name index, so by-name search
never worked.
- Single shared per-SV-type color palette in lrSvCommon.py (svColor), used by
every converter and the merge. CPX is purple everywhere (was orange in
1kgOnt/apr/cpc1, colliding with INV's orange), colorsDb DEL is 200,0,0 like
the rest, and TRA/INSDEL get their own colors.
- deCODE: drop byte-identical duplicate rows and blank the fake AC=50
placeholder (AC is now a string field, omitted from the name and mouseOver).
- AoU: numeric-entity-encode non-ASCII gene/trait text and drop duplicate rows.
- gustafson, chirmade101, hprc2v21: drop byte-identical duplicate rows.
- lrSvMergeAll.py: skip byte-identical duplicate source rows instead of summing
their allele counts, which had inflated the per-database and total AC.

refs #36258

diff --git src/hg/makeDb/scripts/lrSv/lrSvMergeAll.py src/hg/makeDb/scripts/lrSv/lrSvMergeAll.py
index 49d7d1aa535..1c20d1223eb 100644
--- src/hg/makeDb/scripts/lrSv/lrSvMergeAll.py
+++ src/hg/makeDb/scripts/lrSv/lrSvMergeAll.py
@@ -10,45 +10,34 @@
 
 Usage:
   python3 lrSvMergeAll.py
   python3 lrSvMergeAll.py --region chr22       # quick test on one chromosome
 """
 
 import argparse
 import os
 import re
 import subprocess
 import sys
 from collections import OrderedDict, defaultdict
 from multiprocessing import Pool
 
 SCRIPTS_DIR = os.path.dirname(os.path.abspath(__file__))
-ALL_CHROMOSOMES = [f"chr{i}" for i in range(1, 23)] + ["chrX", "chrY", "chrM"]
+sys.path.insert(0, SCRIPTS_DIR)
+from lrSvCommon import svColor
 
-SVTYPE_COLOR = {
-    "DEL":     "200,0,0",
-    "INS":     "0,0,200",
-    "DUP":     "0,150,0",
-    "INV":     "200,100,0",
-    "CPX":     "150,0,150",
-    "MIXED":   "150,0,150",
-    "INSDEL":  "100,100,150",
-    "BND":     "100,100,100",
-    "TRA":     "100,100,100",
-    "MEI":     "0,100,200",
-    "CNV":     "0,150,150",
-}
+ALL_CHROMOSOMES = [f"chr{i}" for i in range(1, 23)] + ["chrX", "chrY", "chrM"]
 
 # ---------------------------------------------------------------------------
 # Config + autoSql parsing
 # ---------------------------------------------------------------------------
 
 def sanitize_id(s):
     """Convert an arbitrary key to a valid autoSql / C identifier.
     Non-alphanumeric chars become underscores; leading digit gets a 'd' prefix.
     """
     out = re.sub(r'[^A-Za-z0-9_]', '_', s)
     if out and out[0].isdigit():
         out = 'd' + out
     return out
 
 
@@ -291,60 +280,71 @@
         # Take max - same site shouldn't have varying sample counts but be safe
         try:
             v = max(int(pval) if pval.isdigit() else 0,
                     int(nval) if nval.isdigit() else 0)
             return (str(v), af, pins)
         except ValueError:
             return (pval or nval, af, pins)
     # Numeric AC: sum
     return (_add_int(pval, nval), af, pins)
 
 
 def merge_chrom(args):
     chrom, dbs, extract_dir, out_dir = args
     # variant_key (start, end, svType, svLen, insLen) -> {db_key: (value, af, insLen)}
     variants = defaultdict(dict)
+    # (key, db_key) -> set of (value, af) already incorporated, so we only sum
+    # AC across genuinely distinct records and never across byte-identical
+    # duplicate rows (which would inflate the per-database and total AC).
+    seen_combo = defaultdict(set)
     n_in = 0
+    n_dup = 0
 
     db_by_key = {db["key"]: db for db in dbs}
 
     for db in dbs:
         path = os.path.join(extract_dir, db["key"], f"{chrom}.tsv")
         if not os.path.exists(path):
             continue
         with open(path) as f:
             for line in f:
                 parts = line.rstrip("\n").split("\t")
                 if len(parts) < 7:
                     continue
                 start, end, svType, svLen, insLen, value, af = parts
                 # Use insLen=0 for non-INS to avoid spurious key splits
                 if svType not in ("INS",):
                     insLen_key = "0"
                 else:
                     insLen_key = insLen
                 try:
                     key = (int(start), int(end), svType,
                            int(svLen), int(insLen_key))
                 except ValueError:
                     continue
                 n_in += 1
+                combo = (value, af)
                 if db["key"] in variants[key]:
+                    if combo in seen_combo[(key, db["key"])]:
+                        # identical record already counted for this db; skip
+                        n_dup += 1
+                        continue
                     variants[key][db["key"]] = _combine(
                         db, variants[key][db["key"]], (value, af, insLen))
                 else:
                     variants[key][db["key"]] = (value, af, insLen)
+                seen_combo[(key, db["key"])].add(combo)
 
     out_path = os.path.join(out_dir, f"{chrom}.bed")
     n_out = 0
     n_split_aff = 0  # variants with at least one Kim PD affected AC
 
     with open(out_path, "w") as out:
         for key in sorted(variants.keys()):
             start, end, svType, svLen, _insLen_key, = key
             per_db = variants[key]
 
             sources = sorted(per_db.keys())
             source_count = len(sources)
 
             # Compute totalAC: sum over numeric values from regular dbs
             total_ac = 0
@@ -385,31 +385,31 @@
                 if f_af > 0 and (min_af is None or f_af < min_af):
                     min_af = f_af
 
             score = min(1000, int(max_af * 1000)) if max_af > 0 else 0
 
             # Use the original insLen from any contributing INS record
             ins_len_out = "0"
             for db in dbs:
                 if db["key"] in per_db:
                     _v, _af, ins = per_db[db["key"]]
                     ins_len_out = ins
                     break
 
             short = svType[:6]
             name = f"{short}-{svLen}:{source_count}"
-            color = SVTYPE_COLOR.get(svType, "128,128,128")
+            color = svColor(svType)
 
             fields = [
                 chrom, str(start), str(end), name, str(score), ".",
                 str(start), str(end), color,
                 svType, str(svLen), ins_len_out, svType,
                 f"{max_af:.6g}" if max_af > 0 else "0",
                 f"{min_af:.6g}" if min_af is not None else "0",
                 str(total_ac),
                 str(source_count),
                 ",".join(sources),
             ]
 
             # Per-database value columns (SPLIT contributes 2 columns)
             for db in dbs:
                 if db["valueField"] == "SPLIT":
@@ -417,31 +417,32 @@
                         value, _af, _i = per_db[db["key"]]
                         aff, heal = value.split("|", 1)
                         if aff:
                             n_split_aff += 1
                     else:
                         aff, heal = "", ""
                     fields.extend([aff, heal])
                 else:
                     value = per_db.get(db["key"], ("", "", ""))[0]
                     fields.append(value)
 
             out.write("\t".join(fields) + "\n")
             n_out += 1
 
     print(f"  {chrom}: {n_in:,} input rows -> {n_out:,} unique variants "
-          f"({n_in - n_out:,} merged)", file=sys.stderr)
+          f"({n_in - n_out:,} merged; {n_dup:,} identical dup rows skipped)",
+          file=sys.stderr)
     return out_path, n_out
 
 
 # ---------------------------------------------------------------------------
 # AutoSql + trackDb fragment generation
 # ---------------------------------------------------------------------------
 
 def write_autosql(out_path, dbs):
     with open(out_path, "w") as f:
         f.write('table lrSvAll\n')
         f.write('"Combined long-read structural variants from all lrSv subtracks"\n')
         f.write('(\n')
         # BED9
         f.write('    string  chrom;          "Chromosome"\n')
         f.write('    uint    chromStart;     "Start position (0-based)"\n')
@@ -494,31 +495,30 @@
     src_parts = [f"{db['key']}|{db['label'].replace(',', '')}" for db in dbs]
     with open(out_path, "w") as f:
         f.write("# AUTO-GENERATED by ~/kent/src/hg/makeDb/scripts/lrSv/lrSvMergeAll.py\n")
         f.write("# Do not edit by hand - re-run the merge script and re-commit.\n\n")
         f.write("    track lrSvAll\n")
         f.write("    parent lrSv\n")
         f.write("    bigDataUrl /gbdb/$D/lrSv/lrSvAll.bb\n")
         f.write("    shortLabel All LR SVs merged\n")
         f.write("    longLabel All long-read SVs merged across subtracks "
                 "by exact position, with per-database AC\n")
         f.write("    type bigBed 9 +\n")
         f.write("    itemRgb on\n")
         f.write("    visibility pack\n")
         f.write("    mouseOver <b>$name</b> ($svType) svLen=$svLen insLen=$insLen "
                 "sources=$sources AF=$minAF-$maxAF AC=$AC\n")
-        f.write("    searchIndex name\n")
         # Source filter
         f.write("    filterValues.sources " + ",".join(src_parts) + "\n")
         f.write("    filterType.sources multipleListOr\n")
         f.write("    filterLabel.sources Source Database\n")
         # SV type
         f.write("    filterValues.svType DEL,INS,DUP,INV,CPX,MIXED,INSDEL,"
                 "CNV,BND,TRA,MEI\n")
         f.write("    filterType.svType multipleListOr\n")
         f.write("    filterLabel.svType SV Type\n")
         # Range filters
         f.write("    filter.svLen 0:30000000\n")
         f.write("    filterByRange.svLen on\n")
         f.write("    filterLabel.svLen SV Length (bp)\n")
         f.write("    filter.insLen 0:600000\n")
         f.write("    filterByRange.insLen on\n")