9eb4e0937782954c19d664e7d384d210bffb3b25 max Sat Jun 13 16:01:42 2026 -0700 lrSv: QA fixes from Lou's review - dedup, shared color palette, deCODE/AoU cleanup - Drop kwanhoSv (KimPD) from the lrSvAll merge in databases.tsv; it stays on dev/alpha until published, which also removes its >5 Mb breakend artifacts from the merged track. - Remove searchIndex from colorsDbSv, lrSv1kLin and lrSvAll (and the merge generator): the bigBeds were built without a name index, so by-name search never worked. - Single shared per-SV-type color palette in lrSvCommon.py (svColor), used by every converter and the merge. CPX is purple everywhere (was orange in 1kgOnt/apr/cpc1, colliding with INV's orange), colorsDb DEL is 200,0,0 like the rest, and TRA/INSDEL get their own colors. - deCODE: drop byte-identical duplicate rows and blank the fake AC=50 placeholder (AC is now a string field, omitted from the name and mouseOver). - AoU: numeric-entity-encode non-ASCII gene/trait text and drop duplicate rows. - gustafson, chirmade101, hprc2v21: drop byte-identical duplicate rows. - lrSvMergeAll.py: skip byte-identical duplicate source rows instead of summing their allele counts, which had inflated the per-database and total AC. refs #36258 diff --git src/hg/makeDb/scripts/lrSv/lrSvMergeAll.py src/hg/makeDb/scripts/lrSv/lrSvMergeAll.py index 49d7d1aa535..1c20d1223eb 100644 --- src/hg/makeDb/scripts/lrSv/lrSvMergeAll.py +++ src/hg/makeDb/scripts/lrSv/lrSvMergeAll.py @@ -10,45 +10,34 @@ Usage: python3 lrSvMergeAll.py python3 lrSvMergeAll.py --region chr22 # quick test on one chromosome """ import argparse import os import re import subprocess import sys from collections import OrderedDict, defaultdict from multiprocessing import Pool SCRIPTS_DIR = os.path.dirname(os.path.abspath(__file__)) -ALL_CHROMOSOMES = [f"chr{i}" for i in range(1, 23)] + ["chrX", "chrY", "chrM"] +sys.path.insert(0, SCRIPTS_DIR) +from lrSvCommon import svColor -SVTYPE_COLOR = { - "DEL": "200,0,0", - "INS": "0,0,200", - "DUP": "0,150,0", - "INV": "200,100,0", - "CPX": "150,0,150", - "MIXED": "150,0,150", - "INSDEL": "100,100,150", - "BND": "100,100,100", - "TRA": "100,100,100", - "MEI": "0,100,200", - "CNV": "0,150,150", -} +ALL_CHROMOSOMES = [f"chr{i}" for i in range(1, 23)] + ["chrX", "chrY", "chrM"] # --------------------------------------------------------------------------- # Config + autoSql parsing # --------------------------------------------------------------------------- def sanitize_id(s): """Convert an arbitrary key to a valid autoSql / C identifier. Non-alphanumeric chars become underscores; leading digit gets a 'd' prefix. """ out = re.sub(r'[^A-Za-z0-9_]', '_', s) if out and out[0].isdigit(): out = 'd' + out return out @@ -291,60 +280,71 @@ # Take max - same site shouldn't have varying sample counts but be safe try: v = max(int(pval) if pval.isdigit() else 0, int(nval) if nval.isdigit() else 0) return (str(v), af, pins) except ValueError: return (pval or nval, af, pins) # Numeric AC: sum return (_add_int(pval, nval), af, pins) def merge_chrom(args): chrom, dbs, extract_dir, out_dir = args # variant_key (start, end, svType, svLen, insLen) -> {db_key: (value, af, insLen)} variants = defaultdict(dict) + # (key, db_key) -> set of (value, af) already incorporated, so we only sum + # AC across genuinely distinct records and never across byte-identical + # duplicate rows (which would inflate the per-database and total AC). + seen_combo = defaultdict(set) n_in = 0 + n_dup = 0 db_by_key = {db["key"]: db for db in dbs} for db in dbs: path = os.path.join(extract_dir, db["key"], f"{chrom}.tsv") if not os.path.exists(path): continue with open(path) as f: for line in f: parts = line.rstrip("\n").split("\t") if len(parts) < 7: continue start, end, svType, svLen, insLen, value, af = parts # Use insLen=0 for non-INS to avoid spurious key splits if svType not in ("INS",): insLen_key = "0" else: insLen_key = insLen try: key = (int(start), int(end), svType, int(svLen), int(insLen_key)) except ValueError: continue n_in += 1 + combo = (value, af) if db["key"] in variants[key]: + if combo in seen_combo[(key, db["key"])]: + # identical record already counted for this db; skip + n_dup += 1 + continue variants[key][db["key"]] = _combine( db, variants[key][db["key"]], (value, af, insLen)) else: variants[key][db["key"]] = (value, af, insLen) + seen_combo[(key, db["key"])].add(combo) out_path = os.path.join(out_dir, f"{chrom}.bed") n_out = 0 n_split_aff = 0 # variants with at least one Kim PD affected AC with open(out_path, "w") as out: for key in sorted(variants.keys()): start, end, svType, svLen, _insLen_key, = key per_db = variants[key] sources = sorted(per_db.keys()) source_count = len(sources) # Compute totalAC: sum over numeric values from regular dbs total_ac = 0 @@ -385,31 +385,31 @@ if f_af > 0 and (min_af is None or f_af < min_af): min_af = f_af score = min(1000, int(max_af * 1000)) if max_af > 0 else 0 # Use the original insLen from any contributing INS record ins_len_out = "0" for db in dbs: if db["key"] in per_db: _v, _af, ins = per_db[db["key"]] ins_len_out = ins break short = svType[:6] name = f"{short}-{svLen}:{source_count}" - color = SVTYPE_COLOR.get(svType, "128,128,128") + color = svColor(svType) fields = [ chrom, str(start), str(end), name, str(score), ".", str(start), str(end), color, svType, str(svLen), ins_len_out, svType, f"{max_af:.6g}" if max_af > 0 else "0", f"{min_af:.6g}" if min_af is not None else "0", str(total_ac), str(source_count), ",".join(sources), ] # Per-database value columns (SPLIT contributes 2 columns) for db in dbs: if db["valueField"] == "SPLIT": @@ -417,31 +417,32 @@ value, _af, _i = per_db[db["key"]] aff, heal = value.split("|", 1) if aff: n_split_aff += 1 else: aff, heal = "", "" fields.extend([aff, heal]) else: value = per_db.get(db["key"], ("", "", ""))[0] fields.append(value) out.write("\t".join(fields) + "\n") n_out += 1 print(f" {chrom}: {n_in:,} input rows -> {n_out:,} unique variants " - f"({n_in - n_out:,} merged)", file=sys.stderr) + f"({n_in - n_out:,} merged; {n_dup:,} identical dup rows skipped)", + file=sys.stderr) return out_path, n_out # --------------------------------------------------------------------------- # AutoSql + trackDb fragment generation # --------------------------------------------------------------------------- def write_autosql(out_path, dbs): with open(out_path, "w") as f: f.write('table lrSvAll\n') f.write('"Combined long-read structural variants from all lrSv subtracks"\n') f.write('(\n') # BED9 f.write(' string chrom; "Chromosome"\n') f.write(' uint chromStart; "Start position (0-based)"\n') @@ -494,31 +495,30 @@ src_parts = [f"{db['key']}|{db['label'].replace(',', '')}" for db in dbs] with open(out_path, "w") as f: f.write("# AUTO-GENERATED by ~/kent/src/hg/makeDb/scripts/lrSv/lrSvMergeAll.py\n") f.write("# Do not edit by hand - re-run the merge script and re-commit.\n\n") f.write(" track lrSvAll\n") f.write(" parent lrSv\n") f.write(" bigDataUrl /gbdb/$D/lrSv/lrSvAll.bb\n") f.write(" shortLabel All LR SVs merged\n") f.write(" longLabel All long-read SVs merged across subtracks " "by exact position, with per-database AC\n") f.write(" type bigBed 9 +\n") f.write(" itemRgb on\n") f.write(" visibility pack\n") f.write(" mouseOver <b>$name</b> ($svType) svLen=$svLen insLen=$insLen " "sources=$sources AF=$minAF-$maxAF AC=$AC\n") - f.write(" searchIndex name\n") # Source filter f.write(" filterValues.sources " + ",".join(src_parts) + "\n") f.write(" filterType.sources multipleListOr\n") f.write(" filterLabel.sources Source Database\n") # SV type f.write(" filterValues.svType DEL,INS,DUP,INV,CPX,MIXED,INSDEL," "CNV,BND,TRA,MEI\n") f.write(" filterType.svType multipleListOr\n") f.write(" filterLabel.svType SV Type\n") # Range filters f.write(" filter.svLen 0:30000000\n") f.write(" filterByRange.svLen on\n") f.write(" filterLabel.svLen SV Length (bp)\n") f.write(" filter.insLen 0:600000\n") f.write(" filterByRange.insLen on\n")