bac95a147f49cd331052e597006e04b3deee40fc max Wed Apr 22 10:43:20 2026 -0700 lrSv/srSv: human-readable SV type filter labels, script cleanups Add human-readable labels to the supertrack-level svType filter on both the lrSv and srSv supertracks using the "CODE|CODE (Long name)" filterValues syntax: DEL -> "DEL (Deletion)", INS -> "INS (Insertion)", etc. Labels keep the short code up front so users can match what hgTracks shows next to each feature. Also sweep in the in-progress converter/as-file cleanups under scripts/lrSv/ and scripts/srSv/ (introduction of lrSvCommon.py helpers, consistent insLen / svLen / AC column naming, tightened field-description text) that had been piling up as an unstaged working tree. refs #36258 diff --git src/hg/makeDb/scripts/lrSv/lrSvCpc1VcfToBed.py src/hg/makeDb/scripts/lrSv/lrSvCpc1VcfToBed.py index 13f01328ff0..277bc06d601 100755 --- src/hg/makeDb/scripts/lrSv/lrSvCpc1VcfToBed.py +++ src/hg/makeDb/scripts/lrSv/lrSvCpc1VcfToBed.py @@ -15,43 +15,48 @@ Pipeline: 1. Parse the #CHROM header to identify the 58 CPC sample column indices by prefix (HIFI032* or RY*). 2. Strip the "CHM13v2." contig prefix for hs1 chrom names. 3. For each VCF row, read the GT field for each CPC sample and compute CPC-only AC (count of "1" alleles), AN (count of non-missing alleles), and NS (CPC samples with at least one called allele). 4. Classify each alt allele by length delta d = len(ALT)-len(REF): d >= +50 -> INS d <= -50 -> DEL |d| < 50 and max(len) >= 50 -> CPX otherwise -> dropped (below 50bp threshold) 5. Drop rows where CPC AC == 0 (HPRC-specific alts). 6. Collapse all remaining alts sharing the same snarl ID into one output row: svType = that class if all alts agree, else MIXED - svLen = max |d| across alts - alleleCount = sum of per-alt CPC AC + svLen = reference span (chromEnd - chromStart) + insLen = max inserted-sequence length for INS alts (0 otherwise) + AC = sum of per-alt CPC AC alleleNumber = CPC AN - alleleFreq = alleleCount / alleleNumber + alleleFreq = AC / alleleNumber numSamples = CPC NS Usage: lrSvCpc1VcfToBed.py input.vcf.gz output.bed chrom.sizes """ import gzip +import os import sys +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +from lrSvCommon import svName, normalizeSvType + # Colors per SV type COLORS = { "INS": "0,0,200", # blue "DEL": "200,0,0", # red "CPX": "230,140,0", # orange "MIXED": "120,120,120", # grey } SIZE_THRESHOLD = 50 def is_cpc_sample(name): """CPC samples are HIFI032* (Chinese, 47) and RY* (Chinese, 11).""" return name.startswith("HIFI032") or name.startswith("RY") @@ -78,47 +83,53 @@ for a in gt.replace("/", "|").split("|"): if a == ".": continue an += 1 has_called = True if a == "1": ac += 1 if has_called: ns += 1 return ac, an, ns def emit(site, fout): classes = site["types"] sv_type = next(iter(classes)) if len(classes) == 1 else "MIXED" - rgb = COLORS[sv_type] + sv_type = normalizeSvType(sv_type) + rgb = COLORS.get(sv_type, "120,120,120") chrom = site["chrom"] start = site["pos0"] end = start + max(site["ref_len"], 1) af = (site["ac_sum"] / site["an"]) if site["an"] else 0.0 score = min(1000, max(0, int(round(af * 1000)))) + svLen = end - start + insLen = site["max_ins"] if sv_type == "INS" else 0 + featLen = insLen if sv_type in ("INS", "MEI") else svLen + name = svName(sv_type, featLen, site["ac_sum"]) row = [ chrom, str(start), str(end), - site["name"], + name, str(score), ".", str(start), str(end), rgb, sv_type, - str(site["svlen"]), - str(site["num_alts"]), + str(svLen), + str(insLen), str(site["ac_sum"]), + str(site["num_alts"]), str(site["an"]), f"{af:.6f}", str(site["ns"]), ] fout.write("\t".join(row) + "\n") def main(): if len(sys.argv) < 4: sys.exit("usage: lrSvCpc1VcfToBed.py input.vcf.gz output.bed chrom.sizes") in_file = sys.argv[1] out_file = sys.argv[2] sizes_file = sys.argv[3] chrom_sizes = {} @@ -184,40 +195,45 @@ continue pos0 = int(pos) - 1 key = (chrom, pos0, vid) if key != prev_key: if site is not None: emit(site, fout) flushed_sites += 1 site = { "chrom": chrom, "pos0": pos0, "ref_len": ref_len, "name": vid, "types": set(), "svlen": 0, + "max_ins": 0, "num_alts": 0, "ac_sum": 0, "an": an, "ns": ns, } prev_key = key site["types"].add(sv_type) if mag > site["svlen"]: site["svlen"] = mag + if sv_type == "INS": + d = alt_len - ref_len + if d > site["max_ins"]: + site["max_ins"] = d site["num_alts"] += 1 site["ac_sum"] += ac if an > site["an"]: site["an"] = an if ns > site["ns"]: site["ns"] = ns kept_rows += 1 if site is not None: emit(site, fout) flushed_sites += 1 print(f"kept alt rows (CPC carrier): {kept_rows}", file=sys.stderr) print(f"dropped no CPC carrier: {dropped_no_cpc_carrier}", file=sys.stderr) print(f"dropped <{SIZE_THRESHOLD}bp alt rows: {dropped_small}", file=sys.stderr)