bac95a147f49cd331052e597006e04b3deee40fc max Wed Apr 22 10:43:20 2026 -0700 lrSv/srSv: human-readable SV type filter labels, script cleanups Add human-readable labels to the supertrack-level svType filter on both the lrSv and srSv supertracks using the "CODE|CODE (Long name)" filterValues syntax: DEL -> "DEL (Deletion)", INS -> "INS (Insertion)", etc. Labels keep the short code up front so users can match what hgTracks shows next to each feature. Also sweep in the in-progress converter/as-file cleanups under scripts/lrSv/ and scripts/srSv/ (introduction of lrSvCommon.py helpers, consistent insLen / svLen / AC column naming, tightened field-description text) that had been piling up as an unstaged working tree. refs #36258 diff --git src/hg/makeDb/scripts/srSv/abelSvVcfToBed.py src/hg/makeDb/scripts/srSv/abelSvVcfToBed.py index 1869080093e..d54cb5bbd74 100755 --- src/hg/makeDb/scripts/srSv/abelSvVcfToBed.py +++ src/hg/makeDb/scripts/srSv/abelSvVcfToBed.py @@ -1,30 +1,35 @@ #!/usr/bin/env python3 """Convert an Abel et al. 2020 CCDG structural-variant VCF (Build38 or Build37) to the bed14+ representation used by the abelSv track. Inputs: gzipped VCF on stdin (pass through zcat). Output: tab-separated bed rows on stdout. Usage: zcat Build38.public.v2.vcf.gz | vcfToBed.py B38 > B38.bed zcat Build37.public.v2.vcf.gz | vcfToBed.py B37 > B37.prelift.bed """ import sys import gzip +import os import re +# Share the lrSv svName/normalizeSvType helpers without duplicating the module. +sys.path.insert(0, os.path.expanduser("~/kent/src/hg/makeDb/scripts/lrSv")) +from lrSvCommon import svName, normalizeSvType + # RGB colors per SV type (flat solid colors) COLORS = { "DEL": "220,50,32", # red "DUP": "0,120,200", # blue "INV": "230,140,0", # orange "MEI": "140,80,180", # purple "BND": "120,120,120", # grey } # Population codes present in INFO (per-ancestry AC/AN) POPS = ["AFR", "AMR", "NFE", "FE", "EAS", "SAS", "PI", "Other"] # Integer fields we extract and report as int INT_FIELDS = ["AC", "AN", "NSAMP", "NFAM"] FLOAT_FIELDS = ["MSQ", "AF"] @@ -97,30 +102,33 @@ chrom, pos, vid, ref, alt, qual, filt, info = f[:8] info_d = parse_info(info) # Skip secondary BND records; we keep only the primary. if info_d.get("SECONDARY"): continue svtype_raw = info_d.get("SVTYPE", "") # Also check ALT for richer subtype info alt_sv = get_alt_svtype(alt) svclass = classify_svtype(alt_sv or svtype_raw) if svclass not in COLORS: continue # skip unknown types + # Canonicalize via lrSvCommon (DEL/DUP/INV/MEI/BND are already canonical). + svType = normalizeSvType(svclass) + start = int(pos) - 1 # VCF is 1-based mate_chrom = "" mate_pos = -1 sv_length = -1 end = start + 1 # default single-width if svclass == "BND": mate_chrom, mate_pos = parse_bnd_alt(alt) if add_chr and mate_chrom and not mate_chrom.startswith("chr"): mate_chrom = "chr" + mate_chrom end = start + 1 else: if "END" in info_d: end = int(info_d["END"]) if "SVLEN" in info_d: @@ -150,43 +158,65 @@ except ValueError: msq = 0.0 nsamp = int(info_d.get("NSAMP", "0")) nfam = int(info_d.get("NFAM", "0")) # Score for bigBed: scale AF to 0-1000 score = min(1000, max(0, int(round(af * 1000)))) # Per-population AC/AN (missing -> 0) pop_vals = [] for pop in POPS: pop_ac = int(info_d.get(f"AC_{pop}", "0").split(",")[0]) pop_an = int(info_d.get(f"AN_{pop}", "0")) pop_vals.append((pop_ac, pop_an)) + # svLen canonical = reference span; keep legacy abs(SVLEN) as the displayed + # length too by writing it into svLen when non-BND (matches legacy abelSv + # filter range). For BND, use -1 (a sentinel preserved in the as schema). + if svclass == "BND": + svLen = -1 + insLen = 0 + else: + svLen = end - start + if svclass == "MEI": + # For MEI, abs(SVLEN) is the inserted-element size. + # abelSv VCF reports MEI with negative SVLEN (e.g. -303), + # so test != 0 rather than > 0. + insLen = abs(sv_length) if sv_length != 0 and sv_length != -1 else 0 + else: + insLen = 0 + rgb = COLORS[svclass] + + featLen = insLen if svType in ("INS", "MEI") else (svLen if svLen > 0 else None) + name = svName(svType, featLen, ac) + row = [ chrom, str(start), str(end), - vid, + name, str(score), ".", str(start), str(end), rgb, - svclass, - str(sv_length), + svType, + str(svLen), + str(insLen), + str(ac), out_tag, filt, - str(ac), str(an), + str(an), f"{af:.6f}", f"{msq:.2f}", str(nsamp), str(nfam), mate_chrom, str(mate_pos), ] # Append per-pop AC/AN in the order: AFR, AMR, NFE, FE, EAS, SAS, PI, Other for ac_p, an_p in pop_vals: row.append(str(ac_p)) row.append(str(an_p)) print("\t".join(row)) if __name__ == "__main__": main()