bac95a147f49cd331052e597006e04b3deee40fc max Wed Apr 22 10:43:20 2026 -0700 lrSv/srSv: human-readable SV type filter labels, script cleanups Add human-readable labels to the supertrack-level svType filter on both the lrSv and srSv supertracks using the "CODE|CODE (Long name)" filterValues syntax: DEL -> "DEL (Deletion)", INS -> "INS (Insertion)", etc. Labels keep the short code up front so users can match what hgTracks shows next to each feature. Also sweep in the in-progress converter/as-file cleanups under scripts/lrSv/ and scripts/srSv/ (introduction of lrSvCommon.py helpers, consistent insLen / svLen / AC column naming, tightened field-description text) that had been piling up as an unstaged working tree. refs #36258 diff --git src/hg/makeDb/scripts/srSv/abelSvVcfToBed.py src/hg/makeDb/scripts/srSv/abelSvVcfToBed.py index 1869080093e..d54cb5bbd74 100755 --- src/hg/makeDb/scripts/srSv/abelSvVcfToBed.py +++ src/hg/makeDb/scripts/srSv/abelSvVcfToBed.py @@ -1,192 +1,222 @@ #!/usr/bin/env python3 """Convert an Abel et al. 2020 CCDG structural-variant VCF (Build38 or Build37) to the bed14+ representation used by the abelSv track. Inputs: gzipped VCF on stdin (pass through zcat). Output: tab-separated bed rows on stdout. Usage: zcat Build38.public.v2.vcf.gz | vcfToBed.py B38 > B38.bed zcat Build37.public.v2.vcf.gz | vcfToBed.py B37 > B37.prelift.bed """ import sys import gzip +import os import re +# Share the lrSv svName/normalizeSvType helpers without duplicating the module. +sys.path.insert(0, os.path.expanduser("~/kent/src/hg/makeDb/scripts/lrSv")) +from lrSvCommon import svName, normalizeSvType + # RGB colors per SV type (flat solid colors) COLORS = { "DEL": "220,50,32", # red "DUP": "0,120,200", # blue "INV": "230,140,0", # orange "MEI": "140,80,180", # purple "BND": "120,120,120", # grey } # Population codes present in INFO (per-ancestry AC/AN) POPS = ["AFR", "AMR", "NFE", "FE", "EAS", "SAS", "PI", "Other"] # Integer fields we extract and report as int INT_FIELDS = ["AC", "AN", "NSAMP", "NFAM"] FLOAT_FIELDS = ["MSQ", "AF"] def parse_info(info_str): """Parse INFO column into dict; flags become True.""" d = {} for token in info_str.split(";"): if not token: continue if "=" in token: k, v = token.split("=", 1) d[k] = v else: d[token] = True return d BND_ALT_RE = re.compile(r"([\[\]])([^:]+):(\d+)[\[\]]") def parse_bnd_alt(alt): """Extract mate chrom and position from a BND ALT string.""" m = BND_ALT_RE.search(alt) if m: return m.group(2), int(m.group(3)) return "", -1 def classify_svtype(svtype_raw): """Collapse SVTYPE string to one of DEL/DUP/INV/MEI/BND. The VCF has composite types like that should map to MEI (mobile-element-insertion-derived deletion).""" if svtype_raw == "BND": return "BND" if svtype_raw.startswith("DEL"): if ":ME" in svtype_raw: return "MEI" return "DEL" if svtype_raw == "MEI" or svtype_raw.startswith("INS:ME") or svtype_raw.startswith("ME"): return "MEI" if svtype_raw.startswith("DUP"): return "DUP" if svtype_raw.startswith("INV"): return "INV" return svtype_raw def get_alt_svtype(alt): """ALT column often holds the detailed SV type for non-BND, e.g. .""" alt = alt.strip() if alt.startswith("<") and alt.endswith(">"): return alt[1:-1] return "" def main(): if len(sys.argv) < 2 or sys.argv[1] not in ("B38", "B37", "B37lift"): sys.exit("usage: vcfToBed.py B38|B37|B37lift") tag = sys.argv[1] # Will we need the chr-prefix? B37 uses bare names. add_chr = (tag == "B37") out_tag = "B37lift" if tag == "B37" else tag for line in sys.stdin: if line.startswith("#"): continue f = line.rstrip("\n").split("\t") chrom, pos, vid, ref, alt, qual, filt, info = f[:8] info_d = parse_info(info) # Skip secondary BND records; we keep only the primary. if info_d.get("SECONDARY"): continue svtype_raw = info_d.get("SVTYPE", "") # Also check ALT for richer subtype info alt_sv = get_alt_svtype(alt) svclass = classify_svtype(alt_sv or svtype_raw) if svclass not in COLORS: continue # skip unknown types + # Canonicalize via lrSvCommon (DEL/DUP/INV/MEI/BND are already canonical). + svType = normalizeSvType(svclass) + start = int(pos) - 1 # VCF is 1-based mate_chrom = "" mate_pos = -1 sv_length = -1 end = start + 1 # default single-width if svclass == "BND": mate_chrom, mate_pos = parse_bnd_alt(alt) if add_chr and mate_chrom and not mate_chrom.startswith("chr"): mate_chrom = "chr" + mate_chrom end = start + 1 else: if "END" in info_d: end = int(info_d["END"]) if "SVLEN" in info_d: try: # SVLEN can be comma-list or negative sv_length = int(info_d["SVLEN"].split(",")[0]) except ValueError: sv_length = -1 if end <= start: end = start + 1 if add_chr and not chrom.startswith("chr"): chrom = "chr" + chrom # Numeric fields ac = int(info_d.get("AC", "0").split(",")[0]) an = int(info_d.get("AN", "0")) if "AF" in info_d: try: af = float(info_d["AF"].split(",")[0]) except ValueError: af = (ac / an) if an else 0.0 else: af = (ac / an) if an else 0.0 try: msq = float(info_d.get("MSQ", "0")) except ValueError: msq = 0.0 nsamp = int(info_d.get("NSAMP", "0")) nfam = int(info_d.get("NFAM", "0")) # Score for bigBed: scale AF to 0-1000 score = min(1000, max(0, int(round(af * 1000)))) # Per-population AC/AN (missing -> 0) pop_vals = [] for pop in POPS: pop_ac = int(info_d.get(f"AC_{pop}", "0").split(",")[0]) pop_an = int(info_d.get(f"AN_{pop}", "0")) pop_vals.append((pop_ac, pop_an)) + # svLen canonical = reference span; keep legacy abs(SVLEN) as the displayed + # length too by writing it into svLen when non-BND (matches legacy abelSv + # filter range). For BND, use -1 (a sentinel preserved in the as schema). + if svclass == "BND": + svLen = -1 + insLen = 0 + else: + svLen = end - start + if svclass == "MEI": + # For MEI, abs(SVLEN) is the inserted-element size. + # abelSv VCF reports MEI with negative SVLEN (e.g. -303), + # so test != 0 rather than > 0. + insLen = abs(sv_length) if sv_length != 0 and sv_length != -1 else 0 + else: + insLen = 0 + rgb = COLORS[svclass] + + featLen = insLen if svType in ("INS", "MEI") else (svLen if svLen > 0 else None) + name = svName(svType, featLen, ac) + row = [ chrom, str(start), str(end), - vid, + name, str(score), ".", str(start), str(end), rgb, - svclass, - str(sv_length), + svType, + str(svLen), + str(insLen), + str(ac), out_tag, filt, - str(ac), str(an), + str(an), f"{af:.6f}", f"{msq:.2f}", str(nsamp), str(nfam), mate_chrom, str(mate_pos), ] # Append per-pop AC/AN in the order: AFR, AMR, NFE, FE, EAS, SAS, PI, Other for ac_p, an_p in pop_vals: row.append(str(ac_p)) row.append(str(an_p)) print("\t".join(row)) if __name__ == "__main__": main()