4d9c385e230b4cc8bd55815aa7ba37dea97d4c0a max Mon Apr 20 08:49:42 2026 -0700 abelSv: new track for CCDG 17,795 structural variants (hg38) Add site-frequency track for the Abel et al. 2020 Nature SV callset (PMID 32460305): 737,998 SVs from 17,795 deeply sequenced genomes. Includes the B38 native callset (14,623 samples) and B37 lifted to hg38 (8,417 samples). Track is colored by SV type (DEL/DUP/INV/MEI/BND) with filters for type, callset, filter status, AC, AF, SV length, and MSQ. BND secondary mate records are suppressed so translocation pairs appear only once. refs #36258 diff --git src/hg/makeDb/scripts/abelSv/vcfToBed.py src/hg/makeDb/scripts/abelSv/vcfToBed.py new file mode 100755 index 00000000000..1869080093e --- /dev/null +++ src/hg/makeDb/scripts/abelSv/vcfToBed.py @@ -0,0 +1,192 @@ +#!/usr/bin/env python3 +"""Convert an Abel et al. 2020 CCDG structural-variant VCF (Build38 or Build37) +to the bed14+ representation used by the abelSv track. + +Inputs: gzipped VCF on stdin (pass through zcat). +Output: tab-separated bed rows on stdout. + +Usage: + zcat Build38.public.v2.vcf.gz | vcfToBed.py B38 > B38.bed + zcat Build37.public.v2.vcf.gz | vcfToBed.py B37 > B37.prelift.bed +""" +import sys +import gzip +import re + +# RGB colors per SV type (flat solid colors) +COLORS = { + "DEL": "220,50,32", # red + "DUP": "0,120,200", # blue + "INV": "230,140,0", # orange + "MEI": "140,80,180", # purple + "BND": "120,120,120", # grey +} + +# Population codes present in INFO (per-ancestry AC/AN) +POPS = ["AFR", "AMR", "NFE", "FE", "EAS", "SAS", "PI", "Other"] + +# Integer fields we extract and report as int +INT_FIELDS = ["AC", "AN", "NSAMP", "NFAM"] +FLOAT_FIELDS = ["MSQ", "AF"] + + +def parse_info(info_str): + """Parse INFO column into dict; flags become True.""" + d = {} + for token in info_str.split(";"): + if not token: + continue + if "=" in token: + k, v = token.split("=", 1) + d[k] = v + else: + d[token] = True + return d + + +BND_ALT_RE = re.compile(r"([\[\]])([^:]+):(\d+)[\[\]]") + + +def parse_bnd_alt(alt): + """Extract mate chrom and position from a BND ALT string.""" + m = BND_ALT_RE.search(alt) + if m: + return m.group(2), int(m.group(3)) + return "", -1 + + +def classify_svtype(svtype_raw): + """Collapse SVTYPE string to one of DEL/DUP/INV/MEI/BND. + The VCF has composite types like that should + map to MEI (mobile-element-insertion-derived deletion).""" + if svtype_raw == "BND": + return "BND" + if svtype_raw.startswith("DEL"): + if ":ME" in svtype_raw: + return "MEI" + return "DEL" + if svtype_raw == "MEI" or svtype_raw.startswith("INS:ME") or svtype_raw.startswith("ME"): + return "MEI" + if svtype_raw.startswith("DUP"): + return "DUP" + if svtype_raw.startswith("INV"): + return "INV" + return svtype_raw + + +def get_alt_svtype(alt): + """ALT column often holds the detailed SV type for non-BND, e.g. .""" + alt = alt.strip() + if alt.startswith("<") and alt.endswith(">"): + return alt[1:-1] + return "" + + +def main(): + if len(sys.argv) < 2 or sys.argv[1] not in ("B38", "B37", "B37lift"): + sys.exit("usage: vcfToBed.py B38|B37|B37lift") + tag = sys.argv[1] + # Will we need the chr-prefix? B37 uses bare names. + add_chr = (tag == "B37") + out_tag = "B37lift" if tag == "B37" else tag + + for line in sys.stdin: + if line.startswith("#"): + continue + f = line.rstrip("\n").split("\t") + chrom, pos, vid, ref, alt, qual, filt, info = f[:8] + info_d = parse_info(info) + + # Skip secondary BND records; we keep only the primary. + if info_d.get("SECONDARY"): + continue + + svtype_raw = info_d.get("SVTYPE", "") + # Also check ALT for richer subtype info + alt_sv = get_alt_svtype(alt) + svclass = classify_svtype(alt_sv or svtype_raw) + + if svclass not in COLORS: + continue # skip unknown types + + start = int(pos) - 1 # VCF is 1-based + mate_chrom = "" + mate_pos = -1 + sv_length = -1 + end = start + 1 # default single-width + + if svclass == "BND": + mate_chrom, mate_pos = parse_bnd_alt(alt) + if add_chr and mate_chrom and not mate_chrom.startswith("chr"): + mate_chrom = "chr" + mate_chrom + end = start + 1 + else: + if "END" in info_d: + end = int(info_d["END"]) + if "SVLEN" in info_d: + try: + # SVLEN can be comma-list or negative + sv_length = int(info_d["SVLEN"].split(",")[0]) + except ValueError: + sv_length = -1 + if end <= start: + end = start + 1 + + if add_chr and not chrom.startswith("chr"): + chrom = "chr" + chrom + + # Numeric fields + ac = int(info_d.get("AC", "0").split(",")[0]) + an = int(info_d.get("AN", "0")) + if "AF" in info_d: + try: + af = float(info_d["AF"].split(",")[0]) + except ValueError: + af = (ac / an) if an else 0.0 + else: + af = (ac / an) if an else 0.0 + try: + msq = float(info_d.get("MSQ", "0")) + except ValueError: + msq = 0.0 + nsamp = int(info_d.get("NSAMP", "0")) + nfam = int(info_d.get("NFAM", "0")) + + # Score for bigBed: scale AF to 0-1000 + score = min(1000, max(0, int(round(af * 1000)))) + + # Per-population AC/AN (missing -> 0) + pop_vals = [] + for pop in POPS: + pop_ac = int(info_d.get(f"AC_{pop}", "0").split(",")[0]) + pop_an = int(info_d.get(f"AN_{pop}", "0")) + pop_vals.append((pop_ac, pop_an)) + + rgb = COLORS[svclass] + row = [ + chrom, str(start), str(end), + vid, + str(score), + ".", + str(start), str(end), + rgb, + svclass, + str(sv_length), + out_tag, + filt, + str(ac), str(an), + f"{af:.6f}", + f"{msq:.2f}", + str(nsamp), str(nfam), + mate_chrom, str(mate_pos), + ] + # Append per-pop AC/AN in the order: AFR, AMR, NFE, FE, EAS, SAS, PI, Other + for ac_p, an_p in pop_vals: + row.append(str(ac_p)) + row.append(str(an_p)) + + print("\t".join(row)) + + +if __name__ == "__main__": + main()