5e4ca58df1b5bfe554fe5cc3309a39736ca256ee max Tue Apr 21 08:08:52 2026 -0700 cpc1Sv: restrict to the 58 CPC samples, drop HPRC-specific SVs Rewrite lrSvCpc1VcfToBed.py to identify the 58 CPC sample columns by name prefix (HIFI032* or RY*), recompute AC/AN/NS from those GT columns only, and skip any snarl that no CPC sample carries. The HPRC portion is already represented elsewhere in lrSv, so this keeps the track population-consistent with its label. Rebuild results: 46,092 snarl sites on hs1 (down from 97,205 when combined with HPRC), 36,030 lifted to hg38 (down from 81,261; 10,062 unmapped). Updates cpc1Sv.html, lrSv.ra labels, and the makeDoc. refs #36258 diff --git src/hg/makeDb/scripts/lrSv/lrSvCpc1VcfToBed.py src/hg/makeDb/scripts/lrSv/lrSvCpc1VcfToBed.py index 5faecd7ce9c..13f01328ff0 100755 --- src/hg/makeDb/scripts/lrSv/lrSvCpc1VcfToBed.py +++ src/hg/makeDb/scripts/lrSv/lrSvCpc1VcfToBed.py @@ -1,204 +1,229 @@ #!/usr/bin/env python3 """Convert the CPC+HPRC Phase 1 pangenome VCF (on T2T-CHM13v2) to a -bed9+7 table for the lrSv/cpc1Sv track. +bed9+7 table for the lrSv/cpc1Sv track, RECOMPUTING AC/AN/NS using only +the 58 CPC (Chinese Pangenome Consortium) sample columns and dropping +snarls with no CPC carriers. The input VCF is produced by vcfwave/bcftools norm from a pangenome graph, so: * contigs are named "CHM13v2.chrN" * variant IDs are graph snarl traversals like ">2541>2547" * REF and ALT are always explicit sequences (no symbolic ALTs) * each snarl may appear as several rows, one per decomposed alt allele - -This script: - 1. strips the "CHM13v2." contig prefix (hs1 chrom names) - 2. classifies each alt allele by length delta d = len(ALT)-len(REF): + * the 105 sample columns are 47 HPRC (HG*, NA*) + 58 CPC (HIFI032*, RY*) + +Pipeline: + 1. Parse the #CHROM header to identify the 58 CPC sample column indices + by prefix (HIFI032* or RY*). + 2. Strip the "CHM13v2." contig prefix for hs1 chrom names. + 3. For each VCF row, read the GT field for each CPC sample and compute + CPC-only AC (count of "1" alleles), AN (count of non-missing alleles), + and NS (CPC samples with at least one called allele). + 4. Classify each alt allele by length delta d = len(ALT)-len(REF): d >= +50 -> INS d <= -50 -> DEL |d| < 50 and max(len) >= 50 -> CPX otherwise -> dropped (below 50bp threshold) - 3. groups all alt alleles that share the same graph snarl ID into one - output row, taking: + 5. Drop rows where CPC AC == 0 (HPRC-specific alts). + 6. Collapse all remaining alts sharing the same snarl ID into one output row: svType = that class if all alts agree, else MIXED svLen = max |d| across alts - alleleCount = sum of per-alt AC - alleleNumber = AN (constant for the snarl) + alleleCount = sum of per-alt CPC AC + alleleNumber = CPC AN alleleFreq = alleleCount / alleleNumber - The BED interval spans from POS-1 to POS-1 + len(REF). + numSamples = CPC NS Usage: lrSvCpc1VcfToBed.py input.vcf.gz output.bed chrom.sizes """ import gzip import sys -from collections import defaultdict # Colors per SV type COLORS = { "INS": "0,0,200", # blue "DEL": "200,0,0", # red "CPX": "230,140,0", # orange "MIXED": "120,120,120", # grey } SIZE_THRESHOLD = 50 -def parse_info(info_str): - d = {} - for token in info_str.split(";"): - if not token: - continue - if "=" in token: - k, v = token.split("=", 1) - d[k] = v - else: - d[token] = True - return d +def is_cpc_sample(name): + """CPC samples are HIFI032* (Chinese, 47) and RY* (Chinese, 11).""" + return name.startswith("HIFI032") or name.startswith("RY") def classify(ref_len, alt_len): d = alt_len - ref_len if d >= SIZE_THRESHOLD: return "INS", d if d <= -SIZE_THRESHOLD: return "DEL", -d if max(ref_len, alt_len) >= SIZE_THRESHOLD: return "CPX", max(ref_len, alt_len) return None, 0 +def compute_cpc_counts(gt_cols): + """Return (ac, an, ns) across the given CPC GT strings for a single alt.""" + ac = 0 + an = 0 + ns = 0 + for gt in gt_cols: + # GT may be "0|0", "1|0", ".|1", ".|.", "0/1", etc. + has_called = False + for a in gt.replace("/", "|").split("|"): + if a == ".": + continue + an += 1 + has_called = True + if a == "1": + ac += 1 + if has_called: + ns += 1 + return ac, an, ns + + def emit(site, fout): - """site is dict: chrom, pos0, ref_len, name, types (set), svlen, - ac_sum, an, ns.""" classes = site["types"] - if len(classes) == 1: - sv_type = next(iter(classes)) - else: - sv_type = "MIXED" + sv_type = next(iter(classes)) if len(classes) == 1 else "MIXED" rgb = COLORS[sv_type] chrom = site["chrom"] start = site["pos0"] end = start + max(site["ref_len"], 1) af = (site["ac_sum"] / site["an"]) if site["an"] else 0.0 score = min(1000, max(0, int(round(af * 1000)))) row = [ chrom, str(start), str(end), site["name"], str(score), ".", str(start), str(end), rgb, sv_type, str(site["svlen"]), str(site["num_alts"]), str(site["ac_sum"]), str(site["an"]), f"{af:.6f}", str(site["ns"]), ] fout.write("\t".join(row) + "\n") def main(): if len(sys.argv) < 4: sys.exit("usage: lrSvCpc1VcfToBed.py input.vcf.gz output.bed chrom.sizes") in_file = sys.argv[1] out_file = sys.argv[2] sizes_file = sys.argv[3] - # Load chrom sizes for validation / clipping chrom_sizes = {} with open(sizes_file) as f: for line in f: c, s = line.strip().split("\t") chrom_sizes[c] = int(s) opener = gzip.open if in_file.endswith(".gz") else open - # Accumulator: key = (chrom, pos0, snarl_id) - # We stream the file and flush a site whenever we encounter a new key, - # because the VCF is sorted and decomposed alts of the same snarl are - # adjacent. + cpc_col_idx = None # set when we parse #CHROM header + prev_key = None site = None kept_rows = 0 dropped_small = 0 + dropped_no_cpc_carrier = 0 dropped_chrom = 0 flushed_sites = 0 with opener(in_file, "rt") as fin, open(out_file, "w") as fout: for line in fin: - if line.startswith("#"): + if line.startswith("##"): continue - f = line.rstrip("\n").split("\t", 8) - chrom_raw, pos, vid, ref, alt, qual, filt, info = f[:8] + if line.startswith("#CHROM"): + # Sample columns start at index 9. + cols = line.rstrip("\n").split("\t") + sample_names = cols[9:] + cpc_col_idx = [ + 9 + i for i, n in enumerate(sample_names) if is_cpc_sample(n) + ] + print(f"CPC samples found: {len(cpc_col_idx)}", file=sys.stderr) + if len(cpc_col_idx) == 0: + sys.exit("ERROR: no CPC sample columns (HIFI032* or RY*) detected") + continue + + if cpc_col_idx is None: + sys.exit("ERROR: data line before #CHROM header") + + f = line.rstrip("\n").split("\t") + chrom_raw, pos, vid, ref, alt = f[0], f[1], f[2], f[3], f[4] + if chrom_raw.startswith("CHM13v2."): chrom = chrom_raw[len("CHM13v2."):] else: chrom = chrom_raw if chrom not in chrom_sizes: dropped_chrom += 1 continue ref_len = len(ref) alt_len = len(alt) sv_type, mag = classify(ref_len, alt_len) if sv_type is None: dropped_small += 1 continue - info_d = parse_info(info) - def _int(val, default=0): - try: - return int(str(val).split(",")[0]) - except (ValueError, TypeError): - return default - ac = _int(info_d.get("AC", 0)) - an = _int(info_d.get("AN", 0)) - ns = _int(info_d.get("NS", 0)) - pos0 = int(pos) - 1 + # Extract CPC GTs and compute AC/AN/NS + gt_cols = [f[i] for i in cpc_col_idx] + ac, an, ns = compute_cpc_counts(gt_cols) + if ac == 0: + dropped_no_cpc_carrier += 1 + continue + pos0 = int(pos) - 1 key = (chrom, pos0, vid) if key != prev_key: if site is not None: emit(site, fout) flushed_sites += 1 site = { "chrom": chrom, "pos0": pos0, "ref_len": ref_len, "name": vid, "types": set(), "svlen": 0, "num_alts": 0, "ac_sum": 0, "an": an, "ns": ns, } prev_key = key site["types"].add(sv_type) if mag > site["svlen"]: site["svlen"] = mag site["num_alts"] += 1 site["ac_sum"] += ac - # AN, NS should be the same across alts of one site; keep max just in case if an > site["an"]: site["an"] = an if ns > site["ns"]: site["ns"] = ns kept_rows += 1 if site is not None: emit(site, fout) flushed_sites += 1 - print(f"kept alt rows: {kept_rows}", file=sys.stderr) + print(f"kept alt rows (CPC carrier): {kept_rows}", file=sys.stderr) + print(f"dropped no CPC carrier: {dropped_no_cpc_carrier}", file=sys.stderr) print(f"dropped <{SIZE_THRESHOLD}bp alt rows: {dropped_small}", file=sys.stderr) print(f"dropped bad chrom: {dropped_chrom}", file=sys.stderr) print(f"output sites: {flushed_sites}", file=sys.stderr) if __name__ == "__main__": main()