bac95a147f49cd331052e597006e04b3deee40fc max Wed Apr 22 10:43:20 2026 -0700 lrSv/srSv: human-readable SV type filter labels, script cleanups Add human-readable labels to the supertrack-level svType filter on both the lrSv and srSv supertracks using the "CODE|CODE (Long name)" filterValues syntax: DEL -> "DEL (Deletion)", INS -> "INS (Insertion)", etc. Labels keep the short code up front so users can match what hgTracks shows next to each feature. Also sweep in the in-progress converter/as-file cleanups under scripts/lrSv/ and scripts/srSv/ (introduction of lrSvCommon.py helpers, consistent insLen / svLen / AC column naming, tightened field-description text) that had been piling up as an unstaged working tree. refs #36258 diff --git src/hg/makeDb/scripts/lrSv/lrSvAprVcfToBed.py src/hg/makeDb/scripts/lrSv/lrSvAprVcfToBed.py index 51a4a75bdd4..78b439bae4e 100755 --- src/hg/makeDb/scripts/lrSv/lrSvAprVcfToBed.py +++ src/hg/makeDb/scripts/lrSv/lrSvAprVcfToBed.py @@ -1,169 +1,190 @@ #!/usr/bin/env python3 """Convert Arabic Pangenome Reference (APR) pangenome VCF on T2T-CHM13v2 to a bed9+7 table for the lrSv/aprSv track. Input VCF characteristics (unlike the CPC VCF this one is NOT decomposed): * contigs are named "chr1", "chr2", ... with CHM13v2 lengths * variant IDs are graph snarl IDs like "<951452<1012008" * REF/ALT are explicit sequences * each snarl appears on a single row, with multi-allelic ALT (comma-separated). INFO AC/AF are comma lists of the same length. This script: 1. iterates the comma-separated alt alleles within each VCF row 2. classifies each alt by length delta d = len(ALT)-len(REF): d >= +50 -> INS d <= -50 -> DEL |d| < 50 and max(len) >= 50 -> CPX otherwise -> dropped 3. emits one bed row per snarl (row) with alts merged: svType = agreed class or MIXED if alts differ - svLen = max |d| across passing alts - alleleCount = sum of AC values for passing alts + svLen = reference span (chromEnd - chromStart) + insLen = max inserted-sequence length across passing INS alts (0 otherwise) + AC = sum of AC values for passing alts alleleNumber = AN (constant) - alleleFreq = alleleCount / alleleNumber + alleleFreq = AC / alleleNumber Rows with zero passing alts are skipped. Usage: lrSvAprVcfToBed.py input.vcf.gz output.bed chrom.sizes """ import gzip +import os import sys +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +from lrSvCommon import svName, normalizeSvType + COLORS = { "INS": "0,0,200", # blue "DEL": "200,0,0", # red "CPX": "230,140,0", # orange "MIXED": "120,120,120", # grey } SIZE_THRESHOLD = 50 def parse_info(info_str): d = {} for token in info_str.split(";"): if not token: continue if "=" in token: k, v = token.split("=", 1) d[k] = v else: d[token] = True return d def classify(ref_len, alt_len): d = alt_len - ref_len if d >= SIZE_THRESHOLD: return "INS", d if d <= -SIZE_THRESHOLD: return "DEL", -d if max(ref_len, alt_len) >= SIZE_THRESHOLD: return "CPX", max(ref_len, alt_len) return None, 0 def _int(val, default=0): try: return int(str(val)) except (ValueError, TypeError): return default def main(): if len(sys.argv) < 4: sys.exit("usage: lrSvAprVcfToBed.py input.vcf.gz output.bed chrom.sizes") in_file = sys.argv[1] out_file = sys.argv[2] sizes_file = sys.argv[3] chrom_sizes = {} with open(sizes_file) as f: for line in f: c, s = line.strip().split("\t") chrom_sizes[c] = int(s) opener = gzip.open if in_file.endswith(".gz") else open emitted = 0 skipped_no_sv_alt = 0 skipped_chrom = 0 total_alts_seen = 0 with opener(in_file, "rt") as fin, open(out_file, "w") as fout: for line in fin: if line.startswith("#"): continue f = line.rstrip("\n").split("\t", 8) chrom, pos, vid, ref, alt, qual, filt, info = f[:8] if chrom not in chrom_sizes: skipped_chrom += 1 continue alts = alt.split(",") total_alts_seen += len(alts) info_d = parse_info(info) ac_list = info_d.get("AC", "").split(",") af_list = info_d.get("AF", "").split(",") an = _int(info_d.get("AN", "0")) ns = _int(info_d.get("NS", "0")) ref_len = len(ref) types = set() max_mag = 0 + max_ins = 0 ac_sum = 0 num_pass = 0 for i, alt_seq in enumerate(alts): sv_type, mag = classify(ref_len, len(alt_seq)) if sv_type is None: continue types.add(sv_type) if mag > max_mag: max_mag = mag + if sv_type == "INS": + d = len(alt_seq) - ref_len + if d > max_ins: + max_ins = d if i < len(ac_list): ac_sum += _int(ac_list[i]) num_pass += 1 if num_pass == 0: skipped_no_sv_alt += 1 continue sv_type = next(iter(types)) if len(types) == 1 else "MIXED" - rgb = COLORS[sv_type] + sv_type = normalizeSvType(sv_type) + rgb = COLORS.get(sv_type, "120,120,120") pos0 = int(pos) - 1 start = pos0 end = start + max(ref_len, 1) af = (ac_sum / an) if an else 0.0 score = min(1000, max(0, int(round(af * 1000)))) + svLen = end - start + if sv_type == "INS": + insLen = max_ins + else: + insLen = 0 + + featLen = insLen if sv_type in ("INS", "MEI") else svLen + name = svName(sv_type, featLen, ac_sum) + row = [ chrom, str(start), str(end), - vid, + name, str(score), ".", str(start), str(end), rgb, sv_type, - str(max_mag), - str(num_pass), + str(svLen), + str(insLen), str(ac_sum), + str(num_pass), str(an), f"{af:.6f}", str(ns), ] fout.write("\t".join(row) + "\n") emitted += 1 print(f"total alt alleles seen: {total_alts_seen}", file=sys.stderr) print(f"emitted sites (>=1 SV alt): {emitted}", file=sys.stderr) print(f"skipped rows (no SV alt): {skipped_no_sv_alt}", file=sys.stderr) print(f"skipped rows (bad chrom): {skipped_chrom}", file=sys.stderr) if __name__ == "__main__": main()