7594507ca126d5242346787e42e13c52ea7709b1 max Fri Apr 17 08:40:31 2026 -0700 Add lrSv supertrack: long-read structural variants from 9 studies (hg38). #Preview2 week - bugs introduced now will need a build patch to fix Sub-tracks (all bigBed 9+): han945Sv - 945 Han Chinese, ONT (Gong 2025, PMID 39929826) lrSv1kgOnt - 1019 1000 Genomes, ONT, SVAN-annotated (Schloissnig 2025, PMID 40702182; lifted from hs1) tommoJpSv - 333 Japanese (111 trios), ONT (Otsuki 2022, PMID 36127505) aou1kSv - 1027 All of Us, PacBio HiFi (Garimella 2025, PMID 41256123) ga4kSv - 502 GA4K pediatric rare disease, PacBio HiFi (Cohen 2022, PMID 35305867) decodeSv - 3622 Icelanders, ONT (Beyter 2021, PMID 33972781) hgsvc3Sv - 65 HGSVC3 diverse haplotype-resolved assemblies, HiFi+ONT (Logsdon 2025, PMID 40702183; merges insdel+inv tables) kwanhoSv - 100 post-mortem brains (PD/ILBD/HC), PacBio HiFi (Kim 2026, PMID 41929179) chirmade101Sv - 101 long-read WGS GWAS SVatalog cohort (Chirmade 2026, PMID 41203876) Includes per-track conversion scripts and autoSql under scripts/lrSv/, the supertrack summary table in lrSv.html, and a consolidated makeDoc at doc/hg38/lrSv.txt. refs #36258 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> diff --git src/hg/makeDb/scripts/lrSv/lrSvHan945SuppVecToVcf.py src/hg/makeDb/scripts/lrSv/lrSvHan945SuppVecToVcf.py new file mode 100644 index 00000000000..83547d00055 --- /dev/null +++ src/hg/makeDb/scripts/lrSv/lrSvHan945SuppVecToVcf.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 +"""Convert Han 945 site-only SV VCF with SUPP_VEC to a VCF with per-sample genotypes. + +The input VCF has no sample columns but contains a SUPP_VEC INFO field: +a 945-character binary string where '1' means the sample supports the SV. +This script reconstructs per-sample GT columns (0/1 for carriers, 0/0 for non-carriers). + +Sample names are generated as Sample_001 through Sample_945 since +the original VCF does not include sample identifiers. + +Usage: + lrSvHan945SuppVecToVcf.py input.vcf.gz output.vcf +""" + +import gzip +import sys + +def main(): + if len(sys.argv) != 3: + print(__doc__, file=sys.stderr) + sys.exit(1) + + inFile, outFile = sys.argv[1], sys.argv[2] + nSamples = 945 + sampleNames = [f"Sample_{i+1:03d}" for i in range(nSamples)] + + opener = gzip.open if inFile.endswith(".gz") else open + + with opener(inFile, "rt") as fIn, open(outFile, "w") as fOut: + for line in fIn: + if line.startswith("##"): + fOut.write(line) + continue + + if line.startswith("#CHROM"): + # Rewrite header line with sample columns + baseCols = line.rstrip("\n").split("\t")[:8] + fOut.write("\t".join(baseCols + ["FORMAT"] + sampleNames) + "\n") + continue + + fields = line.rstrip("\n").split("\t") + # fields: CHROM POS ID REF ALT QUAL FILTER INFO + infoStr = fields[7] + + # Extract SUPP_VEC + suppVec = "" + for item in infoStr.split(";"): + if item.startswith("SUPP_VEC="): + suppVec = item.split("=", 1)[1] + break + + if len(suppVec) != nSamples: + print(f"Warning: SUPP_VEC length {len(suppVec)} != {nSamples} " + f"at {fields[0]}:{fields[1]}, skipping", file=sys.stderr) + continue + + # Build genotype columns + gts = ["0/1" if c == "1" else "0/0" for c in suppVec] + + row = fields[:8] + ["GT"] + gts + fOut.write("\t".join(row) + "\n") + +if __name__ == "__main__": + main()