7594507ca126d5242346787e42e13c52ea7709b1 max Fri Apr 17 08:40:31 2026 -0700 Add lrSv supertrack: long-read structural variants from 9 studies (hg38). #Preview2 week - bugs introduced now will need a build patch to fix Sub-tracks (all bigBed 9+): han945Sv - 945 Han Chinese, ONT (Gong 2025, PMID 39929826) lrSv1kgOnt - 1019 1000 Genomes, ONT, SVAN-annotated (Schloissnig 2025, PMID 40702182; lifted from hs1) tommoJpSv - 333 Japanese (111 trios), ONT (Otsuki 2022, PMID 36127505) aou1kSv - 1027 All of Us, PacBio HiFi (Garimella 2025, PMID 41256123) ga4kSv - 502 GA4K pediatric rare disease, PacBio HiFi (Cohen 2022, PMID 35305867) decodeSv - 3622 Icelanders, ONT (Beyter 2021, PMID 33972781) hgsvc3Sv - 65 HGSVC3 diverse haplotype-resolved assemblies, HiFi+ONT (Logsdon 2025, PMID 40702183; merges insdel+inv tables) kwanhoSv - 100 post-mortem brains (PD/ILBD/HC), PacBio HiFi (Kim 2026, PMID 41929179) chirmade101Sv - 101 long-read WGS GWAS SVatalog cohort (Chirmade 2026, PMID 41203876) Includes per-track conversion scripts and autoSql under scripts/lrSv/, the supertrack summary table in lrSv.html, and a consolidated makeDoc at doc/hg38/lrSv.txt. refs #36258 Co-Authored-By: Claude Opus 4.7 (1M context) diff --git src/hg/makeDb/scripts/lrSv/lrSvGa4kSvVcfToBed.py src/hg/makeDb/scripts/lrSv/lrSvGa4kSvVcfToBed.py new file mode 100644 index 00000000000..082f128d7f5 --- /dev/null +++ src/hg/makeDb/scripts/lrSv/lrSvGa4kSvVcfToBed.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python3 +"""Convert a GA4K Jasmine-merged SV VCF (site-only) to BED9+ for bigBed. + +Usage: + lrSvGa4kSvVcfToBed.py input.vcf.gz output.bed +""" + +import gzip +import sys + +SV_COLORS = { + "DEL": "200,0,0", # red + "INS": "0,0,200", # blue + "DUP": "0,160,0", # green + "INV": "230,140,0", # orange +} + + +def parseInfo(infoStr): + d = {} + for item in infoStr.split(";"): + if "=" in item: + k, v = item.split("=", 1) + d[k] = v + else: + d[item] = True + return d + + +def main(): + if len(sys.argv) != 3: + print(__doc__, file=sys.stderr) + sys.exit(1) + + inFile, outFile = sys.argv[1], sys.argv[2] + opener = gzip.open if inFile.endswith(".gz") else open + + with opener(inFile, "rt") as fIn, open(outFile, "w") as fOut: + for line in fIn: + if line.startswith("#"): + continue + + fields = line.rstrip("\n").split("\t") + chrom = fields[0] + pos = int(fields[1]) + name = fields[2] + info = parseInfo(fields[7]) + + svType = info.get("SVTYPE", ".") + end = int(info.get("END", pos)) + svLen = int(float(info.get("SVLEN", "0"))) + af = float(info.get("SVF", "0")) + svc = int(info.get("SVC", "0")) + svn = int(info.get("SVN", "0")) + + chromStart = pos - 1 + chromEnd = end + if chromEnd <= chromStart: + chromEnd = chromStart + 1 + + absSvLen = abs(svLen) + color = SV_COLORS.get(svType, "100,100,100") + + row = [ + chrom, + str(chromStart), + str(chromEnd), + name, + "0", + ".", + str(chromStart), + str(chromEnd), + color, + svType, + str(absSvLen), + f"{af:.6f}", + str(svc), + str(svn), + ] + fOut.write("\t".join(row) + "\n") + + +if __name__ == "__main__": + main()