7594507ca126d5242346787e42e13c52ea7709b1 max Fri Apr 17 08:40:31 2026 -0700 Add lrSv supertrack: long-read structural variants from 9 studies (hg38). #Preview2 week - bugs introduced now will need a build patch to fix Sub-tracks (all bigBed 9+): han945Sv - 945 Han Chinese, ONT (Gong 2025, PMID 39929826) lrSv1kgOnt - 1019 1000 Genomes, ONT, SVAN-annotated (Schloissnig 2025, PMID 40702182; lifted from hs1) tommoJpSv - 333 Japanese (111 trios), ONT (Otsuki 2022, PMID 36127505) aou1kSv - 1027 All of Us, PacBio HiFi (Garimella 2025, PMID 41256123) ga4kSv - 502 GA4K pediatric rare disease, PacBio HiFi (Cohen 2022, PMID 35305867) decodeSv - 3622 Icelanders, ONT (Beyter 2021, PMID 33972781) hgsvc3Sv - 65 HGSVC3 diverse haplotype-resolved assemblies, HiFi+ONT (Logsdon 2025, PMID 40702183; merges insdel+inv tables) kwanhoSv - 100 post-mortem brains (PD/ILBD/HC), PacBio HiFi (Kim 2026, PMID 41929179) chirmade101Sv - 101 long-read WGS GWAS SVatalog cohort (Chirmade 2026, PMID 41203876) Includes per-track conversion scripts and autoSql under scripts/lrSv/, the supertrack summary table in lrSv.html, and a consolidated makeDoc at doc/hg38/lrSv.txt. refs #36258 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> diff --git src/hg/makeDb/scripts/lrSv/lrSvHgsvc3TsvToBed.py src/hg/makeDb/scripts/lrSv/lrSvHgsvc3TsvToBed.py new file mode 100644 index 00000000000..dc976967aeb --- /dev/null +++ src/hg/makeDb/scripts/lrSv/lrSvHgsvc3TsvToBed.py @@ -0,0 +1,132 @@ +#!/usr/bin/env python3 +"""Convert HGSVC3 final annotation TSVs (insdel + inv) to a merged BED9+ file. + +Usage: + lrSvHgsvc3TsvToBed.py insdel.tsv.gz inv.tsv.gz output.bed +""" + +import csv +import gzip +import sys + +SV_COLORS = { + "DEL": "200,0,0", # red + "INS": "0,0,200", # blue + "INV": "230,140,0", # orange +} + + +def openTsv(path): + return gzip.open(path, "rt") if path.endswith(".gz") else open(path, "rt") + + +def countSamples(mergeSamples): + """Return (haplotypeCount, uniqSampleCount). + + MERGE_SAMPLES is a comma-separated list of haplotype IDs like + HG00096-h1,HG00096-h2,HG00514-un. Haplotype count is the length of that + list. Unique sample count strips the -h1/-h2/-un suffix. + """ + if not mergeSamples: + return 0, 0 + parts = mergeSamples.split(",") + uniq = set() + for p in parts: + p = p.strip() + if not p: + continue + for suf in ("-h1", "-h2", "-un"): + if p.endswith(suf): + p = p[: -len(suf)] + break + uniq.add(p) + return len(parts), len(uniq) + + +def emit(outF, row, typeExtra): + """Write one BED row. typeExtra is dict of the fields that differ + between insdel and inv. + """ + chrom = row["#CHROM"] + pos = int(row["POS"]) # 0-based in source TSV based on sample inspection + end = int(row["END"]) + svType = row["SVTYPE"] + try: + svLen = int(row["SVLEN"]) + except ValueError: + svLen = 0 + absSvLen = abs(svLen) + + # Source coords are already 0-based half-open (chr1 POS=10669 END=10901 + # matches a 232-bp deletion starting at 10670 in 1-based). Use as-is. + chromStart = pos + chromEnd = end + if chromEnd <= chromStart: + chromEnd = chromStart + 1 + + haploCount, sampleCount = countSamples(row.get("MERGE_SAMPLES", "")) + + color = SV_COLORS.get(svType, "100,100,100") + + refTrf = row.get("REF_TRF", "") + try: + refSd = float(row.get("REF_SD", "0") or 0) + except ValueError: + refSd = 0.0 + + bedRow = [ + chrom, + str(chromStart), + str(chromEnd), + row["ID"], + "0", + ".", + str(chromStart), + str(chromEnd), + color, + svType, + str(absSvLen), + str(haploCount), + str(sampleCount), + typeExtra.get("homRef", ""), + typeExtra.get("homTig", ""), + typeExtra.get("regionRefInner", ""), + typeExtra.get("te", ""), + refTrf, + f"{refSd:.6f}", + row.get("MERGE_SAMPLES", ""), + row.get("WIN_500", ""), + row.get("WIN_2K", ""), + ] + outF.write("\t".join(bedRow) + "\n") + + +def main(): + if len(sys.argv) != 4: + print(__doc__, file=sys.stderr) + sys.exit(1) + + insdelPath, invPath, outPath = sys.argv[1], sys.argv[2], sys.argv[3] + + with open(outPath, "w") as outF: + # insertions + deletions + with openTsv(insdelPath) as fIn: + reader = csv.DictReader(fIn, delimiter="\t") + for row in reader: + emit(outF, row, { + "homRef": row.get("HOM_REF", ""), + "homTig": row.get("HOM_TIG", ""), + "te": row.get("TE", ""), + }) + + # inversions + with openTsv(invPath) as fIn: + reader = csv.DictReader(fIn, delimiter="\t") + for row in reader: + emit(outF, row, { + "regionRefInner": row.get("RGN_REF_INNER", ""), + }) + + +if __name__ == "__main__": + main()