7594507ca126d5242346787e42e13c52ea7709b1 max Fri Apr 17 08:40:31 2026 -0700 Add lrSv supertrack: long-read structural variants from 9 studies (hg38). #Preview2 week - bugs introduced now will need a build patch to fix Sub-tracks (all bigBed 9+): han945Sv - 945 Han Chinese, ONT (Gong 2025, PMID 39929826) lrSv1kgOnt - 1019 1000 Genomes, ONT, SVAN-annotated (Schloissnig 2025, PMID 40702182; lifted from hs1) tommoJpSv - 333 Japanese (111 trios), ONT (Otsuki 2022, PMID 36127505) aou1kSv - 1027 All of Us, PacBio HiFi (Garimella 2025, PMID 41256123) ga4kSv - 502 GA4K pediatric rare disease, PacBio HiFi (Cohen 2022, PMID 35305867) decodeSv - 3622 Icelanders, ONT (Beyter 2021, PMID 33972781) hgsvc3Sv - 65 HGSVC3 diverse haplotype-resolved assemblies, HiFi+ONT (Logsdon 2025, PMID 40702183; merges insdel+inv tables) kwanhoSv - 100 post-mortem brains (PD/ILBD/HC), PacBio HiFi (Kim 2026, PMID 41929179) chirmade101Sv - 101 long-read WGS GWAS SVatalog cohort (Chirmade 2026, PMID 41203876) Includes per-track conversion scripts and autoSql under scripts/lrSv/, the supertrack summary table in lrSv.html, and a consolidated makeDoc at doc/hg38/lrSv.txt. refs #36258 Co-Authored-By: Claude Opus 4.7 (1M context) diff --git src/hg/makeDb/scripts/lrSv/lrSv1kgOntVcfToBed.py src/hg/makeDb/scripts/lrSv/lrSv1kgOntVcfToBed.py new file mode 100644 index 00000000000..6e698057a2e --- /dev/null +++ src/hg/makeDb/scripts/lrSv/lrSv1kgOntVcfToBed.py @@ -0,0 +1,228 @@ +#!/usr/bin/env python3 +"""Convert 1KG ONT SVAN VCF to BED9+ for bigBed, adding allele counts from phased VCF. + +Usage: + lrSv1kgOntVcfToBed.py svan.vcf.gz output.bed chrom.sizes [phased.vcf.gz] + +The optional phased VCF provides AC, AN, and AF per variant (matched by ID). +SVs without a match get alleleCount=-1, alleleNumber=-1, alleleFreq=-1. +""" + +import gzip +import sys + +# Colors by SV class (R,G,B) +SV_COLORS = { + "INS": "0,0,200", # blue + "DEL": "200,0,0", # red + "COMPLEX": "230,140,0", # orange +} + +def parseInfo(infoStr): + """Parse INFO field into a dict.""" + d = {} + for item in infoStr.split(";"): + if "=" in item: + k, v = item.split("=", 1) + d[k] = v + else: + d[item] = True + return d + +def getSvClass(varId): + """Extract SV class (INS, DEL, COMPLEX) from the variant ID.""" + if "-INS-" in varId: + return "INS" + elif "-DEL-" in varId: + return "DEL" + elif "-COMPLEX-" in varId: + return "COMPLEX" + return "OTHER" + +def loadPhasedAC(phasedFile): + """Load AC, AN, AF from phased VCF, keyed by variant ID.""" + acMap = {} + opener = gzip.open if phasedFile.endswith(".gz") else open + with opener(phasedFile, "rt") as f: + for line in f: + if line.startswith("#"): + continue + fields = line.split("\t", 9) + varId = fields[2] + info = parseInfo(fields[7]) + ac = int(info.get("AC", "0")) + an = int(info.get("AN", "0")) + af = float(info.get("AF", "0")) + acMap[varId] = (ac, an, af) + return acMap + +def main(): + if len(sys.argv) < 4: + print(__doc__, file=sys.stderr) + sys.exit(1) + + inFile = sys.argv[1] + outFile = sys.argv[2] + chromSizesFile = sys.argv[3] + phasedFile = sys.argv[4] if len(sys.argv) > 4 else None + opener = gzip.open if inFile.endswith(".gz") else open + + # Load chrom sizes for clipping + chromSizes = {} + with open(chromSizesFile) as f: + for line in f: + c, s = line.strip().split("\t") + chromSizes[c] = int(s) + + # Load allele counts from phased VCF + acMap = {} + if phasedFile: + print(f"Loading allele counts from {phasedFile}...", file=sys.stderr) + acMap = loadPhasedAC(phasedFile) + print(f"Loaded {len(acMap)} variants from phased VCF", file=sys.stderr) + + skipped = 0 + matched = 0 + unmatched = 0 + with opener(inFile, "rt") as fIn, open(outFile, "w") as fOut: + for line in fIn: + if line.startswith("#"): + continue + + fields = line.rstrip("\n").split("\t") + chrom = fields[0] + pos = int(fields[1]) + varId = fields[2] + ref = fields[3] + info = parseInfo(fields[7]) + + svClass = getSvClass(varId) + + # BED coordinates: 0-based half-open + chromStart = pos - 1 + chromEnd = chromStart + len(ref) + + # SV length + svLen = int(info.get("INS_LEN", info.get("DEL_LEN", "0"))) + + # Readable name: e.g. "DEL 258bp" or "INS Alu 330bp" + famName = info.get("FAM_N", "") + if famName: + name = f"{svClass} {famName} {svLen}bp" + else: + name = f"{svClass} {svLen}bp" + + # For INS, the item spans only the anchor base; expand by 1 for visibility + if svClass == "INS" and chromEnd <= chromStart + 1: + chromEnd = chromStart + 1 + + # Insertion/deletion type + insType = info.get("ITYPE_N", info.get("DTYPE_N", "")) + + # Transposon family + family = info.get("FAM_N", "") + + # Percent resolved + try: + percResolved = float(info.get("PERC_RESOLVED", "0")) + except ValueError: + percResolved = 0.0 + + # TSD length + try: + tsdLen = int(info.get("TSD_LEN", "0")) + except ValueError: + tsdLen = 0 + + # Poly-A length + try: + polyaLen = int(info.get("POLYA_LEN", "0")) + except ValueError: + polyaLen = 0 + + # Conformation + conformation = info.get("CONFORMATION", "") + + # Retrotransposon length + try: + rtLen = int(info.get("RT_LEN", "0")) + except ValueError: + rtLen = 0 + + # VNTR motif count + try: + nbMotifs = int(info.get("NB_MOTIFS", "0")) + except ValueError: + nbMotifs = 0 + + # Source gene (for processed pseudogenes) + srcGene = info.get("SRC_GENE", "") + + # Number of exons retrotransposed + try: + nbExons = int(info.get("NB_EXONS", "0")) + except ValueError: + nbExons = 0 + + # Non-canonical MEI flag + notCanonical = "Yes" if info.get("NOT_CANONICAL") else "" + + # Strand + strand = info.get("STRAND", ".") + if strand not in ("+", "-"): + strand = "." + + color = SV_COLORS.get(svClass, "100,100,100") + + # Allele counts from phased VCF + if varId in acMap: + ac, an, af = acMap[varId] + matched += 1 + else: + ac, an, af = -1, -1, -1.0 + unmatched += 1 + + # Clip to chrom sizes; skip records on unknown chroms + if chrom not in chromSizes: + skipped += 1 + continue + if chromEnd > chromSizes[chrom]: + skipped += 1 + continue + + row = [ + chrom, + str(chromStart), + str(chromEnd), + name, + "0", + strand, + str(chromStart), # thickStart + str(chromEnd), # thickEnd + color, + svClass, + str(svLen), + insType, + family, + f"{percResolved:.2f}", + str(tsdLen), + str(polyaLen), + conformation, + str(rtLen), + str(nbMotifs), + srcGene, + str(nbExons), + notCanonical, + str(ac), + str(an), + f"{af:.4f}" if af >= 0 else "-1.0000", + ] + fOut.write("\t".join(row) + "\n") + + print(f"Skipped {skipped} records outside chrom sizes", file=sys.stderr) + if acMap: + print(f"Allele counts: {matched} matched, {unmatched} unmatched " + f"({100*matched/(matched+unmatched):.1f}% matched)", file=sys.stderr) + +if __name__ == "__main__": + main()