7594507ca126d5242346787e42e13c52ea7709b1 max Fri Apr 17 08:40:31 2026 -0700 Add lrSv supertrack: long-read structural variants from 9 studies (hg38). #Preview2 week - bugs introduced now will need a build patch to fix Sub-tracks (all bigBed 9+): han945Sv - 945 Han Chinese, ONT (Gong 2025, PMID 39929826) lrSv1kgOnt - 1019 1000 Genomes, ONT, SVAN-annotated (Schloissnig 2025, PMID 40702182; lifted from hs1) tommoJpSv - 333 Japanese (111 trios), ONT (Otsuki 2022, PMID 36127505) aou1kSv - 1027 All of Us, PacBio HiFi (Garimella 2025, PMID 41256123) ga4kSv - 502 GA4K pediatric rare disease, PacBio HiFi (Cohen 2022, PMID 35305867) decodeSv - 3622 Icelanders, ONT (Beyter 2021, PMID 33972781) hgsvc3Sv - 65 HGSVC3 diverse haplotype-resolved assemblies, HiFi+ONT (Logsdon 2025, PMID 40702183; merges insdel+inv tables) kwanhoSv - 100 post-mortem brains (PD/ILBD/HC), PacBio HiFi (Kim 2026, PMID 41929179) chirmade101Sv - 101 long-read WGS GWAS SVatalog cohort (Chirmade 2026, PMID 41203876) Includes per-track conversion scripts and autoSql under scripts/lrSv/, the supertrack summary table in lrSv.html, and a consolidated makeDoc at doc/hg38/lrSv.txt. refs #36258 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> diff --git src/hg/makeDb/scripts/lrSv/lrSvTommoJpVcfToBed.py src/hg/makeDb/scripts/lrSv/lrSvTommoJpVcfToBed.py new file mode 100644 index 00000000000..f5b7cbd5a67 --- /dev/null +++ src/hg/makeDb/scripts/lrSv/lrSvTommoJpVcfToBed.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python3 +"""Convert ToMMo JSV1 SURVIVOR-merged SV VCF to BED9+ for bigBed. + +Usage: + lrSvTommoJpVcfToBed.py input.vcf.gz output.bed +""" + +import gzip +import sys + +# Colors by SV type (R,G,B) +SV_COLORS = { + "DEL": "200,0,0", # red + "INS": "0,0,200", # blue +} + +def parseInfo(infoStr): + """Parse INFO field into a dict.""" + d = {} + for item in infoStr.split(";"): + if "=" in item: + k, v = item.split("=", 1) + d[k] = v + else: + d[item] = True + return d + +def main(): + if len(sys.argv) != 3: + print(__doc__, file=sys.stderr) + sys.exit(1) + + inFile, outFile = sys.argv[1], sys.argv[2] + opener = gzip.open if inFile.endswith(".gz") else open + + with opener(inFile, "rt") as fIn, open(outFile, "w") as fOut: + for line in fIn: + if line.startswith("#"): + continue + + fields = line.rstrip("\n").split("\t") + chrom = fields[0] + pos = int(fields[1]) + qual = fields[5] + info = parseInfo(fields[7]) + + svType = info.get("SVTYPE", ".") + end = int(info.get("END", pos)) + svLen = int(float(info.get("SVLEN", "0"))) + af = info.get("AF", "0") + ac = int(info.get("AC", "0")) + an = int(info.get("AN", "0")) + + # Mendelian error fields + errRatio = info.get("ERROR_FAMILY_RATIO", ".") + errNum = info.get("ERROR_FAMILY_NUM", "0") + famNum = info.get("FAMILY_NUM", "0") + + # Handle nan/missing AF + try: + afFloat = float(af) + except ValueError: + afFloat = 0.0 + + # BED is 0-based half-open + chromStart = pos - 1 + + # For INS, END == POS so the item has zero width; expand by 1 bp + chromEnd = end + if chromEnd <= chromStart: + chromEnd = chromStart + 1 + + # Absolute SV length + absSvLen = abs(svLen) + + # Readable name + name = f"{svType} {absSvLen}bp" + + # Score: map QUAL to 0-1000 + try: + score = min(int(round(float(qual) * 2)), 1000) + except ValueError: + score = 0 + + strand = "." + color = SV_COLORS.get(svType, "100,100,100") + + # Handle missing error ratio + try: + errRatioFloat = float(errRatio) + except ValueError: + errRatioFloat = 0.0 + + try: + errNumInt = int(errNum) + except ValueError: + errNumInt = 0 + + try: + famNumInt = int(famNum) + except ValueError: + famNumInt = 0 + + row = [ + chrom, + str(chromStart), + str(chromEnd), + name, + str(score), + strand, + str(chromStart), # thickStart + str(chromEnd), # thickEnd + color, + svType, + str(absSvLen), + f"{afFloat:.6f}", + str(ac), + str(an), + f"{errRatioFloat:.3f}", + str(errNumInt), + str(famNumInt), + ] + fOut.write("\t".join(row) + "\n") + +if __name__ == "__main__": + main()