7594507ca126d5242346787e42e13c52ea7709b1 max Fri Apr 17 08:40:31 2026 -0700 Add lrSv supertrack: long-read structural variants from 9 studies (hg38). #Preview2 week - bugs introduced now will need a build patch to fix Sub-tracks (all bigBed 9+): han945Sv - 945 Han Chinese, ONT (Gong 2025, PMID 39929826) lrSv1kgOnt - 1019 1000 Genomes, ONT, SVAN-annotated (Schloissnig 2025, PMID 40702182; lifted from hs1) tommoJpSv - 333 Japanese (111 trios), ONT (Otsuki 2022, PMID 36127505) aou1kSv - 1027 All of Us, PacBio HiFi (Garimella 2025, PMID 41256123) ga4kSv - 502 GA4K pediatric rare disease, PacBio HiFi (Cohen 2022, PMID 35305867) decodeSv - 3622 Icelanders, ONT (Beyter 2021, PMID 33972781) hgsvc3Sv - 65 HGSVC3 diverse haplotype-resolved assemblies, HiFi+ONT (Logsdon 2025, PMID 40702183; merges insdel+inv tables) kwanhoSv - 100 post-mortem brains (PD/ILBD/HC), PacBio HiFi (Kim 2026, PMID 41929179) chirmade101Sv - 101 long-read WGS GWAS SVatalog cohort (Chirmade 2026, PMID 41203876) Includes per-track conversion scripts and autoSql under scripts/lrSv/, the supertrack summary table in lrSv.html, and a consolidated makeDoc at doc/hg38/lrSv.txt. refs #36258 Co-Authored-By: Claude Opus 4.7 (1M context) diff --git src/hg/makeDb/scripts/lrSv/lrSvVcfToBed.py src/hg/makeDb/scripts/lrSv/lrSvVcfToBed.py new file mode 100644 index 00000000000..c50c3741e27 --- /dev/null +++ src/hg/makeDb/scripts/lrSv/lrSvVcfToBed.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python3 +"""Convert a SURVIVOR-merged SV VCF (site-only) to BED9+ for bigBed. + +Usage: + lrSvVcfToBed.py input.vcf.gz output.bed +""" + +import gzip +import sys + +# Colors by SV type (R,G,B) +SV_COLORS = { + "DEL": "200,0,0", # red + "INS": "0,0,200", # blue + "DUP": "0,160,0", # green + "INV": "230,140,0", # orange + "TRA": "140,0,200", # purple +} + +def parseInfo(infoStr): + """Parse INFO field into a dict.""" + d = {} + for item in infoStr.split(";"): + if "=" in item: + k, v = item.split("=", 1) + d[k] = v + else: + d[item] = True + return d + +def suppVecToList(suppVec): + """Convert binary support vector to comma-separated 1-based sample indices.""" + indices = [] + for i, c in enumerate(suppVec): + if c == "1": + indices.append(str(i + 1)) + return ",".join(indices) if indices else "" + +def main(): + if len(sys.argv) != 3: + print(__doc__, file=sys.stderr) + sys.exit(1) + + inFile, outFile = sys.argv[1], sys.argv[2] + opener = gzip.open if inFile.endswith(".gz") else open + + with opener(inFile, "rt") as fIn, open(outFile, "w") as fOut: + for line in fIn: + if line.startswith("#"): + continue + + fields = line.rstrip("\n").split("\t") + chrom = fields[0] + pos = int(fields[1]) + name = fields[2] + qual = fields[5] + info = parseInfo(fields[7]) + + svType = info.get("SVTYPE", ".") + end = int(info.get("END", pos)) + svLen = int(float(info.get("SVLEN", "0"))) + af = float(info.get("AF", "0")) + supp = int(info.get("SUPP", "0")) + ciPos = info.get("CIPOS", "0,0") + ciEnd = info.get("CIEND", "0,0") + chr2 = info.get("CHR2", ".") + strands = info.get("STRANDS", "+-") + suppVec = info.get("SUPP_VEC", "") + + # BED is 0-based half-open + chromStart = pos - 1 + + # For INS, END == POS so the item has zero width; expand by 1 bp + chromEnd = end + if chromEnd <= chromStart: + chromEnd = chromStart + 1 + + # Score: map QUAL to 0-1000 + try: + score = min(int(round(float(qual) * 2)), 1000) + except ValueError: + score = 0 + + # Strand from first character of STRANDS field + strand = strands[0] if strands and strands[0] in "+-" else "." + + # Absolute SV length + absSvLen = abs(svLen) + + color = SV_COLORS.get(svType, "100,100,100") + + # sampleList from SUPP_VEC + sampleList = suppVecToList(suppVec) + + # end2 for TRA; empty for non-TRA so skipEmptyFields hides them + end2 = str(end) if svType == "TRA" else "" + chr2Out = chr2 if svType == "TRA" else "" + + # For TRA, chromEnd is the position on chr1 side, not chr2 + if svType == "TRA": + chromEnd = chromStart + 1 + + row = [ + chrom, + str(chromStart), + str(chromEnd), + name, + str(score), + strand, + str(chromStart), # thickStart + str(chromEnd), # thickEnd + color, + svType, + str(absSvLen), + f"{af:.6f}", + str(supp), + ciPos, + ciEnd, + chr2Out, + end2, + sampleList, + ] + fOut.write("\t".join(row) + "\n") + +if __name__ == "__main__": + main()