9a11061ca6b40fe16bdfd09b1af53192f6c7c85b max Tue Apr 21 08:13:02 2026 -0700 lrSv: add HTML doc pages and conversion scripts for recent subtracks, + hs1 HGSVC3 Subtrack stanzas for these SV callsets landed in earlier commits but the conversion scripts and per-track HTML description pages were never added; trackDb therefore had no doc to serve. This commit catches up. Docs (new): - colorsDbSv.html CoLoRSdb 1,427-sample long-read SVs - gustafsonSv.html 1KG ONT 100 (Gustafson 2024, PMID 39358015) - hgsvc2Sv.html HGSVC2 (Ebert 2021, PMID 33632895) - hprc2Sv.html HPRC release-2 pangenome SVs (no PMID yet; see humanpangenome.org/hprc-data-release-2/) - onekg3202Sr.html 1KG 3202 Illumina SHORT-READ GATK-SV (Byrska-Bishop 2022, PMID 36055201) Scripts (new): - lrSvGustafson.as / lrSvGustafsonVcfToBed.py - lrSvHgsvc2.as / lrSvHgsvc2TsvToBed.py (merges insdel + inv tables) - lrSvHprc2.as / lrSvHprc2VcfToBed.py (streams wave-decomposed VCF, explodes multi-allelic rows, filters to SV-sized or INV) - lrSv1kg3202Sr.as / lrSv1kg3202SrVcfToBed.py HGSVC3 also on hs1: - hgsvc3Sv.html: note that the hs1 build is native (not lifted): HGSVC3 aligned all assemblies to both GRCh38 and T2T-CHM13 and released separate annotation tables per reference. Added the T2T-CHM13 source URL to the Methods section and the hs1 hgsvc3.bb download link to Data Access. - doc/hs1/lrSv.txt (new): hs1-specific wget + build steps; refers back to doc/hg38/lrSv.txt for the full process. refs #36258 Co-Authored-By: Claude Opus 4.7 (1M context) diff --git src/hg/makeDb/scripts/lrSv/lrSvGustafsonVcfToBed.py src/hg/makeDb/scripts/lrSv/lrSvGustafsonVcfToBed.py new file mode 100644 index 00000000000..43e9fbfd1a0 --- /dev/null +++ src/hg/makeDb/scripts/lrSv/lrSvGustafsonVcfToBed.py @@ -0,0 +1,114 @@ +#!/usr/bin/env python3 +"""Convert the Gustafson 2024 1000G ONT Jasmine-merged SV VCF to BED9+. + +Usage: + lrSvGustafsonVcfToBed.py input.vcf.gz output.bed + +Source: + https://s3.amazonaws.com/1000g-ont/Gustafson_etal_2024_preprint_SUPPLEMENTAL/ + 20240423_jasmine_intrasample_noBND_custom_suppvec_alphanumeric_header_JASMINE.vcf.gz +Paper: + Gustafson et al. 2024, bioRxiv / Genome Res, PMID 39358015. +""" + +import gzip +import subprocess +import sys + +SV_COLORS = { + "DEL": "200,0,0", # red + "INS": "0,0,200", # blue + "DUP": "0,160,0", # green + "INV": "230,140,0", # orange +} + +# Jasmine END on chrM can overshoot by one base; clip to chrM length. +CHRM_LEN = 16569 + + +def openVcf(path): + """Open a local .vcf.gz via gzip; everything else as plain text.""" + return gzip.open(path, "rt") if path.endswith(".gz") else open(path, "rt") + + +def parseInfo(infoStr): + d = {} + for item in infoStr.split(";"): + if "=" in item: + k, v = item.split("=", 1) + d[k] = v + else: + d[item] = True + return d + + +def main(): + if len(sys.argv) != 3: + print(__doc__, file=sys.stderr) + sys.exit(1) + + inPath, outPath = sys.argv[1], sys.argv[2] + + # bcftools view -H strips the header so we don't have to; but gzip is fine + # for this file and saves the external dependency. + with openVcf(inPath) as fIn, open(outPath, "w") as fOut: + for line in fIn: + if line.startswith("#"): + continue + fields = line.rstrip("\n").split("\t") + chrom = fields[0] + pos = int(fields[1]) + name = fields[2] + info = parseInfo(fields[7]) + + svType = info.get("SVTYPE", ".") + end = int(info.get("END", pos)) + try: + svLen = int(float(info.get("SVLEN", "0"))) + except ValueError: + svLen = 0 + try: + supp = int(info.get("SUPP", "0")) + except ValueError: + supp = 0 + try: + varCalls = int(info.get("VARCALLS", "0")) + except ValueError: + varCalls = 0 + precise = 1 if "PRECISE" in info else 0 + strands = info.get("STRANDS", "") + if strands == "??": + strands = "" + + chromStart = pos - 1 + chromEnd = end + if chromEnd <= chromStart: + chromEnd = chromStart + 1 + if chrom == "chrM" and chromEnd > CHRM_LEN: + chromEnd = CHRM_LEN + + absSvLen = abs(svLen) + color = SV_COLORS.get(svType, "100,100,100") + + row = [ + chrom, + str(chromStart), + str(chromEnd), + name, + "0", + ".", + str(chromStart), + str(chromEnd), + color, + svType, + str(absSvLen), + str(supp), + str(varCalls), + str(precise), + strands, + ] + fOut.write("\t".join(row) + "\n") + + +if __name__ == "__main__": + main()