9a11061ca6b40fe16bdfd09b1af53192f6c7c85b max Tue Apr 21 08:13:02 2026 -0700 lrSv: add HTML doc pages and conversion scripts for recent subtracks, + hs1 HGSVC3 Subtrack stanzas for these SV callsets landed in earlier commits but the conversion scripts and per-track HTML description pages were never added; trackDb therefore had no doc to serve. This commit catches up. Docs (new): - colorsDbSv.html CoLoRSdb 1,427-sample long-read SVs - gustafsonSv.html 1KG ONT 100 (Gustafson 2024, PMID 39358015) - hgsvc2Sv.html HGSVC2 (Ebert 2021, PMID 33632895) - hprc2Sv.html HPRC release-2 pangenome SVs (no PMID yet; see humanpangenome.org/hprc-data-release-2/) - onekg3202Sr.html 1KG 3202 Illumina SHORT-READ GATK-SV (Byrska-Bishop 2022, PMID 36055201) Scripts (new): - lrSvGustafson.as / lrSvGustafsonVcfToBed.py - lrSvHgsvc2.as / lrSvHgsvc2TsvToBed.py (merges insdel + inv tables) - lrSvHprc2.as / lrSvHprc2VcfToBed.py (streams wave-decomposed VCF, explodes multi-allelic rows, filters to SV-sized or INV) - lrSv1kg3202Sr.as / lrSv1kg3202SrVcfToBed.py HGSVC3 also on hs1: - hgsvc3Sv.html: note that the hs1 build is native (not lifted): HGSVC3 aligned all assemblies to both GRCh38 and T2T-CHM13 and released separate annotation tables per reference. Added the T2T-CHM13 source URL to the Methods section and the hs1 hgsvc3.bb download link to Data Access. - doc/hs1/lrSv.txt (new): hs1-specific wget + build steps; refers back to doc/hg38/lrSv.txt for the full process. refs #36258 Co-Authored-By: Claude Opus 4.7 (1M context) diff --git src/hg/makeDb/scripts/lrSv/lrSvHprc2VcfToBed.py src/hg/makeDb/scripts/lrSv/lrSvHprc2VcfToBed.py new file mode 100644 index 00000000000..6807b9f7d8d --- /dev/null +++ src/hg/makeDb/scripts/lrSv/lrSvHprc2VcfToBed.py @@ -0,0 +1,180 @@ +#!/usr/bin/env python3 +"""Convert the HPRC release-2 wave-decomposed pangenome VCF to BED9+. + +Filters to SV-sized alleles (|LEN| >= 50 bp) and inversions (INV flag). +Expands multi-allelic rows so each ALT becomes its own BED row. + +Usage: + lrSvHprc2VcfToBed.py input.vcf.gz output.bed + +Source: + https://humanpangenome.org/hprc-data-release-2/ + Graph/VCF from the HPRC v2.0 minigraph-cactus GRCh38 release + (see https://github.com/human-pangenomics/hprc_intermediate_assembly/ + blob/main/data_tables/pangenomes/alignments_v2.0.csv). +""" + +import gzip +import sys + +MIN_SV_LEN = 50 + +SV_COLORS = { + "INS": "0,0,200", # blue + "DEL": "200,0,0", # red + "COMPLEX": "140,0,200", # purple + "INV": "230,140,0", # orange +} + + +def parseInfo(infoStr): + d = {} + for item in infoStr.split(";"): + if "=" in item: + k, v = item.split("=", 1) + d[k] = v + else: + d[item] = True + return d + + +def splitMulti(s, n): + """Split a per-allele INFO value; pad with '' if missing.""" + if s is None: + return [""] * n + parts = s.split(",") + if len(parts) < n: + parts = parts + [""] * (n - len(parts)) + return parts + + +def deriveType(ref, alt): + """Derive allele type when TYPE isn't in INFO.""" + dr, da = len(ref), len(alt) + if dr == da: + return "snp" if dr == 1 else "mnp" + if dr == 1 and da > 1: + return "ins" + if da == 1 and dr > 1: + return "del" + return "complex" + + +def main(): + if len(sys.argv) != 3: + print(__doc__, file=sys.stderr) + sys.exit(1) + + inPath, outPath = sys.argv[1], sys.argv[2] + opener = gzip.open if inPath.endswith(".gz") else open + + with opener(inPath, "rt") as fIn, open(outPath, "w") as fOut: + for line in fIn: + if line.startswith("#"): + continue + fields = line.rstrip("\n").split("\t") + chrom = fields[0] + pos = int(fields[1]) + name = fields[2] + ref = fields[3] + alts = fields[4].split(",") + info = parseInfo(fields[7]) + + n = len(alts) + typeList = splitMulti(info.get("TYPE"), n) + lenList = splitMulti(info.get("LEN"), n) + acList = splitMulti(info.get("AC"), n) + afList = splitMulti(info.get("AF"), n) + + isInv = "INV" in info + try: + nSamples = int(info.get("NS", "0")) + except ValueError: + nSamples = 0 + try: + an = int(info.get("AN", "0")) + except ValueError: + an = 0 + try: + snarlLevel = int(info.get("LV", "0")) + except ValueError: + snarlLevel = 0 + + for i, alt in enumerate(alts): + if alt in (".", "*"): + continue + + t = (typeList[i] or "").strip().lower() + if not t: + t = deriveType(ref, alt) + + lenStr = (lenList[i] or "").strip() + try: + svLenVal = int(lenStr) + except ValueError: + svLenVal = abs(len(alt) - len(ref)) + + # Filter: keep SV-sized alleles plus INV flag records. + if svLenVal < MIN_SV_LEN and not isInv: + continue + + # Normalize type for display. + svType = t.upper() + if svType not in ("INS", "DEL", "COMPLEX"): + svType = "COMPLEX" + if isInv: + svType = "INV" + + color = SV_COLORS.get(svType, "100,100,100") + + chromStart = pos - 1 + # INS: 1-bp anchor; DEL/COMPLEX/INV: span the REF interval. + if svType == "INS": + chromEnd = chromStart + 1 + else: + chromEnd = chromStart + len(ref) + if chromEnd <= chromStart: + chromEnd = chromStart + 1 + + try: + ac = int(acList[i]) + except ValueError: + ac = 0 + try: + af = float(afList[i]) + except ValueError: + af = 0.0 + + # Name: VCF ID if informative, else constructed. Truncate + # very long snarl/ORIGIN-chain IDs to stay within bigBed's + # 255-char limit. + if name and name != ".": + rowName = f"{name}.{i+1}" if n > 1 else name + else: + rowName = f"{chrom}:{pos}:{svType}:{svLenVal}" + if len(rowName) > 200: + rowName = f"{chrom}:{pos}:{svType}:{svLenVal}" + + row = [ + chrom, + str(chromStart), + str(chromEnd), + rowName, + "0", + ".", + str(chromStart), + str(chromEnd), + color, + svType, + str(svLenVal), + str(ac), + str(an), + f"{af:.6f}", + str(nSamples), + str(snarlLevel), + ] + fOut.write("\t".join(row) + "\n") + + +if __name__ == "__main__": + main()