9a11061ca6b40fe16bdfd09b1af53192f6c7c85b max Tue Apr 21 08:13:02 2026 -0700 lrSv: add HTML doc pages and conversion scripts for recent subtracks, + hs1 HGSVC3 Subtrack stanzas for these SV callsets landed in earlier commits but the conversion scripts and per-track HTML description pages were never added; trackDb therefore had no doc to serve. This commit catches up. Docs (new): - colorsDbSv.html CoLoRSdb 1,427-sample long-read SVs - gustafsonSv.html 1KG ONT 100 (Gustafson 2024, PMID 39358015) - hgsvc2Sv.html HGSVC2 (Ebert 2021, PMID 33632895) - hprc2Sv.html HPRC release-2 pangenome SVs (no PMID yet; see humanpangenome.org/hprc-data-release-2/) - onekg3202Sr.html 1KG 3202 Illumina SHORT-READ GATK-SV (Byrska-Bishop 2022, PMID 36055201) Scripts (new): - lrSvGustafson.as / lrSvGustafsonVcfToBed.py - lrSvHgsvc2.as / lrSvHgsvc2TsvToBed.py (merges insdel + inv tables) - lrSvHprc2.as / lrSvHprc2VcfToBed.py (streams wave-decomposed VCF, explodes multi-allelic rows, filters to SV-sized or INV) - lrSv1kg3202Sr.as / lrSv1kg3202SrVcfToBed.py HGSVC3 also on hs1: - hgsvc3Sv.html: note that the hs1 build is native (not lifted): HGSVC3 aligned all assemblies to both GRCh38 and T2T-CHM13 and released separate annotation tables per reference. Added the T2T-CHM13 source URL to the Methods section and the hs1 hgsvc3.bb download link to Data Access. - doc/hs1/lrSv.txt (new): hs1-specific wget + build steps; refers back to doc/hg38/lrSv.txt for the full process. refs #36258 Co-Authored-By: Claude Opus 4.7 (1M context) diff --git src/hg/makeDb/scripts/lrSv/lrSvHgsvc2TsvToBed.py src/hg/makeDb/scripts/lrSv/lrSvHgsvc2TsvToBed.py new file mode 100644 index 00000000000..7492432ddac --- /dev/null +++ src/hg/makeDb/scripts/lrSv/lrSvHgsvc2TsvToBed.py @@ -0,0 +1,163 @@ +#!/usr/bin/env python3 +"""Convert HGSVC2 v2.0 integrated annotation TSVs (insdel + inv) to merged BED9+. + +Usage: + lrSvHgsvc2TsvToBed.py insdel.tsv.gz inv.tsv.gz output.bed + +The two TSVs share an envelope (chrom, pos, end, svtype, svlen, MERGE_AC / +MERGE_SAMPLES, cytoband, REF_SD, REF_TRF, REFSEQ_*, PLI/LOEUF, WIN_500/2K) +but diverge on type-specific fields: insdel has POP_*_AF population allele +frequencies, inv has RGN_REF_INNER. Both columns are emitted; type-specific +ones are empty where not applicable. + +Source: + https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/HGSVC2/release/v2.0/integrated_callset/ +Paper: + Ebert et al. 2021, Science, PMID 33632895. +""" + +import csv +import gzip +import sys + +SV_COLORS = { + "DEL": "200,0,0", # red + "INS": "0,0,200", # blue + "INV": "230,140,0", # orange +} + + +def openTsv(path): + return gzip.open(path, "rt") if path.endswith(".gz") else open(path, "rt") + + +def na(val): + if val is None: + return "" + v = val.strip() + if v in ("", "NA", ".", "nan"): + return "" + return v + + +def toInt(s): + if not s or s in ("NA", "."): + return 0 + try: + return int(float(s)) + except ValueError: + return 0 + + +def toFloat(s): + if not s or s in ("NA", "."): + return 0.0 + try: + return float(s) + except ValueError: + return 0.0 + + +def countSamples(mergeSamples): + """Return unique-sample count, stripping -h1/-h2/-un haplotype suffixes.""" + if not mergeSamples: + return 0 + uniq = set() + for p in mergeSamples.split(","): + p = p.strip() + if not p: + continue + for suf in ("-h1", "-h2", "-un", "-1", "-2"): + if p.endswith(suf): + p = p[: -len(suf)] + break + uniq.add(p) + return len(uniq) + + +def emit(outF, row, typeExtra): + chrom = row["#CHROM"] + if not chrom.startswith("chr"): + chrom = "chr" + chrom + # HGSVC2 TSV coords are 0-based half-open (matches HGSVC3). + chromStart = toInt(row["POS"]) + chromEnd = toInt(row["END"]) + if chromEnd <= chromStart: + chromEnd = chromStart + 1 + + svType = row["SVTYPE"] + svLen = abs(toInt(row["SVLEN"])) + color = SV_COLORS.get(svType, "100,100,100") + + alleleCount = toInt(row.get("MERGE_AC", "0")) + sampleCount = countSamples(row.get("MERGE_SAMPLES", "")) + + bed = [ + chrom, + str(chromStart), + str(chromEnd), + row["ID"], + "0", + ".", + str(chromStart), + str(chromEnd), + color, + svType, + str(svLen), + str(alleleCount), + str(sampleCount), + na(row.get("BAND", "")), + f"{toFloat(row.get('REF_SD', '0')):.6f}", + na(row.get("REF_TRF", "")), + str(toInt(row.get("REFSEQ_CDS", "0"))), + str(toInt(row.get("REFSEQ_UTR3", "0"))), + str(toInt(row.get("REFSEQ_UTR5", "0"))), + str(toInt(row.get("REFSEQ_INTRON", "0"))), + str(toInt(row.get("REFSEQ_NCRNA", "0"))), + str(toInt(row.get("REFSEQ_UP_5K", "0"))), + str(toInt(row.get("REFSEQ_DN_5K", "0"))), + na(row.get("PLI_MAX", "")), + na(row.get("LOEUF_MIN", "")), + typeExtra.get("popAllAf", ""), + typeExtra.get("popAfrAf", ""), + typeExtra.get("popAmrAf", ""), + typeExtra.get("popEasAf", ""), + typeExtra.get("popEurAf", ""), + typeExtra.get("popSasAf", ""), + typeExtra.get("regionRefInner", ""), + row.get("MERGE_SAMPLES", "") or "", + na(row.get("DISC_CLASS", "")), + na(row.get("WIN_500", "")), + na(row.get("WIN_2K", "")), + ] + outF.write("\t".join(bed) + "\n") + + +def main(): + if len(sys.argv) != 4: + print(__doc__, file=sys.stderr) + sys.exit(1) + + insdelPath, invPath, outPath = sys.argv[1], sys.argv[2], sys.argv[3] + + with open(outPath, "w") as outF: + with openTsv(insdelPath) as fIn: + for row in csv.DictReader(fIn, delimiter="\t"): + emit(outF, row, { + "popAllAf": na(row.get("POP_ALL_AF", "")), + "popAfrAf": na(row.get("POP_AFR_AF", "")), + "popAmrAf": na(row.get("POP_AMR_AF", "")), + "popEasAf": na(row.get("POP_EAS_AF", "")), + "popEurAf": na(row.get("POP_EUR_AF", "")), + "popSasAf": na(row.get("POP_SAS_AF", "")), + }) + + with openTsv(invPath) as fIn: + for row in csv.DictReader(fIn, delimiter="\t"): + emit(outF, row, { + "regionRefInner": na(row.get("RGN_REF_INNER", "")), + }) + + +if __name__ == "__main__": + main()