9a11061ca6b40fe16bdfd09b1af53192f6c7c85b max Tue Apr 21 08:13:02 2026 -0700 lrSv: add HTML doc pages and conversion scripts for recent subtracks, + hs1 HGSVC3 Subtrack stanzas for these SV callsets landed in earlier commits but the conversion scripts and per-track HTML description pages were never added; trackDb therefore had no doc to serve. This commit catches up. Docs (new): - colorsDbSv.html CoLoRSdb 1,427-sample long-read SVs - gustafsonSv.html 1KG ONT 100 (Gustafson 2024, PMID 39358015) - hgsvc2Sv.html HGSVC2 (Ebert 2021, PMID 33632895) - hprc2Sv.html HPRC release-2 pangenome SVs (no PMID yet; see humanpangenome.org/hprc-data-release-2/) - onekg3202Sr.html 1KG 3202 Illumina SHORT-READ GATK-SV (Byrska-Bishop 2022, PMID 36055201) Scripts (new): - lrSvGustafson.as / lrSvGustafsonVcfToBed.py - lrSvHgsvc2.as / lrSvHgsvc2TsvToBed.py (merges insdel + inv tables) - lrSvHprc2.as / lrSvHprc2VcfToBed.py (streams wave-decomposed VCF, explodes multi-allelic rows, filters to SV-sized or INV) - lrSv1kg3202Sr.as / lrSv1kg3202SrVcfToBed.py HGSVC3 also on hs1: - hgsvc3Sv.html: note that the hs1 build is native (not lifted): HGSVC3 aligned all assemblies to both GRCh38 and T2T-CHM13 and released separate annotation tables per reference. Added the T2T-CHM13 source URL to the Methods section and the hs1 hgsvc3.bb download link to Data Access. - doc/hs1/lrSv.txt (new): hs1-specific wget + build steps; refers back to doc/hg38/lrSv.txt for the full process. refs #36258 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> diff --git src/hg/makeDb/scripts/lrSv/lrSv1kg3202SrVcfToBed.py src/hg/makeDb/scripts/lrSv/lrSv1kg3202SrVcfToBed.py new file mode 100644 index 00000000000..71f0f061da7 --- /dev/null +++ src/hg/makeDb/scripts/lrSv/lrSv1kg3202SrVcfToBed.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python3 +"""Convert 1KG 3,202-sample GATK-SV short-read VCF to BED9+. + +Short-read comparator track for the lrSv collection. + +Source: + https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20210124.SV_Illumina_Integration/1KGP_3202.gatksv_svtools_novelins.freeze_V3.wAF.vcf.gz +Paper: + Byrska-Bishop et al. 2022, Cell, PMID 36055201. + +Usage: + lrSv1kg3202SrVcfToBed.py input.vcf.gz output.bed +""" + +import gzip +import sys + +SV_COLORS = { + "DEL": "200,0,0", # red + "INS": "0,0,200", # blue + "DUP": "0,160,0", # green + "INV": "230,140,0", # orange + "CPX": "140,0,200", # purple + "CTX": "100,100,100", # grey + "CNV": "150,80,0", # brown +} + + +def parseInfo(infoStr): + d = {} + for item in infoStr.split(";"): + if "=" in item: + k, v = item.split("=", 1) + d[k] = v + else: + d[item] = True + return d + + +def toInt(s): + if not s: + return 0 + try: + return int(float(s)) + except ValueError: + return 0 + + +def toFloat(s): + if not s: + return 0.0 + try: + return float(s) + except ValueError: + return 0.0 + + +def main(): + if len(sys.argv) != 3: + print(__doc__, file=sys.stderr) + sys.exit(1) + + inPath, outPath = sys.argv[1], sys.argv[2] + opener = gzip.open if inPath.endswith(".gz") else open + + with opener(inPath, "rt") as fIn, open(outPath, "w") as fOut: + for line in fIn: + if line.startswith("#"): + continue + fields = line.rstrip("\n").split("\t") + chrom = fields[0] + pos = int(fields[1]) + name = fields[2] + filt = fields[6] + info = parseInfo(fields[7]) + + svType = info.get("SVTYPE", ".") + end = int(info.get("END", pos)) + svLen = abs(toInt(info.get("SVLEN", "0"))) + + chromStart = pos - 1 + chromEnd = end + if chromEnd <= chromStart: + chromEnd = chromStart + 1 + + # Translocations: the END is on chr2; cap the item width to 1 bp + # on the chromosome-1 side. + chr2 = info.get("CHR2", "") + if svType == "CTX" and chr2 and chr2 != chrom: + chromEnd = chromStart + 1 + + color = SV_COLORS.get(svType, "100,100,100") + + row = [ + chrom, + str(chromStart), + str(chromEnd), + name, + "0", + ".", + str(chromStart), + str(chromEnd), + color, + svType, + str(svLen), + str(toInt(info.get("AC", "0"))), + str(toInt(info.get("AN", "0"))), + f"{toFloat(info.get('AF', '0')):.6f}", + f"{toFloat(info.get('POPMAX_AF', '0')):.6f}", + f"{toFloat(info.get('AFR_AF', '0')):.6f}", + f"{toFloat(info.get('AMR_AF', '0')):.6f}", + f"{toFloat(info.get('ASN_AF', '0')):.6f}", + f"{toFloat(info.get('EUR_AF', '0')):.6f}", + f"{toFloat(info.get('SAN_AF', '0')):.6f}", + str(toInt(info.get("N_HET", "0"))), + str(toInt(info.get("N_HOMALT", "0"))), + info.get("ALGORITHMS", ""), + info.get("SOURCE", ""), + filt, + chr2 if svType == "CTX" else "", + ] + fOut.write("\t".join(row) + "\n") + + +if __name__ == "__main__": + main()