9eb4e0937782954c19d664e7d384d210bffb3b25 max Sat Jun 13 16:01:42 2026 -0700 lrSv: QA fixes from Lou's review - dedup, shared color palette, deCODE/AoU cleanup - Drop kwanhoSv (KimPD) from the lrSvAll merge in databases.tsv; it stays on dev/alpha until published, which also removes its >5 Mb breakend artifacts from the merged track. - Remove searchIndex from colorsDbSv, lrSv1kLin and lrSvAll (and the merge generator): the bigBeds were built without a name index, so by-name search never worked. - Single shared per-SV-type color palette in lrSvCommon.py (svColor), used by every converter and the merge. CPX is purple everywhere (was orange in 1kgOnt/apr/cpc1, colliding with INV's orange), colorsDb DEL is 200,0,0 like the rest, and TRA/INSDEL get their own colors. - deCODE: drop byte-identical duplicate rows and blank the fake AC=50 placeholder (AC is now a string field, omitted from the name and mouseOver). - AoU: numeric-entity-encode non-ASCII gene/trait text and drop duplicate rows. - gustafson, chirmade101, hprc2v21: drop byte-identical duplicate rows. - lrSvMergeAll.py: skip byte-identical duplicate source rows instead of summing their allele counts, which had inflated the per-database and total AC. refs #36258 diff --git src/hg/makeDb/scripts/lrSv/lrSvGustafsonVcfToBed.py src/hg/makeDb/scripts/lrSv/lrSvGustafsonVcfToBed.py index f2879004f6b..8aa12b9e016 100644 --- src/hg/makeDb/scripts/lrSv/lrSvGustafsonVcfToBed.py +++ src/hg/makeDb/scripts/lrSv/lrSvGustafsonVcfToBed.py @@ -1,130 +1,135 @@ #!/usr/bin/env python3 """Convert the Gustafson 2024 1000G ONT Jasmine-merged SV VCF to BED9+. Usage: lrSvGustafsonVcfToBed.py input.vcf.gz output.bed Source: https://s3.amazonaws.com/1000g-ont/Gustafson_etal_2024_preprint_SUPPLEMENTAL/ 20240423_jasmine_intrasample_noBND_custom_suppvec_alphanumeric_header_JASMINE.vcf.gz Paper: Gustafson et al. 2024, bioRxiv / Genome Res, PMID 39358015. """ import gzip import os import sys sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) -from lrSvCommon import svName, normalizeSvType - -SV_COLORS = { - "DEL": "200,0,0", # red - "INS": "0,0,200", # blue - "DUP": "0,160,0", # green - "INV": "230,140,0", # orange -} +from lrSvCommon import svName, normalizeSvType, svColor # Jasmine END on chrM can overshoot by one base; clip to chrM length. CHRM_LEN = 16569 def openVcf(path): """Open a local .vcf.gz via gzip; everything else as plain text.""" return gzip.open(path, "rt") if path.endswith(".gz") else open(path, "rt") def parseInfo(infoStr): d = {} for item in infoStr.split(";"): if "=" in item: k, v = item.split("=", 1) d[k] = v else: d[item] = True return d def main(): if len(sys.argv) != 3: print(__doc__, file=sys.stderr) sys.exit(1) inPath, outPath = sys.argv[1], sys.argv[2] + seen = set() + nIn = 0 + nDup = 0 with openVcf(inPath) as fIn, open(outPath, "w") as fOut: for line in fIn: if line.startswith("#"): continue + nIn += 1 fields = line.rstrip("\n").split("\t") chrom = fields[0] pos = int(fields[1]) rowName = fields[2] info = parseInfo(fields[7]) svTypeRaw = info.get("SVTYPE", ".") svType = normalizeSvType(svTypeRaw) end = int(info.get("END", pos)) try: svLenRaw = int(float(info.get("SVLEN", "0"))) except ValueError: svLenRaw = 0 try: supp = int(info.get("SUPP", "0")) except ValueError: supp = 0 try: varCalls = int(info.get("VARCALLS", "0")) except ValueError: varCalls = 0 precise = 1 if "PRECISE" in info else 0 strands = info.get("STRANDS", "") if strands == "??": strands = "" chromStart = pos - 1 chromEnd = end if chromEnd <= chromStart: chromEnd = chromStart + 1 if chrom == "chrM" and chromEnd > CHRM_LEN: chromEnd = CHRM_LEN svLen = chromEnd - chromStart if svType in ("INS", "MEI"): insLen = abs(svLenRaw) else: insLen = 0 # Gustafson callset is site-level without AC; 2 * SUPP # is the diploid carrier upper bound. ac = supp * 2 - color = SV_COLORS.get(svType, "100,100,100") + color = svColor(svType) featLen = insLen if svType in ("INS", "MEI") else svLen name = svName(svType, featLen, ac) row = [ chrom, str(chromStart), str(chromEnd), name, "0", ".", str(chromStart), str(chromEnd), color, svType, str(svLen), str(insLen), str(ac), str(supp), str(varCalls), str(precise), strands, ] - fOut.write("\t".join(row) + "\n") + line_out = "\t".join(row) + if line_out in seen: + nDup += 1 + continue + seen.add(line_out) + fOut.write(line_out + "\n") + + print(f"Gustafson: {nIn:,} input records, {nDup:,} duplicate rows dropped, " + f"{nIn - nDup:,} written", file=sys.stderr) if __name__ == "__main__": main()