9eb4e0937782954c19d664e7d384d210bffb3b25 max Sat Jun 13 16:01:42 2026 -0700 lrSv: QA fixes from Lou's review - dedup, shared color palette, deCODE/AoU cleanup - Drop kwanhoSv (KimPD) from the lrSvAll merge in databases.tsv; it stays on dev/alpha until published, which also removes its >5 Mb breakend artifacts from the merged track. - Remove searchIndex from colorsDbSv, lrSv1kLin and lrSvAll (and the merge generator): the bigBeds were built without a name index, so by-name search never worked. - Single shared per-SV-type color palette in lrSvCommon.py (svColor), used by every converter and the merge. CPX is purple everywhere (was orange in 1kgOnt/apr/cpc1, colliding with INV's orange), colorsDb DEL is 200,0,0 like the rest, and TRA/INSDEL get their own colors. - deCODE: drop byte-identical duplicate rows and blank the fake AC=50 placeholder (AC is now a string field, omitted from the name and mouseOver). - AoU: numeric-entity-encode non-ASCII gene/trait text and drop duplicate rows. - gustafson, chirmade101, hprc2v21: drop byte-identical duplicate rows. - lrSvMergeAll.py: skip byte-identical duplicate source rows instead of summing their allele counts, which had inflated the per-database and total AC. refs #36258 diff --git src/hg/makeDb/scripts/lrSv/lrSv1kgOntVcfToBed.py src/hg/makeDb/scripts/lrSv/lrSv1kgOntVcfToBed.py index 7859c2cedb7..cb2cfe65186 100644 --- src/hg/makeDb/scripts/lrSv/lrSv1kgOntVcfToBed.py +++ src/hg/makeDb/scripts/lrSv/lrSv1kgOntVcfToBed.py @@ -1,246 +1,239 @@ #!/usr/bin/env python3 """Convert 1KG ONT SVAN VCF to BED9+ for bigBed, adding allele counts from phased VCF. Usage: lrSv1kgOntVcfToBed.py svan.vcf.gz output.bed chrom.sizes [phased.vcf.gz] The optional phased VCF provides AC, AN, and AF per variant (matched by ID). SVs without a match get AC=-1 (written to AC column), alleleNumber=-1, alleleFreq=-1. The canonical `name` column omits the :AC suffix when AC is -1. """ import gzip import os import sys sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) -from lrSvCommon import svName, normalizeSvType - -# Colors by SV class (R,G,B) -SV_COLORS = { - "INS": "0,0,200", # blue - "DEL": "200,0,0", # red - "CPX": "230,140,0", # orange -} +from lrSvCommon import svName, normalizeSvType, svColor def parseInfo(infoStr): """Parse INFO field into a dict.""" d = {} for item in infoStr.split(";"): if "=" in item: k, v = item.split("=", 1) d[k] = v else: d[item] = True return d def getSvClass(varId): """Extract raw SV class from the variant ID. Returns raw string; will be normalized by normalizeSvType (COMPLEX -> CPX).""" if "-INS-" in varId: return "INS" elif "-DEL-" in varId: return "DEL" elif "-COMPLEX-" in varId: return "COMPLEX" return "OTHER" def loadPhasedAC(phasedFile): """Load AC, AN, AF from phased VCF, keyed by variant ID.""" acMap = {} opener = gzip.open if phasedFile.endswith(".gz") else open with opener(phasedFile, "rt") as f: for line in f: if line.startswith("#"): continue fields = line.split("\t", 9) varId = fields[2] info = parseInfo(fields[7]) ac = int(info.get("AC", "0")) an = int(info.get("AN", "0")) af = float(info.get("AF", "0")) acMap[varId] = (ac, an, af) return acMap def main(): if len(sys.argv) < 4: print(__doc__, file=sys.stderr) sys.exit(1) inFile = sys.argv[1] outFile = sys.argv[2] chromSizesFile = sys.argv[3] phasedFile = sys.argv[4] if len(sys.argv) > 4 else None opener = gzip.open if inFile.endswith(".gz") else open # Load chrom sizes for clipping chromSizes = {} with open(chromSizesFile) as f: for line in f: c, s = line.strip().split("\t") chromSizes[c] = int(s) # Load allele counts from phased VCF acMap = {} if phasedFile: print(f"Loading allele counts from {phasedFile}...", file=sys.stderr) acMap = loadPhasedAC(phasedFile) print(f"Loaded {len(acMap)} variants from phased VCF", file=sys.stderr) skipped = 0 matched = 0 unmatched = 0 with opener(inFile, "rt") as fIn, open(outFile, "w") as fOut: for line in fIn: if line.startswith("#"): continue fields = line.rstrip("\n").split("\t") chrom = fields[0] pos = int(fields[1]) varId = fields[2] ref = fields[3] info = parseInfo(fields[7]) svTypeRaw = getSvClass(varId) svType = normalizeSvType(svTypeRaw) # BED coordinates: 0-based half-open chromStart = pos - 1 chromEnd = chromStart + len(ref) # Source lengths: INS_LEN for insertion size, DEL_LEN for deletion size insLenSrc = int(info.get("INS_LEN", "0")) delLenSrc = int(info.get("DEL_LEN", "0")) # For INS, the item spans only the anchor base; expand by 1 for visibility if svType == "INS" and chromEnd <= chromStart + 1: chromEnd = chromStart + 1 # svLen = span on reference svLen = chromEnd - chromStart # insLen = length of inserted sequence (INS only); 0 otherwise if svType == "INS": insLen = insLenSrc elif svType == "DEL": # DEL_LEN matches reference span insLen = 0 else: insLen = 0 # Insertion/deletion type insType = info.get("ITYPE_N", info.get("DTYPE_N", "")) # Transposon family family = info.get("FAM_N", "") # Percent resolved try: percResolved = float(info.get("PERC_RESOLVED", "0")) except ValueError: percResolved = 0.0 # TSD length try: tsdLen = int(info.get("TSD_LEN", "0")) except ValueError: tsdLen = 0 # Poly-A length try: polyaLen = int(info.get("POLYA_LEN", "0")) except ValueError: polyaLen = 0 # Conformation conformation = info.get("CONFORMATION", "") # Retrotransposon length try: rtLen = int(info.get("RT_LEN", "0")) except ValueError: rtLen = 0 # VNTR motif count try: nbMotifs = int(info.get("NB_MOTIFS", "0")) except ValueError: nbMotifs = 0 # Source gene (for processed pseudogenes) srcGene = info.get("SRC_GENE", "") # Number of exons retrotransposed try: nbExons = int(info.get("NB_EXONS", "0")) except ValueError: nbExons = 0 # Non-canonical MEI flag notCanonical = "Yes" if info.get("NOT_CANONICAL") else "" # Strand strand = info.get("STRAND", ".") if strand not in ("+", "-"): strand = "." - color = SV_COLORS.get(svType, "100,100,100") + color = svColor(svType) # Allele counts from phased VCF if varId in acMap: ac, an, af = acMap[varId] matched += 1 acForName = ac else: ac, an, af = -1, -1, -1.0 unmatched += 1 acForName = None # drop :AC suffix for SVAN-only rows # Clip to chrom sizes; skip records on unknown chroms if chrom not in chromSizes: skipped += 1 continue if chromEnd > chromSizes[chrom]: skipped += 1 continue featLen = insLen if svType in ("INS", "MEI") else svLen name = svName(svType, featLen, acForName) row = [ chrom, str(chromStart), str(chromEnd), name, "0", strand, str(chromStart), # thickStart str(chromEnd), # thickEnd color, svType, str(svLen), str(insLen), str(ac), insType, family, f"{percResolved:.2f}", str(tsdLen), str(polyaLen), conformation, str(rtLen), str(nbMotifs), srcGene, str(nbExons), notCanonical, str(an), f"{af:.4f}" if af >= 0 else "-1.0000", ] fOut.write("\t".join(row) + "\n") print(f"Skipped {skipped} records outside chrom sizes", file=sys.stderr) if acMap: print(f"Allele counts: {matched} matched, {unmatched} unmatched " f"({100*matched/(matched+unmatched):.1f}% matched)", file=sys.stderr) if __name__ == "__main__": main()