9eb4e0937782954c19d664e7d384d210bffb3b25 max Sat Jun 13 16:01:42 2026 -0700 lrSv: QA fixes from Lou's review - dedup, shared color palette, deCODE/AoU cleanup - Drop kwanhoSv (KimPD) from the lrSvAll merge in databases.tsv; it stays on dev/alpha until published, which also removes its >5 Mb breakend artifacts from the merged track. - Remove searchIndex from colorsDbSv, lrSv1kLin and lrSvAll (and the merge generator): the bigBeds were built without a name index, so by-name search never worked. - Single shared per-SV-type color palette in lrSvCommon.py (svColor), used by every converter and the merge. CPX is purple everywhere (was orange in 1kgOnt/apr/cpc1, colliding with INV's orange), colorsDb DEL is 200,0,0 like the rest, and TRA/INSDEL get their own colors. - deCODE: drop byte-identical duplicate rows and blank the fake AC=50 placeholder (AC is now a string field, omitted from the name and mouseOver). - AoU: numeric-entity-encode non-ASCII gene/trait text and drop duplicate rows. - gustafson, chirmade101, hprc2v21: drop byte-identical duplicate rows. - lrSvMergeAll.py: skip byte-identical duplicate source rows instead of summing their allele counts, which had inflated the per-database and total AC. refs #36258 diff --git src/hg/makeDb/scripts/lrSv/lrSv1kgOntVcfToBed.py src/hg/makeDb/scripts/lrSv/lrSv1kgOntVcfToBed.py index 7859c2cedb7..cb2cfe65186 100644 --- src/hg/makeDb/scripts/lrSv/lrSv1kgOntVcfToBed.py +++ src/hg/makeDb/scripts/lrSv/lrSv1kgOntVcfToBed.py @@ -2,38 +2,31 @@ """Convert 1KG ONT SVAN VCF to BED9+ for bigBed, adding allele counts from phased VCF. Usage: lrSv1kgOntVcfToBed.py svan.vcf.gz output.bed chrom.sizes [phased.vcf.gz] The optional phased VCF provides AC, AN, and AF per variant (matched by ID). SVs without a match get AC=-1 (written to AC column), alleleNumber=-1, alleleFreq=-1. The canonical `name` column omits the :AC suffix when AC is -1. """ import gzip import os import sys sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) -from lrSvCommon import svName, normalizeSvType - -# Colors by SV class (R,G,B) -SV_COLORS = { - "INS": "0,0,200", # blue - "DEL": "200,0,0", # red - "CPX": "230,140,0", # orange -} +from lrSvCommon import svName, normalizeSvType, svColor def parseInfo(infoStr): """Parse INFO field into a dict.""" d = {} for item in infoStr.split(";"): if "=" in item: k, v = item.split("=", 1) d[k] = v else: d[item] = True return d def getSvClass(varId): """Extract raw SV class from the variant ID. Returns raw string; will be normalized by normalizeSvType (COMPLEX -> CPX).""" @@ -172,31 +165,31 @@ # Number of exons retrotransposed try: nbExons = int(info.get("NB_EXONS", "0")) except ValueError: nbExons = 0 # Non-canonical MEI flag notCanonical = "Yes" if info.get("NOT_CANONICAL") else "" # Strand strand = info.get("STRAND", ".") if strand not in ("+", "-"): strand = "." - color = SV_COLORS.get(svType, "100,100,100") + color = svColor(svType) # Allele counts from phased VCF if varId in acMap: ac, an, af = acMap[varId] matched += 1 acForName = ac else: ac, an, af = -1, -1, -1.0 unmatched += 1 acForName = None # drop :AC suffix for SVAN-only rows # Clip to chrom sizes; skip records on unknown chroms if chrom not in chromSizes: skipped += 1 continue