9eb4e0937782954c19d664e7d384d210bffb3b25 max Sat Jun 13 16:01:42 2026 -0700 lrSv: QA fixes from Lou's review - dedup, shared color palette, deCODE/AoU cleanup - Drop kwanhoSv (KimPD) from the lrSvAll merge in databases.tsv; it stays on dev/alpha until published, which also removes its >5 Mb breakend artifacts from the merged track. - Remove searchIndex from colorsDbSv, lrSv1kLin and lrSvAll (and the merge generator): the bigBeds were built without a name index, so by-name search never worked. - Single shared per-SV-type color palette in lrSvCommon.py (svColor), used by every converter and the merge. CPX is purple everywhere (was orange in 1kgOnt/apr/cpc1, colliding with INV's orange), colorsDb DEL is 200,0,0 like the rest, and TRA/INSDEL get their own colors. - deCODE: drop byte-identical duplicate rows and blank the fake AC=50 placeholder (AC is now a string field, omitted from the name and mouseOver). - AoU: numeric-entity-encode non-ASCII gene/trait text and drop duplicate rows. - gustafson, chirmade101, hprc2v21: drop byte-identical duplicate rows. - lrSvMergeAll.py: skip byte-identical duplicate source rows instead of summing their allele counts, which had inflated the per-database and total AC. refs #36258 diff --git src/hg/makeDb/scripts/lrSv/lrSvVcfToBed.py src/hg/makeDb/scripts/lrSv/lrSvVcfToBed.py index 754dec4aa9b..6c48315c4d2 100644 --- src/hg/makeDb/scripts/lrSv/lrSvVcfToBed.py +++ src/hg/makeDb/scripts/lrSv/lrSvVcfToBed.py @@ -1,37 +1,28 @@ #!/usr/bin/env python3 """Convert a SURVIVOR-merged SV VCF (site-only) to BED9+ for bigBed. Usage: lrSvVcfToBed.py input.vcf.gz output.bed """ import gzip import os import sys sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) -from lrSvCommon import svName, normalizeSvType, insLenFor - -# Colors by SV type (R,G,B) -SV_COLORS = { - "DEL": "200,0,0", # red - "INS": "0,0,200", # blue - "DUP": "0,160,0", # green - "INV": "230,140,0", # orange - "TRA": "140,0,200", # purple -} +from lrSvCommon import svName, normalizeSvType, insLenFor, svColor def parseInfo(infoStr): """Parse INFO field into a dict.""" d = {} for item in infoStr.split(";"): if "=" in item: k, v = item.split("=", 1) d[k] = v else: d[item] = True return d def suppVecToList(suppVec): """Convert binary support vector to comma-separated 1-based sample indices.""" indices = [] @@ -76,31 +67,31 @@ # For INS, END == POS so the item has zero width; expand by 1 bp chromEnd = end if chromEnd <= chromStart: chromEnd = chromStart + 1 # Score: map QUAL to 0-1000 try: score = min(int(round(float(qual) * 2)), 1000) except ValueError: score = 0 # Strand from first character of STRANDS field strand = strands[0] if strands and strands[0] in "+-" else "." - color = SV_COLORS.get(svType, "100,100,100") + color = svColor(svType) # sampleList from SUPP_VEC sampleList = suppVecToList(suppVec) # end2 for TRA; empty for non-TRA so skipEmptyFields hides them end2 = str(end) if svType == "TRA" else "" chr2Out = chr2 if svType == "TRA" else "" # For TRA, chromEnd is the position on chr1 side, not chr2 if svType == "TRA": chromEnd = chromStart + 1 # svLen: length on reference svLen = chromEnd - chromStart # insLen: for INS use abs(SVLEN); else 0 (except TRA which is 0)