9eb4e0937782954c19d664e7d384d210bffb3b25
max
  Sat Jun 13 16:01:42 2026 -0700
lrSv: QA fixes from Lou's review - dedup, shared color palette, deCODE/AoU cleanup

- Drop kwanhoSv (KimPD) from the lrSvAll merge in databases.tsv; it stays on
dev/alpha until published, which also removes its >5 Mb breakend artifacts
from the merged track.
- Remove searchIndex from colorsDbSv, lrSv1kLin and lrSvAll (and the merge
generator): the bigBeds were built without a name index, so by-name search
never worked.
- Single shared per-SV-type color palette in lrSvCommon.py (svColor), used by
every converter and the merge. CPX is purple everywhere (was orange in
1kgOnt/apr/cpc1, colliding with INV's orange), colorsDb DEL is 200,0,0 like
the rest, and TRA/INSDEL get their own colors.
- deCODE: drop byte-identical duplicate rows and blank the fake AC=50
placeholder (AC is now a string field, omitted from the name and mouseOver).
- AoU: numeric-entity-encode non-ASCII gene/trait text and drop duplicate rows.
- gustafson, chirmade101, hprc2v21: drop byte-identical duplicate rows.
- lrSvMergeAll.py: skip byte-identical duplicate source rows instead of summing
their allele counts, which had inflated the per-database and total AC.

refs #36258

diff --git src/hg/makeDb/scripts/lrSv/lrSvGustafsonVcfToBed.py src/hg/makeDb/scripts/lrSv/lrSvGustafsonVcfToBed.py
index f2879004f6b..8aa12b9e016 100644
--- src/hg/makeDb/scripts/lrSv/lrSvGustafsonVcfToBed.py
+++ src/hg/makeDb/scripts/lrSv/lrSvGustafsonVcfToBed.py
@@ -4,70 +4,67 @@
 Usage:
     lrSvGustafsonVcfToBed.py input.vcf.gz output.bed
 
 Source:
     https://s3.amazonaws.com/1000g-ont/Gustafson_etal_2024_preprint_SUPPLEMENTAL/
     20240423_jasmine_intrasample_noBND_custom_suppvec_alphanumeric_header_JASMINE.vcf.gz
 Paper:
     Gustafson et al. 2024, bioRxiv / Genome Res, PMID 39358015.
 """
 
 import gzip
 import os
 import sys
 
 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
-from lrSvCommon import svName, normalizeSvType
-
-SV_COLORS = {
-    "DEL": "200,0,0",      # red
-    "INS": "0,0,200",      # blue
-    "DUP": "0,160,0",      # green
-    "INV": "230,140,0",    # orange
-}
+from lrSvCommon import svName, normalizeSvType, svColor
 
 # Jasmine END on chrM can overshoot by one base; clip to chrM length.
 CHRM_LEN = 16569
 
 
 def openVcf(path):
     """Open a local .vcf.gz via gzip; everything else as plain text."""
     return gzip.open(path, "rt") if path.endswith(".gz") else open(path, "rt")
 
 
 def parseInfo(infoStr):
     d = {}
     for item in infoStr.split(";"):
         if "=" in item:
             k, v = item.split("=", 1)
             d[k] = v
         else:
             d[item] = True
     return d
 
 
 def main():
     if len(sys.argv) != 3:
         print(__doc__, file=sys.stderr)
         sys.exit(1)
 
     inPath, outPath = sys.argv[1], sys.argv[2]
 
+    seen = set()
+    nIn = 0
+    nDup = 0
     with openVcf(inPath) as fIn, open(outPath, "w") as fOut:
         for line in fIn:
             if line.startswith("#"):
                 continue
+            nIn += 1
             fields = line.rstrip("\n").split("\t")
             chrom = fields[0]
             pos = int(fields[1])
             rowName = fields[2]
             info = parseInfo(fields[7])
 
             svTypeRaw = info.get("SVTYPE", ".")
             svType = normalizeSvType(svTypeRaw)
             end = int(info.get("END", pos))
             try:
                 svLenRaw = int(float(info.get("SVLEN", "0")))
             except ValueError:
                 svLenRaw = 0
             try:
                 supp = int(info.get("SUPP", "0"))
@@ -87,44 +84,52 @@
             if chromEnd <= chromStart:
                 chromEnd = chromStart + 1
             if chrom == "chrM" and chromEnd > CHRM_LEN:
                 chromEnd = CHRM_LEN
 
             svLen = chromEnd - chromStart
             if svType in ("INS", "MEI"):
                 insLen = abs(svLenRaw)
             else:
                 insLen = 0
 
             # Gustafson callset is site-level without AC; 2 * SUPP
             # is the diploid carrier upper bound.
             ac = supp * 2
 
-            color = SV_COLORS.get(svType, "100,100,100")
+            color = svColor(svType)
 
             featLen = insLen if svType in ("INS", "MEI") else svLen
             name = svName(svType, featLen, ac)
 
             row = [
                 chrom,
                 str(chromStart),
                 str(chromEnd),
                 name,
                 "0",
                 ".",
                 str(chromStart),
                 str(chromEnd),
                 color,
                 svType,
                 str(svLen),
                 str(insLen),
                 str(ac),
                 str(supp),
                 str(varCalls),
                 str(precise),
                 strands,
             ]
-            fOut.write("\t".join(row) + "\n")
+            line_out = "\t".join(row)
+            if line_out in seen:
+                nDup += 1
+                continue
+            seen.add(line_out)
+            fOut.write(line_out + "\n")
+
+    print(f"Gustafson: {nIn:,} input records, {nDup:,} duplicate rows dropped, "
+          f"{nIn - nDup:,} written", file=sys.stderr)
 
 
 if __name__ == "__main__":
     main()