src/hg/makeDb/scripts/lrSv/lrSvAou1kCsvToBed.py 9eb4e0937782954c19d664e7d384d210bffb3b25

9eb4e0937782954c19d664e7d384d210bffb3b25
max
  Sat Jun 13 16:01:42 2026 -0700
lrSv: QA fixes from Lou's review - dedup, shared color palette, deCODE/AoU cleanup

- Drop kwanhoSv (KimPD) from the lrSvAll merge in databases.tsv; it stays on
dev/alpha until published, which also removes its >5 Mb breakend artifacts
from the merged track.
- Remove searchIndex from colorsDbSv, lrSv1kLin and lrSvAll (and the merge
generator): the bigBeds were built without a name index, so by-name search
never worked.
- Single shared per-SV-type color palette in lrSvCommon.py (svColor), used by
every converter and the merge. CPX is purple everywhere (was orange in
1kgOnt/apr/cpc1, colliding with INV's orange), colorsDb DEL is 200,0,0 like
the rest, and TRA/INSDEL get their own colors.
- deCODE: drop byte-identical duplicate rows and blank the fake AC=50
placeholder (AC is now a string field, omitted from the name and mouseOver).
- AoU: numeric-entity-encode non-ASCII gene/trait text and drop duplicate rows.
- gustafson, chirmade101, hprc2v21: drop byte-identical duplicate rows.
- lrSvMergeAll.py: skip byte-identical duplicate source rows instead of summing
their allele counts, which had inflated the per-database and total AC.

refs #36258

diff --git src/hg/makeDb/scripts/lrSv/lrSvAou1kCsvToBed.py src/hg/makeDb/scripts/lrSv/lrSvAou1kCsvToBed.py
index 550f491c33e..e0c36c99526 100644
--- src/hg/makeDb/scripts/lrSv/lrSvAou1kCsvToBed.py
+++ src/hg/makeDb/scripts/lrSv/lrSvAou1kCsvToBed.py
@@ -7,86 +7,95 @@
   Cancer genes, ACMG genes, OMIM CDS, Disease CDS, Cancer CDS, ACMG CDS,
   Regulatory element, SegDUP, Tandem repeats, Other LR datasets,
   Detected in AoU SR, eQTLs, GWAS, SV-trait associations,
   SR validation, LR assembly-supported, Locityper validation
 
 Usage:
     lrSvAou1kCsvToBed.py input.csv.gz output.bed
 """
 
 import csv
 import gzip
 import os
 import sys
 
 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
-from lrSvCommon import svName, normalizeSvType
+from lrSvCommon import svName, normalizeSvType, svColor
 
 # AoU Phase-I (2025) SV panel: 1,027 samples => 2,054 alleles (diploid)
 # No per-variant AC in the site-level release; approximate AC as
 # round(max_popAF * N_alleles) for naming purposes only.
 AOU_N_ALLELES = 2054
 
-SV_COLORS = {
-    "DEL": "200,0,0",
-    "INS": "0,0,200",
-}
+
+def encodeNonAscii(s):
+    """Replace non-ASCII characters with numeric HTML entities so detail
+    pages render them correctly instead of as mojibake. The source CSV has
+    gene/trait text with accented names, Greek letters, curly quotes and
+    en-dashes (e.g. ö, β, ', –)."""
+    return "".join(c if ord(c) < 128 else f"&#{ord(c)};" for c in s)
+
 
 def na(val):
-    """Return empty string for NA values."""
+    """Return empty string for NA values, else the value with any non-ASCII
+    characters numeric-entity encoded."""
     if val == "NA" or val == "No" or val == "":
         return ""
-    return val
+    return encodeNonAscii(val)
 
 def main():
     if len(sys.argv) != 3:
         print(__doc__, file=sys.stderr)
         sys.exit(1)
 
     inFile, outFile = sys.argv[1], sys.argv[2]
 
+    seen = set()
+    nIn = 0
+    nDup = 0
     with gzip.open(inFile, "rt") as fIn, open(outFile, "w") as fOut:
         reader = csv.reader(fIn)
         header = next(reader)
 
         for row in reader:
+            nIn += 1
             coord = row[0]       # chr1:10627
             svTypeRaw = row[2]
             svType = normalizeSvType(svTypeRaw)
             svLenSrc = int(row[3])
 
             # Parse coordinate (1-based position)
             chrom, posStr = coord.split(":")
             pos = int(posStr)
 
             # BED is 0-based half-open
             chromStart = pos - 1
             if svType == "DEL":
                 chromEnd = chromStart + svLenSrc
             else:
                 # INS: place at insertion site, 1 bp wide
                 chromEnd = chromStart + 1
 
             svLen = chromEnd - chromStart
             # insLen: the source "SV length" represents the INS payload for INS
             # and 0 for DEL (where it equals reference span)
             if svType in ("INS", "MEI"):
                 insLen = svLenSrc
             else:
                 insLen = 0
 
-            color = SV_COLORS.get(svType, "100,100,100")
+            color = svColor(svType)
 
             # Parse population AFs (column 5): "0.001,0.002,0.003,0.004,0.005"
             afStr = row[5]
             afParts = afStr.split(",")
             try:
                 afAfr = float(afParts[0])
                 afAmr = float(afParts[1])
                 afEas = float(afParts[2])
                 afEur = float(afParts[3])
                 afSas = float(afParts[4])
             except (ValueError, IndexError):
                 afAfr = afAmr = afEas = afEur = afSas = 0.0
 
             fst = na(row[6])
 
@@ -138,19 +147,27 @@
                 f"{afSas:.6f}",
                 fst,
                 omimGenes,
                 diseaseGenes,
                 cancerGenes,
                 acmgGenes,
                 regElement,
                 segDup,
                 tandemRepeat,
                 otherLr,
                 detectedSr,
                 eqtls,
                 gwas,
                 traitAssoc,
             ]
-            fOut.write("\t".join(bedRow) + "\n")
+            line_out = "\t".join(bedRow)
+            if line_out in seen:
+                nDup += 1
+                continue
+            seen.add(line_out)
+            fOut.write(line_out + "\n")
+
+    print(f"AoU 1K: {nIn:,} input records, {nDup:,} duplicate rows dropped, "
+          f"{nIn - nDup:,} written", file=sys.stderr)
 
 if __name__ == "__main__":
     main()