src/hg/makeDb/scripts/lrSv/lrSvChirmade101TsvToBed.py 9eb4e0937782954c19d664e7d384d210bffb3b25

9eb4e0937782954c19d664e7d384d210bffb3b25
max
  Sat Jun 13 16:01:42 2026 -0700
lrSv: QA fixes from Lou's review - dedup, shared color palette, deCODE/AoU cleanup

- Drop kwanhoSv (KimPD) from the lrSvAll merge in databases.tsv; it stays on
dev/alpha until published, which also removes its >5 Mb breakend artifacts
from the merged track.
- Remove searchIndex from colorsDbSv, lrSv1kLin and lrSvAll (and the merge
generator): the bigBeds were built without a name index, so by-name search
never worked.
- Single shared per-SV-type color palette in lrSvCommon.py (svColor), used by
every converter and the merge. CPX is purple everywhere (was orange in
1kgOnt/apr/cpc1, colliding with INV's orange), colorsDb DEL is 200,0,0 like
the rest, and TRA/INSDEL get their own colors.
- deCODE: drop byte-identical duplicate rows and blank the fake AC=50
placeholder (AC is now a string field, omitted from the name and mouseOver).
- AoU: numeric-entity-encode non-ASCII gene/trait text and drop duplicate rows.
- gustafson, chirmade101, hprc2v21: drop byte-identical duplicate rows.
- lrSvMergeAll.py: skip byte-identical duplicate source rows instead of summing
their allele counts, which had inflated the per-database and total AC.

refs #36258

diff --git src/hg/makeDb/scripts/lrSv/lrSvChirmade101TsvToBed.py src/hg/makeDb/scripts/lrSv/lrSvChirmade101TsvToBed.py
index c240ab5414c..025c4dd2e0e 100644
--- src/hg/makeDb/scripts/lrSv/lrSvChirmade101TsvToBed.py
+++ src/hg/makeDb/scripts/lrSv/lrSvChirmade101TsvToBed.py
@@ -1,137 +1,141 @@
 #!/usr/bin/env python3
 """Convert the Chirmade 2026 SVatalog sv_annotations.tsv to BED9+.
 
 Source: https://zenodo.org/records/13367574 (sv_annotations.tsv)
 Paper:  Chirmade et al. 2026, Heredity (Edinb), PMID 41203876
 
 Coordinates in the source TSV are 1-based closed (End-Start+1 == Length).
 Translate to BED-style 0-based half-open (chromStart = Start - 1,
 chromEnd = End).
 
 Usage:
     lrSvChirmade101TsvToBed.py sv_annotations.tsv output.bed
 """
 
 import csv
 import os
 import sys
 
 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
-from lrSvCommon import svName, normalizeSvType
-
-SV_COLORS = {
-    "DEL": "200,0,0",      # red
-    "INS": "0,0,200",      # blue
-    "DUP": "0,160,0",      # green
-    "INV": "230,140,0",    # orange
-    "CPX": "140,0,200",    # purple
-}
+from lrSvCommon import svName, normalizeSvType, svColor
 
 
 def na(val):
     """Return '' for missing ('NA' or empty) source values."""
     if val is None:
         return ""
     v = val.strip()
     if v == "" or v == "NA":
         return ""
     return v
 
 
 def toInt(s):
     if not s:
         return 0
     try:
         return int(float(s))
     except ValueError:
         return 0
 
 
 def main():
     if len(sys.argv) != 3:
         print(__doc__, file=sys.stderr)
         sys.exit(1)
 
     inPath, outPath = sys.argv[1], sys.argv[2]
 
+    seen = set()
+    nIn = 0
+    nDup = 0
     with open(inPath, newline="") as fIn, open(outPath, "w") as fOut:
         reader = csv.DictReader(fIn, delimiter="\t")
         for row in reader:
+            nIn += 1
             chrom = row["Chromosome"]
             if not chrom.startswith("chr"):
                 chrom = "chr" + chrom
 
             # 1-based closed -> 0-based half-open
             chromStart = toInt(row["Start"]) - 1
             chromEnd = toInt(row["End"])
             if chromEnd <= chromStart:
                 chromEnd = chromStart + 1
 
             svTypeRaw = row["Type"]  # lowercase del/ins/dup/inv/complex
             svType = normalizeSvType(svTypeRaw)
             srcLen = abs(toInt(row["Length"]))
             svLen = chromEnd - chromStart
             if svType in ("INS", "MEI"):
                 insLen = srcLen
             else:
                 insLen = 0
-            color = SV_COLORS.get(svType, "100,100,100")
+            color = svColor(svType)
 
             # Chirmade catalog is site-level without AC. Use -1 as placeholder
             # so svName drops the :AC suffix.
             ac = -1
 
             featLen = insLen if svType in ("INS", "MEI") else svLen
             name = svName(svType, featLen, ac)
 
             bedRow = [
                 chrom,
                 str(chromStart),
                 str(chromEnd),
                 name,
                 "0",
                 ".",
                 str(chromStart),
                 str(chromEnd),
                 color,
                 svType,
                 str(svLen),
                 str(insLen),
                 str(ac),
                 str(toInt(row.get("GC (%)", "0"))),
                 na(row.get("Cytoband", "")),
                 str(toInt(row.get("Gene Count", "0"))),
                 na(row.get("Gene Name(s)", "")),
                 na(row.get("Gene at Start", "")),
                 na(row.get("Gene at End", "")),
                 na(row.get("Exon Name", "")),
                 na(row.get("CDS Name", "")),
                 na(row.get("Dark Genes % Overlap", "")),
                 na(row.get("ClinGen Haploinsufficient", "")),
                 na(row.get("ClinGen Triplosensitive", "")),
                 na(row.get("gnomAD O/E LoF Upper", "")),
                 na(row.get("gnomAD O/E Mis Upper", "")),
                 na(row.get("gnomAD pLI", "")),
                 na(row.get("gnomAD pRec", "")),
                 na(row.get("Repeat % Overlap", "")),
                 na(row.get("Dirty Region % Overlap", "")),
                 na(row.get("Chromosome Region", "")),
                 na(row.get("CGD", "")),
                 na(row.get("OMIM Pheno", "")),
                 na(row.get("OMIM Inh", "")),
                 na(row.get("ClinGen Region", "")),
                 na(row.get("Decipher Region", "")),
                 na(row.get("ClinVar VarID", "")),
                 na(row.get("gnomAD AF Max 90% RO", "")),
                 na(row.get("gnomAD Population AF Max 90% RO", "")),
                 na(row.get("gnomAD Hom/Ref Frequency 90% RO", "")),
                 na(row.get("gnomAD Het Frequency 90% RO", "")),
                 na(row.get("gnomAD Hom/Alt Frequency 90% RO", "")),
                 na(row.get("DGV % Overlap", "")),
                 na(row.get("DGV 50% RO", "")),
             ]
-            fOut.write("\t".join(bedRow) + "\n")
+            line_out = "\t".join(bedRow)
+            if line_out in seen:
+                nDup += 1
+                continue
+            seen.add(line_out)
+            fOut.write(line_out + "\n")
+
+    print(f"Chirmade101: {nIn:,} input records, {nDup:,} duplicate rows dropped, "
+          f"{nIn - nDup:,} written", file=sys.stderr)
 
 
 if __name__ == "__main__":
     main()