bac95a147f49cd331052e597006e04b3deee40fc
max
  Wed Apr 22 10:43:20 2026 -0700
lrSv/srSv: human-readable SV type filter labels, script cleanups

Add human-readable labels to the supertrack-level svType filter on
both the lrSv and srSv supertracks using the "CODE|CODE (Long name)"
filterValues syntax: DEL -> "DEL (Deletion)", INS -> "INS (Insertion)",
etc. Labels keep the short code up front so users can match what
hgTracks shows next to each feature.

Also sweep in the in-progress converter/as-file cleanups under
scripts/lrSv/ and scripts/srSv/ (introduction of lrSvCommon.py
helpers, consistent insLen / svLen / AC column naming, tightened
field-description text) that had been piling up as an unstaged
working tree.

refs #36258

diff --git src/hg/makeDb/scripts/lrSv/lrSvChirmade101TsvToBed.py src/hg/makeDb/scripts/lrSv/lrSvChirmade101TsvToBed.py
index b790c617518..c240ab5414c 100644
--- src/hg/makeDb/scripts/lrSv/lrSvChirmade101TsvToBed.py
+++ src/hg/makeDb/scripts/lrSv/lrSvChirmade101TsvToBed.py
@@ -1,118 +1,137 @@
 #!/usr/bin/env python3
 """Convert the Chirmade 2026 SVatalog sv_annotations.tsv to BED9+.
 
 Source: https://zenodo.org/records/13367574 (sv_annotations.tsv)
 Paper:  Chirmade et al. 2026, Heredity (Edinb), PMID 41203876
 
 Coordinates in the source TSV are 1-based closed (End-Start+1 == Length).
 Translate to BED-style 0-based half-open (chromStart = Start - 1,
 chromEnd = End).
 
 Usage:
     lrSvChirmade101TsvToBed.py sv_annotations.tsv output.bed
 """
 
 import csv
+import os
 import sys
 
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from lrSvCommon import svName, normalizeSvType
+
 SV_COLORS = {
-    "del":     "200,0,0",      # red
-    "ins":     "0,0,200",      # blue
-    "dup":     "0,160,0",      # green
-    "inv":     "230,140,0",    # orange
-    "complex": "140,0,200",    # purple
+    "DEL": "200,0,0",      # red
+    "INS": "0,0,200",      # blue
+    "DUP": "0,160,0",      # green
+    "INV": "230,140,0",    # orange
+    "CPX": "140,0,200",    # purple
 }
 
 
 def na(val):
     """Return '' for missing ('NA' or empty) source values."""
     if val is None:
         return ""
     v = val.strip()
     if v == "" or v == "NA":
         return ""
     return v
 
 
 def toInt(s):
     if not s:
         return 0
     try:
         return int(float(s))
     except ValueError:
         return 0
 
 
 def main():
     if len(sys.argv) != 3:
         print(__doc__, file=sys.stderr)
         sys.exit(1)
 
     inPath, outPath = sys.argv[1], sys.argv[2]
 
     with open(inPath, newline="") as fIn, open(outPath, "w") as fOut:
         reader = csv.DictReader(fIn, delimiter="\t")
         for row in reader:
             chrom = row["Chromosome"]
             if not chrom.startswith("chr"):
                 chrom = "chr" + chrom
 
             # 1-based closed -> 0-based half-open
             chromStart = toInt(row["Start"]) - 1
             chromEnd = toInt(row["End"])
             if chromEnd <= chromStart:
                 chromEnd = chromStart + 1
 
-            svType = row["Type"]
-            svLen = abs(toInt(row["Length"]))
+            svTypeRaw = row["Type"]  # lowercase del/ins/dup/inv/complex
+            svType = normalizeSvType(svTypeRaw)
+            srcLen = abs(toInt(row["Length"]))
+            svLen = chromEnd - chromStart
+            if svType in ("INS", "MEI"):
+                insLen = srcLen
+            else:
+                insLen = 0
             color = SV_COLORS.get(svType, "100,100,100")
 
+            # Chirmade catalog is site-level without AC. Use -1 as placeholder
+            # so svName drops the :AC suffix.
+            ac = -1
+
+            featLen = insLen if svType in ("INS", "MEI") else svLen
+            name = svName(svType, featLen, ac)
+
             bedRow = [
                 chrom,
                 str(chromStart),
                 str(chromEnd),
-                row["ID"],
+                name,
                 "0",
                 ".",
                 str(chromStart),
                 str(chromEnd),
                 color,
                 svType,
                 str(svLen),
+                str(insLen),
+                str(ac),
                 str(toInt(row.get("GC (%)", "0"))),
                 na(row.get("Cytoband", "")),
                 str(toInt(row.get("Gene Count", "0"))),
                 na(row.get("Gene Name(s)", "")),
                 na(row.get("Gene at Start", "")),
                 na(row.get("Gene at End", "")),
                 na(row.get("Exon Name", "")),
                 na(row.get("CDS Name", "")),
                 na(row.get("Dark Genes % Overlap", "")),
                 na(row.get("ClinGen Haploinsufficient", "")),
                 na(row.get("ClinGen Triplosensitive", "")),
                 na(row.get("gnomAD O/E LoF Upper", "")),
                 na(row.get("gnomAD O/E Mis Upper", "")),
                 na(row.get("gnomAD pLI", "")),
                 na(row.get("gnomAD pRec", "")),
                 na(row.get("Repeat % Overlap", "")),
                 na(row.get("Dirty Region % Overlap", "")),
                 na(row.get("Chromosome Region", "")),
                 na(row.get("CGD", "")),
                 na(row.get("OMIM Pheno", "")),
                 na(row.get("OMIM Inh", "")),
                 na(row.get("ClinGen Region", "")),
                 na(row.get("Decipher Region", "")),
                 na(row.get("ClinVar VarID", "")),
                 na(row.get("gnomAD AF Max 90% RO", "")),
                 na(row.get("gnomAD Population AF Max 90% RO", "")),
                 na(row.get("gnomAD Hom/Ref Frequency 90% RO", "")),
                 na(row.get("gnomAD Het Frequency 90% RO", "")),
                 na(row.get("gnomAD Hom/Alt Frequency 90% RO", "")),
                 na(row.get("DGV % Overlap", "")),
                 na(row.get("DGV 50% RO", "")),
             ]
             fOut.write("\t".join(bedRow) + "\n")
 
 
 if __name__ == "__main__":
     main()