src/hg/makeDb/scripts/lrSv/lrSvKwanhoTsvToBed.py bac95a147f49cd331052e597006e04b3deee40fc

bac95a147f49cd331052e597006e04b3deee40fc
max
  Wed Apr 22 10:43:20 2026 -0700
lrSv/srSv: human-readable SV type filter labels, script cleanups

Add human-readable labels to the supertrack-level svType filter on
both the lrSv and srSv supertracks using the "CODE|CODE (Long name)"
filterValues syntax: DEL -> "DEL (Deletion)", INS -> "INS (Insertion)",
etc. Labels keep the short code up front so users can match what
hgTracks shows next to each feature.

Also sweep in the in-progress converter/as-file cleanups under
scripts/lrSv/ and scripts/srSv/ (introduction of lrSvCommon.py
helpers, consistent insLen / svLen / AC column naming, tightened
field-description text) that had been piling up as an unstaged
working tree.

refs #36258

diff --git src/hg/makeDb/scripts/lrSv/lrSvKwanhoTsvToBed.py src/hg/makeDb/scripts/lrSv/lrSvKwanhoTsvToBed.py
index 2d5008bef83..ba9cb38fb65 100644
--- src/hg/makeDb/scripts/lrSv/lrSvKwanhoTsvToBed.py
+++ src/hg/makeDb/scripts/lrSv/lrSvKwanhoTsvToBed.py
@@ -1,40 +1,60 @@
 #!/usr/bin/env python3
 """Convert the Kim 2026 PD long-read SV catalog (media-13.txt) to BED9+.
 
 Usage:
     lrSvKwanhoTsvToBed.py media-13.txt output.bed
 
 The source TSV has thousands-separator commas inside quoted numeric fields
 (e.g. "10,889"), so we parse it with the csv module.
 """
 
 import csv
+import os
 import re
 import sys
 
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from lrSvCommon import svName, normalizeSvType
+
 SV_COLORS = {
     "DEL": "200,0,0",      # red
     "INS": "0,0,200",      # blue
     "DUP": "0,160,0",      # green
     "INV": "230,140,0",    # orange
 }
 
 
 def toInt(s):
+    # Kim et al. media-13.txt stores many integer fields as Python tuple
+    # repr, e.g. "(4,)" for single-sample or "(4, 2)" for multi. We take
+    # the sum across the tuple.
     if s is None or s == "":
         return 0
+    s = s.strip()
+    if s.startswith("(") and s.endswith(")"):
+        inner = s[1:-1]
+        total = 0
+        for p in inner.split(","):
+            p = p.strip()
+            if not p:
+                continue
+            try:
+                total += int(float(p))
+            except ValueError:
+                return 0
+        return total
     s = s.replace(",", "")
     try:
         return int(float(s))
     except ValueError:
         return 0
 
 
 def toFloat(s):
     if s is None or s == "":
         return 0.0
     s = s.strip().rstrip("%")
     s = s.replace(",", "")
     try:
         return float(s)
     except ValueError:
@@ -75,65 +95,81 @@
 
     inPath, outPath = sys.argv[1], sys.argv[2]
 
     with open(inPath, newline="") as fIn, open(outPath, "w") as fOut:
         reader = csv.DictReader(fIn, delimiter="\t")
         for row in reader:
             chrom = row["Chromosome"]
             if not chrom.startswith("chr"):
                 chrom = "chr" + chrom
 
             chromStart = toInt(row["Start"])
             chromEnd = toInt(row["End"])
             if chromEnd <= chromStart:
                 chromEnd = chromStart + 1
 
-            svType = row["SV type"]
-            svLen = abs(toInt(row["SV length"]))
+            svTypeRaw = row["SV type"]
+            svType = normalizeSvType(svTypeRaw)
+            srcSvLen = abs(toInt(row["SV length"]))
+            svLen = chromEnd - chromStart
+            if svType in ("INS", "MEI"):
+                insLen = srcSvLen
+            else:
+                insLen = 0
 
             color = SV_COLORS.get(svType, "100,100,100")
 
             pdStr, nPd = carrierList(row.get("PD CARRIERS", ""))
             hcStr, nHc = carrierList(row.get("HC CARRIERS", ""))
             ilbdStr, nIlbd = carrierList(row.get("ILBD CARRIERS", ""))
 
+            acPd = toInt(row.get("AC PD", "0"))
+            acHc = toInt(row.get("AC HC", "0"))
+            acIlbd = toInt(row.get("AC ILBD", "0"))
+            ac = acPd + acHc + acIlbd
+
+            featLen = insLen if svType in ("INS", "MEI") else svLen
+            name = svName(svType, featLen, ac)
+
             bedRow = [
                 chrom,
                 str(chromStart),
                 str(chromEnd),
-                row["ID"],
+                name,
                 "0",
                 ".",
                 str(chromStart),
                 str(chromEnd),
                 color,
                 svType,
                 str(svLen),
+                str(insLen),
+                str(ac),
                 row.get("Size bin", ""),
                 str(toInt(row.get("qual", "0"))),
                 str(toInt(row.get("SUPP", "0"))),
                 row.get("SUPP VEC", ""),
                 f"{toFloat(row.get('MISSING RATE', '0')):.6f}",
                 f"{pctToFrac(row.get('CASE RATE', '0')):.6f}",
                 f"{pctToFrac(row.get('CONTROL RATE', '0')):.6f}",
                 f"{toFloat(row.get('DIFFERENTIAL RATE', '0')):.6f}",
                 f"{toFloat(row.get('AF PD', '0')):.6f}",
                 f"{toFloat(row.get('AF HC', '0')):.6f}",
                 f"{toFloat(row.get('AF ILBD', '0')):.6f}",
-                str(toInt(row.get("AC PD", "0"))),
-                str(toInt(row.get("AC HC", "0"))),
-                str(toInt(row.get("AC ILBD", "0"))),
+                str(acPd),
+                str(acHc),
+                str(acIlbd),
                 str(toInt(row.get("AN PD", "0"))),
                 str(toInt(row.get("AN HC", "0"))),
                 str(toInt(row.get("AN ILBD", "0"))),
                 str(nPd),
                 str(nHc),
                 str(nIlbd),
                 str(toInt(row.get("LD SNPS COUNT", "0"))),
                 str(toInt(row.get("TOTAL SNPS NEARBY", "0"))),
                 f"{toFloat(row.get('AVG MAP QUALITY', '0')):.3f}",
                 f"{toFloat(row.get('AVG READS PER SAMPLE', '0')):.3f}",
                 pdStr,
                 hcStr,
                 ilbdStr,
             ]
             fOut.write("\t".join(bedRow) + "\n")