src/hg/makeDb/scripts/lrSv/lrSvHgsvc3TsvToBed.py bac95a147f49cd331052e597006e04b3deee40fc

bac95a147f49cd331052e597006e04b3deee40fc
max
  Wed Apr 22 10:43:20 2026 -0700
lrSv/srSv: human-readable SV type filter labels, script cleanups

Add human-readable labels to the supertrack-level svType filter on
both the lrSv and srSv supertracks using the "CODE|CODE (Long name)"
filterValues syntax: DEL -> "DEL (Deletion)", INS -> "INS (Insertion)",
etc. Labels keep the short code up front so users can match what
hgTracks shows next to each feature.

Also sweep in the in-progress converter/as-file cleanups under
scripts/lrSv/ and scripts/srSv/ (introduction of lrSvCommon.py
helpers, consistent insLen / svLen / AC column naming, tightened
field-description text) that had been piling up as an unstaged
working tree.

refs #36258

diff --git src/hg/makeDb/scripts/lrSv/lrSvHgsvc3TsvToBed.py src/hg/makeDb/scripts/lrSv/lrSvHgsvc3TsvToBed.py
index dc976967aeb..92ce799b02b 100644
--- src/hg/makeDb/scripts/lrSv/lrSvHgsvc3TsvToBed.py
+++ src/hg/makeDb/scripts/lrSv/lrSvHgsvc3TsvToBed.py
@@ -1,132 +1,147 @@
 #!/usr/bin/env python3
 """Convert HGSVC3 final annotation TSVs (insdel + inv) to a merged BED9+ file.
 
 Usage:
     lrSvHgsvc3TsvToBed.py insdel.tsv.gz inv.tsv.gz output.bed
 """
 
 import csv
 import gzip
+import os
 import sys
 
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from lrSvCommon import svName, normalizeSvType
+
 SV_COLORS = {
     "DEL": "200,0,0",      # red
     "INS": "0,0,200",      # blue
     "INV": "230,140,0",    # orange
 }
 
 
 def openTsv(path):
     return gzip.open(path, "rt") if path.endswith(".gz") else open(path, "rt")
 
 
 def countSamples(mergeSamples):
     """Return (haplotypeCount, uniqSampleCount).
 
     MERGE_SAMPLES is a comma-separated list of haplotype IDs like
     HG00096-h1,HG00096-h2,HG00514-un. Haplotype count is the length of that
     list. Unique sample count strips the -h1/-h2/-un suffix.
     """
     if not mergeSamples:
         return 0, 0
     parts = mergeSamples.split(",")
     uniq = set()
     for p in parts:
         p = p.strip()
         if not p:
             continue
         for suf in ("-h1", "-h2", "-un"):
             if p.endswith(suf):
                 p = p[: -len(suf)]
                 break
         uniq.add(p)
     return len(parts), len(uniq)
 
 
 def emit(outF, row, typeExtra):
     """Write one BED row. typeExtra is dict of the fields that differ
     between insdel and inv.
     """
     chrom = row["#CHROM"]
     pos = int(row["POS"])  # 0-based in source TSV based on sample inspection
     end = int(row["END"])
-    svType = row["SVTYPE"]
+    svType = normalizeSvType(row["SVTYPE"])
     try:
-        svLen = int(row["SVLEN"])
+        svLenSrc = int(row["SVLEN"])
     except ValueError:
-        svLen = 0
-    absSvLen = abs(svLen)
+        svLenSrc = 0
+    absSvLen = abs(svLenSrc)
 
     # Source coords are already 0-based half-open (chr1 POS=10669 END=10901
     # matches a 232-bp deletion starting at 10670 in 1-based). Use as-is.
     chromStart = pos
     chromEnd = end
     if chromEnd <= chromStart:
         chromEnd = chromStart + 1
 
+    svLen = chromEnd - chromStart
+    if svType in ("INS", "MEI"):
+        insLen = absSvLen
+    else:
+        insLen = 0
+
     haploCount, sampleCount = countSamples(row.get("MERGE_SAMPLES", ""))
+    ac = haploCount
 
     color = SV_COLORS.get(svType, "100,100,100")
 
     refTrf = row.get("REF_TRF", "")
     try:
         refSd = float(row.get("REF_SD", "0") or 0)
     except ValueError:
         refSd = 0.0
 
+    featLen = insLen if svType in ("INS", "MEI") else svLen
+    name = svName(svType, featLen, ac)
+
     bedRow = [
         chrom,
         str(chromStart),
         str(chromEnd),
-        row["ID"],
+        name,
         "0",
         ".",
         str(chromStart),
         str(chromEnd),
         color,
         svType,
-        str(absSvLen),
-        str(haploCount),
+        str(svLen),
+        str(insLen),
+        str(ac),
         str(sampleCount),
         typeExtra.get("homRef", ""),
         typeExtra.get("homTig", ""),
         typeExtra.get("regionRefInner", ""),
         typeExtra.get("te", ""),
         refTrf,
         f"{refSd:.6f}",
         row.get("MERGE_SAMPLES", ""),
         row.get("WIN_500", ""),
         row.get("WIN_2K", ""),
     ]
     outF.write("\t".join(bedRow) + "\n")
 
 
 def main():
     if len(sys.argv) != 4:
         print(__doc__, file=sys.stderr)
         sys.exit(1)
 
     insdelPath, invPath, outPath = sys.argv[1], sys.argv[2], sys.argv[3]
 
     with open(outPath, "w") as outF:
         # insertions + deletions
         with openTsv(insdelPath) as fIn:
             reader = csv.DictReader(fIn, delimiter="\t")
             for row in reader:
                 emit(outF, row, {
                     "homRef": row.get("HOM_REF", ""),
                     "homTig": row.get("HOM_TIG", ""),
                     "te": row.get("TE", ""),
                 })
 
         # inversions
         with openTsv(invPath) as fIn:
             reader = csv.DictReader(fIn, delimiter="\t")
             for row in reader:
                 emit(outF, row, {
                     "regionRefInner": row.get("RGN_REF_INNER", ""),
                 })
 
 
 if __name__ == "__main__":
     main()