src/hg/makeDb/scripts/lrSv/lrSvHgsvc2TsvToBed.py bac95a147f49cd331052e597006e04b3deee40fc

bac95a147f49cd331052e597006e04b3deee40fc
max
  Wed Apr 22 10:43:20 2026 -0700
lrSv/srSv: human-readable SV type filter labels, script cleanups

Add human-readable labels to the supertrack-level svType filter on
both the lrSv and srSv supertracks using the "CODE|CODE (Long name)"
filterValues syntax: DEL -> "DEL (Deletion)", INS -> "INS (Insertion)",
etc. Labels keep the short code up front so users can match what
hgTracks shows next to each feature.

Also sweep in the in-progress converter/as-file cleanups under
scripts/lrSv/ and scripts/srSv/ (introduction of lrSvCommon.py
helpers, consistent insLen / svLen / AC column naming, tightened
field-description text) that had been piling up as an unstaged
working tree.

refs #36258

diff --git src/hg/makeDb/scripts/lrSv/lrSvHgsvc2TsvToBed.py src/hg/makeDb/scripts/lrSv/lrSvHgsvc2TsvToBed.py
index 7492432ddac..1612378c987 100644
--- src/hg/makeDb/scripts/lrSv/lrSvHgsvc2TsvToBed.py
+++ src/hg/makeDb/scripts/lrSv/lrSvHgsvc2TsvToBed.py
@@ -6,32 +6,36 @@
 
 The two TSVs share an envelope (chrom, pos, end, svtype, svlen, MERGE_AC /
 MERGE_SAMPLES, cytoband, REF_SD, REF_TRF, REFSEQ_*, PLI/LOEUF, WIN_500/2K)
 but diverge on type-specific fields: insdel has POP_*_AF population allele
 frequencies, inv has RGN_REF_INNER. Both columns are emitted; type-specific
 ones are empty where not applicable.
 
 Source:
     https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/HGSVC2/release/v2.0/integrated_callset/
 Paper:
     Ebert et al. 2021, Science, PMID 33632895.
 """
 
 import csv
 import gzip
+import os
 import sys
 
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from lrSvCommon import svName, normalizeSvType
+
 SV_COLORS = {
     "DEL": "200,0,0",      # red
     "INS": "0,0,200",      # blue
     "INV": "230,140,0",    # orange
 }
 
 
 def openTsv(path):
     return gzip.open(path, "rt") if path.endswith(".gz") else open(path, "rt")
 
 
 def na(val):
     if val is None:
         return ""
     v = val.strip()
@@ -73,50 +77,59 @@
                 break
         uniq.add(p)
     return len(uniq)
 
 
 def emit(outF, row, typeExtra):
     chrom = row["#CHROM"]
     if not chrom.startswith("chr"):
         chrom = "chr" + chrom
     # HGSVC2 TSV coords are 0-based half-open (matches HGSVC3).
     chromStart = toInt(row["POS"])
     chromEnd = toInt(row["END"])
     if chromEnd <= chromStart:
         chromEnd = chromStart + 1
 
-    svType = row["SVTYPE"]
-    svLen = abs(toInt(row["SVLEN"]))
+    svType = normalizeSvType(row["SVTYPE"])
+    svLenSrc = abs(toInt(row["SVLEN"]))
+    svLen = chromEnd - chromStart
+    if svType in ("INS", "MEI"):
+        insLen = svLenSrc
+    else:
+        insLen = 0
     color = SV_COLORS.get(svType, "100,100,100")
 
-    alleleCount = toInt(row.get("MERGE_AC", "0"))
+    ac = toInt(row.get("MERGE_AC", "0"))
     sampleCount = countSamples(row.get("MERGE_SAMPLES", ""))
 
+    featLen = insLen if svType in ("INS", "MEI") else svLen
+    name = svName(svType, featLen, ac)
+
     bed = [
         chrom,
         str(chromStart),
         str(chromEnd),
-        row["ID"],
+        name,
         "0",
         ".",
         str(chromStart),
         str(chromEnd),
         color,
         svType,
         str(svLen),
-        str(alleleCount),
+        str(insLen),
+        str(ac),
         str(sampleCount),
         na(row.get("BAND", "")),
         f"{toFloat(row.get('REF_SD', '0')):.6f}",
         na(row.get("REF_TRF", "")),
         str(toInt(row.get("REFSEQ_CDS", "0"))),
         str(toInt(row.get("REFSEQ_UTR3", "0"))),
         str(toInt(row.get("REFSEQ_UTR5", "0"))),
         str(toInt(row.get("REFSEQ_INTRON", "0"))),
         str(toInt(row.get("REFSEQ_NCRNA", "0"))),
         str(toInt(row.get("REFSEQ_UP_5K", "0"))),
         str(toInt(row.get("REFSEQ_DN_5K", "0"))),
         na(row.get("PLI_MAX", "")),
         na(row.get("LOEUF_MIN", "")),
         typeExtra.get("popAllAf", ""),
         typeExtra.get("popAfrAf", ""),