bac95a147f49cd331052e597006e04b3deee40fc max Wed Apr 22 10:43:20 2026 -0700 lrSv/srSv: human-readable SV type filter labels, script cleanups Add human-readable labels to the supertrack-level svType filter on both the lrSv and srSv supertracks using the "CODE|CODE (Long name)" filterValues syntax: DEL -> "DEL (Deletion)", INS -> "INS (Insertion)", etc. Labels keep the short code up front so users can match what hgTracks shows next to each feature. Also sweep in the in-progress converter/as-file cleanups under scripts/lrSv/ and scripts/srSv/ (introduction of lrSvCommon.py helpers, consistent insLen / svLen / AC column naming, tightened field-description text) that had been piling up as an unstaged working tree. refs #36258 diff --git src/hg/makeDb/scripts/lrSv/lrSvKwanhoTsvToBed.py src/hg/makeDb/scripts/lrSv/lrSvKwanhoTsvToBed.py index 2d5008bef83..ba9cb38fb65 100644 --- src/hg/makeDb/scripts/lrSv/lrSvKwanhoTsvToBed.py +++ src/hg/makeDb/scripts/lrSv/lrSvKwanhoTsvToBed.py @@ -1,40 +1,60 @@ #!/usr/bin/env python3 """Convert the Kim 2026 PD long-read SV catalog (media-13.txt) to BED9+. Usage: lrSvKwanhoTsvToBed.py media-13.txt output.bed The source TSV has thousands-separator commas inside quoted numeric fields (e.g. "10,889"), so we parse it with the csv module. """ import csv +import os import re import sys +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +from lrSvCommon import svName, normalizeSvType + SV_COLORS = { "DEL": "200,0,0", # red "INS": "0,0,200", # blue "DUP": "0,160,0", # green "INV": "230,140,0", # orange } def toInt(s): + # Kim et al. media-13.txt stores many integer fields as Python tuple + # repr, e.g. "(4,)" for single-sample or "(4, 2)" for multi. We take + # the sum across the tuple. if s is None or s == "": return 0 + s = s.strip() + if s.startswith("(") and s.endswith(")"): + inner = s[1:-1] + total = 0 + for p in inner.split(","): + p = p.strip() + if not p: + continue + try: + total += int(float(p)) + except ValueError: + return 0 + return total s = s.replace(",", "") try: return int(float(s)) except ValueError: return 0 def toFloat(s): if s is None or s == "": return 0.0 s = s.strip().rstrip("%") s = s.replace(",", "") try: return float(s) except ValueError: @@ -75,65 +95,81 @@ inPath, outPath = sys.argv[1], sys.argv[2] with open(inPath, newline="") as fIn, open(outPath, "w") as fOut: reader = csv.DictReader(fIn, delimiter="\t") for row in reader: chrom = row["Chromosome"] if not chrom.startswith("chr"): chrom = "chr" + chrom chromStart = toInt(row["Start"]) chromEnd = toInt(row["End"]) if chromEnd <= chromStart: chromEnd = chromStart + 1 - svType = row["SV type"] - svLen = abs(toInt(row["SV length"])) + svTypeRaw = row["SV type"] + svType = normalizeSvType(svTypeRaw) + srcSvLen = abs(toInt(row["SV length"])) + svLen = chromEnd - chromStart + if svType in ("INS", "MEI"): + insLen = srcSvLen + else: + insLen = 0 color = SV_COLORS.get(svType, "100,100,100") pdStr, nPd = carrierList(row.get("PD CARRIERS", "")) hcStr, nHc = carrierList(row.get("HC CARRIERS", "")) ilbdStr, nIlbd = carrierList(row.get("ILBD CARRIERS", "")) + acPd = toInt(row.get("AC PD", "0")) + acHc = toInt(row.get("AC HC", "0")) + acIlbd = toInt(row.get("AC ILBD", "0")) + ac = acPd + acHc + acIlbd + + featLen = insLen if svType in ("INS", "MEI") else svLen + name = svName(svType, featLen, ac) + bedRow = [ chrom, str(chromStart), str(chromEnd), - row["ID"], + name, "0", ".", str(chromStart), str(chromEnd), color, svType, str(svLen), + str(insLen), + str(ac), row.get("Size bin", ""), str(toInt(row.get("qual", "0"))), str(toInt(row.get("SUPP", "0"))), row.get("SUPP VEC", ""), f"{toFloat(row.get('MISSING RATE', '0')):.6f}", f"{pctToFrac(row.get('CASE RATE', '0')):.6f}", f"{pctToFrac(row.get('CONTROL RATE', '0')):.6f}", f"{toFloat(row.get('DIFFERENTIAL RATE', '0')):.6f}", f"{toFloat(row.get('AF PD', '0')):.6f}", f"{toFloat(row.get('AF HC', '0')):.6f}", f"{toFloat(row.get('AF ILBD', '0')):.6f}", - str(toInt(row.get("AC PD", "0"))), - str(toInt(row.get("AC HC", "0"))), - str(toInt(row.get("AC ILBD", "0"))), + str(acPd), + str(acHc), + str(acIlbd), str(toInt(row.get("AN PD", "0"))), str(toInt(row.get("AN HC", "0"))), str(toInt(row.get("AN ILBD", "0"))), str(nPd), str(nHc), str(nIlbd), str(toInt(row.get("LD SNPS COUNT", "0"))), str(toInt(row.get("TOTAL SNPS NEARBY", "0"))), f"{toFloat(row.get('AVG MAP QUALITY', '0')):.3f}", f"{toFloat(row.get('AVG READS PER SAMPLE', '0')):.3f}", pdStr, hcStr, ilbdStr, ] fOut.write("\t".join(bedRow) + "\n")