bac95a147f49cd331052e597006e04b3deee40fc max Wed Apr 22 10:43:20 2026 -0700 lrSv/srSv: human-readable SV type filter labels, script cleanups Add human-readable labels to the supertrack-level svType filter on both the lrSv and srSv supertracks using the "CODE|CODE (Long name)" filterValues syntax: DEL -> "DEL (Deletion)", INS -> "INS (Insertion)", etc. Labels keep the short code up front so users can match what hgTracks shows next to each feature. Also sweep in the in-progress converter/as-file cleanups under scripts/lrSv/ and scripts/srSv/ (introduction of lrSvCommon.py helpers, consistent insLen / svLen / AC column naming, tightened field-description text) that had been piling up as an unstaged working tree. refs #36258 diff --git src/hg/makeDb/scripts/lrSv/lrSvChirmade101TsvToBed.py src/hg/makeDb/scripts/lrSv/lrSvChirmade101TsvToBed.py index b790c617518..c240ab5414c 100644 --- src/hg/makeDb/scripts/lrSv/lrSvChirmade101TsvToBed.py +++ src/hg/makeDb/scripts/lrSv/lrSvChirmade101TsvToBed.py @@ -1,118 +1,137 @@ #!/usr/bin/env python3 """Convert the Chirmade 2026 SVatalog sv_annotations.tsv to BED9+. Source: https://zenodo.org/records/13367574 (sv_annotations.tsv) Paper: Chirmade et al. 2026, Heredity (Edinb), PMID 41203876 Coordinates in the source TSV are 1-based closed (End-Start+1 == Length). Translate to BED-style 0-based half-open (chromStart = Start - 1, chromEnd = End). Usage: lrSvChirmade101TsvToBed.py sv_annotations.tsv output.bed """ import csv +import os import sys +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +from lrSvCommon import svName, normalizeSvType + SV_COLORS = { - "del": "200,0,0", # red - "ins": "0,0,200", # blue - "dup": "0,160,0", # green - "inv": "230,140,0", # orange - "complex": "140,0,200", # purple + "DEL": "200,0,0", # red + "INS": "0,0,200", # blue + "DUP": "0,160,0", # green + "INV": "230,140,0", # orange + "CPX": "140,0,200", # purple } def na(val): """Return '' for missing ('NA' or empty) source values.""" if val is None: return "" v = val.strip() if v == "" or v == "NA": return "" return v def toInt(s): if not s: return 0 try: return int(float(s)) except ValueError: return 0 def main(): if len(sys.argv) != 3: print(__doc__, file=sys.stderr) sys.exit(1) inPath, outPath = sys.argv[1], sys.argv[2] with open(inPath, newline="") as fIn, open(outPath, "w") as fOut: reader = csv.DictReader(fIn, delimiter="\t") for row in reader: chrom = row["Chromosome"] if not chrom.startswith("chr"): chrom = "chr" + chrom # 1-based closed -> 0-based half-open chromStart = toInt(row["Start"]) - 1 chromEnd = toInt(row["End"]) if chromEnd <= chromStart: chromEnd = chromStart + 1 - svType = row["Type"] - svLen = abs(toInt(row["Length"])) + svTypeRaw = row["Type"] # lowercase del/ins/dup/inv/complex + svType = normalizeSvType(svTypeRaw) + srcLen = abs(toInt(row["Length"])) + svLen = chromEnd - chromStart + if svType in ("INS", "MEI"): + insLen = srcLen + else: + insLen = 0 color = SV_COLORS.get(svType, "100,100,100") + # Chirmade catalog is site-level without AC. Use -1 as placeholder + # so svName drops the :AC suffix. + ac = -1 + + featLen = insLen if svType in ("INS", "MEI") else svLen + name = svName(svType, featLen, ac) + bedRow = [ chrom, str(chromStart), str(chromEnd), - row["ID"], + name, "0", ".", str(chromStart), str(chromEnd), color, svType, str(svLen), + str(insLen), + str(ac), str(toInt(row.get("GC (%)", "0"))), na(row.get("Cytoband", "")), str(toInt(row.get("Gene Count", "0"))), na(row.get("Gene Name(s)", "")), na(row.get("Gene at Start", "")), na(row.get("Gene at End", "")), na(row.get("Exon Name", "")), na(row.get("CDS Name", "")), na(row.get("Dark Genes % Overlap", "")), na(row.get("ClinGen Haploinsufficient", "")), na(row.get("ClinGen Triplosensitive", "")), na(row.get("gnomAD O/E LoF Upper", "")), na(row.get("gnomAD O/E Mis Upper", "")), na(row.get("gnomAD pLI", "")), na(row.get("gnomAD pRec", "")), na(row.get("Repeat % Overlap", "")), na(row.get("Dirty Region % Overlap", "")), na(row.get("Chromosome Region", "")), na(row.get("CGD", "")), na(row.get("OMIM Pheno", "")), na(row.get("OMIM Inh", "")), na(row.get("ClinGen Region", "")), na(row.get("Decipher Region", "")), na(row.get("ClinVar VarID", "")), na(row.get("gnomAD AF Max 90% RO", "")), na(row.get("gnomAD Population AF Max 90% RO", "")), na(row.get("gnomAD Hom/Ref Frequency 90% RO", "")), na(row.get("gnomAD Het Frequency 90% RO", "")), na(row.get("gnomAD Hom/Alt Frequency 90% RO", "")), na(row.get("DGV % Overlap", "")), na(row.get("DGV 50% RO", "")), ] fOut.write("\t".join(bedRow) + "\n") if __name__ == "__main__": main()