bac95a147f49cd331052e597006e04b3deee40fc max Wed Apr 22 10:43:20 2026 -0700 lrSv/srSv: human-readable SV type filter labels, script cleanups Add human-readable labels to the supertrack-level svType filter on both the lrSv and srSv supertracks using the "CODE|CODE (Long name)" filterValues syntax: DEL -> "DEL (Deletion)", INS -> "INS (Insertion)", etc. Labels keep the short code up front so users can match what hgTracks shows next to each feature. Also sweep in the in-progress converter/as-file cleanups under scripts/lrSv/ and scripts/srSv/ (introduction of lrSvCommon.py helpers, consistent insLen / svLen / AC column naming, tightened field-description text) that had been piling up as an unstaged working tree. refs #36258 diff --git src/hg/makeDb/scripts/lrSv/lrSvHgsvc2TsvToBed.py src/hg/makeDb/scripts/lrSv/lrSvHgsvc2TsvToBed.py index 7492432ddac..1612378c987 100644 --- src/hg/makeDb/scripts/lrSv/lrSvHgsvc2TsvToBed.py +++ src/hg/makeDb/scripts/lrSv/lrSvHgsvc2TsvToBed.py @@ -6,32 +6,36 @@ The two TSVs share an envelope (chrom, pos, end, svtype, svlen, MERGE_AC / MERGE_SAMPLES, cytoband, REF_SD, REF_TRF, REFSEQ_*, PLI/LOEUF, WIN_500/2K) but diverge on type-specific fields: insdel has POP_*_AF population allele frequencies, inv has RGN_REF_INNER. Both columns are emitted; type-specific ones are empty where not applicable. Source: https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/HGSVC2/release/v2.0/integrated_callset/ Paper: Ebert et al. 2021, Science, PMID 33632895. """ import csv import gzip +import os import sys +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +from lrSvCommon import svName, normalizeSvType + SV_COLORS = { "DEL": "200,0,0", # red "INS": "0,0,200", # blue "INV": "230,140,0", # orange } def openTsv(path): return gzip.open(path, "rt") if path.endswith(".gz") else open(path, "rt") def na(val): if val is None: return "" v = val.strip() @@ -73,50 +77,59 @@ break uniq.add(p) return len(uniq) def emit(outF, row, typeExtra): chrom = row["#CHROM"] if not chrom.startswith("chr"): chrom = "chr" + chrom # HGSVC2 TSV coords are 0-based half-open (matches HGSVC3). chromStart = toInt(row["POS"]) chromEnd = toInt(row["END"]) if chromEnd <= chromStart: chromEnd = chromStart + 1 - svType = row["SVTYPE"] - svLen = abs(toInt(row["SVLEN"])) + svType = normalizeSvType(row["SVTYPE"]) + svLenSrc = abs(toInt(row["SVLEN"])) + svLen = chromEnd - chromStart + if svType in ("INS", "MEI"): + insLen = svLenSrc + else: + insLen = 0 color = SV_COLORS.get(svType, "100,100,100") - alleleCount = toInt(row.get("MERGE_AC", "0")) + ac = toInt(row.get("MERGE_AC", "0")) sampleCount = countSamples(row.get("MERGE_SAMPLES", "")) + featLen = insLen if svType in ("INS", "MEI") else svLen + name = svName(svType, featLen, ac) + bed = [ chrom, str(chromStart), str(chromEnd), - row["ID"], + name, "0", ".", str(chromStart), str(chromEnd), color, svType, str(svLen), - str(alleleCount), + str(insLen), + str(ac), str(sampleCount), na(row.get("BAND", "")), f"{toFloat(row.get('REF_SD', '0')):.6f}", na(row.get("REF_TRF", "")), str(toInt(row.get("REFSEQ_CDS", "0"))), str(toInt(row.get("REFSEQ_UTR3", "0"))), str(toInt(row.get("REFSEQ_UTR5", "0"))), str(toInt(row.get("REFSEQ_INTRON", "0"))), str(toInt(row.get("REFSEQ_NCRNA", "0"))), str(toInt(row.get("REFSEQ_UP_5K", "0"))), str(toInt(row.get("REFSEQ_DN_5K", "0"))), na(row.get("PLI_MAX", "")), na(row.get("LOEUF_MIN", "")), typeExtra.get("popAllAf", ""), typeExtra.get("popAfrAf", ""),