bac95a147f49cd331052e597006e04b3deee40fc max Wed Apr 22 10:43:20 2026 -0700 lrSv/srSv: human-readable SV type filter labels, script cleanups Add human-readable labels to the supertrack-level svType filter on both the lrSv and srSv supertracks using the "CODE|CODE (Long name)" filterValues syntax: DEL -> "DEL (Deletion)", INS -> "INS (Insertion)", etc. Labels keep the short code up front so users can match what hgTracks shows next to each feature. Also sweep in the in-progress converter/as-file cleanups under scripts/lrSv/ and scripts/srSv/ (introduction of lrSvCommon.py helpers, consistent insLen / svLen / AC column naming, tightened field-description text) that had been piling up as an unstaged working tree. refs #36258 diff --git src/hg/makeDb/scripts/lrSv/lrSv1kgOntVcfToBed.py src/hg/makeDb/scripts/lrSv/lrSv1kgOntVcfToBed.py index 6e698057a2e..7859c2cedb7 100644 --- src/hg/makeDb/scripts/lrSv/lrSv1kgOntVcfToBed.py +++ src/hg/makeDb/scripts/lrSv/lrSv1kgOntVcfToBed.py @@ -1,48 +1,54 @@ #!/usr/bin/env python3 """Convert 1KG ONT SVAN VCF to BED9+ for bigBed, adding allele counts from phased VCF. Usage: lrSv1kgOntVcfToBed.py svan.vcf.gz output.bed chrom.sizes [phased.vcf.gz] The optional phased VCF provides AC, AN, and AF per variant (matched by ID). -SVs without a match get alleleCount=-1, alleleNumber=-1, alleleFreq=-1. +SVs without a match get AC=-1 (written to AC column), alleleNumber=-1, alleleFreq=-1. +The canonical `name` column omits the :AC suffix when AC is -1. """ import gzip +import os import sys +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +from lrSvCommon import svName, normalizeSvType + # Colors by SV class (R,G,B) SV_COLORS = { "INS": "0,0,200", # blue "DEL": "200,0,0", # red - "COMPLEX": "230,140,0", # orange + "CPX": "230,140,0", # orange } def parseInfo(infoStr): """Parse INFO field into a dict.""" d = {} for item in infoStr.split(";"): if "=" in item: k, v = item.split("=", 1) d[k] = v else: d[item] = True return d def getSvClass(varId): - """Extract SV class (INS, DEL, COMPLEX) from the variant ID.""" + """Extract raw SV class from the variant ID. Returns raw string; + will be normalized by normalizeSvType (COMPLEX -> CPX).""" if "-INS-" in varId: return "INS" elif "-DEL-" in varId: return "DEL" elif "-COMPLEX-" in varId: return "COMPLEX" return "OTHER" def loadPhasedAC(phasedFile): """Load AC, AN, AF from phased VCF, keyed by variant ID.""" acMap = {} opener = gzip.open if phasedFile.endswith(".gz") else open with opener(phasedFile, "rt") as f: for line in f: if line.startswith("#"): @@ -84,50 +90,56 @@ skipped = 0 matched = 0 unmatched = 0 with opener(inFile, "rt") as fIn, open(outFile, "w") as fOut: for line in fIn: if line.startswith("#"): continue fields = line.rstrip("\n").split("\t") chrom = fields[0] pos = int(fields[1]) varId = fields[2] ref = fields[3] info = parseInfo(fields[7]) - svClass = getSvClass(varId) + svTypeRaw = getSvClass(varId) + svType = normalizeSvType(svTypeRaw) # BED coordinates: 0-based half-open chromStart = pos - 1 chromEnd = chromStart + len(ref) - # SV length - svLen = int(info.get("INS_LEN", info.get("DEL_LEN", "0"))) - - # Readable name: e.g. "DEL 258bp" or "INS Alu 330bp" - famName = info.get("FAM_N", "") - if famName: - name = f"{svClass} {famName} {svLen}bp" - else: - name = f"{svClass} {svLen}bp" + # Source lengths: INS_LEN for insertion size, DEL_LEN for deletion size + insLenSrc = int(info.get("INS_LEN", "0")) + delLenSrc = int(info.get("DEL_LEN", "0")) # For INS, the item spans only the anchor base; expand by 1 for visibility - if svClass == "INS" and chromEnd <= chromStart + 1: + if svType == "INS" and chromEnd <= chromStart + 1: chromEnd = chromStart + 1 + # svLen = span on reference + svLen = chromEnd - chromStart + # insLen = length of inserted sequence (INS only); 0 otherwise + if svType == "INS": + insLen = insLenSrc + elif svType == "DEL": + # DEL_LEN matches reference span + insLen = 0 + else: + insLen = 0 + # Insertion/deletion type insType = info.get("ITYPE_N", info.get("DTYPE_N", "")) # Transposon family family = info.get("FAM_N", "") # Percent resolved try: percResolved = float(info.get("PERC_RESOLVED", "0")) except ValueError: percResolved = 0.0 # TSD length try: tsdLen = int(info.get("TSD_LEN", "0")) @@ -160,69 +172,75 @@ # Number of exons retrotransposed try: nbExons = int(info.get("NB_EXONS", "0")) except ValueError: nbExons = 0 # Non-canonical MEI flag notCanonical = "Yes" if info.get("NOT_CANONICAL") else "" # Strand strand = info.get("STRAND", ".") if strand not in ("+", "-"): strand = "." - color = SV_COLORS.get(svClass, "100,100,100") + color = SV_COLORS.get(svType, "100,100,100") # Allele counts from phased VCF if varId in acMap: ac, an, af = acMap[varId] matched += 1 + acForName = ac else: ac, an, af = -1, -1, -1.0 unmatched += 1 + acForName = None # drop :AC suffix for SVAN-only rows # Clip to chrom sizes; skip records on unknown chroms if chrom not in chromSizes: skipped += 1 continue if chromEnd > chromSizes[chrom]: skipped += 1 continue + featLen = insLen if svType in ("INS", "MEI") else svLen + name = svName(svType, featLen, acForName) + row = [ chrom, str(chromStart), str(chromEnd), name, "0", strand, str(chromStart), # thickStart str(chromEnd), # thickEnd color, - svClass, + svType, str(svLen), + str(insLen), + str(ac), insType, family, f"{percResolved:.2f}", str(tsdLen), str(polyaLen), conformation, str(rtLen), str(nbMotifs), srcGene, str(nbExons), notCanonical, - str(ac), str(an), f"{af:.4f}" if af >= 0 else "-1.0000", ] fOut.write("\t".join(row) + "\n") print(f"Skipped {skipped} records outside chrom sizes", file=sys.stderr) if acMap: print(f"Allele counts: {matched} matched, {unmatched} unmatched " f"({100*matched/(matched+unmatched):.1f}% matched)", file=sys.stderr) if __name__ == "__main__": main()