bac95a147f49cd331052e597006e04b3deee40fc max Wed Apr 22 10:43:20 2026 -0700 lrSv/srSv: human-readable SV type filter labels, script cleanups Add human-readable labels to the supertrack-level svType filter on both the lrSv and srSv supertracks using the "CODE|CODE (Long name)" filterValues syntax: DEL -> "DEL (Deletion)", INS -> "INS (Insertion)", etc. Labels keep the short code up front so users can match what hgTracks shows next to each feature. Also sweep in the in-progress converter/as-file cleanups under scripts/lrSv/ and scripts/srSv/ (introduction of lrSvCommon.py helpers, consistent insLen / svLen / AC column naming, tightened field-description text) that had been piling up as an unstaged working tree. refs #36258 diff --git src/hg/makeDb/scripts/srSv/onekg3202SrVcfToBed.py src/hg/makeDb/scripts/srSv/onekg3202SrVcfToBed.py index 71f0f061da7..b4857eabd8b 100644 --- src/hg/makeDb/scripts/srSv/onekg3202SrVcfToBed.py +++ src/hg/makeDb/scripts/srSv/onekg3202SrVcfToBed.py @@ -1,32 +1,36 @@ #!/usr/bin/env python3 """Convert 1KG 3,202-sample GATK-SV short-read VCF to BED9+. Short-read comparator track for the lrSv collection. Source: https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20210124.SV_Illumina_Integration/1KGP_3202.gatksv_svtools_novelins.freeze_V3.wAF.vcf.gz Paper: Byrska-Bishop et al. 2022, Cell, PMID 36055201. Usage: lrSv1kg3202SrVcfToBed.py input.vcf.gz output.bed """ import gzip +import os import sys +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +from lrSvCommon import svName, normalizeSvType + SV_COLORS = { "DEL": "200,0,0", # red "INS": "0,0,200", # blue "DUP": "0,160,0", # green "INV": "230,140,0", # orange "CPX": "140,0,200", # purple "CTX": "100,100,100", # grey "CNV": "150,80,0", # brown } def parseInfo(infoStr): d = {} for item in infoStr.split(";"): if "=" in item: @@ -58,65 +62,79 @@ def main(): if len(sys.argv) != 3: print(__doc__, file=sys.stderr) sys.exit(1) inPath, outPath = sys.argv[1], sys.argv[2] opener = gzip.open if inPath.endswith(".gz") else open with opener(inPath, "rt") as fIn, open(outPath, "w") as fOut: for line in fIn: if line.startswith("#"): continue fields = line.rstrip("\n").split("\t") chrom = fields[0] pos = int(fields[1]) - name = fields[2] + rowName = fields[2] filt = fields[6] info = parseInfo(fields[7]) - svType = info.get("SVTYPE", ".") + svTypeRaw = info.get("SVTYPE", ".") + svType = normalizeSvType(svTypeRaw) end = int(info.get("END", pos)) - svLen = abs(toInt(info.get("SVLEN", "0"))) + srcSvLen = abs(toInt(info.get("SVLEN", "0"))) chromStart = pos - 1 chromEnd = end if chromEnd <= chromStart: chromEnd = chromStart + 1 # Translocations: the END is on chr2; cap the item width to 1 bp # on the chromosome-1 side. chr2 = info.get("CHR2", "") if svType == "CTX" and chr2 and chr2 != chrom: chromEnd = chromStart + 1 + svLen = chromEnd - chromStart + if svType in ("INS", "MEI"): + insLen = srcSvLen + else: + insLen = 0 + color = SV_COLORS.get(svType, "100,100,100") + ac = toInt(info.get("AC", "0")) + an = toInt(info.get("AN", "0")) + + featLen = insLen if svType in ("INS", "MEI") else svLen + name = svName(svType, featLen, ac) + row = [ chrom, str(chromStart), str(chromEnd), name, "0", ".", str(chromStart), str(chromEnd), color, svType, str(svLen), - str(toInt(info.get("AC", "0"))), - str(toInt(info.get("AN", "0"))), + str(insLen), + str(ac), + str(an), f"{toFloat(info.get('AF', '0')):.6f}", f"{toFloat(info.get('POPMAX_AF', '0')):.6f}", f"{toFloat(info.get('AFR_AF', '0')):.6f}", f"{toFloat(info.get('AMR_AF', '0')):.6f}", f"{toFloat(info.get('ASN_AF', '0')):.6f}", f"{toFloat(info.get('EUR_AF', '0')):.6f}", f"{toFloat(info.get('SAN_AF', '0')):.6f}", str(toInt(info.get("N_HET", "0"))), str(toInt(info.get("N_HOMALT", "0"))), info.get("ALGORITHMS", ""), info.get("SOURCE", ""), filt, chr2 if svType == "CTX" else "", ] fOut.write("\t".join(row) + "\n")