06a482a2120d4d85c7c34fb5038213e07f595554 max Tue Apr 21 15:00:21 2026 -0700 lrSv: add tommoJpCnv short-read CNV comparator (multiWig) ToMMo 48KJPN-CNV Frequency Panel: copy-number variation frequencies from short-read whole-genome sequencing of 48,874 Japanese individuals (jMorp 20230828 release, GATK CNV germline workflow at 1 kb resolution). Published as a companion short-read comparator to the long-read tommoJpSv track. Rendered as a multiWig container with two bigWig subtracks (transparent overlay): tommoJpCnvLoss.bw counts samples at CN<2 per bin (red) and tommoJpCnvGain.bw counts samples at CN>2 per bin (green). Values are absolute carrier counts out of 48,874. 2,006,905 bins with at least one CNV carrier; bins that are wholly CN=2 are omitted. Files: - trackDb/human/lrSv.ra: new tommoJpCnv multiWig container - trackDb/human/tommoJpCnv.html: new doc page - trackDb/human/lrSv.html: summary-table row + per-track blurb - scripts/lrSv/lrSvTommoJpCnvVcfToBedGraph.py: VCF -> two bedGraphs - doc/hg38/lrSv.txt: wget, converter invocation, bigWig build steps refs #36258 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> diff --git src/hg/makeDb/scripts/lrSv/lrSvTommoJpCnvVcfToBedGraph.py src/hg/makeDb/scripts/lrSv/lrSvTommoJpCnvVcfToBedGraph.py new file mode 100644 index 00000000000..f9a684929ce --- /dev/null +++ src/hg/makeDb/scripts/lrSv/lrSvTommoJpCnvVcfToBedGraph.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python3 +"""Convert the ToMMo 48KJPN CNV Frequency VCF to two bedGraph files. + +Emits per-1 kb-bin absolute carrier counts: + - Loss track: number of samples with CN < 2 (CN0, CN1) + - Gain track: number of samples with CN > 2 (CN3, CN4, CN5) +Bins where every sample is CN=2 (no CNV observed) are omitted. + +Usage: + lrSvTommoJpCnvVcfToBedGraph.py input.vcf.gz outLoss.bg outGain.bg +""" + +import gzip +import sys + + +def parseInfo(infoStr): + d = {} + for item in infoStr.split(";"): + if "=" in item: + k, v = item.split("=", 1) + d[k] = v + else: + d[item] = True + return d + + +def parseAltList(altStr): + out = [] + for a in altStr.split(","): + a = a.strip().lstrip("<").rstrip(">") + if a.startswith("CN"): + try: + out.append(int(a[2:])) + continue + except ValueError: + pass + out.append(-1) + return out + + +def main(): + if len(sys.argv) != 4: + print(__doc__, file=sys.stderr) + sys.exit(1) + + inPath, outLoss, outGain = sys.argv[1], sys.argv[2], sys.argv[3] + opener = gzip.open if inPath.endswith(".gz") else open + + kept = 0 + with opener(inPath, "rt") as fIn, \ + open(outLoss, "w") as fLoss, open(outGain, "w") as fGain: + for line in fIn: + if line.startswith("#"): + continue + fields = line.rstrip("\n").split("\t") + chrom = fields[0] + pos = int(fields[1]) + altList = parseAltList(fields[4]) + info = parseInfo(fields[7]) + + end = int(info.get("END", pos)) + scParts = info.get("SC", "").split(",") + + lossCount = 0 + gainCount = 0 + for i, cn in enumerate(altList): + try: + sc = int(scParts[i]) + except (ValueError, IndexError): + sc = 0 + if 0 <= cn < 2: + lossCount += sc + elif cn > 2: + gainCount += sc + + if lossCount == 0 and gainCount == 0: + continue + + chromStart = pos - 1 + chromEnd = end + if chromEnd <= chromStart: + chromEnd = chromStart + 1 + + if lossCount > 0: + fLoss.write(f"{chrom}\t{chromStart}\t{chromEnd}\t{lossCount}\n") + if gainCount > 0: + fGain.write(f"{chrom}\t{chromStart}\t{chromEnd}\t{gainCount}\n") + kept += 1 + + print(f"Wrote {kept} bins with any CNV carrier", file=sys.stderr) + + +if __name__ == "__main__": + main()