3d9187d264d00ee8e681521bc2c942ee2527d4f1 max Wed May 13 07:33:38 2026 -0700 varFreqs: add WBBC (Westlake BioBank for Chinese) subtrack from the Phase I v20210103 release: 4,480 WGS samples, 78.6M variants, per-region frequencies for the 4 Han Chinese geographic subgroups (North/Central/South/Lingnan). databases.tsv + populations.tsv updated for the next varFreqsAll rebuild. refs #36642 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> diff --git src/hg/makeDb/scripts/varFreqs/wbbcFix.py src/hg/makeDb/scripts/varFreqs/wbbcFix.py new file mode 100644 index 00000000000..820f5977e5b --- /dev/null +++ src/hg/makeDb/scripts/varFreqs/wbbcFix.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python3 +"""Stream-filter and re-header WBBC concat VCF: add proper INFO/contig +lines, drop AC=0 rows. Reads stdin, writes stdout.""" +import sys + +HG38_CONTIGS = [ + ("chr1", 248956422), ("chr2", 242193529), ("chr3", 198295559), + ("chr4", 190214555), ("chr5", 181538259), ("chr6", 170805979), + ("chr7", 159345973), ("chr8", 145138636), ("chr9", 138394717), + ("chr10", 133797422), ("chr11", 135086622), ("chr12", 133275309), + ("chr13", 114364328), ("chr14", 107043718), ("chr15", 101991189), + ("chr16", 90338345), ("chr17", 83257441), ("chr18", 80373285), + ("chr19", 58617616), ("chr20", 64444167), ("chr21", 46709983), + ("chr22", 50818468), +] + +INFO_DEFS = [ + ("AC", "A", "Integer", "Alt allele count"), + ("AF", "A", "Float", "Alt allele frequency"), + ("AN", "1", "Integer", "Total alleles called"), + ("NS", "1", "Integer", "Number of samples"), + ("North_AF", "A", "Float", "Allele frequency in North Han subgroup"), + ("North_AN", "1", "Integer", "Allele number in North Han subgroup"), + ("Central_AF", "A", "Float", "Allele frequency in Central Han subgroup"), + ("Central_AN", "1", "Integer", "Allele number in Central Han subgroup"), + ("South_AF", "A", "Float", "Allele frequency in South Han subgroup"), + ("South_AN", "1", "Integer", "Allele number in South Han subgroup"), + ("Lingnan_AF", "A", "Float", "Allele frequency in Lingnan Han subgroup"), + ("Lingnan_AN", "1", "Integer", "Allele number in Lingnan Han subgroup"), + ("RR", "1", "String", "Pipe-separated hom-ref|het|hom-alt counts (RR|RA|AA)"), + ("DP", "1", "Integer", "Total depth across samples"), + ("VQSLOD", "1", "Float", "GATK VQSR log-odds score"), +] + +written = 0 +filtered_ac0 = 0 +in_header = True +column_line_written = False + +out = sys.stdout +for line in sys.stdin: + if in_header: + if line.startswith("#CHROM"): + # write our INFO and contig lines before the column line + for chrom, length in HG38_CONTIGS: + out.write(f"##contig=<ID={chrom},length={length}>\n") + for info_id, num, vtype, desc in INFO_DEFS: + out.write(f'##INFO=<ID={info_id},Number={num},Type={vtype},Description="{desc}">\n') + out.write(line) + in_header = False + continue + if line.startswith("##FORMAT"): + # drop the leftover FORMAT line from SHAPEIT (sites-only file) + continue + out.write(line) + continue + + # Data line. INFO is column 8 (1-based), so 7 (0-based). + info_field = line.split("\t", 8)[7] + if info_field.startswith("AC=0;"): + filtered_ac0 += 1 + continue + out.write(line) + written += 1 + if written % 5000000 == 0: + sys.stderr.write(f" written {written:,}\n") + +sys.stderr.write(f"written={written} filtered_AC0={filtered_ac0}\n")