9a49afb16653363a70d8e4d205513008b7b08df5 max Wed May 13 06:18:42 2026 -0700 varFreqs: add UK Biobank subtrack from Neale Lab Round 2 imputed-v3 variant manifest (13.7M variants, 361k white British samples). TSV → VCF conversion + CrossMap hg19→hg38, refs #36642 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> diff --git src/hg/makeDb/scripts/varFreqs/ukbbToVcf.py src/hg/makeDb/scripts/varFreqs/ukbbToVcf.py new file mode 100644 index 00000000000..365c37f43fc --- /dev/null +++ src/hg/makeDb/scripts/varFreqs/ukbbToVcf.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python3 +""" +Convert the Neale Lab UK Biobank Round 2 variants.tsv.bgz manifest to a +sites-only VCF (GRCh37 / hg19 coordinates, with chr prefix). The source +file ships post-QC allele counts and imputation INFO scores from 361,194 +imputed UK Biobank samples, restricted to white British ancestry, and is +documented at https://www.nealelab.is/uk-biobank. + +Rows with AC=0 are dropped (variant not seen in the cohort). +AN is computed as 2 * n_called (diploid). For chrX this slightly +over-estimates the number of alleles in males but matches the convention +the rest of the Neale Lab pipeline uses. +""" +import sys, gzip + +INPUT = "variants.tsv.bgz" +OUTPUT = "ukbb.hg19.vcf" + +# (column-name-in-source, vcf-info-id, Number, Type, Description) +INFO_FIELDS = [ + ("AC", "AC", "A", "Integer", "Allele count (alt)"), + ("AN", "AN", "1", "Integer", "Total alleles called (2 * n_called)"), + ("AF", "AF", "A", "Float", "Alt allele frequency"), + ("info", "INFO_S", "A", "Float", "Imputation INFO score"), + ("p_hwe", "HWE", "A", "Float", "Hardy-Weinberg equilibrium p-value"), + ("n_called", "NS", "1", "Integer", "Number of samples called"), + ("n_hom_ref", "n_hom_ref", "1", "Integer", "Number of hom-ref samples"), + ("n_het", "n_het", "1", "Integer", "Number of heterozygous samples"), + ("n_hom_var", "n_hom_var", "1", "Integer", "Number of hom-alt samples"), + ("consequence", "CSQ", "1", "String", "Most severe VEP consequence (Neale Lab annotation)"), + ("consequence_category", "CSQC", "1", "String", "VEP consequence category (non_coding/missense/synonymous/ptv)"), +] + +# hg19 contig sizes (used in the VCF header so downstream tools accept the +# file). CrossMap rewrites them on lift. +HG19_CONTIGS = [ + ("chr1", 249250621), ("chr2", 243199373), ("chr3", 198022430), + ("chr4", 191154276), ("chr5", 180915260), ("chr6", 171115067), + ("chr7", 159138663), ("chr8", 146364022), ("chr9", 141213431), + ("chr10", 135534747), ("chr11", 135006516), ("chr12", 133851895), + ("chr13", 115169878), ("chr14", 107349540), ("chr15", 102531392), + ("chr16", 90354753), ("chr17", 81195210), ("chr18", 78077248), + ("chr19", 59128983), ("chr20", 63025520), ("chr21", 48129895), + ("chr22", 51304566), ("chrX", 155270560), +] + + +def is_missing(v): + return v == "" or v == "NA" or v == "nan" or v == "." + + +def main(): + inf = gzip.open(INPUT, "rt") + out = open(OUTPUT, "w") + + header = inf.readline().rstrip("\n").split("\t") + idx = {c: i for i, c in enumerate(header)} + + out.write("##fileformat=VCFv4.2\n") + out.write("##source=NealeLab_UKB_Round2_variants\n") + out.write("##reference=GRCh37\n") + for name, length in HG19_CONTIGS: + out.write(f"##contig=<ID={name},length={length}>\n") + for _, info_id, num, vtype, desc in INFO_FIELDS: + out.write(f'##INFO=<ID={info_id},Number={num},Type={vtype},Description="{desc}">\n') + out.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n") + + total = 0 + skipped_ac0 = 0 + skipped_bad = 0 + written = 0 + for line in inf: + total += 1 + f = line.rstrip("\n").split("\t") + if len(f) < len(header): + skipped_bad += 1 + continue + + chrom = f[idx["chr"]] + pos = f[idx["pos"]] + ref = f[idx["ref"]] + alt = f[idx["alt"]] + rsid = f[idx["rsid"]] + ac_str = f[idx["AC"]] + n_called_str = f[idx["n_called"]] + + try: + ac = int(ac_str) + except ValueError: + skipped_bad += 1 + continue + if ac == 0: + skipped_ac0 += 1 + continue + + try: + n_called = int(n_called_str) + except ValueError: + skipped_bad += 1 + continue + an = 2 * n_called + + if not chrom.startswith("chr"): + chrom = "chr" + chrom + + if is_missing(rsid): + rsid = "." + + info_parts = [] + for src_name, info_id, _, vtype, _ in INFO_FIELDS: + if src_name == "AN": + info_parts.append(f"AN={an}") + continue + v = f[idx[src_name]] if src_name in idx else "" + if is_missing(v): + continue + if vtype == "Integer": + try: + info_parts.append(f"{info_id}={int(float(v))}") + except ValueError: + pass + elif vtype == "Float": + try: + info_parts.append(f"{info_id}={float(v):.6g}") + except ValueError: + pass + else: + vv = v.replace(";", "%3B").replace("=", "%3D").replace(" ", "_") + info_parts.append(f"{info_id}={vv}") + + info_str = ";".join(info_parts) if info_parts else "." + out.write(f"{chrom}\t{pos}\t{rsid}\t{ref}\t{alt}\t.\tPASS\t{info_str}\n") + written += 1 + + if written % 1000000 == 0: + sys.stderr.write(f" written {written:,}\n") + + inf.close() + out.close() + sys.stderr.write( + f"total={total} written={written} skipped_AC0={skipped_ac0} skipped_bad={skipped_bad}\n" + ) + + +if __name__ == "__main__": + main()