0a5389e8b81efa99e932f5e9e2424b27ce1af095
max
  Sun May 17 14:53:57 2026 -0700
varFreqs: add GenomeIndia subtrack (9,768 WGS, 83 endogamous Indian populations, Bhattacharyya 2025) and wire into databases.tsv for the next varFreqsAll rebuild. TSV->VCF conversion synthesizes AC=round(AF*AN) with AN=19536 since the upstream release only ships AF, refs #36642

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

diff --git src/hg/makeDb/scripts/varFreqs/genomeindiaToVcf.py src/hg/makeDb/scripts/varFreqs/genomeindiaToVcf.py
new file mode 100644
index 00000000000..8dd07fa087c
--- /dev/null
+++ src/hg/makeDb/scripts/varFreqs/genomeindiaToVcf.py
@@ -0,0 +1,150 @@
+#!/usr/bin/env python3
+"""Convert GenomeIndia (9,768 individuals) per-chromosome AF TSV files into
+a single bgzipped, tabix-indexed VCF.
+
+Source format (one file per autosome, no header):
+    chrom  pos  id  ref  alt  AF
+
+The release ships only an alternate allele frequency. AC and AN are
+synthesized so the varFreqsAll combined-track pipeline can pick them
+up via INFO/AC and INFO/AF.
+
+    AN = 2 * 9768 = 19536  (autosomal, biallelic; constant)
+    AC = round(AF * AN)
+
+Per the README the underlying calls were filtered to NS_GT >= 98% (>=98%
+of samples called at the site), so AN slightly overstates the called
+allele count for some sites. The combined track only displays AC/AF for
+context so this approximation is acceptable; the methods section notes it.
+
+Usage:
+    python3 genomeindiaToVcf.py <input_dir> <output.vcf.gz>
+"""
+
+import argparse
+import gzip
+import os
+import subprocess
+import sys
+
+N_SAMPLES = 9768
+AN = 2 * N_SAMPLES  # 19536, autosomal biallelic
+
+AUTOSOMES = [f"chr{i}" for i in range(1, 23)]
+
+
+def convert(input_dir, output_vcf):
+    body_path = output_vcf + ".unsorted.body"
+    total = 0
+    skipped = 0
+
+    with open(body_path, "w") as out:
+        for chrom in AUTOSOMES:
+            tsv = os.path.join(
+                input_dir,
+                f"GI_9768_CBR-NIBMG_JointCall_AF_{chrom}.tsv")
+            if not os.path.exists(tsv):
+                print(f"ERROR: missing {tsv}", file=sys.stderr)
+                sys.exit(1)
+
+            n_chrom = 0
+            with open(tsv) as f:
+                for line in f:
+                    parts = line.rstrip("\n").split("\t")
+                    if len(parts) < 6:
+                        skipped += 1
+                        continue
+                    c, pos, vid, ref, alt, af_str = parts[:6]
+
+                    if c != chrom:
+                        # Source files are per-chromosome; cross-chrom rows
+                        # would indicate corruption.
+                        print(f"ERROR: {tsv} has row with chrom={c}",
+                              file=sys.stderr)
+                        sys.exit(1)
+
+                    try:
+                        af = float(af_str)
+                    except ValueError:
+                        skipped += 1
+                        continue
+
+                    if af <= 0 or af > 1:
+                        # AF outside (0,1] is meaningless; skip
+                        skipped += 1
+                        continue
+
+                    ac = int(round(af * AN))
+                    if ac < 1:
+                        ac = 1  # rounding floor: an observed alt allele
+                    if ac > AN:
+                        ac = AN
+
+                    info = f"AN={AN};AC={ac};AF={af:.6g}"
+
+                    if vid == "" or vid == ".":
+                        vid = "."
+
+                    out.write(f"{chrom}\t{pos}\t{vid}\t{ref}\t{alt}"
+                              f"\t.\tPASS\t{info}\n")
+                    n_chrom += 1
+                    total += 1
+            print(f"  {chrom}: {n_chrom:,} variants",
+                  file=sys.stderr)
+
+    print(f"Body written: {total:,} variants ({skipped} skipped)",
+          file=sys.stderr)
+
+    # Write header to a separate file
+    hdr_path = output_vcf + ".unsorted.hdr"
+    with open(hdr_path, "w") as h:
+        h.write("##fileformat=VCFv4.2\n")
+        h.write("##source=GenomeIndia 9768 (Bhattacharyya et al. 2025)\n")
+        h.write('##INFO=<ID=AN,Number=1,Type=Integer,'
+                'Description="Total allele number (2 * 9768 = 19536; '
+                'autosomal, biallelic; assumes 100% call rate)">\n')
+        h.write('##INFO=<ID=AC,Number=A,Type=Integer,'
+                'Description="Synthesized alternate allele count '
+                '(round(AF * AN); AN=19536)">\n')
+        h.write('##INFO=<ID=AF,Number=A,Type=Float,'
+                'Description="Alternate allele frequency from GenomeIndia '
+                'summary stats">\n')
+        for chrom in AUTOSOMES:
+            h.write(f"##contig=<ID={chrom}>\n")
+        h.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n")
+
+    # Concatenate header + body, then bgzip-sort with bcftools
+    unsorted_vcf = output_vcf + ".unsorted.vcf"
+    with open(unsorted_vcf, "w") as out:
+        for p in (hdr_path, body_path):
+            with open(p) as f:
+                for line in f:
+                    out.write(line)
+
+    print("Sorting + bgzipping...", file=sys.stderr)
+    # Files are already in chrom order; just sort by position within each
+    # chrom. Easiest: bcftools sort handles both.
+    subprocess.run(
+        ["bcftools", "sort", "-Oz", "-T", "/data/tmp", "-m", "8G",
+         "-o", output_vcf, unsorted_vcf],
+        check=True)
+    subprocess.run(["tabix", "-p", "vcf", output_vcf], check=True)
+
+    os.remove(hdr_path)
+    os.remove(body_path)
+    os.remove(unsorted_vcf)
+
+    print(f"Done: {output_vcf}", file=sys.stderr)
+
+
+def main():
+    p = argparse.ArgumentParser(description=__doc__)
+    p.add_argument("input_dir",
+                   help="Dir with GI_9768_CBR-NIBMG_JointCall_AF_chr*.tsv")
+    p.add_argument("output_vcf", help="Output .vcf.gz")
+    args = p.parse_args()
+    convert(args.input_dir, args.output_vcf)
+
+
+if __name__ == "__main__":
+    main()