src/hg/makeDb/scripts/varFreqs/vcfToBigBed.py 65da29c9d74d4dd832ab7f16899ad3b209b92da4

65da29c9d74d4dd832ab7f16899ad3b209b92da4
max
  Wed May 6 08:43:57 2026 -0700
varFreqs: 5 vcfToBigBed.py fixes + add NPM Singapore to combined track

vcfToBigBed.py and mergeAndAnnotate.sh moved into kent (they were
hive-only); the build is now reproducible from a fresh kent checkout.

Five vcfToBigBed.py fixes (all caught by Lou's QA pass on #36642):

- normalize_consequence(): bcftools csq emits "&"-joined compound terms
like "stop_gained&frameshift" which exact-match-failed the old 8-bucket
consequence filter and orphaned ~8.5M records. Rewrites "&" to "," so a
single record can match multiple buckets, and appends ",others" to any
token list with no named-filter token. Trackdb gains 4 buckets (3' UTR,
5' UTR, Non-coding, Other) and switches to filterType.consequence
multipleListOr.

- Source-attribution bug: the old check only inspected the unified AC/AF
slot. AllOfUs ships only per-population fields ("." in the unified
slot), so all 67M+ AllOfUs variants got no source attribution -- ~43M
rows in the previous bigBed had an empty "sources" column. Fix scans
per-population slots before declaring "no data".

- parse_bcsq() returns "" instead of "." for aaChange/dnaChange on
non-coding variants, so the mouseOver and detail page render a clean
blank line.

- maxAF format: "{:.6g}" -> "{:.6f}" so very small AFs render as
"0.000003" instead of "3.31347e-06".

- autoSql `table varFreqs` -> `table varFreqsAll` (matches the bigBed
filename; required for hgIntegrator wiring).

NPM Singapore (SG10K_Health, 9.7k WGS) added to databases.tsv,
files.txt, populations.tsv (SgChinese / SgMalay / SgIndian) and the
trackDb filter UI. NPM individual subtrack stays tableBrowser off
(license); folded into varFreqsAll same as finngen / kova / mgrb /
swefreq / tishkoff180.

varFreqsAll bigBed rebuild is in progress at /hive/data/genomes/hg38/
bed/varFreqs/all/; will land in /gbdb when the bedToBigBed step
completes.

refs #36642

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

diff --git src/hg/makeDb/scripts/varFreqs/vcfToBigBed.py src/hg/makeDb/scripts/varFreqs/vcfToBigBed.py
new file mode 100755
index 00000000000..84034dd4b5e
--- /dev/null
+++ src/hg/makeDb/scripts/varFreqs/vcfToBigBed.py
@@ -0,0 +1,652 @@
+#!/usr/bin/env python3
+"""
+Convert merged/annotated VCF to bigBed with variant frequencies.
+
+Reads database config from TSV files in the kent source tree.
+Two-phase parallel processing:
+  Phase 1: Pre-extract frequency data from each VCF (parallel by database)
+  Phase 2: Build BED from annotated VCF + pre-extracted data (parallel by chromosome)
+
+Usage:
+  python3 vcfToBigBed.py                          # full genome
+  python3 vcfToBigBed.py --region chrM             # quick test on chrM
+  python3 vcfToBigBed.py --region chr22 --threads 4
+"""
+
+import sys
+import os
+import subprocess
+import argparse
+import shutil
+from collections import OrderedDict
+from typing import List, Tuple, Dict, Optional
+from multiprocessing import Pool
+
+# ============================================================================
+# Constants
+# ============================================================================
+
+SCRIPTS_DIR = os.path.join(os.path.expanduser("~"), "kent", "src", "hg",
+                           "makeDb", "scripts", "varFreqs")
+
+ALL_CHROMOSOMES = [f"chr{i}" for i in range(1, 23)] + ["chrX", "chrY", "chrM"]
+
+CHR_RENAME = {str(i): f"chr{i}" for i in range(1, 23)}
+CHR_RENAME.update({"X": "chrX", "Y": "chrY", "M": "chrM", "MT": "chrM",
+                    "23": "chrX", "24": "chrY"})
+
+# ============================================================================
+# Consequence Handling
+# ============================================================================
+
+LOF_CSQ = {"stop_gained", "stop_lost", "frameshift", "splice_donor",
+           "splice_acceptor", "start_lost", "transcript_ablation",
+           "feature_truncation"}
+MISSENSE_CSQ = {"missense", "inframe_insertion", "inframe_deletion",
+                "protein_altering", "coding_sequence"}
+SYN_CSQ = {"synonymous", "stop_retained", "start_retained"}
+
+
+def get_color(bcsq):
+    """Map BCSQ to RGB color based on most severe consequence."""
+    if not bcsq or bcsq == ".":
+        return (128, 128, 128)
+    best_priority = -1
+    best_color = (128, 128, 128)
+    for entry in bcsq.split(","):
+        csq = entry.split("|")[0].lower()
+        for part in csq.split("&"):
+            if part in LOF_CSQ and best_priority < 3:
+                best_priority, best_color = 3, (255, 0, 0)
+            elif part in MISSENSE_CSQ and best_priority < 2:
+                best_priority, best_color = 2, (31, 119, 180)
+            elif part in SYN_CSQ and best_priority < 1:
+                best_priority, best_color = 1, (0, 128, 0)
+    return best_color
+
+
+def parse_bcsq(bcsq):
+    """Parse BCSQ → (consequence, gene, transcript, aa_change, dna_change).
+    aa_change/dna_change return "" rather than "." for non-coding so the
+    mouseOver and detail page render a clean blank line."""
+    if not bcsq or bcsq == ".":
+        return (".", ".", ".", "", "")
+    best, best_pri = None, -1
+    for entry in bcsq.split(","):
+        parts = entry.split("|")
+        if len(parts) < 4:
+            continue
+        csq = parts[0].lower().split("&")[0]
+        pri = 3 if csq in LOF_CSQ else 2 if csq in MISSENSE_CSQ \
+              else 1 if csq in SYN_CSQ else 0
+        if pri > best_pri:
+            best_pri, best = pri, parts
+    if best is None:
+        best = bcsq.split(",")[0].split("|")
+    return (
+        best[0] if len(best) > 0 else ".",
+        best[1] if len(best) > 1 else ".",
+        best[2] if len(best) > 2 else ".",
+        best[5] if len(best) > 5 else "",
+        best[6] if len(best) > 6 else "",
+    )
+
+
+# Filter buckets exposed in the trackdb consequence multipleListOr filter.
+# Anything that produces no token in this set gets ",others" appended so it
+# is still selectable via the "Other" bucket.
+NAMED_CONSEQUENCES = {
+    "missense", "synonymous", "stop_gained", "frameshift",
+    "splice_donor", "splice_acceptor", "intron", ".",
+    "3_prime_utr", "5_prime_utr", "non_coding",
+}
+
+
+def normalize_consequence(raw):
+    """Convert bcftools csq consequence (& or , joined) to a comma-joined
+    token list. Append "others" if no token matches a named filter bucket."""
+    if not raw or raw == ".":
+        return "."
+    tokens = raw.replace("&", ",").split(",")
+    if not any(tok in NAMED_CONSEQUENCES for tok in tokens):
+        tokens.append("others")
+    return ",".join(tokens)
+
+
+def get_vartype(ref, alt):
+    if len(ref) == 1 and len(alt) == 1:
+        return "SNV"
+    elif len(ref) == 1:
+        return "INS"
+    elif len(alt) == 1:
+        return "DEL"
+    return "MNV"
+
+
+# ============================================================================
+# Configuration Loading
+# ============================================================================
+
+def load_config(scripts_dir):
+    """Load databases.tsv and populations.tsv."""
+    databases = OrderedDict()
+
+    db_path = os.path.join(scripts_dir, "databases.tsv")
+    with open(db_path) as f:
+        for line in f:
+            line = line.strip()
+            if not line or line.startswith('#'):
+                continue
+            parts = line.split('\t')
+            if len(parts) < 5:
+                print(f"WARNING: skipping malformed line: {line}",
+                      file=sys.stderr)
+                continue
+            key, name, vcf, ac_field, af_field = (
+                parts[0], parts[1], parts[2], parts[3], parts[4])
+            databases[key] = {
+                "name": name, "vcf": vcf,
+                "ac_field": ac_field, "af_field": af_field,
+                "pops": [],
+            }
+
+    pop_path = os.path.join(scripts_dir, "populations.tsv")
+    with open(pop_path) as f:
+        for line in f:
+            line = line.strip()
+            if not line or line.startswith('#'):
+                continue
+            parts = line.split('\t')
+            if len(parts) < 5:
+                continue
+            db_key = parts[0]
+            if db_key in databases:
+                databases[db_key]["pops"].append({
+                    "key": parts[1], "name": parts[2],
+                    "ac_field": parts[3], "af_field": parts[4],
+                })
+            else:
+                print(f"WARNING: population references unknown db '{db_key}'",
+                      file=sys.stderr)
+
+    return databases
+
+
+# ============================================================================
+# Phase 1: Pre-extract frequency data (parallel by database)
+# ============================================================================
+
+def pre_extract_one(args):
+    """Extract frequency data from one VCF, split output by chromosome."""
+    db_key, db_info, region, extract_dir = args
+
+    vcf = db_info["vcf"]
+    if not os.path.exists(vcf):
+        print(f"  {db_key}: VCF not found ({vcf}), skipping", file=sys.stderr)
+        return db_key, 0
+
+    out_dir = os.path.join(extract_dir, db_key)
+    os.makedirs(out_dir, exist_ok=True)
+
+    # Build bcftools query format: CHROM POS REF ALT [AC AF] [pop_ac pop_af ...]
+    fmt_parts = ["%CHROM", "%POS", "%REF", "%ALT"]
+    if db_info["ac_field"] != ".":
+        fmt_parts.append(f"%INFO/{db_info['ac_field']}")
+    else:
+        fmt_parts.append(".")
+    if db_info["af_field"] != ".":
+        fmt_parts.append(f"%INFO/{db_info['af_field']}")
+    else:
+        fmt_parts.append(".")
+    for pop in db_info["pops"]:
+        if pop["ac_field"] != ".":
+            fmt_parts.append(f"%INFO/{pop['ac_field']}")
+        else:
+            fmt_parts.append(".")
+        if pop["af_field"] != ".":
+            fmt_parts.append(f"%INFO/{pop['af_field']}")
+        else:
+            fmt_parts.append(".")
+    fmt = "\t".join(fmt_parts) + "\n"
+
+    cmd = ["bcftools", "query", "-f", fmt]
+    if region:
+        cmd.extend(["-r", region])
+    cmd.append(vcf)
+
+    chrom_files = {}
+    total = 0
+
+    try:
+        proc = subprocess.Popen(cmd, stdout=subprocess.PIPE,
+                                stderr=subprocess.PIPE, text=True)
+        for line in proc.stdout:
+            parts = line.rstrip('\n').split('\t')
+            if len(parts) < 4:
+                continue
+            chrom = parts[0]
+            if chrom in CHR_RENAME:
+                chrom = CHR_RENAME[chrom]
+            if chrom not in chrom_files:
+                chrom_files[chrom] = open(
+                    os.path.join(out_dir, f"{chrom}.tsv"), "w")
+            # Write: POS\tREF\tALT\tAC\tAF\t[pop fields...]
+            chrom_files[chrom].write("\t".join(parts[1:]) + "\n")
+            total += 1
+        proc.wait()
+    except Exception as e:
+        print(f"  {db_key}: ERROR: {e}", file=sys.stderr)
+    finally:
+        for fh in chrom_files.values():
+            fh.close()
+
+    print(f"  {db_key}: {total:,} variants extracted", file=sys.stderr)
+    return db_key, total
+
+
+def pre_extract_all(databases, region, extract_dir, threads):
+    """Phase 1: Pre-extract all VCFs in parallel."""
+    print(f"\n=== Phase 1: Pre-extracting frequency data "
+          f"({threads} parallel) ===", file=sys.stderr)
+
+    tasks = [(db_key, db_info, region, extract_dir)
+             for db_key, db_info in databases.items()]
+
+    with Pool(min(threads, len(tasks))) as pool:
+        pool.map(pre_extract_one, tasks)
+
+
+# ============================================================================
+# Phase 2: Process chromosomes (parallel by chromosome)
+# ============================================================================
+
+def load_extracted(db_key, chrom, extract_dir):
+    """Load pre-extracted TSV into dict keyed by pos:ref:alt."""
+    path = os.path.join(extract_dir, db_key, f"{chrom}.tsv")
+    data = {}
+    if not os.path.exists(path):
+        return data
+    with open(path) as f:
+        for line in f:
+            parts = line.rstrip('\n').split('\t')
+            if len(parts) < 3:
+                continue
+            key = f"{parts[0]}:{parts[1]}:{parts[2]}"
+            data[key] = parts[3:]
+    return data
+
+
+def process_chromosome(args):
+    """Phase 2: Build BED for one chromosome from annotated VCF + extracted data."""
+    chrom, annotated_vcf, databases, extract_dir, output_dir = args
+
+    # Read annotated variants (try BCSQ first, fall back to plain variants)
+    cmd = ["bcftools", "query", "-r", chrom, "-f",
+           "%CHROM\t%POS\t%REF\t%ALT\t%INFO/BCSQ\n", annotated_vcf]
+    result = subprocess.run(cmd, capture_output=True, text=True)
+
+    if result.returncode != 0 or not result.stdout.strip():
+        # Fall back: read without BCSQ (use "." as consequence)
+        cmd = ["bcftools", "query", "-r", chrom, "-f",
+               "%CHROM\t%POS\t%REF\t%ALT\t.\n", annotated_vcf]
+        result = subprocess.run(cmd, capture_output=True, text=True)
+
+    if result.returncode != 0 or not result.stdout.strip():
+        print(f"  {chrom}: no variants", file=sys.stderr)
+        return None
+
+    ann_lines = result.stdout.strip().split("\n")
+    print(f"  {chrom}: {len(ann_lines):,} variants, "
+          f"loading frequencies...", file=sys.stderr)
+
+    # Load pre-extracted frequency data
+    freq_data = {}
+    for db_key in databases:
+        freq_data[db_key] = load_extracted(db_key, chrom, extract_dir)
+
+    output_path = os.path.join(output_dir, f"{chrom}.bed")
+    count = 0
+
+    with open(output_path, "w") as out:
+        for line in ann_lines:
+            parts = line.split("\t")
+            if len(parts) < 5:
+                continue
+
+            chrom_name, pos, ref, alt, bcsq = parts[0:5]
+            key = f"{pos}:{ref}:{alt}"
+
+            consequence, gene, transcript, aa_change, dna_change = \
+                parse_bcsq(bcsq)
+            r, g, b = get_color(bcsq)
+
+            pos_int = int(pos)
+            start = pos_int - 1
+            end = start + len(ref)
+
+            ref_d = ref[:17] + "..." if len(ref) > 20 else ref
+            alt_d = alt[:17] + "..." if len(alt) > 20 else alt
+            name = f"{ref_d}>{alt_d}"
+            var_type = get_vartype(ref, alt)
+
+            max_af = 0.0
+            sources = []
+            db_ac_af = []    # per-database AC, AF
+            pop_ac_af = []   # per-population AC, AF (written AFTER all db fields)
+            total_ac = 0
+
+            for db_key, db_info in databases.items():
+                values = freq_data.get(db_key, {}).get(key, [])
+
+                ac = values[0] if len(values) > 0 and values[0] not in \
+                     (".", "") else ""
+                af = values[1] if len(values) > 1 and values[1] not in \
+                     (".", "") else ""
+
+                # A source contributes if the unified AC/AF slot has data OR
+                # any per-population slot has data. AllOfUs ships only
+                # per-population fields ("." in the unified slot) so without
+                # this fallback its 67M+ variants get no source attribution.
+                has_data = bool(ac) or bool(af)
+                if not has_data:
+                    for i in range(len(db_info["pops"])):
+                        idx = 2 + i * 2
+                        pop_ac = values[idx] if len(values) > idx and \
+                            values[idx] not in (".", "") else ""
+                        pop_af = values[idx + 1] if len(values) > idx + 1 \
+                            and values[idx + 1] not in (".", "") else ""
+                        if pop_ac or pop_af:
+                            has_data = True
+                            break
+
+                if has_data:
+                    sources.append(db_key)
+
+                if ac:
+                    try:
+                        total_ac += int(ac)
+                    except ValueError:
+                        pass
+
+                db_ac_af.extend([ac, af])
+
+                if af:
+                    try:
+                        max_af = max(max_af, float(af))
+                    except ValueError:
+                        pass
+
+                for i, pop in enumerate(db_info["pops"]):
+                    idx = 2 + i * 2
+                    pop_ac = values[idx] if len(values) > idx and \
+                        values[idx] not in (".", "") else ""
+                    pop_af = values[idx + 1] if len(values) > idx + 1 and \
+                        values[idx + 1] not in (".", "") else ""
+                    pop_ac_af.extend([pop_ac, pop_af])
+
+                    if pop_af:
+                        try:
+                            max_af = max(max_af, float(pop_af))
+                        except ValueError:
+                            pass
+
+            score = min(1000, int(max_af * 1000))
+
+            fields = [
+                chrom_name, str(start), str(end), name, str(score), "+",
+                str(start), str(end), f"{r},{g},{b}",
+                ref, alt, str(len(ref)), str(len(alt)),
+                str(len(alt) - len(ref)), var_type,
+                normalize_consequence(consequence),
+                gene, transcript, aa_change, dna_change,
+                f"{max_af:.6f}" if max_af > 0 else "0",
+                str(total_ac) if total_ac > 0 else "0",
+                ",".join(sources),
+            ]
+            # Database AC/AF first, then population AC/AF — must match autoSql order
+            fields.extend(db_ac_af)
+            fields.extend(pop_ac_af)
+
+            out.write("\t".join(fields) + "\n")
+            count += 1
+
+    print(f"  {chrom}: wrote {count:,} BED lines", file=sys.stderr)
+    return output_path
+
+
+# ============================================================================
+# AutoSql Generation
+# ============================================================================
+
+def generate_autosql(output_path, databases):
+    """Generate autoSql file from database config."""
+    with open(output_path, "w") as f:
+        f.write('table varFreqsAll\n')
+        f.write('"Variant frequencies across population databases"\n')
+        f.write('(\n')
+        # BED9
+        f.write('    string chrom;        "Chromosome"\n')
+        f.write('    uint chromStart;     "Start position (0-based)"\n')
+        f.write('    uint chromEnd;       "End position"\n')
+        f.write('    string name;         "Variant (REF>ALT)"\n')
+        f.write('    uint score;          "Score (maxAF * 1000)"\n')
+        f.write('    char[1] strand;      "Strand"\n')
+        f.write('    uint thickStart;     "Thick start"\n')
+        f.write('    uint thickEnd;       "Thick end"\n')
+        f.write('    uint reserved;       "Color by consequence"\n')
+        # Variant info
+        f.write('    lstring ref;         "Reference allele"\n')
+        f.write('    lstring alt;         "Alternate allele"\n')
+        f.write('    int refLen;          "Reference length"\n')
+        f.write('    int altLen;          "Alternate length"\n')
+        f.write('    int varLen;          "Length change (alt-ref)"\n')
+        f.write('    string varType;      "Type (SNV/INS/DEL/MNV)"\n')
+        # Consequence
+        f.write('    string consequence;  "Consequence"\n')
+        f.write('    string gene;         "Gene"\n')
+        f.write('    string transcript;   "Transcript"\n')
+        f.write('    lstring aaChange;    "AA change"\n')
+        f.write('    lstring dnaChange;   "DNA change"\n')
+        # Frequencies
+        f.write('    float maxAF;         "Max allele frequency"\n')
+        f.write('    uint totalAC;        "Total allele count across all databases"\n')
+        f.write('    string sources;      "Source databases"\n')
+        # Per-database AC/AF
+        for db_key, db_info in databases.items():
+            f.write(f'    string {db_key}AC;'
+                    f'      "{db_info["name"]} AC"\n')
+            f.write(f'    string {db_key}AF;'
+                    f'      "{db_info["name"]} AF"\n')
+        # Per-population AC/AF
+        for db_key, db_info in databases.items():
+            for pop in db_info["pops"]:
+                f.write(f'    string {db_key}AC_{pop["key"]};'
+                        f'  "{db_info["name"]} {pop["name"]} AC"\n')
+                f.write(f'    string {db_key}AF_{pop["key"]};'
+                        f'  "{db_info["name"]} {pop["name"]} AF"\n')
+        f.write(')\n')
+    print(f"Wrote autoSql: {output_path}", file=sys.stderr)
+
+
+def generate_trackdb_fragment(output_path, databases):
+    """Generate trackDb .ra fragment for the varFreqsAll track filters."""
+    with open(output_path, "w") as f:
+        f.write("# Auto-generated trackDb fragment for varFreqsAll filters\n")
+        f.write("# Paste this into the varFreqsAll track stanza in varFreqs.ra\n\n")
+
+        # Source database filter
+        src_parts = []
+        for db_key, db_info in databases.items():
+            src_parts.append(f"{db_key}|{db_info['name']}")
+        f.write("        filterValues.sources "
+                + ",".join(src_parts) + "\n")
+        f.write("        filterType.sources multipleListOr\n")
+        f.write("        filterLabel.sources Source Database\n")
+
+        # Variant type and consequence (static)
+        f.write("        # Variant type and consequence filters\n")
+        f.write("        filterValues.varType "
+                "SNV|SNV,INS|Insertion,DEL|Deletion,MNV|MNV\n")
+        f.write("        filterLabel.varType Variant Type\n")
+        f.write("        filterValues.consequence "
+                "missense|Missense,synonymous|Synonymous,"
+                "stop_gained|Stop Gained,frameshift|Frameshift,"
+                "splice_donor|Splice Donor,splice_acceptor|Splice Acceptor,"
+                "intron|Intron,.|Intergenic\n")
+        f.write("        filterLabel.consequence Consequence\n")
+
+        # Length filters
+        f.write("        # Length filters\n")
+        for fld in ["refLen", "altLen", "varLen"]:
+            label = {"refLen": "Reference Length", "altLen": "Alternate Length",
+                     "varLen": "Length Change"}[fld]
+            f.write(f"        filterByRange.{fld} on\n")
+            f.write(f"        filterLabel.{fld} {label}\n")
+
+        # Max AF filter
+        f.write("        # Max AF filter\n")
+        f.write("        filterByRange.maxAF on\n")
+        f.write("        filterLabel.maxAF Max Allele Frequency\n")
+        f.write("        filterLimits.maxAF 0:1\n")
+
+        # Total AC filter
+        f.write("        # Total AC filter\n")
+        f.write("        filterByRange.totalAC on\n")
+        f.write("        filterLabel.totalAC Total Allele Count (all databases)\n")
+
+        # Per-database AF and AC filters
+        f.write("        # Per-database AF filters\n")
+        for db_key, db_info in databases.items():
+            f.write(f"        filterByRange.{db_key}AF on\n")
+            f.write(f"        filterLabel.{db_key}AF "
+                    f"{db_info['name']} AF\n")
+
+        f.write("        # Per-database AC filters\n")
+        for db_key, db_info in databases.items():
+            f.write(f"        filterByRange.{db_key}AC on\n")
+            f.write(f"        filterLabel.{db_key}AC "
+                    f"{db_info['name']} AC\n")
+
+        # Population-specific filters
+        f.write("        # Population-specific AF filters\n")
+        for db_key, db_info in databases.items():
+            if not db_info["pops"]:
+                continue
+            f.write(f"        # {db_info['name']} populations\n")
+            for pop in db_info["pops"]:
+                f.write(f"        filterByRange.{db_key}AF_{pop['key']} on\n")
+                f.write(f"        filterLabel.{db_key}AF_{pop['key']} "
+                        f"{db_info['name']} {pop['name']} AF\n")
+            for pop in db_info["pops"]:
+                f.write(f"        filterByRange.{db_key}AC_{pop['key']} on\n")
+                f.write(f"        filterLabel.{db_key}AC_{pop['key']} "
+                        f"{db_info['name']} {pop['name']} AC\n")
+
+    print(f"Wrote trackDb fragment: {output_path}", file=sys.stderr)
+
+
+# ============================================================================
+# Main
+# ============================================================================
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Convert annotated VCF to bigBed (two-phase parallel)")
+    parser.add_argument("--annotated-vcf", default="merged.annotated.vcf.gz")
+    parser.add_argument("--output-prefix", default="varFreqs")
+    parser.add_argument("--chrom-sizes",
+                        default="/hive/data/genomes/hg38/chrom.sizes")
+    parser.add_argument("--threads", type=int, default=8)
+    parser.add_argument("--work-dir", default=".")
+    parser.add_argument("--region", default=None,
+                        help="Region to process, e.g. chrM or chr22")
+    parser.add_argument("--scripts-dir", default=SCRIPTS_DIR)
+    parser.add_argument("--keep-temp", action="store_true")
+    args = parser.parse_args()
+
+    databases = load_config(args.scripts_dir)
+    print(f"Loaded {len(databases)} databases:", file=sys.stderr)
+    for key, info in databases.items():
+        pops = ", ".join(p["key"] for p in info["pops"])
+        extra = f" [{pops}]" if pops else ""
+        exists = "OK" if os.path.exists(info["vcf"]) else "MISSING"
+        print(f"  {key}: {info['name']}{extra} ({exists})", file=sys.stderr)
+
+    if args.region:
+        chromosomes = [args.region.split(":")[0]]
+    else:
+        chromosomes = ALL_CHROMOSOMES
+
+    as_path = os.path.join(args.work_dir, f"{args.output_prefix}.as")
+    bb_path = os.path.join(args.work_dir, f"{args.output_prefix}.bb")
+    key_path = os.path.join(args.work_dir, f"{args.output_prefix}.keys.txt")
+    extract_dir = os.path.join(args.work_dir, "extracted")
+    bed_dir = os.path.join(args.work_dir, "beds")
+    os.makedirs(extract_dir, exist_ok=True)
+    os.makedirs(bed_dir, exist_ok=True)
+
+    # Step 1: AutoSql and trackDb fragment
+    generate_autosql(as_path, databases)
+    ra_path = os.path.join(args.work_dir, f"{args.output_prefix}.trackDb.ra")
+    generate_trackdb_fragment(ra_path, databases)
+
+    # Step 2: Phase 1 — pre-extract
+    pre_extract_all(databases, args.region, extract_dir, args.threads)
+
+    # Step 3: Phase 2 — process chromosomes
+    print(f"\n=== Phase 2: Processing {len(chromosomes)} chromosome(s) ===",
+          file=sys.stderr)
+
+    task_args = [(chrom, args.annotated_vcf, databases, extract_dir, bed_dir)
+                 for chrom in chromosomes]
+
+    with Pool(min(args.threads, len(chromosomes))) as pool:
+        chrom_beds = pool.map(process_chromosome, task_args)
+    chrom_beds = [b for b in chrom_beds if b is not None]
+
+    if not chrom_beds:
+        print("ERROR: No BED output produced", file=sys.stderr)
+        sys.exit(1)
+
+    # Step 4: Concatenate and sort
+    print(f"\n=== Concatenating {len(chrom_beds)} chromosome files ===",
+          file=sys.stderr)
+    bed_path = os.path.join(args.work_dir, f"{args.output_prefix}.bed")
+    with open(bed_path, "w") as out:
+        for chrom_bed in chrom_beds:
+            result = subprocess.run(
+                ["sort", "-k2,2n", chrom_bed],
+                capture_output=True, text=True)
+            out.write(result.stdout)
+
+    # Step 5: bedToBigBed
+    print(f"\n=== Running bedToBigBed ===", file=sys.stderr)
+    cmd = ["bedToBigBed", "-type=bed9+", f"-as={as_path}", "-tab",
+           bed_path, args.chrom_sizes, bb_path]
+    print(f"  {' '.join(cmd)}", file=sys.stderr)
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    if result.returncode != 0:
+        print(f"  Error: {result.stderr}", file=sys.stderr)
+        sys.exit(1)
+
+    # Step 6: Key mapping
+    with open(key_path, "w") as f:
+        for key, info in databases.items():
+            f.write(f"{key}|{info['name']}\n")
+
+    # Cleanup
+    if not args.keep_temp:
+        for chrom_bed in chrom_beds:
+            if os.path.exists(chrom_bed):
+                os.remove(chrom_bed)
+        if os.path.exists(bed_path):
+            os.remove(bed_path)
+        shutil.rmtree(extract_dir, ignore_errors=True)
+        shutil.rmtree(bed_dir, ignore_errors=True)
+
+    bb_size = os.path.getsize(bb_path) / (1024**3)
+    print(f"\nDone! {bb_path} ({bb_size:.1f} GB)", file=sys.stderr)
+
+
+if __name__ == "__main__":
+    main()