3180d71425ab40bc022712bb95868bfe80747375 max Fri May 29 08:52:38 2026 -0700 [Claude] varFreqs: split SPARK+SCHEMA by phenotype, add disease + array combined tracks, drop array cohorts from varFreqsAll #Preview2 week - bugs introduced now will need a build patch to fix Split SFARI SPARK WES and WGS by autism status using fill-tags -S with the SPARK individuals_registration TSV (AC_AUT / AN_AUT / AF_AUT plus AC_NON_AUT / AN_NON_AUT / AF_NON_AUT). Added matching SCHEMA case/control sums (AC_CASE etc.). Two new combined bigBed tracks: varFreqsDisease (SPARK, SFARI WGS, TOPMed, SCHEMA, GREGoR, GA4K) and varFreqsArray (TPMI, MexBB, UKBB). TPMI and MexBB are removed from varFreqsAll so the main combined track is purely WGS/WES. Build scripts parameterized so the same code drives all three combined builds: mergeAndAnnotate.sh gains --databases / --tag, vcfToBigBed.py gains --databases-file / --populations-file and a per-track autoSql table name. mergeAndAnnotate.sh now pins /cluster/software/src/bcftools-1.22 in PATH (--unify-chr-names is a 1.22 feature; conda's 1.14 silently fails). refs #36642 diff --git src/hg/makeDb/scripts/varFreqs/vcfToBigBed.py src/hg/makeDb/scripts/varFreqs/vcfToBigBed.py index 5bd7a00de19..cdab8f57360 100755 --- src/hg/makeDb/scripts/varFreqs/vcfToBigBed.py +++ src/hg/makeDb/scripts/varFreqs/vcfToBigBed.py @@ -115,54 +115,60 @@ def get_vartype(ref, alt): if len(ref) == 1 and len(alt) == 1: return "SNV" elif len(ref) == 1: return "INS" elif len(alt) == 1: return "DEL" return "MNV" # ============================================================================ # Configuration Loading # ============================================================================ -def load_config(scripts_dir): - """Load databases.tsv and populations.tsv.""" +def load_config(scripts_dir, db_file="databases.tsv", + pop_file="populations.tsv"): + """Load the database and population config TSVs. + + db_file/pop_file may be bare names (resolved against scripts_dir) or + absolute paths, so disease/array variants can use their own configs.""" databases = OrderedDict() - db_path = os.path.join(scripts_dir, "databases.tsv") + db_path = db_file if os.path.isabs(db_file) \ + else os.path.join(scripts_dir, db_file) with open(db_path) as f: for line in f: line = line.strip() if not line or line.startswith('#'): continue parts = line.split('\t') if len(parts) < 5: print(f"WARNING: skipping malformed line: {line}", file=sys.stderr) continue key, name, vcf, ac_field, af_field = ( parts[0], parts[1], parts[2], parts[3], parts[4]) databases[key] = { "name": name, "vcf": vcf, "ac_field": ac_field, "af_field": af_field, "pops": [], } - pop_path = os.path.join(scripts_dir, "populations.tsv") + pop_path = pop_file if os.path.isabs(pop_file) \ + else os.path.join(scripts_dir, pop_file) with open(pop_path) as f: for line in f: line = line.strip() if not line or line.startswith('#'): continue parts = line.split('\t') if len(parts) < 5: continue db_key = parts[0] if db_key in databases: databases[db_key]["pops"].append({ "key": parts[1], "name": parts[2], "ac_field": parts[3], "af_field": parts[4], }) else: @@ -406,34 +412,34 @@ # Database AC/AF first, then population AC/AF — must match autoSql order fields.extend(db_ac_af) fields.extend(pop_ac_af) out.write("\t".join(fields) + "\n") count += 1 print(f" {chrom}: wrote {count:,} BED lines", file=sys.stderr) return output_path # ============================================================================ # AutoSql Generation # ============================================================================ -def generate_autosql(output_path, databases): +def generate_autosql(output_path, databases, table_name="varFreqsAll"): """Generate autoSql file from database config.""" with open(output_path, "w") as f: - f.write('table varFreqsAll\n') + f.write(f'table {table_name}\n') f.write('"Variant frequencies across population databases"\n') f.write('(\n') # BED9 f.write(' string chrom; "Chromosome"\n') f.write(' uint chromStart; "Start position (0-based)"\n') f.write(' uint chromEnd; "End position"\n') f.write(' string name; "Variant (REF>ALT)"\n') f.write(' uint score; "Score (maxAF * 1000)"\n') f.write(' char[1] strand; "Strand"\n') f.write(' uint thickStart; "Thick start"\n') f.write(' uint thickEnd; "Thick end"\n') f.write(' uint reserved; "Color by consequence"\n') # Variant info f.write(' lstring ref; "Reference allele"\n') f.write(' lstring alt; "Alternate allele"\n') @@ -456,35 +462,35 @@ f.write(f' string {db_key}AC;' f' "{db_info["name"]} AC"\n') f.write(f' string {db_key}AF;' f' "{db_info["name"]} AF"\n') # Per-population AC/AF for db_key, db_info in databases.items(): for pop in db_info["pops"]: f.write(f' string {db_key}AC_{pop["key"]};' f' "{db_info["name"]} {pop["name"]} AC"\n') f.write(f' string {db_key}AF_{pop["key"]};' f' "{db_info["name"]} {pop["name"]} AF"\n') f.write(')\n') print(f"Wrote autoSql: {output_path}", file=sys.stderr) -def generate_trackdb_fragment(output_path, databases): - """Generate trackDb .ra fragment for the varFreqsAll track filters.""" +def generate_trackdb_fragment(output_path, databases, track_name="varFreqsAll"): + """Generate trackDb .ra fragment for the combined track filters.""" with open(output_path, "w") as f: - f.write("# Auto-generated trackDb fragment for varFreqsAll filters\n") - f.write("# Paste this into the varFreqsAll track stanza in varFreqs.ra\n\n") + f.write(f"# Auto-generated trackDb fragment for {track_name} filters\n") + f.write(f"# Paste this into the {track_name} track stanza in varFreqs.ra\n\n") # Source database filter src_parts = [] for db_key, db_info in databases.items(): src_parts.append(f"{db_key}|{db_info['name']}") f.write(" filterValues.sources " + ",".join(src_parts) + "\n") f.write(" filterType.sources multipleListOr\n") f.write(" filterLabel.sources Source Database\n") # Variant type and consequence (static) f.write(" # Variant type and consequence filters\n") f.write(" filterValues.varType " "SNV|SNV,INS|Insertion,DEL|Deletion,MNV|MNV\n") f.write(" filterLabel.varType Variant Type\n") @@ -553,58 +559,65 @@ # Main # ============================================================================ def main(): parser = argparse.ArgumentParser( description="Convert annotated VCF to bigBed (two-phase parallel)") parser.add_argument("--annotated-vcf", default="merged.annotated.vcf.gz") parser.add_argument("--output-prefix", default="varFreqs") parser.add_argument("--chrom-sizes", default="/hive/data/genomes/hg38/chrom.sizes") parser.add_argument("--threads", type=int, default=8) parser.add_argument("--work-dir", default=".") parser.add_argument("--region", default=None, help="Region to process, e.g. chrM or chr22") parser.add_argument("--scripts-dir", default=SCRIPTS_DIR) + parser.add_argument("--databases-file", default="databases.tsv", + help="DB config TSV (name under scripts-dir or abs path)") + parser.add_argument("--populations-file", default="populations.tsv", + help="Population config TSV (name under scripts-dir or abs path)") parser.add_argument("--keep-temp", action="store_true") args = parser.parse_args() - databases = load_config(args.scripts_dir) + databases = load_config(args.scripts_dir, args.databases_file, + args.populations_file) print(f"Loaded {len(databases)} databases:", file=sys.stderr) for key, info in databases.items(): pops = ", ".join(p["key"] for p in info["pops"]) extra = f" [{pops}]" if pops else "" exists = "OK" if os.path.exists(info["vcf"]) else "MISSING" print(f" {key}: {info['name']}{extra} ({exists})", file=sys.stderr) if args.region: chromosomes = [args.region.split(":")[0]] else: chromosomes = ALL_CHROMOSOMES as_path = os.path.join(args.work_dir, f"{args.output_prefix}.as") bb_path = os.path.join(args.work_dir, f"{args.output_prefix}.bb") key_path = os.path.join(args.work_dir, f"{args.output_prefix}.keys.txt") extract_dir = os.path.join(args.work_dir, "extracted") bed_dir = os.path.join(args.work_dir, "beds") os.makedirs(extract_dir, exist_ok=True) os.makedirs(bed_dir, exist_ok=True) - # Step 1: AutoSql and trackDb fragment - generate_autosql(as_path, databases) + # Step 1: AutoSql and trackDb fragment. The autoSql table name matches the + # output prefix so each combined track (varFreqsAll/Disease/Array) gets a + # distinct table definition. + generate_autosql(as_path, databases, table_name=args.output_prefix) ra_path = os.path.join(args.work_dir, f"{args.output_prefix}.trackDb.ra") - generate_trackdb_fragment(ra_path, databases) + generate_trackdb_fragment(ra_path, databases, track_name=args.output_prefix) # Step 2: Phase 1 — pre-extract pre_extract_all(databases, args.region, extract_dir, args.threads) # Step 3: Phase 2 — process chromosomes print(f"\n=== Phase 2: Processing {len(chromosomes)} chromosome(s) ===", file=sys.stderr) task_args = [(chrom, args.annotated_vcf, databases, extract_dir, bed_dir) for chrom in chromosomes] with Pool(min(args.threads, len(chromosomes))) as pool: chrom_beds = pool.map(process_chromosome, task_args) chrom_beds = [b for b in chrom_beds if b is not None]