af9d8fe39e88f7b7cec3792ea797dab44f1416b0 max Tue May 19 04:36:02 2026 -0700 varFreqs: rebuild varFreqsAll with WBBC/TPMI/ChinaMAP/GenomeIndia, drop IndiGen, harden build pipeline Rebuilds /gbdb/hg38/varFreqs/_all/varFreqsAll.bb to fold in the four new subtracks registered earlier in May (WBBC 78.6M, TPMI 672k, ChinaMAP 147M, GenomeIndia 130M) and to drop IndiGen, which ships only a VRT bit and contributed an always-empty AC/AF column. New bb is 47 GB / 147 fields / 1.34 billion items (was 44 GB / 133 / 1.22B). Two pipeline fixes were necessary mid-rebuild: - bcftools 1.22 csq is stricter than earlier versions. Added --unify-chr-names chr,-,chr (Ensembl GFF3 uses bare "1" while merged VCF + FASTA use "chr1") and --force (5 SCHEMA alt contigs end up in the merge but aren't annotated in the GFF3) to the csq invocation in mergeAndAnnotate.sh. Four follow-up cleanups to the build scripts (no track change, just safer next rebuild): - mergeAndAnnotate.sh now reads VCF paths directly from databases.tsv in both the per-VCF strip+norm step and the merge step. The previous "files.txt + find normalized/" model could silently re-merge stale norm cache entries after a database was dropped from databases.tsv. - vcfToBigBed.py concat step streams sort stdout straight to disk instead of capture_output=True, which buffered the whole sorted chromosome (~24 GB for chr1) in Python RAM. - vcfToBigBed.py generate_trackdb_fragment() now emits the three customizations that used to have to be added on top of the auto-fragment by hand: filterType.consequence multipleListOr, the expanded consequence buckets (3_prime_utr, 5_prime_utr, non_coding, others), and skipEmptyFields on. - trackDb/human/varFreqs.ra updated to match the new bb columns (WBBC/TPMI/ChinaMAP/GenomeIndia AC+AF filters, WBBC 4-region population filters, IndiGen filter removed). refs #36642 Co-Authored-By: Claude Opus 4.7 (1M context) diff --git src/hg/makeDb/scripts/varFreqs/vcfToBigBed.py src/hg/makeDb/scripts/varFreqs/vcfToBigBed.py index 84034dd4b5e..5bd7a00de19 100755 --- src/hg/makeDb/scripts/varFreqs/vcfToBigBed.py +++ src/hg/makeDb/scripts/varFreqs/vcfToBigBed.py @@ -480,31 +480,33 @@ src_parts.append(f"{db_key}|{db_info['name']}") f.write(" filterValues.sources " + ",".join(src_parts) + "\n") f.write(" filterType.sources multipleListOr\n") f.write(" filterLabel.sources Source Database\n") # Variant type and consequence (static) f.write(" # Variant type and consequence filters\n") f.write(" filterValues.varType " "SNV|SNV,INS|Insertion,DEL|Deletion,MNV|MNV\n") f.write(" filterLabel.varType Variant Type\n") f.write(" filterValues.consequence " "missense|Missense,synonymous|Synonymous," "stop_gained|Stop Gained,frameshift|Frameshift," "splice_donor|Splice Donor,splice_acceptor|Splice Acceptor," - "intron|Intron,.|Intergenic\n") + "intron|Intron,3_prime_utr|3' UTR,5_prime_utr|5' UTR," + "non_coding|Non-coding,.|Intergenic,others|Other\n") + f.write(" filterType.consequence multipleListOr\n") f.write(" filterLabel.consequence Consequence\n") # Length filters f.write(" # Length filters\n") for fld in ["refLen", "altLen", "varLen"]: label = {"refLen": "Reference Length", "altLen": "Alternate Length", "varLen": "Length Change"}[fld] f.write(f" filterByRange.{fld} on\n") f.write(f" filterLabel.{fld} {label}\n") # Max AF filter f.write(" # Max AF filter\n") f.write(" filterByRange.maxAF on\n") f.write(" filterLabel.maxAF Max Allele Frequency\n") f.write(" filterLimits.maxAF 0:1\n") @@ -530,30 +532,32 @@ # Population-specific filters f.write(" # Population-specific AF filters\n") for db_key, db_info in databases.items(): if not db_info["pops"]: continue f.write(f" # {db_info['name']} populations\n") for pop in db_info["pops"]: f.write(f" filterByRange.{db_key}AF_{pop['key']} on\n") f.write(f" filterLabel.{db_key}AF_{pop['key']} " f"{db_info['name']} {pop['name']} AF\n") for pop in db_info["pops"]: f.write(f" filterByRange.{db_key}AC_{pop['key']} on\n") f.write(f" filterLabel.{db_key}AC_{pop['key']} " f"{db_info['name']} {pop['name']} AC\n") + f.write(" skipEmptyFields on\n") + print(f"Wrote trackDb fragment: {output_path}", file=sys.stderr) # ============================================================================ # Main # ============================================================================ def main(): parser = argparse.ArgumentParser( description="Convert annotated VCF to bigBed (two-phase parallel)") parser.add_argument("--annotated-vcf", default="merged.annotated.vcf.gz") parser.add_argument("--output-prefix", default="varFreqs") parser.add_argument("--chrom-sizes", default="/hive/data/genomes/hg38/chrom.sizes") parser.add_argument("--threads", type=int, default=8) @@ -597,39 +601,39 @@ print(f"\n=== Phase 2: Processing {len(chromosomes)} chromosome(s) ===", file=sys.stderr) task_args = [(chrom, args.annotated_vcf, databases, extract_dir, bed_dir) for chrom in chromosomes] with Pool(min(args.threads, len(chromosomes))) as pool: chrom_beds = pool.map(process_chromosome, task_args) chrom_beds = [b for b in chrom_beds if b is not None] if not chrom_beds: print("ERROR: No BED output produced", file=sys.stderr) sys.exit(1) # Step 4: Concatenate and sort + # Stream sort output straight to disk instead of capture_output=True, which + # would buffer the full sorted chrom (~24 GB for chr1) in Python's RAM. print(f"\n=== Concatenating {len(chrom_beds)} chromosome files ===", file=sys.stderr) bed_path = os.path.join(args.work_dir, f"{args.output_prefix}.bed") - with open(bed_path, "w") as out: + with open(bed_path, "wb") as out: for chrom_bed in chrom_beds: - result = subprocess.run( - ["sort", "-k2,2n", chrom_bed], - capture_output=True, text=True) - out.write(result.stdout) + subprocess.run(["sort", "-k2,2n", chrom_bed], + stdout=out, check=True) # Step 5: bedToBigBed print(f"\n=== Running bedToBigBed ===", file=sys.stderr) cmd = ["bedToBigBed", "-type=bed9+", f"-as={as_path}", "-tab", bed_path, args.chrom_sizes, bb_path] print(f" {' '.join(cmd)}", file=sys.stderr) result = subprocess.run(cmd, capture_output=True, text=True) if result.returncode != 0: print(f" Error: {result.stderr}", file=sys.stderr) sys.exit(1) # Step 6: Key mapping with open(key_path, "w") as f: for key, info in databases.items(): f.write(f"{key}|{info['name']}\n")