af9d8fe39e88f7b7cec3792ea797dab44f1416b0 max Tue May 19 04:36:02 2026 -0700 varFreqs: rebuild varFreqsAll with WBBC/TPMI/ChinaMAP/GenomeIndia, drop IndiGen, harden build pipeline Rebuilds /gbdb/hg38/varFreqs/_all/varFreqsAll.bb to fold in the four new subtracks registered earlier in May (WBBC 78.6M, TPMI 672k, ChinaMAP 147M, GenomeIndia 130M) and to drop IndiGen, which ships only a VRT bit and contributed an always-empty AC/AF column. New bb is 47 GB / 147 fields / 1.34 billion items (was 44 GB / 133 / 1.22B). Two pipeline fixes were necessary mid-rebuild: - bcftools 1.22 csq is stricter than earlier versions. Added --unify-chr-names chr,-,chr (Ensembl GFF3 uses bare "1" while merged VCF + FASTA use "chr1") and --force (5 SCHEMA alt contigs end up in the merge but aren't annotated in the GFF3) to the csq invocation in mergeAndAnnotate.sh. Four follow-up cleanups to the build scripts (no track change, just safer next rebuild): - mergeAndAnnotate.sh now reads VCF paths directly from databases.tsv in both the per-VCF strip+norm step and the merge step. The previous "files.txt + find normalized/" model could silently re-merge stale norm cache entries after a database was dropped from databases.tsv. - vcfToBigBed.py concat step streams sort stdout straight to disk instead of capture_output=True, which buffered the whole sorted chromosome (~24 GB for chr1) in Python RAM. - vcfToBigBed.py generate_trackdb_fragment() now emits the three customizations that used to have to be added on top of the auto-fragment by hand: filterType.consequence multipleListOr, the expanded consequence buckets (3_prime_utr, 5_prime_utr, non_coding, others), and skipEmptyFields on. - trackDb/human/varFreqs.ra updated to match the new bb columns (WBBC/TPMI/ChinaMAP/GenomeIndia AC+AF filters, WBBC 4-region population filters, IndiGen filter removed). refs #36642 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> diff --git src/hg/makeDb/scripts/varFreqs/mergeAndAnnotate.sh src/hg/makeDb/scripts/varFreqs/mergeAndAnnotate.sh index 47a875afbe0..b1ae0b37988 100755 --- src/hg/makeDb/scripts/varFreqs/mergeAndAnnotate.sh +++ src/hg/makeDb/scripts/varFreqs/mergeAndAnnotate.sh @@ -1,256 +1,273 @@ #!/bin/bash set -eu # Note: not using pipefail due to bcftools pipeline exit codes # Merge multiple VCF files and annotate with protein consequences # Uses bcftools for merging and consequence annotation # Step 4 (strip+normalize) runs in parallel for speed. # # Usage: # ./mergeAndAnnotate.sh # full genome # ./mergeAndAnnotate.sh --region chrM # quick test on chrM WORKDIR="/hive/data/genomes/hg38/bed/varFreqs/all" -FILELIST="$WORKDIR/files.txt" +# Source of truth for which VCFs go into the combined track. The same TSV +# drives vcfToBigBed.py; reading both pipeline stages off it keeps the two +# lists from drifting (the old files.txt could miss new entries silently) +# and keeps stale entries in normalized/ out of the merge. +DBTSV="$HOME/kent/src/hg/makeDb/scripts/varFreqs/databases.tsv" REF2BIT="/hive/data/genomes/hg38/hg38.2bit" REFFASTA="$WORKDIR/hg38.fa" GFF3="$WORKDIR/Homo_sapiens.GRCh38.115.chr.gff3.gz" THREADS=8 PARALLEL_JOBS=8 # Parse --region flag REGION="" while [[ $# -gt 0 ]]; do case $1 in --region) REGION="$2"; shift 2 ;; *) echo "Unknown option: $1"; exit 1 ;; esac done REGION_ARGS="" SUFFIX="" VIEW_REGION="" if [[ -n "$REGION" ]]; then REGION_ARGS="-r $REGION" SUFFIX=".${REGION%%:*}" # e.g. ".chrM" VIEW_REGION="$REGION" echo "*** REGION MODE: $REGION ***" else VIEW_REGION="chr1,chr2,chr3,chr4,chr5,chr6,chr7,chr8,chr9,chr10,chr11,chr12,chr13,chr14,chr15,chr16,chr17,chr18,chr19,chr20,chr21,chr22,chrX,chrY,chrM" fi # Output files MERGED="$WORKDIR/merged${SUFFIX}.vcf.gz" ANNOTATED="$WORKDIR/merged${SUFFIX}.annotated.vcf.gz" echo "=== VCF Merge and Annotate Pipeline ===" echo "Working directory: $WORKDIR" echo "" # Step 1: Check GFF3 annotation file if [[ ! -f "$GFF3" ]]; then echo "Step 1: ERROR: GFF3 file not found: $GFF3" echo " Download from: https://ftp.ensembl.org/pub/release-115/gff3/homo_sapiens/Homo_sapiens.GRCh38.115.chr.gff3.gz" exit 1 fi if [[ ! -f "${GFF3}.tbi" ]]; then echo "Step 1: Indexing GFF3..." tabix -p gff "$GFF3" else echo "Step 1: GFF3 OK" fi # Step 2: Create FASTA from 2bit if not present if [[ ! -f "$REFFASTA" ]]; then echo "Step 2: Converting 2bit to FASTA..." twoBitToFa "$REF2BIT" "$REFFASTA" samtools faidx "$REFFASTA" else echo "Step 2: FASTA already exists, skipping" fi # Step 3: Create chromosome rename map CHRMAP="$WORKDIR/chr_rename.txt" if [[ ! -f "$CHRMAP" ]]; then echo "Step 3: Creating chromosome rename map..." for i in {1..22} X Y M; do echo "$i chr$i"; done > "$CHRMAP" echo "MT chrM" >> "$CHRMAP" echo "23 chrX" >> "$CHRMAP" echo "24 chrY" >> "$CHRMAP" else echo "Step 3: Chromosome rename map already exists, skipping" fi # ============================================================================ # Step 4: Strip and normalize VCFs (PARALLEL) # ============================================================================ # Export variables needed by the worker function export WORKDIR SUFFIX VIEW_REGION REFFASTA CHRMAP # Worker function: process one VCF (strip + normalize) process_one_vcf() { local vcf="$1" local bname=$(basename "$vcf" .vcf.gz) local stripped="$WORKDIR/stripped/${bname}${SUFFIX}.stripped.vcf.gz" local outfile="$WORKDIR/normalized/${bname}${SUFFIX}.norm.vcf.gz" if [[ -f "$outfile" ]]; then echo " $bname: already done, skipping" return 0 fi echo " Processing: $bname" # Check chromosome naming local first_chr first_chr=$(bcftools view -H "$vcf" 2>/dev/null | head -1 | cut -f1) if [[ -z "$first_chr" ]]; then echo " $bname: WARNING: Empty VCF or cannot read, skipping" return 0 fi # Check if header uses chr prefix local header_chr header_chr=$(bcftools view -h "$vcf" 2>/dev/null | grep '##contig' | head -1 | grep -o 'ID=chr' || true) # Strip fields and fix chromosome naming if [[ "$first_chr" =~ ^chr ]]; then # Data already has chr prefix bcftools view -r "$VIEW_REGION" "$vcf" 2>/dev/null | \ bcftools annotate --force -x ID,QUAL,FILTER,INFO -Oz -o "$stripped" 2>/dev/null || { echo " $bname: ERROR: strip failed"; return 1 } elif [[ -n "$header_chr" ]]; then # Header has chr but data doesn't (e.g. AllOfUs) echo " $bname: fixing mismatched chr prefix" bcftools annotate --force -x ID,QUAL,FILTER,INFO "$vcf" 2>/dev/null | \ awk 'BEGIN{OFS="\t"} /^#/{print;next} {$1="chr"$1; print}' | \ bgzip -c > "${stripped}.tmp.vcf.gz" 2>/dev/null && \ bcftools index -t "${stripped}.tmp.vcf.gz" 2>/dev/null && \ bcftools view -r "$VIEW_REGION" "${stripped}.tmp.vcf.gz" -Oz -o "$stripped" 2>/dev/null && \ rm -f "${stripped}.tmp.vcf.gz" "${stripped}.tmp.vcf.gz.tbi" || { echo " $bname: ERROR: strip/fix failed"; rm -f "${stripped}.tmp.vcf.gz"*; return 1 } else # No chr prefix anywhere echo " $bname: adding chr prefix" bcftools annotate --force -x ID,QUAL,FILTER,INFO "$vcf" 2>/dev/null | \ awk 'BEGIN{OFS="\t"; r["23"]="chrX"; r["24"]="chrY"; r["MT"]="chrM" } /^##contig=<ID=23>/{sub(/ID=23/,"ID=chrX"); print; next} /^##contig=<ID=24>/{sub(/ID=24/,"ID=chrY"); print; next} /^##contig=<ID=MT>/{sub(/ID=MT/,"ID=chrM"); print; next} /^##contig=<ID=([0-9]|[XYM])/{sub(/ID=/,"ID=chr")} /^#/{print;next} {c=$1; if(c in r) $1=r[c]; else $1="chr"c; print}' | \ bgzip -c > "${stripped}.tmp.vcf.gz" 2>/dev/null && \ bcftools index -t "${stripped}.tmp.vcf.gz" 2>/dev/null && \ bcftools view -r "$VIEW_REGION" "${stripped}.tmp.vcf.gz" -Oz -o "$stripped" 2>/dev/null && \ rm -f "${stripped}.tmp.vcf.gz" "${stripped}.tmp.vcf.gz.tbi" || { echo " $bname: ERROR: strip/rename failed"; rm -f "${stripped}.tmp.vcf.gz"*; return 1 } fi bcftools index -t "$stripped" 2>/dev/null # Normalize (split multi-allelic sites) bcftools norm -m -any --check-ref x -f "$REFFASTA" "$stripped" -Oz -o "$outfile" 2>/dev/null || { echo " $bname: ERROR: normalization failed" rm -f "$stripped" "$stripped.tbi" return 1 } bcftools index -t "$outfile" 2>/dev/null rm -f "$stripped" "$stripped.tbi" local nvars nvars=$(bcftools index -n "$outfile" 2>/dev/null || echo "?") echo " $bname: Done: $nvars variants" } export -f process_one_vcf echo "" echo "Step 4: Processing VCFs in parallel ($PARALLEL_JOBS jobs)..." mkdir -p "$WORKDIR/normalized" "$WORKDIR/stripped" -grep -v '^#' "$FILELIST" | grep -v '^\s*$' | \ +# Pull the VCF column out of databases.tsv (skip comments + blank lines). +awk -F'\t' '!/^#/ && NF >= 3 && $3 != "" {print $3}' "$DBTSV" | \ parallel -j "$PARALLEL_JOBS" --line-buffer process_one_vcf {} echo "Step 4: Done." # ============================================================================ # Step 5: Merge all normalized VCFs # ============================================================================ echo "" echo "Step 5: Merging all VCFs..." -find "$WORKDIR/normalized" -name "*${SUFFIX}.norm.vcf.gz" | sort > "$WORKDIR/normalized_files${SUFFIX}.txt" +# Build the merge list from databases.tsv so it tracks exactly what Step 4 +# processed. Previously we used `find normalized/ -name '*.norm.vcf.gz'`, +# which silently re-merged stale entries from prior builds (e.g. +# IndiGenomes_Variants.norm.vcf.gz after IndiGen was dropped from the +# config). Basename rule mirrors process_one_vcf: strip .vcf.gz only +# (.vcf.bgz like SG10K_Health is left intact, matching the cache layout). NORM_LIST="$WORKDIR/normalized_files${SUFFIX}.txt" +awk -F'\t' '!/^#/ && NF >= 3 && $3 != "" {print $3}' "$DBTSV" | \ +while read -r vcf; do + bn=$(basename "$vcf" .vcf.gz) + echo "$WORKDIR/normalized/${bn}${SUFFIX}.norm.vcf.gz" +done | sort > "$NORM_LIST" nfiles=$(wc -l < "$NORM_LIST") echo " Found $nfiles normalized VCF files" if [[ ! -f "$MERGED" ]]; then bcftools merge \ --threads "$THREADS" \ -l "$NORM_LIST" \ -m none \ --force-samples \ -Oz -o "$MERGED" bcftools index -t "$MERGED" echo " Merged VCF created: $MERGED" else echo " Merged VCF already exists, skipping" fi # ============================================================================ # Step 6: Annotate with consequences # ============================================================================ echo "" echo "Step 6: Annotating with protein consequences..." if [[ ! -f "$ANNOTATED" ]]; then echo " Running bcftools csq..." bcftools csq \ --threads "$THREADS" \ $REGION_ARGS \ -p a \ -l \ -n 64 \ + --unify-chr-names chr,-,chr \ + --force \ -f "$REFFASTA" \ -g "$GFF3" \ "$MERGED" \ -Oz -o "$ANNOTATED" 2>"$WORKDIR/csq.log" || { echo " WARNING: bcftools csq had errors. Check csq.log" cat "$WORKDIR/csq.log" } if [[ -f "$ANNOTATED" ]]; then bcftools index -t "$ANNOTATED" echo " Annotated VCF created: $ANNOTATED" fi else echo " Annotated VCF already exists, skipping" fi # ============================================================================ # Step 7: Summary # ============================================================================ echo "" echo "=== Summary ===" if [[ -f "$MERGED" ]]; then echo "Merged variants: $(bcftools view -H "$MERGED" | wc -l)" fi if [[ -f "$ANNOTATED" ]]; then echo "" echo "Consequence summary (top 20):" bcftools query -f '%INFO/BCSQ\n' "$ANNOTATED" 2>/dev/null | \ cut -d'|' -f1 | sort | uniq -c | sort -rn | head -20 fi echo "" echo "Done! Output files:" echo " - Merged VCF: $MERGED" echo " - Annotated VCF: $ANNOTATED"