af9d8fe39e88f7b7cec3792ea797dab44f1416b0 max Tue May 19 04:36:02 2026 -0700 varFreqs: rebuild varFreqsAll with WBBC/TPMI/ChinaMAP/GenomeIndia, drop IndiGen, harden build pipeline Rebuilds /gbdb/hg38/varFreqs/_all/varFreqsAll.bb to fold in the four new subtracks registered earlier in May (WBBC 78.6M, TPMI 672k, ChinaMAP 147M, GenomeIndia 130M) and to drop IndiGen, which ships only a VRT bit and contributed an always-empty AC/AF column. New bb is 47 GB / 147 fields / 1.34 billion items (was 44 GB / 133 / 1.22B). Two pipeline fixes were necessary mid-rebuild: - bcftools 1.22 csq is stricter than earlier versions. Added --unify-chr-names chr,-,chr (Ensembl GFF3 uses bare "1" while merged VCF + FASTA use "chr1") and --force (5 SCHEMA alt contigs end up in the merge but aren't annotated in the GFF3) to the csq invocation in mergeAndAnnotate.sh. Four follow-up cleanups to the build scripts (no track change, just safer next rebuild): - mergeAndAnnotate.sh now reads VCF paths directly from databases.tsv in both the per-VCF strip+norm step and the merge step. The previous "files.txt + find normalized/" model could silently re-merge stale norm cache entries after a database was dropped from databases.tsv. - vcfToBigBed.py concat step streams sort stdout straight to disk instead of capture_output=True, which buffered the whole sorted chromosome (~24 GB for chr1) in Python RAM. - vcfToBigBed.py generate_trackdb_fragment() now emits the three customizations that used to have to be added on top of the auto-fragment by hand: filterType.consequence multipleListOr, the expanded consequence buckets (3_prime_utr, 5_prime_utr, non_coding, others), and skipEmptyFields on. - trackDb/human/varFreqs.ra updated to match the new bb columns (WBBC/TPMI/ChinaMAP/GenomeIndia AC+AF filters, WBBC 4-region population filters, IndiGen filter removed). refs #36642 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> diff --git src/hg/makeDb/scripts/varFreqs/mergeAndAnnotate.sh src/hg/makeDb/scripts/varFreqs/mergeAndAnnotate.sh index 47a875afbe0..b1ae0b37988 100755 --- src/hg/makeDb/scripts/varFreqs/mergeAndAnnotate.sh +++ src/hg/makeDb/scripts/varFreqs/mergeAndAnnotate.sh @@ -1,29 +1,33 @@ #!/bin/bash set -eu # Note: not using pipefail due to bcftools pipeline exit codes # Merge multiple VCF files and annotate with protein consequences # Uses bcftools for merging and consequence annotation # Step 4 (strip+normalize) runs in parallel for speed. # # Usage: # ./mergeAndAnnotate.sh # full genome # ./mergeAndAnnotate.sh --region chrM # quick test on chrM WORKDIR="/hive/data/genomes/hg38/bed/varFreqs/all" -FILELIST="$WORKDIR/files.txt" +# Source of truth for which VCFs go into the combined track. The same TSV +# drives vcfToBigBed.py; reading both pipeline stages off it keeps the two +# lists from drifting (the old files.txt could miss new entries silently) +# and keeps stale entries in normalized/ out of the merge. +DBTSV="$HOME/kent/src/hg/makeDb/scripts/varFreqs/databases.tsv" REF2BIT="/hive/data/genomes/hg38/hg38.2bit" REFFASTA="$WORKDIR/hg38.fa" GFF3="$WORKDIR/Homo_sapiens.GRCh38.115.chr.gff3.gz" THREADS=8 PARALLEL_JOBS=8 # Parse --region flag REGION="" while [[ $# -gt 0 ]]; do case $1 in --region) REGION="$2"; shift 2 ;; *) echo "Unknown option: $1"; exit 1 ;; esac done @@ -163,73 +167,86 @@ } bcftools index -t "$outfile" 2>/dev/null rm -f "$stripped" "$stripped.tbi" local nvars nvars=$(bcftools index -n "$outfile" 2>/dev/null || echo "?") echo " $bname: Done: $nvars variants" } export -f process_one_vcf echo "" echo "Step 4: Processing VCFs in parallel ($PARALLEL_JOBS jobs)..." mkdir -p "$WORKDIR/normalized" "$WORKDIR/stripped" -grep -v '^#' "$FILELIST" | grep -v '^\s*$' | \ +# Pull the VCF column out of databases.tsv (skip comments + blank lines). +awk -F'\t' '!/^#/ && NF >= 3 && $3 != "" {print $3}' "$DBTSV" | \ parallel -j "$PARALLEL_JOBS" --line-buffer process_one_vcf {} echo "Step 4: Done." # ============================================================================ # Step 5: Merge all normalized VCFs # ============================================================================ echo "" echo "Step 5: Merging all VCFs..." -find "$WORKDIR/normalized" -name "*${SUFFIX}.norm.vcf.gz" | sort > "$WORKDIR/normalized_files${SUFFIX}.txt" +# Build the merge list from databases.tsv so it tracks exactly what Step 4 +# processed. Previously we used `find normalized/ -name '*.norm.vcf.gz'`, +# which silently re-merged stale entries from prior builds (e.g. +# IndiGenomes_Variants.norm.vcf.gz after IndiGen was dropped from the +# config). Basename rule mirrors process_one_vcf: strip .vcf.gz only +# (.vcf.bgz like SG10K_Health is left intact, matching the cache layout). NORM_LIST="$WORKDIR/normalized_files${SUFFIX}.txt" +awk -F'\t' '!/^#/ && NF >= 3 && $3 != "" {print $3}' "$DBTSV" | \ +while read -r vcf; do + bn=$(basename "$vcf" .vcf.gz) + echo "$WORKDIR/normalized/${bn}${SUFFIX}.norm.vcf.gz" +done | sort > "$NORM_LIST" nfiles=$(wc -l < "$NORM_LIST") echo " Found $nfiles normalized VCF files" if [[ ! -f "$MERGED" ]]; then bcftools merge \ --threads "$THREADS" \ -l "$NORM_LIST" \ -m none \ --force-samples \ -Oz -o "$MERGED" bcftools index -t "$MERGED" echo " Merged VCF created: $MERGED" else echo " Merged VCF already exists, skipping" fi # ============================================================================ # Step 6: Annotate with consequences # ============================================================================ echo "" echo "Step 6: Annotating with protein consequences..." if [[ ! -f "$ANNOTATED" ]]; then echo " Running bcftools csq..." bcftools csq \ --threads "$THREADS" \ $REGION_ARGS \ -p a \ -l \ -n 64 \ + --unify-chr-names chr,-,chr \ + --force \ -f "$REFFASTA" \ -g "$GFF3" \ "$MERGED" \ -Oz -o "$ANNOTATED" 2>"$WORKDIR/csq.log" || { echo " WARNING: bcftools csq had errors. Check csq.log" cat "$WORKDIR/csq.log" } if [[ -f "$ANNOTATED" ]]; then bcftools index -t "$ANNOTATED" echo " Annotated VCF created: $ANNOTATED" fi else echo " Annotated VCF already exists, skipping" fi