src/hg/makeDb/scripts/varFreqs/mergeAndAnnotate.sh af9d8fe39e88f7b7cec3792ea797dab44f1416b0

af9d8fe39e88f7b7cec3792ea797dab44f1416b0
max
  Tue May 19 04:36:02 2026 -0700
varFreqs: rebuild varFreqsAll with WBBC/TPMI/ChinaMAP/GenomeIndia, drop IndiGen, harden build pipeline

Rebuilds /gbdb/hg38/varFreqs/_all/varFreqsAll.bb to fold in the four new
subtracks registered earlier in May (WBBC 78.6M, TPMI 672k, ChinaMAP
147M, GenomeIndia 130M) and to drop IndiGen, which ships only a VRT bit
and contributed an always-empty AC/AF column. New bb is 47 GB / 147
fields / 1.34 billion items (was 44 GB / 133 / 1.22B).

Two pipeline fixes were necessary mid-rebuild:

- bcftools 1.22 csq is stricter than earlier versions. Added
--unify-chr-names chr,-,chr (Ensembl GFF3 uses bare "1" while merged
VCF + FASTA use "chr1") and --force (5 SCHEMA alt contigs end up in
the merge but aren't annotated in the GFF3) to the csq invocation in
mergeAndAnnotate.sh.

Four follow-up cleanups to the build scripts (no track change, just
safer next rebuild):

- mergeAndAnnotate.sh now reads VCF paths directly from databases.tsv
in both the per-VCF strip+norm step and the merge step. The previous
"files.txt + find normalized/" model could silently re-merge stale
norm cache entries after a database was dropped from databases.tsv.

- vcfToBigBed.py concat step streams sort stdout straight to disk
instead of capture_output=True, which buffered the whole sorted
chromosome (~24 GB for chr1) in Python RAM.

- vcfToBigBed.py generate_trackdb_fragment() now emits the three
customizations that used to have to be added on top of the
auto-fragment by hand: filterType.consequence multipleListOr, the
expanded consequence buckets (3_prime_utr, 5_prime_utr, non_coding,
others), and skipEmptyFields on.

- trackDb/human/varFreqs.ra updated to match the new bb columns
(WBBC/TPMI/ChinaMAP/GenomeIndia AC+AF filters, WBBC 4-region
population filters, IndiGen filter removed).

refs #36642

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

diff --git src/hg/makeDb/scripts/varFreqs/mergeAndAnnotate.sh src/hg/makeDb/scripts/varFreqs/mergeAndAnnotate.sh
index 47a875afbe0..b1ae0b37988 100755
--- src/hg/makeDb/scripts/varFreqs/mergeAndAnnotate.sh
+++ src/hg/makeDb/scripts/varFreqs/mergeAndAnnotate.sh
@@ -1,29 +1,33 @@
 #!/bin/bash
 set -eu
 # Note: not using pipefail due to bcftools pipeline exit codes
 
 # Merge multiple VCF files and annotate with protein consequences
 # Uses bcftools for merging and consequence annotation
 # Step 4 (strip+normalize) runs in parallel for speed.
 #
 # Usage:
 #   ./mergeAndAnnotate.sh              # full genome
 #   ./mergeAndAnnotate.sh --region chrM # quick test on chrM
 
 WORKDIR="/hive/data/genomes/hg38/bed/varFreqs/all"
-FILELIST="$WORKDIR/files.txt"
+# Source of truth for which VCFs go into the combined track. The same TSV
+# drives vcfToBigBed.py; reading both pipeline stages off it keeps the two
+# lists from drifting (the old files.txt could miss new entries silently)
+# and keeps stale entries in normalized/ out of the merge.
+DBTSV="$HOME/kent/src/hg/makeDb/scripts/varFreqs/databases.tsv"
 REF2BIT="/hive/data/genomes/hg38/hg38.2bit"
 REFFASTA="$WORKDIR/hg38.fa"
 GFF3="$WORKDIR/Homo_sapiens.GRCh38.115.chr.gff3.gz"
 THREADS=8
 PARALLEL_JOBS=8
 
 # Parse --region flag
 REGION=""
 while [[ $# -gt 0 ]]; do
     case $1 in
         --region) REGION="$2"; shift 2 ;;
         *) echo "Unknown option: $1"; exit 1 ;;
     esac
 done
 
@@ -163,73 +167,86 @@
     }
 
     bcftools index -t "$outfile" 2>/dev/null
     rm -f "$stripped" "$stripped.tbi"
 
     local nvars
     nvars=$(bcftools index -n "$outfile" 2>/dev/null || echo "?")
     echo "    $bname: Done: $nvars variants"
 }
 export -f process_one_vcf
 
 echo ""
 echo "Step 4: Processing VCFs in parallel ($PARALLEL_JOBS jobs)..."
 mkdir -p "$WORKDIR/normalized" "$WORKDIR/stripped"
 
-grep -v '^#' "$FILELIST" | grep -v '^\s*$' | \
+# Pull the VCF column out of databases.tsv (skip comments + blank lines).
+awk -F'\t' '!/^#/ && NF >= 3 && $3 != "" {print $3}' "$DBTSV" | \
     parallel -j "$PARALLEL_JOBS" --line-buffer process_one_vcf {}
 
 echo "Step 4: Done."
 
 # ============================================================================
 # Step 5: Merge all normalized VCFs
 # ============================================================================
 echo ""
 echo "Step 5: Merging all VCFs..."
 
-find "$WORKDIR/normalized" -name "*${SUFFIX}.norm.vcf.gz" | sort > "$WORKDIR/normalized_files${SUFFIX}.txt"
+# Build the merge list from databases.tsv so it tracks exactly what Step 4
+# processed. Previously we used `find normalized/ -name '*.norm.vcf.gz'`,
+# which silently re-merged stale entries from prior builds (e.g.
+# IndiGenomes_Variants.norm.vcf.gz after IndiGen was dropped from the
+# config). Basename rule mirrors process_one_vcf: strip .vcf.gz only
+# (.vcf.bgz like SG10K_Health is left intact, matching the cache layout).
 NORM_LIST="$WORKDIR/normalized_files${SUFFIX}.txt"
+awk -F'\t' '!/^#/ && NF >= 3 && $3 != "" {print $3}' "$DBTSV" | \
+while read -r vcf; do
+    bn=$(basename "$vcf" .vcf.gz)
+    echo "$WORKDIR/normalized/${bn}${SUFFIX}.norm.vcf.gz"
+done | sort > "$NORM_LIST"
 nfiles=$(wc -l < "$NORM_LIST")
 echo "  Found $nfiles normalized VCF files"
 
 if [[ ! -f "$MERGED" ]]; then
     bcftools merge \
         --threads "$THREADS" \
         -l "$NORM_LIST" \
         -m none \
         --force-samples \
         -Oz -o "$MERGED"
     bcftools index -t "$MERGED"
     echo "  Merged VCF created: $MERGED"
 else
     echo "  Merged VCF already exists, skipping"
 fi
 
 # ============================================================================
 # Step 6: Annotate with consequences
 # ============================================================================
 echo ""
 echo "Step 6: Annotating with protein consequences..."
 
 if [[ ! -f "$ANNOTATED" ]]; then
     echo "  Running bcftools csq..."
     bcftools csq \
         --threads "$THREADS" \
         $REGION_ARGS \
         -p a \
         -l \
         -n 64 \
+        --unify-chr-names chr,-,chr \
+        --force \
         -f "$REFFASTA" \
         -g "$GFF3" \
         "$MERGED" \
         -Oz -o "$ANNOTATED" 2>"$WORKDIR/csq.log" || {
             echo "  WARNING: bcftools csq had errors. Check csq.log"
             cat "$WORKDIR/csq.log"
         }
 
     if [[ -f "$ANNOTATED" ]]; then
         bcftools index -t "$ANNOTATED"
         echo "  Annotated VCF created: $ANNOTATED"
     fi
 else
     echo "  Annotated VCF already exists, skipping"
 fi