3180d71425ab40bc022712bb95868bfe80747375
max
  Fri May 29 08:52:38 2026 -0700
[Claude] varFreqs: split SPARK+SCHEMA by phenotype, add disease + array combined tracks, drop array cohorts from varFreqsAll

#Preview2 week - bugs introduced now will need a build patch to fix
Split SFARI SPARK WES and WGS by autism status using fill-tags -S with the
SPARK individuals_registration TSV (AC_AUT / AN_AUT / AF_AUT plus
AC_NON_AUT / AN_NON_AUT / AF_NON_AUT). Added matching SCHEMA case/control
sums (AC_CASE etc.). Two new combined bigBed tracks: varFreqsDisease
(SPARK, SFARI WGS, TOPMed, SCHEMA, GREGoR, GA4K) and varFreqsArray (TPMI,
MexBB, UKBB). TPMI and MexBB are removed from varFreqsAll so the main
combined track is purely WGS/WES.

Build scripts parameterized so the same code drives all three combined
builds: mergeAndAnnotate.sh gains --databases / --tag, vcfToBigBed.py
gains --databases-file / --populations-file and a per-track autoSql table
name. mergeAndAnnotate.sh now pins /cluster/software/src/bcftools-1.22 in
PATH (--unify-chr-names is a 1.22 feature; conda's 1.14 silently fails).

refs #36642

diff --git src/hg/makeDb/scripts/varFreqs/mergeAndAnnotate.sh src/hg/makeDb/scripts/varFreqs/mergeAndAnnotate.sh
index b1ae0b37988..91fdaea0c52 100755
--- src/hg/makeDb/scripts/varFreqs/mergeAndAnnotate.sh
+++ src/hg/makeDb/scripts/varFreqs/mergeAndAnnotate.sh
@@ -1,63 +1,76 @@
 #!/bin/bash
 set -eu
 # Note: not using pipefail due to bcftools pipeline exit codes
+#
+# bcftools csq's --unify-chr-names was added in 1.22; conda installs 1.14 by
+# default. Pin the 1.22 build at the front of PATH so every bcftools call in
+# this script (including the parallel-spawned ones) finds the right binary.
+export PATH=/cluster/software/src/bcftools-1.22:$PATH
 
 # Merge multiple VCF files and annotate with protein consequences
 # Uses bcftools for merging and consequence annotation
 # Step 4 (strip+normalize) runs in parallel for speed.
 #
 # Usage:
 #   ./mergeAndAnnotate.sh              # full genome
 #   ./mergeAndAnnotate.sh --region chrM # quick test on chrM
 
 WORKDIR="/hive/data/genomes/hg38/bed/varFreqs/all"
 # Source of truth for which VCFs go into the combined track. The same TSV
 # drives vcfToBigBed.py; reading both pipeline stages off it keeps the two
 # lists from drifting (the old files.txt could miss new entries silently)
 # and keeps stale entries in normalized/ out of the merge.
 DBTSV="$HOME/kent/src/hg/makeDb/scripts/varFreqs/databases.tsv"
 REF2BIT="/hive/data/genomes/hg38/hg38.2bit"
 REFFASTA="$WORKDIR/hg38.fa"
 GFF3="$WORKDIR/Homo_sapiens.GRCh38.115.chr.gff3.gz"
 THREADS=8
 PARALLEL_JOBS=8
 
-# Parse --region flag
+# Parse flags
+#   --region chrM        process a single region (quick test)
+#   --databases <tsv>    DB config (default databases.tsv); subsets reuse the
+#                        shared normalized/ cache built by the full run
+#   --tag <name>         suffix for merged/annotated outputs so disease/array
+#                        builds (e.g. --tag .disease) don't clobber the all run
 REGION=""
+TAG=""
 while [[ $# -gt 0 ]]; do
     case $1 in
         --region) REGION="$2"; shift 2 ;;
+        --databases) DBTSV="$2"; shift 2 ;;
+        --tag) TAG="$2"; shift 2 ;;
         *) echo "Unknown option: $1"; exit 1 ;;
     esac
 done
 
 REGION_ARGS=""
 SUFFIX=""
 VIEW_REGION=""
 if [[ -n "$REGION" ]]; then
     REGION_ARGS="-r $REGION"
     SUFFIX=".${REGION%%:*}"  # e.g. ".chrM"
     VIEW_REGION="$REGION"
     echo "*** REGION MODE: $REGION ***"
 else
     VIEW_REGION="chr1,chr2,chr3,chr4,chr5,chr6,chr7,chr8,chr9,chr10,chr11,chr12,chr13,chr14,chr15,chr16,chr17,chr18,chr19,chr20,chr21,chr22,chrX,chrY,chrM"
 fi
 
-# Output files
-MERGED="$WORKDIR/merged${SUFFIX}.vcf.gz"
-ANNOTATED="$WORKDIR/merged${SUFFIX}.annotated.vcf.gz"
+# Output files (TAG keeps disease/array merges separate from the all build)
+MERGED="$WORKDIR/merged${TAG}${SUFFIX}.vcf.gz"
+ANNOTATED="$WORKDIR/merged${TAG}${SUFFIX}.annotated.vcf.gz"
 
 echo "=== VCF Merge and Annotate Pipeline ==="
 echo "Working directory: $WORKDIR"
 echo ""
 
 # Step 1: Check GFF3 annotation file
 if [[ ! -f "$GFF3" ]]; then
     echo "Step 1: ERROR: GFF3 file not found: $GFF3"
     echo "  Download from: https://ftp.ensembl.org/pub/release-115/gff3/homo_sapiens/Homo_sapiens.GRCh38.115.chr.gff3.gz"
     exit 1
 fi
 if [[ ! -f "${GFF3}.tbi" ]]; then
     echo "Step 1: Indexing GFF3..."
     tabix -p gff "$GFF3"
 else
@@ -185,31 +198,31 @@
 
 echo "Step 4: Done."
 
 # ============================================================================
 # Step 5: Merge all normalized VCFs
 # ============================================================================
 echo ""
 echo "Step 5: Merging all VCFs..."
 
 # Build the merge list from databases.tsv so it tracks exactly what Step 4
 # processed. Previously we used `find normalized/ -name '*.norm.vcf.gz'`,
 # which silently re-merged stale entries from prior builds (e.g.
 # IndiGenomes_Variants.norm.vcf.gz after IndiGen was dropped from the
 # config). Basename rule mirrors process_one_vcf: strip .vcf.gz only
 # (.vcf.bgz like SG10K_Health is left intact, matching the cache layout).
-NORM_LIST="$WORKDIR/normalized_files${SUFFIX}.txt"
+NORM_LIST="$WORKDIR/normalized_files${TAG}${SUFFIX}.txt"
 awk -F'\t' '!/^#/ && NF >= 3 && $3 != "" {print $3}' "$DBTSV" | \
 while read -r vcf; do
     bn=$(basename "$vcf" .vcf.gz)
     echo "$WORKDIR/normalized/${bn}${SUFFIX}.norm.vcf.gz"
 done | sort > "$NORM_LIST"
 nfiles=$(wc -l < "$NORM_LIST")
 echo "  Found $nfiles normalized VCF files"
 
 if [[ ! -f "$MERGED" ]]; then
     bcftools merge \
         --threads "$THREADS" \
         -l "$NORM_LIST" \
         -m none \
         --force-samples \
         -Oz -o "$MERGED"