3180d71425ab40bc022712bb95868bfe80747375 max Fri May 29 08:52:38 2026 -0700 [Claude] varFreqs: split SPARK+SCHEMA by phenotype, add disease + array combined tracks, drop array cohorts from varFreqsAll #Preview2 week - bugs introduced now will need a build patch to fix Split SFARI SPARK WES and WGS by autism status using fill-tags -S with the SPARK individuals_registration TSV (AC_AUT / AN_AUT / AF_AUT plus AC_NON_AUT / AN_NON_AUT / AF_NON_AUT). Added matching SCHEMA case/control sums (AC_CASE etc.). Two new combined bigBed tracks: varFreqsDisease (SPARK, SFARI WGS, TOPMed, SCHEMA, GREGoR, GA4K) and varFreqsArray (TPMI, MexBB, UKBB). TPMI and MexBB are removed from varFreqsAll so the main combined track is purely WGS/WES. Build scripts parameterized so the same code drives all three combined builds: mergeAndAnnotate.sh gains --databases / --tag, vcfToBigBed.py gains --databases-file / --populations-file and a per-track autoSql table name. mergeAndAnnotate.sh now pins /cluster/software/src/bcftools-1.22 in PATH (--unify-chr-names is a 1.22 feature; conda's 1.14 silently fails). refs #36642 diff --git src/hg/makeDb/scripts/varFreqs/mergeAndAnnotate.sh src/hg/makeDb/scripts/varFreqs/mergeAndAnnotate.sh index b1ae0b37988..91fdaea0c52 100755 --- src/hg/makeDb/scripts/varFreqs/mergeAndAnnotate.sh +++ src/hg/makeDb/scripts/varFreqs/mergeAndAnnotate.sh @@ -1,63 +1,76 @@ #!/bin/bash set -eu # Note: not using pipefail due to bcftools pipeline exit codes +# +# bcftools csq's --unify-chr-names was added in 1.22; conda installs 1.14 by +# default. Pin the 1.22 build at the front of PATH so every bcftools call in +# this script (including the parallel-spawned ones) finds the right binary. +export PATH=/cluster/software/src/bcftools-1.22:$PATH # Merge multiple VCF files and annotate with protein consequences # Uses bcftools for merging and consequence annotation # Step 4 (strip+normalize) runs in parallel for speed. # # Usage: # ./mergeAndAnnotate.sh # full genome # ./mergeAndAnnotate.sh --region chrM # quick test on chrM WORKDIR="/hive/data/genomes/hg38/bed/varFreqs/all" # Source of truth for which VCFs go into the combined track. The same TSV # drives vcfToBigBed.py; reading both pipeline stages off it keeps the two # lists from drifting (the old files.txt could miss new entries silently) # and keeps stale entries in normalized/ out of the merge. DBTSV="$HOME/kent/src/hg/makeDb/scripts/varFreqs/databases.tsv" REF2BIT="/hive/data/genomes/hg38/hg38.2bit" REFFASTA="$WORKDIR/hg38.fa" GFF3="$WORKDIR/Homo_sapiens.GRCh38.115.chr.gff3.gz" THREADS=8 PARALLEL_JOBS=8 -# Parse --region flag +# Parse flags +# --region chrM process a single region (quick test) +# --databases DB config (default databases.tsv); subsets reuse the +# shared normalized/ cache built by the full run +# --tag suffix for merged/annotated outputs so disease/array +# builds (e.g. --tag .disease) don't clobber the all run REGION="" +TAG="" while [[ $# -gt 0 ]]; do case $1 in --region) REGION="$2"; shift 2 ;; + --databases) DBTSV="$2"; shift 2 ;; + --tag) TAG="$2"; shift 2 ;; *) echo "Unknown option: $1"; exit 1 ;; esac done REGION_ARGS="" SUFFIX="" VIEW_REGION="" if [[ -n "$REGION" ]]; then REGION_ARGS="-r $REGION" SUFFIX=".${REGION%%:*}" # e.g. ".chrM" VIEW_REGION="$REGION" echo "*** REGION MODE: $REGION ***" else VIEW_REGION="chr1,chr2,chr3,chr4,chr5,chr6,chr7,chr8,chr9,chr10,chr11,chr12,chr13,chr14,chr15,chr16,chr17,chr18,chr19,chr20,chr21,chr22,chrX,chrY,chrM" fi -# Output files -MERGED="$WORKDIR/merged${SUFFIX}.vcf.gz" -ANNOTATED="$WORKDIR/merged${SUFFIX}.annotated.vcf.gz" +# Output files (TAG keeps disease/array merges separate from the all build) +MERGED="$WORKDIR/merged${TAG}${SUFFIX}.vcf.gz" +ANNOTATED="$WORKDIR/merged${TAG}${SUFFIX}.annotated.vcf.gz" echo "=== VCF Merge and Annotate Pipeline ===" echo "Working directory: $WORKDIR" echo "" # Step 1: Check GFF3 annotation file if [[ ! -f "$GFF3" ]]; then echo "Step 1: ERROR: GFF3 file not found: $GFF3" echo " Download from: https://ftp.ensembl.org/pub/release-115/gff3/homo_sapiens/Homo_sapiens.GRCh38.115.chr.gff3.gz" exit 1 fi if [[ ! -f "${GFF3}.tbi" ]]; then echo "Step 1: Indexing GFF3..." tabix -p gff "$GFF3" else @@ -185,31 +198,31 @@ echo "Step 4: Done." # ============================================================================ # Step 5: Merge all normalized VCFs # ============================================================================ echo "" echo "Step 5: Merging all VCFs..." # Build the merge list from databases.tsv so it tracks exactly what Step 4 # processed. Previously we used `find normalized/ -name '*.norm.vcf.gz'`, # which silently re-merged stale entries from prior builds (e.g. # IndiGenomes_Variants.norm.vcf.gz after IndiGen was dropped from the # config). Basename rule mirrors process_one_vcf: strip .vcf.gz only # (.vcf.bgz like SG10K_Health is left intact, matching the cache layout). -NORM_LIST="$WORKDIR/normalized_files${SUFFIX}.txt" +NORM_LIST="$WORKDIR/normalized_files${TAG}${SUFFIX}.txt" awk -F'\t' '!/^#/ && NF >= 3 && $3 != "" {print $3}' "$DBTSV" | \ while read -r vcf; do bn=$(basename "$vcf" .vcf.gz) echo "$WORKDIR/normalized/${bn}${SUFFIX}.norm.vcf.gz" done | sort > "$NORM_LIST" nfiles=$(wc -l < "$NORM_LIST") echo " Found $nfiles normalized VCF files" if [[ ! -f "$MERGED" ]]; then bcftools merge \ --threads "$THREADS" \ -l "$NORM_LIST" \ -m none \ --force-samples \ -Oz -o "$MERGED"