src/hg/makeDb/scripts/varFreqs/sparkMergeVcfAddCounts.sh 3180d71425ab40bc022712bb95868bfe80747375

3180d71425ab40bc022712bb95868bfe80747375
max
  Fri May 29 08:52:38 2026 -0700
[Claude] varFreqs: split SPARK+SCHEMA by phenotype, add disease + array combined tracks, drop array cohorts from varFreqsAll

#Preview2 week - bugs introduced now will need a build patch to fix
Split SFARI SPARK WES and WGS by autism status using fill-tags -S with the
SPARK individuals_registration TSV (AC_AUT / AN_AUT / AF_AUT plus
AC_NON_AUT / AN_NON_AUT / AF_NON_AUT). Added matching SCHEMA case/control
sums (AC_CASE etc.). Two new combined bigBed tracks: varFreqsDisease
(SPARK, SFARI WGS, TOPMed, SCHEMA, GREGoR, GA4K) and varFreqsArray (TPMI,
MexBB, UKBB). TPMI and MexBB are removed from varFreqsAll so the main
combined track is purely WGS/WES.

Build scripts parameterized so the same code drives all three combined
builds: mergeAndAnnotate.sh gains --databases / --tag, vcfToBigBed.py
gains --databases-file / --populations-file and a per-track autoSql table
name. mergeAndAnnotate.sh now pins /cluster/software/src/bcftools-1.22 in
PATH (--unify-chr-names is a 1.22 feature; conda's 1.14 silently fails).

refs #36642

diff --git src/hg/makeDb/scripts/varFreqs/sparkMergeVcfAddCounts.sh src/hg/makeDb/scripts/varFreqs/sparkMergeVcfAddCounts.sh
index b983f8e8c0d..493314af2d0 100755
--- src/hg/makeDb/scripts/varFreqs/sparkMergeVcfAddCounts.sh
+++ src/hg/makeDb/scripts/varFreqs/sparkMergeVcfAddCounts.sh
@@ -1,74 +1,119 @@
 #!/bin/bash
 # Merge all per-chromosome VCFs into a single sites-only VCF
 # Uses bcftools fill-tags plugin to calculate AC/AN/AF from genotypes
 # Output INFO fields: AC, AF, AN, AQ
+#
+# Optional 3rd arg: a SPARK individuals_registration TSV. When given, samples
+# are split by the "asd" column (TRUE -> AUT, FALSE -> NON_AUT) and fill-tags
+# also emits per-group AC_AUT/AN_AUT/AF_AUT and AC_NON_AUT/AN_NON_AUT/AF_NON_AUT.
+# Samples missing from the registration file (or with a blank asd value) still
+# count toward the overall AC/AN/AF but contribute to neither group.
 
 set -euo pipefail
 
 #INBASE="vcf/SPARK.iWES_v3.2024_08.deepvariant"
 INBASE=$1
 OUTFILE=`basename $INBASE.sites.vcf.gz`
 TMPDIR="tmp_merge_$$"
 BCFTOOLS=~/software/bcftools/bin/bcftools
 export BCFTOOLS_PLUGINS=~/software/bcftools/libexec/bcftools
 export INBASE
 
 # Number of parallel jobs (adjust based on available CPUs/memory)
 NJOBS=${2:-4}
 
+# Optional ASD phenotype registration TSV (column 1 = subject_sp_id, column 8 = asd)
+REGFILE=${3:-}
+GROUPFILE=""
+
 mkdir -p "$TMPDIR"
 
+# Build a fill-tags sample-group file (sample<TAB>group) if a registration
+# TSV was supplied. Restrict to samples actually present in the VCF so the
+# plugin never sees an unknown sample ID.
+if [[ -n "$REGFILE" ]]; then
+    echo "Building ASD/non-ASD sample groups from $REGFILE ..."
+    # Find the first per-chromosome VCF that exists to read the sample list.
+    FIRSTVCF=""
+    for chr in chr{1..22} chrX chrY; do
+        if [[ -f "${INBASE}.${chr}.vcf.gz" ]]; then FIRSTVCF="${INBASE}.${chr}.vcf.gz"; break; fi
+    done
+    if [[ -z "$FIRSTVCF" ]]; then echo "ERROR: no per-chromosome VCF found for $INBASE" >&2; exit 1; fi
+
+    GROUPFILE="$TMPDIR/asd_groups.txt"
+    $BCFTOOLS query -l "$FIRSTVCF" > "$TMPDIR/vcf_samples.txt"
+    # asd column: TRUE -> AUT, FALSE -> NON_AUT, anything else -> unassigned
+    awk -F'\t' 'NR==FNR{a[$1]=$8; next}
+                {st=a[$1];
+                 if(st=="TRUE") print $1"\tAUT";
+                 else if(st=="FALSE") print $1"\tNON_AUT"}' \
+        "$REGFILE" "$TMPDIR/vcf_samples.txt" > "$GROUPFILE"
+    nAut=$(awk -F'\t' '$2=="AUT"' "$GROUPFILE" | wc -l)
+    nNon=$(awk -F'\t' '$2=="NON_AUT"' "$GROUPFILE" | wc -l)
+    nTot=$(wc -l < "$TMPDIR/vcf_samples.txt")
+    echo "  $nTot VCF samples: $nAut ASD (AUT), $nNon non-ASD (NON_AUT), $((nTot-nAut-nNon)) unassigned"
+fi
+export GROUPFILE
+
 echo "Processing chromosomes with fill-tags plugin (parallel=$NJOBS)..."
 echo "Started at $(date)"
 
 # Function to process one chromosome
 process_chr() {
     local chr=$1
     local infile="${INBASE}.${chr}.vcf.gz"
     local outfile="$TMPDIR/${chr}.sites.vcf.gz"
 
     if [[ ! -f "$infile" ]]; then
         echo "WARNING: $infile not found, skipping" >&2
         return
     fi
 
     echo "Processing $chr..."
 
-    # Fill AC/AN/AF tags, then drop samples
+    # Fill AC/AN/AF tags (per-group too if a group file was built), then drop samples
+    if [[ -n "$GROUPFILE" ]]; then
+        $BCFTOOLS +fill-tags "$infile" -- -t AC,AN,AF -S "$GROUPFILE" 2>/dev/null | \
+            $BCFTOOLS view -G -Oz -o "$outfile" - 2>/dev/null
+    else
         $BCFTOOLS +fill-tags "$infile" -- -t AC,AN,AF 2>/dev/null | \
             $BCFTOOLS view -G -Oz -o "$outfile" - 2>/dev/null
+    fi
     $BCFTOOLS index -t "$outfile"
 
     echo "  Done: $chr"
 }
 
 export -f process_chr
 export TMPDIR BCFTOOLS BCFTOOLS_PLUGINS
 
 # Run chromosomes in parallel
 printf '%s\n' chr{1..22} chrX chrY | xargs -P "$NJOBS" -I {} bash -c 'process_chr "$@"' _ {}
 
 echo ""
 echo "Concatenating all chromosomes..."
 
 # Create file list in chromosome order
 for chr in chr{1..22} chrX chrY; do
     echo "$TMPDIR/${chr}.sites.vcf.gz"
 done > "$TMPDIR/files.txt"
 
 # Concatenate
 $BCFTOOLS concat -f "$TMPDIR/files.txt" -Oz -o "$OUTFILE"
 $BCFTOOLS index -t "$OUTFILE"
 
 echo "Cleaning up..."
 rm -rf "$TMPDIR"
 
 echo ""
 echo "Done! Output: $OUTFILE"
 echo "Finished at $(date)"
 echo ""
 echo "Verifying output..."
-$BCFTOOLS view -h "$OUTFILE" | grep "^##INFO"
+$BCFTOOLS view -h "$OUTFILE" | grep "^##INFO" || true
 echo ""
 echo "Sample variants:"
-$BCFTOOLS view "$OUTFILE" 2>/dev/null | grep -v "^#" | head -5
+# `| head -5` closes the pipe early, so the upstream bcftools exits via
+# SIGPIPE (141). With set -o pipefail that would fail the whole script
+# after a successful build, breaking any && chain in the caller.
+{ $BCFTOOLS view "$OUTFILE" 2>/dev/null || true; } | grep -v "^#" | head -5 || true