af9a5b388259e680dd34bc47b2cad4ff6e3d162f
lrnassar
  Sat Jun 13 03:00:51 2026 -0700
varFreqs: pre-release polish from comprehensive sanity check.

* Sync the new combined-track shortLabels into the four description pages:
"Affected/Case Individuals" -> "Disease cohorts" and "Population + Unaffected"
-> "Population reference" (matches the trackdb shortLabels users now see).
* Add a paragraph in the supertrack Methods section describing the pooled
affectedAF / backgroundAF formulation (sum AC / sum AN) and the default_an
configuration that handles AF-only cohorts.
* Update the in-track Methods paragraphs on varFreqsAffected.html and
varFreqsBackground.html: replace "summed/maximized" with "pooled".
* Fix supertrack table downloadability column to match the underscore-prefix
convention: allofus "Yes" -> "No" (description page already says license
restricted); gregor "No" -> "Yes" (description page says VCF is on our
download server, and the gbdb path is not underscore-prefixed).
* Add a 2026-06-12 makedoc section documenting the pooled-AF rebuild, the
default_an mechanism, the new affectedAN/backgroundAN columns, the
before/after spot-check at APOE rs429358, and the build commands.

refs #36642

diff --git src/hg/makeDb/doc/hg38/varFreqs.txt src/hg/makeDb/doc/hg38/varFreqs.txt
index bd9095d7f48..52c925634a8 100644
--- src/hg/makeDb/doc/hg38/varFreqs.txt
+++ src/hg/makeDb/doc/hg38/varFreqs.txt
@@ -1,1044 +1,1089 @@
 # Genomic Answers for Kids (GA4K), Children's Mercy - 2026-04-16 Claude max
 # GA4K is a pediatric rare-disease PacBio HiFi long-read cohort (Cohen et al.
 # 2022, Genet Med, PMID 35305867). The release ships 24 per-chromosome VCFs of
 # site-only small variants (SNVs and short indels), filtered to variants
 # replicated in >=2 unrelated GA4K individuals or matched to an HPRC variant.
 # Upstream data lives under /hive/data/genomes/hg38/bed/lrSv/GA4K (co-located
 # with the matched GA4K structural-variant release; see the lrSv makedoc).
 cd /hive/data/genomes/hg38/bed/lrSv/GA4K
 bcftools concat -Oz -o ga4kSnv.vcf.gz \
     pacbio_snv_vcf/pb_joint_merged.snv.chr{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y}.vcf.gz
 tabix -p vcf ga4kSnv.vcf.gz
 # Symlinks placed under /gbdb/hg38/varFreqs/ga4k/ for the ga4kSnv stanza in
 # trackDb/human/varFreqs.ra.
 
 # Mexico Biobank, Max, Nov 8 2025
 CrossMap.py vcf /gbdb/hg19/liftOver/hg19ToHg38.over.chain.gz /hive
 /data/genomes/hg19/bed/varFreqs/mexbb/MXBv2.vcf.gz /hive/data/genomes/hg38/p14Clean/hg38.p14.fa MXBv2.lift.hg19ToHg38.vcf && bgzip MXBv2.lift.hg19ToHg38.vcf && bcftools sort MXBv2.lift.hg19ToHg38.vcf -Oz -m 200G -T /data/tmp/ -o MXBv2.lift.hg19ToHg38.vcf.gz && tabix -p vcf MXBv2.lift.hg19ToHg38.vcf.gz
 
 # Mexico City Prospective study, Max Oct 28 2025
 cd /hive/data/genomes/hg38/bed/varFreqs/mcps/
 for i in `seq 1 22` X; do wget https://rgc-mcps.regeneron.com/downloads/20230130/chr$i.freq.vcf.gz; done
 for i in `seq 1 22` X; do wget https://rgc-mcps.regeneron.com/downloads/20230130/chr$i.freq.vcf.gz.tbi; done
 mv *vcf* vcf/
 bcftools concat  --threads 16  -Oz -o mcps.freq.vcf.gz vcf/chr{1..22}.freq.vcf.gz vcf/chrX.freq.vcf.gz
 # make normal AC and AF and AN fields for mouseovers
 zcat mcps.freq.vcf.gz | sed -e 's/_RAW//g' > mcps.fix.freq.vcf
 mv -f mcps.fix.freq.vcf mcps.freq.vcf
 bgzip mcps.freq.vcf
 tabix -p vcf mcps.freq.vcf.gz 
 
 # Regeneron million exomes, Max, Nov 3 2025
 cd /hive/data/genomes/hg38/bed/varFreqs/me
 for i in `seq 1 22` X Y; do wget https://rgc-research.regeneron.com/me/downloads/20231004/rgc_me_variant_frequencies_chr${i}_20231004.vcf.gz.tbi; done
 bcftools concat  --threads 10  -Oz -o rgc_me_freqs_20231004.vcf.gz rgc_me_variant_frequencies_chr{1..22}_20231004.vcf.gz  rgc_me_variant_frequencies_chrX_20231004.vcf.gz rgc_me_variant_frequencies_chrY_20231004.vcf.gz 
 zcat rgc_me_freqs_20231004.vcf.gz | sed -e 's/ALL_//g' > rgc_me_freqs_20231004.fix.vcf
 tabix -p vcf rgc_me_freqs_20231004.vcf.gz
 
 # GA south asia 100k pilot
 cd /hive/data/genomes/hg38/bed/varFreqs/ga100k/
 parallel -j 8 wget -q --no-check-certificate https://browser.genomeasia100k.org/service/web/download_files/{}.substitutions.annot.cont_withmaf.vcf.gz ::: {1..22} X Y
 # fix the header line, remove "FORMAT"
 for i in *.vcf.gz; do echo "zcat $i |   awk 'BEGIN{OFS=\"\\t\"} /^#CHROM/{NF=8; print; next} /^#/ {print; next} {NF=8; print}' |   bgzip -c > fixed/$i" >> cmds.txt; done
 parallel -j 8 < cmds.txt
 bcftools concat  --threads 16  -Oz -o ../ga100k.subst.vcf.gz fixed/{1..22}.substitutions.annot.cont_withmaf.vcf.gz
 # add indels
 wget -q --no-check-certificate https://browser.genomeasia100k.org/service/web/download_files/All.indels.annot.cont_withmaf.vcf.gz
 # index
 tabix -p vcf ../ga100k*.vcf.gz
 tabix -p vcf All*.vcf.gz
 
 # TOPMED Freeze 10
 cd /hive/data/genomes/hg38/bed/varFreqs/topmed/
 # need to download the VCFs manually, 22 VCFs, with one time links from https://bravo.sph.umich.edu/vcfs.html
 # grrrr...
 bcftools concat  --threads 10  -Oz -o topmed10.vcf.gz {1..22}.vcf.gz X.vcf.gz 
 tabix -p vcf topmed10.vcf.gz
 
 # Abraom brazil
 # get unique download link from https://abraom.ib.usp.br/download/index.php
 cd /hive/data/genomes/hg38/bed/varFreqs/abraom/
 wget 'https://abraom.ib.usp.br/download/download-files.php?fid=RklEMTIzNDU2&key=1762266466-key690a0d62348de0.22872232' -O abraom.tar
 tar xvfz abraom.tar
 ln -s  /hive/data/genomes/hg38/p14Clean/hg38.p14.fa
 samtools faidx hg38.p14.fa 
 python ~/kent/src/hg/makeDb/scripts/varFreqs/abraomToVcf.py SABE1171.Abraom.clean.tsv abraom.vcf hg38.p14.fa
 tabix -p vcf abraom.vcf.gz 
 
 # SGDP
 cd /hive/data/genomes/hg38/bed/varFreqs/sgp/
 CrossMap.py vcf /gbdb/hg19/liftOver/hg19ToHg38.over.chain.gz /hive/data/genomes/hg19/bed/varFreqs/sgdp/SGDP.nh2.vcf.gz hg38.p14.fa sgdp.hg38.nh2.vcf
 bgzip sgdp.hg38.nh2.vcf
 bcftools sort sgdp.hg38.nh2.vcf.gz -Oz -m 200G -T /data/tmp/ -o sgdp.hg38.nh2.sort.vcf.gz 
 mv sgdp.hg38.nh2.sort.vcf.gz SGDP.nh2.vcf.gz
 tabix -p vcf SGDP.nh2.vcf.gz
 
 # KOVA
 cd /hive/data/genomes/hg38/bed/varFreqs/sgp/
 # got tsv file via google drive link from 장인수 <insoo078@kribb.re.kr> 
 # VCF converter, written by Claude Opus 4.1 using 2 lines of example input
 python ~/kent/src/hg/makeDb/scripts/varFreqs/kovaToVcf.py 1_KOVA.v7.tsv.gz kova.v7.vcf
 bgzip kova.v7.vcf
 tabix -p vcf kova.v7.vcf.gz
 
 # NPM Singapore
 cd /hive/data/genomes/hg38/bed/varFreqs/npm/
 # downloaded data manually from chorus website, https://chorus.grids-platform.io/vcfdl
 bcftools concat  --threads 10  -Oz -o SG10K_Health_r5.3.2.sites.vcf.bgz  SG10K_Health_r5.3.2.sites.chr{1..22}.vcf.bgz SG10K_Health_r5.3.2.sites.chrX.vcf.bgz SG10K_Health_r5.3.2.sites.chrY.vcf.bgz 
 tabiv -p vcf SG10K_Health_r5.3.2.sites.vcf.bgz
 
 # Saudi 300 genomes
 cd /hive/data/genomes/hg38/bed/varFreqs/saudi
 wget https://figshare.com/ndownloader/files/51297884 -O 51297884.tsv.gz
 python3 ~/kent/src/hg/makeDb/scripts/varFreqs/saudiToVcf.py
 bgzip saudi.vcf
 tabix -p vcf saudi.vcf.gz
 
 # SFARI SPARK
 cd /hive/data/genomes/hg38/bed/varFreqs/sparkExomes/
 # used globus to download into vcf/
 sh ~/kent/src/hg/makeDb/scripts/varFreqs/sparkMergeVcfAddCounts.sh vcf/SPARK.iWES_v3.2024_08.deepvariant 8
 bcftools norm -m-  SPARK.iWES_v3.2024_08.deepvariant.sites.vcf.gz -Oz > SPARK.iWES_v3.2024_08.deepvariant.norm.vcf.gz && tabix -p vcf SPARK.iWES_v3.2024_08.deepvariant.norm.vcf.gz
 
 cd /hive/data/genomes/hg38/bed/varFreqs/sparkWgs/
 # used globus to download into vcf/
 sh ~/kent/src/hg/makeDb/scripts/varFreqs/sparkMergeVcfAddCounts.sh vcf/wgs_12519_genome.deepvariant 8
 bcftools norm -m-  wgs_12519_genome.deepvariant.sites.vcf.gz -Oz > wgs_12519_genome.deepvariant.norm.vcf.gz
 tabix -p vcf wgs_12519_genome.deepvariant.norm.vcf.gz
 
 # NCBI ALFA bigBed to VCF, Max Jan 26 2026
 # Source: ALFA R4 bigBed files, 904M variants, output 163M with non-zero AF
 cd /hive/data/genomes/hg38/bed/varFreqs/alfa
 python3 ~/kent/src/hg/makeDb/scripts/varFreqs/alfa_to_vcf.py --out ALFA.vcf --zero-af-file ALFA_zero.txt
 # Compress and index
 bgzip ALFA.vcf
 tabix -p vcf ALFA.vcf.gz
 # Final: 2.7GB, 163M variants (146M SNPs, 17M indels), ALFA_zero.txt has 26GB of zero-AF variants
 
 # HRC (Haplotype Reference Consortium), Claude max, Mar 17 2026
 # Source: HRC.r1-1.GRCh37.wgs.mac5.sites.tab.gz
 # 40M variants from 32,488 WGS samples, originally on GRCh37
 cd /hive/data/genomes/hg38/bed/varFreqs/hrc/
 # download HRC.r1-1.GRCh37.wgs.mac5.sites.tab.gz from http://www.haplotype-reference-consortium.org/site
 python3 ~/kent/src/hg/makeDb/scripts/varFreqs/hrcToVcf.py
 # 40,405,505 variants read, 8,052 unmapped, 40,397,453 lifted to hg38
 # sort, compress, index
 bcftools sort hrc.vcf -Oz -o hrc.vcf.gz
 tabix -p vcf hrc.vcf.gz
 rm hrc.vcf
 ln -s /hive/data/genomes/hg38/bed/varFreqs/hrc/hrc.vcf.gz /gbdb/hg38/varFreqs/hrc/hrc.vcf.gz
 ln -s /hive/data/genomes/hg38/bed/varFreqs/hrc/hrc.vcf.gz.tbi /gbdb/hg38/varFreqs/hrc/hrc.vcf.gz.tbi
 
 # Australia, Max, Jan 2026
 # received files from m.hobbs@garvan.org.au
 cd /hive/data/genomes/hg38/bed/varFreqs/mgrb/
 bcftools norm -f hg38.fa -m-any MGRB.phase3.GRCh38.vcf.gz -o MGRB.phase3.GRCh38.norm.vcf.gz
 tabix MGRB.phase3.GRCh38.norm.vcf.gz
 
 # SCHEMA Schizophrenia Exome Meta-Analysis track for hg38, Max, Jan 22 2026
 # source: https://schema.broadinstitute.org/
 # Original is in hg19/GRCh37 coordinates
 cd /hive/data/genomes/hg38/bed/varFreqs/schema
 # SCHEMA_variant_results.vcf.bgz (384M, hg19 coordinates)
 # Step 1: Add AC, AN, AF fields by summing case+control counts
 ~/kent/src/hg/makeDb/scripts/varFreqs/schema_addAcAnAf.py
 bgzip SCHEMA_variant_results_withAF.vcf
 tabix -p vcf SCHEMA_variant_results_withAF.vcf.gz
 # Step 2: Liftover from hg19 to hg38
 # prep hg38 reference FASTA
 zcat /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/bigZips/hg38.fa.gz > hg38.fa
 samtools faidx hg38.fa
 CrossMap.py vcf /gbdb/hg19/liftOver/hg19ToHg38.over.chain.gz \
     SCHEMA_variant_results_withAF.vcf.gz \
     hg38.fa \
     SCHEMA_variant_results_hg38.vcf
 # Output stats: Total entries: 8865268, Failed to map: 780
 # Sort
 grep "^#" SCHEMA_variant_results_hg38.vcf > SCHEMA_variant_results_hg38_sorted.vcf
 grep -v "^#" SCHEMA_variant_results_hg38.vcf | sort -k1,1V -k2,2n >> SCHEMA_variant_results_hg38_sorted.vcf
 # Compress and index
 bgzip SCHEMA_variant_results_hg38_sorted.vcf
 tabix -p vcf SCHEMA_variant_results_hg38_sorted.vcf.gz
 # Clean up temporary files
 rm -f SCHEMA_variant_results_hg38.vcf SCHEMA_variant_results_hg38.vcf.unmap hg38.fa hg38.fa.fai
 
 # Gregor rare disease project, Max, Mar 2026
 cd /hive/data/genomes/hg38/bed/varFreqs/gregor/
 # Downloaded from G Drive, pointed to by Jon Bernstein, Stanford
 # https://drive.google.com/drive/folders/1v-BnW7nKcEjF-NyLqU1Up3YJuP5KJJAg
 # created symlink into my UCSC G Drive, then used rclone
 rclone copy mhaeussldrive:RO4 ./
 bcftools concat --threads 16 -Oz -o gregor.vcf.gz chr{1..22}.vcf.gz chrX.vcf.gz chrY.vcf.gz
 tabix -p vcf gregor.vcf.gz
 # output ~20 GB, took 10 minutes.
 
 # HGDP1k data from the phased Vars track, Max/Claude, Mar 18 2026
 # Just flattening what we have and reducing details
 # Source: 3.2TB VCF with 4094 genomes and per-population INFO fields for 80 populations
 # Strip genotypes and keep only overall + continental group fields (drop per-population-per-sex)
 # Already has chr prefix, no rename needed
 # Note: first attempt kept all fields -> 169GB, too large. This version keeps only continental groups.
 cd /hive/data/genomes/hg38/bed/varFreqs/hgdp1kFreq/
 KEEP="INFO/AC,INFO/AF,INFO/AN,INFO/nhomalt,INFO/gnomad_AC,INFO/gnomad_AF,INFO/gnomad_AN,INFO/gnomad_AC_afr,INFO/gnomad_AF_a
 fr,INFO/gnomad_AN_afr,INFO/gnomad_AC_ami,INFO/gnomad_AF_ami,INFO/gnomad_AN_ami,INFO/gnomad_AC_amr,INFO/gnomad_AF_amr,INFO/g
 nomad_AN_amr,INFO/gnomad_AC_asj,INFO/gnomad_AF_asj,INFO/gnomad_AN_asj,INFO/gnomad_AC_eas,INFO/gnomad_AF_eas,INFO/gnomad_AN_
 eas,INFO/gnomad_AC_fin,INFO/gnomad_AF_fin,INFO/gnomad_AN_fin,INFO/gnomad_AC_mid,INFO/gnomad_AF_mid,INFO/gnomad_AN_mid,INFO/
 gnomad_AC_nfe,INFO/gnomad_AF_nfe,INFO/gnomad_AN_nfe,INFO/gnomad_AC_oth,INFO/gnomad_AF_oth,INFO/gnomad_AN_oth,INFO/gnomad_AC
 _sas,INFO/gnomad_AF_sas,INFO/gnomad_AN_sas,INFO/gnomad_popmax,INFO/gnomad_faf95_popmax"
 # This took days to complete, so asked Claude to make it parallel
 #bcftools view -G /gbdb/hg38/phasedVars/hgdp1k/gnomad.genomes.v3.1.2.hgdp_tgp.vcf.gz --threads 8 \
 #| bcftools annotate -x "^${KEEP}" -Oz --threads 4 -o hgdp1k.freq.vcf.gz
 # use 30 threads, and chunks of 50 Mbp
 sh ~/kent/src/hg/makeDb/scripts/varFreqs/vcfFilterParallel.sh /gbdb/hg38/phasedVars/hgdp1k/gnomad.genomes.v3.1.2.hgdp_tgp.vcf.gz hgdp1k.freq.parallel.vcf.gz "$KEEP" 30 50 &
 tabix -p vcf hgdp1k.freq.vcf.gz
 
 # Swefreq, Max, Feb 2026
 # downloaded files from https://swefreq.nbis.se/dataset/SweGen/download
 # Access was approved through the website, but I emailed swefreq@scilifelab.se, it needed a reminder email
 # Also got email from adam.ameur@igp.uu.se with followup info and do-no-allow-downloads instruction
 cd /hive/data/genomes/hg38/bed/varFreqs/swefreq
 
 # Indigenomes, Max Jan 2026
 # downloaded from https://clingen.igib.res.in/indigen/, used as-is
 cd /hive/data/genomes/hg38/bed/varFreqs/indigenomes/
 
 # Japan Tommo 60k, Max Jan 2026
 # downloaded from https://jmorp.megabank.tohoku.ac.jp/downloads
 cd /hive/data/genomes/hg38/bed/varFreqs/tommo61kjpn/
 # copied urls from website
 wget -i urls.txt 
 bcftools concat --threads 16 -Oz -o tommo-61kjpn-20250616-GRCh38-snvindel-af-autosome.vcf.gz \
     tommo-61kjpn-20250616-GRCh38-snvindel-af-autosome-chr{1..22}.vcf.gz
 tabix -p vcf tommo-61kjpn-20250616-GRCh38-snvindel-af-autosome.vcf.gz
 
 # FinnGen, Max/Claude, Jan 2026
 cd /hive/data/genomes/hg38/bed/varFreqs/finngen/                                                                           
 # Source TSV was downloaded from FinnGen (via email link from Google Cloud bucket)                                         
 # finnge_R12_annotated_variants_v1.gz (32 GB TSV)                                                                          
 # Convert TSV to VCF using custom Python script (written by Claude Opus 4.5)                                               
 python ~/kent/src/hg/makeDb/scripts/varFreqs/finngen_to_vcf.py \                                                                    
     finnge_R12_annotated_variants_v1.gz \                                                                                    
     finnge_R12_annotated_variants_v1.vcf                                                                                     
 # Compress and index                                                                                                       
 bgzip finnge_R12_annotated_variants_v1.vcf -@8                                                                             
 tabix -p vcf finnge_R12_annotated_variants_v1.vcf.gz                                                                       
 
 # All of Us, Max Feb 2026
 # Received from Qudsi at UCSC in the Ioannidis group via phoenix
 # only concated and ran tabix on it
 cd /hive/data/genomes/hg38/bed/varFreqs/allofus/
 bcftools concat --threads 16 -Oz -o allOfUs.locAncFreq.vcf.gz clean/allele_freq_chr{1..22}.NW.clean.conf90.oneline.vcf.gz
 tabix allOfUs.locAncFreq.vcf.gz
 
 ##########
 # 2026-03-27 Claude max
 
 # Two phased SV VCF tracks moved into phasedVars superTrack from lrSv:
 # - han945SvVcf: Per-sample genotypes for 945 Han Chinese SVs
 # - lrSv1kgOntPhased: Phased SVs from 1,019 diverse humans (1KG ONT)
 # Data files remain in /hive/data/genomes/hg38/bed/lrSv/
 # Symlinks moved from /gbdb/{hg38,hs1}/lrSv/ to /gbdb/{hg38,hs1}/phasedVars/
 # Build documentation for these tracks is in lrSv.txt
 
 ##########
 # 2026-04-20 Claude max
 
 # CoLoRSdb v1.2.0 long-read SNV/indel population frequencies added as
 # the colorsDbSnv subtrack of varFreqs, for both hg38 and hs1.
 #
 # Upstream VCFs (GRCh38 and CHM13 releases) are already present in
 # /hive/data/genomes/hg38/bed/lrSv/colorsDb/ (placed there when the
 # CoLoRSdb SV track was first built under lrSv). We just add VCF
 # symlinks under each assembly's varFreqs directory using a consistent
 # filename so the shared trackDb stanza can use $D.
 
 mkdir -p /gbdb/hg38/varFreqs/colorsDb /gbdb/hs1/varFreqs/colorsDb
 ln -sf /hive/data/genomes/hg38/bed/lrSv/colorsDb/CoLoRSdb.GRCh38.v1.2.0.deepvariant.glnexus.vcf.gz     /gbdb/hg38/varFreqs/colorsDb/colorsDbSnv.vcf.gz
 ln -sf /hive/data/genomes/hg38/bed/lrSv/colorsDb/CoLoRSdb.GRCh38.v1.2.0.deepvariant.glnexus.vcf.gz.tbi /gbdb/hg38/varFreqs/colorsDb/colorsDbSnv.vcf.gz.tbi
 ln -sf /hive/data/genomes/hg38/bed/lrSv/colorsDb/CoLoRSdb.CHM13.v1.2.0.deepvariant.glnexus.vcf.gz      /gbdb/hs1/varFreqs/colorsDb/colorsDbSnv.vcf.gz
 ln -sf /hive/data/genomes/hg38/bed/lrSv/colorsDb/CoLoRSdb.CHM13.v1.2.0.deepvariant.glnexus.vcf.gz.tbi  /gbdb/hs1/varFreqs/colorsDb/colorsDbSnv.vcf.gz.tbi
 
 # The varFreqs.ra trackDb file is already in human/ (shared for both
 # hg38 and hs1 via the human/trackDb.ra include), so no move was needed.
 # Only colorsDbSnv is expected to render on hs1 - the other varFreqs
 # subtracks have hg38-only data and will silently show nothing there.
 
 ##########
 # 2026-04-20 Claude max
 #
 # Rebuilt varFreqsAll combined bigBed to include GA4K and CoLoRSdb
 # long-read PacBio subtracks that were added to varFreqs since the
 # last build (Mar 20).
 #
 # Steps (in /hive/data/genomes/hg38/bed/varFreqs/all):
 # 1. Added GA4K and CoLoRSdb rows to
 #      ~/kent/src/hg/makeDb/scripts/varFreqs/databases.tsv
 #    and appended their /gbdb paths to files.txt.
 # 2. Deleted merged.vcf.gz and merged.annotated.vcf.gz to force a full
 #    merge + bcftools csq re-annotation (per-sample normalized VCFs
 #    from the previous run were kept; only the two new VCFs were
 #    normalized in Step 4).
 # 3. Ran ./mergeAndAnnotate.sh (~55 min: 5 min per-file, ~15 min merge,
 #    ~35 min csq).
 # 4. Ran ./vcfToBigBed.py --output-prefix varFreqsAll --threads 8
 #    (Phase 1 pre-extract ~90 min, Phase 2 chrom BED build ~30 min).
 # 5. bedToBigBed on 275 GB sorted BED (~2 h) to produce 37.7 GB
 #    varFreqsAll.bb with 1,165,666,478 records and 113 fields.
 # 6. Updated varFreqs.ra filterValues.sources and added
 #    filterByRange.GA4KAF/AC and filterByRange.CoLoRSdbAF/AC.
 # Existing /gbdb/hg38/varFreqs/varFreqsAll.bb symlink was preserved.
 
 # 2026-04-22 Claude max
 # GWAS SVatalog small-variant (SNV/indel) allele frequencies from the 101
 # SVatalog samples, sibling of the lrSv chirmade101Sv structural-variant
 # track. Paper: Chirmade et al. 2026, Heredity, PMID 41203876.
 # SNPs were called from 10X Genomics linked short-read WGS of the 101
 # samples with GATK HaplotypeCaller v4.0.0.0 and phased with SHAPEIT v4.2.0.
 # Data: the per-chromosome allele-frequency text files were downloaded into
 # /hive/data/genomes/hg38/bed/varFreqs/svCatalog/ alongside the LD-stats
 # files (see the companion Zenodo deposit 13367574).
 cd /hive/data/genomes/hg38/bed/varFreqs/svCatalog/
 # Convert the 23 per-chrom *_allele_freq.txt files to a single sites-only
 # VCF with AF/AC/AN plus gnomAD v3.1 NFE AF and dbSNP RSID as INFO fields.
 # AC is synthesized as round(AF * 202) and AN is fixed at 202 since the
 # source release does not ship AC/AN.
 python3 ~/kent/src/hg/makeDb/scripts/varFreqs/svatalogFreqToVcf.py \
     svatalog.vcf chr{1..22}_allele_freq.txt chrX_allele_freq.txt
 bcftools sort svatalog.vcf -Oz -m 16G -T /tmp/ -o svatalog.vcf.gz
 tabix -p vcf svatalog.vcf.gz
 rm -f svatalog.vcf
 # 8,814,835 variants -> 172 MB bgzipped + 1.6 MB tabix index.
 # Symlinks placed under /gbdb/hg38/varFreqs/svatalog/ for the svatalogSnv
 # stanza in trackDb/human/varFreqs.ra.
 
 # varFreqsAll rebuild, 2026-04-22 Claude max
 # Regenerate the All Databases Combined track to include SVatalog. Source
 # count rises from 23 to 24 databases; final bigBed is 35.3 GB with
 # 1,166,005,346 records and 115 fields (up from 113). Pipeline run in
 # /hive/data/genomes/hg38/bed/varFreqs/all/:
 # 1. Added /gbdb/hg38/varFreqs/svatalog/svatalog.vcf.gz to files.txt and
 #    the SVatalog row to ~/kent/src/hg/makeDb/scripts/varFreqs/databases.tsv.
 # 2. Cleared stale merged.vcf.gz / merged.annotated.vcf.gz to force re-merge.
 # 3. Ran ./mergeAndAnnotate.sh (~60 min: pre-extract skip+svatalog, ~20 min
 #    merge, ~35 min csq).
 # 4. Ran python3 vcfToBigBed.py --output-prefix varFreqsAll --threads 8
 #    (Phase 1 ~90 min re-extract, Phase 2 chrom BED build ~30 min,
 #    concat+sort ~45 min).
 # 5. bedToBigBed on 260 GB sorted BED (~2h 15m) -> 35.3 GB varFreqsAll.bb.
 # 6. Updated varFreqs.ra filterValues.sources and added
 #    filterByRange.SVatalogAF / filterByRange.SVatalogAC.
 # Existing /gbdb/hg38/varFreqs/varFreqsAll.bb symlink (pointing at
 # /hive/data/genomes/hg38/bed/varFreqs/all/varFreqsAll.bb) was preserved
 # and now resolves to the new 35.3 GB build.
 
 ##########
 # 2026-04-29 Claude max
 # Indigenous African 180 WGS allele frequencies (Tishkoff lab,
 # Fan et al. 2023 Cell, PMID 36868214). 180 individuals (15 per
 # population) from 12 indigenous African populations sequenced at >30x
 # on Illumina HiSeq X Ten. Sites-only SNP VCF with aggregate AC/AF/AN
 # (no per-population frequencies) was supplied directly by Matthew
 # Hansen (mhansen@upenn.edu, Tishkoff lab) via Box. Original calls on
 # hs37d5 (GRCh37) so we lift to hg38.
 cd /hive/data/genomes/hg38/bed/varFreqs/tishkoff/
 # Source: 180wgs.SNPs.sites.AF.vcf.gz (~316 MB, 33,618,897 SNPs,
 # autosomes only, bare chromosome names "1"-"22").
 # Step 1: rename bare chromosome names to chr-prefixed UCSC names.
 bcftools annotate --rename-chrs chrRename.txt -Oz \
     -o tishkoff.hg19.chr.vcf.gz 180wgs.SNPs.sites.AF.vcf.gz
 # Step 2: lift to hg38 with CrossMap (~5 min). hg38.fa is the same
 # /hive/data/genomes/hg38/bed/varFreqs/all/hg38.fa used elsewhere.
 CrossMap.py vcf /gbdb/hg19/liftOver/hg19ToHg38.over.chain.gz \
     tishkoff.hg19.chr.vcf.gz \
     /hive/data/genomes/hg38/bed/varFreqs/all/hg38.fa \
     tishkoff.hg38.unsorted.vcf
 # CrossMap reports: 33,618,897 input -> 9,066 unmapped, 33,609,831 lifted.
 # Step 3: drop variants that landed on alt/random/Un/fix contigs and
 # fix the contig= header lines (CrossMap dropped the chr prefix from
 # the contig IDs even though data lines have it).
 awk 'BEGIN{OFS="\t"} /^#/ {next} $1 ~ /^chr([1-9]|1[0-9]|2[0-2]|X|Y|M)$/ {print}' \
     tishkoff.hg38.unsorted.vcf > tishkoff.hg38.canon.body
 awk 'BEGIN{OFS="\t"} /^##contig=/ {sub(/<ID=/,"<ID=chr"); print; next} /^#/ {print}' \
     tishkoff.hg38.unsorted.vcf > tishkoff.hg38.canon.header
 cat tishkoff.hg38.canon.header tishkoff.hg38.canon.body > tishkoff.hg38.canon.fixed.vcf
 # 33,600,472 variants survive (9,359 of the lifted variants landed on
 # alt/random/Un contigs and were dropped).
 # Step 4: sort + bgzip + tabix.
 bcftools sort tishkoff.hg38.canon.fixed.vcf -Oz -m 16G -T /data/tmp/ \
     -o tishkoff180.vcf.gz
 tabix -p vcf tishkoff180.vcf.gz
 rm tishkoff.hg38.unsorted.vcf tishkoff.hg38.canon.* tishkoff.hg19.chr.vcf.gz
 # Final: 346 MB bgzip + 1.6 MB tabix index, 33,600,472 SNPs (autosomes
 # + a handful of chrX entries from the PAR regions). 18,425 of the
 # 33,618,897 source variants (0.055%) are not represented after lift
 # (9,066 failed liftOver, 9,359 mapped to non-canonical contigs).
 # Symlinks placed at /gbdb/hg38/varFreqs/tishkoff/ for the tishkoff180
 # stanza in trackDb/human/varFreqs.ra.
 
 # varFreqsAll rebuild, 2026-04-30 Claude max
 # Regenerate the All Databases Combined track to include Tishkoff180.
 # Source count rises from 24 to 25 databases; final bigBed is 35.6 GB
 # (37.9 GB on disk before zoom indexes) with 1,169,063,801 records and
 # 117 fields (up from 115). Pipeline run in
 # /hive/data/genomes/hg38/bed/varFreqs/all/:
 # 1. Added /gbdb/hg38/varFreqs/tishkoff/tishkoff180.vcf.gz to files.txt
 #    and the Tishkoff180 row to
 #    ~/kent/src/hg/makeDb/scripts/varFreqs/databases.tsv.
 # 2. Cleared stale merged.vcf.gz / merged.annotated.vcf.gz to force re-merge.
 # 3. Ran ./mergeAndAnnotate.sh: per-VCF strip+norm reused cached files for
 #    the 24 unchanged DBs; only tishkoff180 was normalized (~5 min).
 #    bcftools merge ~25 min; bcftools csq ~35 min.
 #    Output: 1,169,064,168 merged variants (up from 1,166,005,346).
 #    Tishkoff added ~3.06M sites that were not present in any other DB
 #    (most of its 33.6M SNPs were already covered by the bigger DBs).
 # 4. Ran python3 vcfToBigBed.py --output-prefix varFreqsAll --threads 8
 #    (Phase 1 ~90 min re-extract of all 25 DBs, Phase 2 ~25 min chrom BED
 #    build, concat+sort ~45 min).
 # 5. bedToBigBed on the sorted BED -> 35.6 GB varFreqsAll.bb (1.169B
 #    records, 117 fields).
 # 6. Updated varFreqs.ra filterValues.sources and added
 #    filterByRange.Tishkoff180AF / filterByRange.Tishkoff180AC.
 # Existing /gbdb/hg38/varFreqs/varFreqsAll.bb symlink (pointing at
 # /hive/data/genomes/hg38/bed/varFreqs/all/varFreqsAll.bb) was preserved
 # and now resolves to the new 35.6 GB build.
 
 # Performance follow-up note (Claude max, 2026-04-30):
 # An incremental add currently takes ~10 hours. The biggest wins for a
 # future incremental-add workflow are:
 #   - Phase 1 of vcfToBigBed.py re-extracts per-DB TSVs for ALL databases
 #     every run (~90 min). Skip extraction when the per-DB output dir is
 #     already present and source mtime is unchanged. Estimated saving:
 #     ~85 min when adding one DB.
 #   - bcftools merge re-merges all 25 normalized VCFs (~25 min). For a
 #     pure additive change, `bcftools merge old_merged.vcf.gz new.vcf.gz`
 #     is much cheaper than re-merging from scratch. Estimated saving:
 #     ~20 min.
 # We tested a BCSQ cache (csqWithCache.sh, prototype in this directory's
 # git history), splitting merged variants into "cached" vs "uncached" via
 # bcftools isec and only running csq on the uncached subset. It is
 # correct but NOT a win: bcftools csq runs at ~30M records/min and is
 # already well-threaded; bcftools isec / annotate over the same volume
 # is slower than just running csq again. Don't pursue. The script was
 # removed after benchmarking; see this commit's history if you want to
 # re-test on different hardware.
 
 ##########
 # 2026-05-05 Claude max
 # GenomeAsia Pilot (gasp + gaspIndel) GRCh37 -> hg38 lift, refs #36642
 #
 # Lou's QA (note 2026-05-04) showed both subtracks were GRCh37 sites
 # served at /gbdb/hg38/, so positions were off by tens of kb to many
 # Mb (e.g. rs1050171 at hg19 chr7:55,249,063 instead of hg38
 # chr7:55,174,014). Three confirmations: contig lengths in the VCF
 # header are GRCh37, every contig record declares assembly=b37, and
 # the GATK reference FASTA is human_g1k_v37_decoy.fasta.
 #
 # Source files (sites-only, bare chromosome names "1"-"22", "X", "Y"):
 #   /hive/data/genomes/hg38/bed/varFreqs/ga100k/ga100k.subst.vcf.gz
 #       (66,236,516 SNVs, autosomes only)
 #   /hive/data/genomes/hg38/bed/varFreqs/ga100k/All.indels.annot.cont_withmaf.vcf.gz
 #       (4,415,156 indels, 1-22 + X/Y/MT plus GL*/hs37d5 alt contigs)
 # The indel VCF has a malformed #CHROM line (declares FORMAT but has
 # no sample columns and no FORMAT field on data lines); the lift
 # script strips it on the fly.
 #
 # Lift script: ~/kent/src/hg/makeDb/scripts/varFreqs/gaspLift.sh
 # Recipe is the same as tishkoff180: chr-prefix rename via
 # bcftools annotate --rename-chrs, CrossMap.py vcf with
 # hg19ToHg38.over.chain.gz, post-filter to canonical chr1-22/X/Y/M,
 # fix contig= header (CrossMap drops the chr prefix from contig IDs),
 # bcftools sort, bgzip, tabix.
 mkdir -p /hive/data/genomes/hg38/bed/varFreqs/ga100k/lift
 cd /hive/data/genomes/hg38/bed/varFreqs/ga100k/lift
 # chrRename.txt is in this directory (1 -> chr1 ... MT -> chrM).
 ~/kent/src/hg/makeDb/scripts/varFreqs/gaspLift.sh \
     /hive/data/genomes/hg38/bed/varFreqs/ga100k/ga100k.subst.vcf.gz \
     ./ga100k.subst.hg38
 ~/kent/src/hg/makeDb/scripts/varFreqs/gaspLift.sh \
     /hive/data/genomes/hg38/bed/varFreqs/ga100k/All.indels.annot.cont_withmaf.vcf.gz \
     ./ga100k.indels.hg38
 # Lift accounting is logged in subst.lift.log and indel.lift.log.
 # After lift, /gbdb symlinks point at the lifted .vcf.gz / .tbi pair.
 # trackDb bigDataUrl for gaspIndel was renamed to ga100k.indels.vcf.gz
 # (the old All.indels.annot.cont_withmaf.vcf.gz file name was a
 # verbatim copy of the GenomeAsia download name).
 
 ##########
 # 2026-05-06 Claude max
 # varFreqsAll rebuild: 5 vcfToBigBed.py fixes + add NPM (Singapore SG10K_Health),
 # refs #36642 notes 49, 51, 53, 55.
 
 # 1. Moved scripts from /hive/.../all/ into kent so the build is reproducible
 #    from a fresh checkout (Lou's note 49):
 #      ~/kent/src/hg/makeDb/scripts/varFreqs/vcfToBigBed.py   (driver)
 #      ~/kent/src/hg/makeDb/scripts/varFreqs/mergeAndAnnotate.sh (merge+csq)
 #    Original /hive/.../all/ paths are now symlinks into kent.
 
 # 2. vcfToBigBed.py fixes in this round:
 #    - normalize_consequence(): bcftools csq emits "&"-joined compounds
 #      (e.g. "stop_gained&frameshift") which exact-match-failed the old
 #      8-bucket consequence filter and orphaned ~8.5 M records. The
 #      normalizer rewrites "&" to "," so a single record can match multiple
 #      filter buckets, and appends ",others" to any token list with no named
 #      filter present so nothing is orphaned.  trackdb gains 4 new buckets
 #      (3_prime_utr, 5_prime_utr, non_coding, others) and switches to
 #      filterType.consequence multipleListOr.
 #    - Source-attribution bug: the old check only inspected the unified
 #      AC/AF slot, so AllOfUs (which ships only per-population fields)
 #      attributed to ZERO of its 67.5 M variants — ~43 M phantom rows in
 #      the previous bigBed had an empty "sources" column. The fix scans
 #      per-population slots before declaring "no data".
 #    - parse_bcsq() now returns "" instead of "." for aaChange/dnaChange on
 #      non-coding variants, so the mouseOver/detail page renders a clean
 #      blank line instead of a stray dot.
 #    - maxAF format: "{:.6g}" -> "{:.6f}" so very small AFs print as
 #      "0.000003" instead of "3.31347e-06".
 #    - autoSql `table varFreqs` -> `table varFreqsAll` (matches the bigBed
 #      filename; required for hgIntegrator to wire up correctly).
 
 # 3. NPM Singapore (SG10K_Health) added to databases.tsv + files.txt +
 #    populations.tsv (SgChinese / SgMalay / SgIndian ancestry groups) +
 #    trackDb filter UI.  The individual `npm` subtrack still has
 #    tableBrowser off (license), but its data is folded into varFreqsAll
 #    same as for finngen / kova / mgrb / swefreq / tishkoff180 (precedent).
 
 cd /hive/data/genomes/hg38/bed/varFreqs/all/
 # Force re-merge + re-csq (existing per-VCF normalize cache is reused).
 rm -f merged.vcf.gz merged.vcf.gz.tbi merged.annotated.vcf.gz \
     merged.annotated.vcf.gz.tbi
 rm -rf extracted beds varFreqsAll.bed
 ./mergeAndAnnotate.sh
 python3 vcfToBigBed.py --output-prefix varFreqsAll --threads 8
 # Build is in-place: /gbdb/hg38/varFreqs/varFreqsAll.bb stays symlinked
 # at /hive/.../all/varFreqsAll.bb, which is overwritten at the end of
 # the bedToBigBed step.
 
 ##########
 # 2026-05-12 Claude max
 # UK Biobank Neale Lab Round 2 variant manifest, refs #36642 (varFreqs).
 #
 # Source: variants.tsv.bgz (~916 MB, 13,791,467 rows), the Neale Lab
 # Round 2 UKB GWAS variant table with imputation INFO score, post-QC
 # allele counts and minor-allele frequencies from 361,194 imputed UK
 # Biobank samples restricted to white British ancestry. The TSV ships
 # bare chromosome names ("1".."22","X") in GRCh37 coordinates, so we
 # convert to a sites-only VCF, then lift to hg38.
 cd /hive/data/genomes/hg38/bed/varFreqs/ukbb
 # Step 1: TSV -> VCF (hg19, chr-prefixed). Drops AC=0 rows (variants
 # present in the imputation panel but not observed in the cohort), sets
 # AN = 2 * n_called, and carries the imputation INFO score, HWE p-value,
 # hom-ref / het / hom-alt sample counts, and the VEP consequence /
 # Neale Lab consequence_category as INFO fields.
 python3 ~/kent/src/hg/makeDb/scripts/varFreqs/ukbbToVcf.py
 # total=13,791,467 written=13,751,808 skipped_AC0=39,659
 # Step 2: lift to hg38 with CrossMap.
 CrossMap.py vcf /gbdb/hg19/liftOver/hg19ToHg38.over.chain.gz \
     ukbb.hg19.vcf \
     /hive/data/genomes/hg38/bed/varFreqs/all/hg38.fa \
     ukbb.hg38.unsorted.vcf
 # Total entries: 13,751,808, Failed to map: 6,889 -> 13,744,919 lifted.
 # Step 3: filter to canonical chromosomes (chr1-22/X/Y/M) in both the
 # data lines and the contig header (CrossMap emits every hg38 contig).
 awk '
     /^##contig=/ {
       if (match($0, /ID=([^,>]+)/, m) && m[1] ~ /^chr([1-9]|1[0-9]|2[0-2]|X|Y|M)$/) print
       next
     }
     /^#/ {print}
 ' ukbb.hg38.unsorted.vcf > header.vcf
 awk -F'\t' 'BEGIN{OFS="\t"} !/^#/ && $1 ~ /^chr([1-9]|1[0-9]|2[0-2]|X|Y|M)$/ {print}' \
     ukbb.hg38.unsorted.vcf > body.vcf
 cat header.vcf body.vcf > ukbb.hg38.canon.vcf
 # 13,743,085 variants survive (1,834 of the lifted variants landed on
 # alt/random/Un/fix contigs and were dropped).
 # Step 4: sort, bgzip, tabix.
 bcftools sort ukbb.hg38.canon.vcf -Oz -m 16G -T /data/tmp/ -o ukbb.vcf.gz
 tabix -p vcf ukbb.vcf.gz
 rm -f ukbb.hg19.vcf ukbb.hg38.unsorted.vcf ukbb.hg38.unsorted.vcf.unmap \
     ukbb.hg38.canon.vcf header.vcf body.vcf
 # Final: 551 MB bgzip + 1.7 MB tabix index, 13,743,085 variants.
 # 48,382 of the 13,791,467 source rows (0.35%) are not represented in
 # hg38: 39,659 had AC=0 (panel-only), 6,889 failed CrossMap, 1,834
 # lifted to non-canonical contigs.
 
 ##########
 # 2026-05-13 Claude max
 # Westlake BioBank for Chinese (WBBC) WGS allele frequencies, refs #36642.
 #
 # 4,480 WGS samples sequenced to ~13.9x at Westlake University. The pilot
 # project has been folded into the larger China Precision BioBank (CPBB);
 # the variant file we host is unchanged from the WBBC Phase I v20210103
 # release. Paper: Cong et al. 2022 Nat Commun, PMID 35618720.
 #
 # Source: 22 per-chromosome sites-only VCFs already on GRCh38 with
 # chr-prefixed chromosomes (autosomes only - no chrX/Y in the download).
 # INFO fields ship cohort AC/AF/AN/NS plus per-region AC/AN for the four
 # Han Chinese geographic subgroups (North, Central, South, Lingnan) plus
 # per-genotype RR|RA|AA counts, total DP and the GATK VQSLOD score.
 cd /hive/data/genomes/hg38/bed/varFreqs/westlakeChina/download
 for i in $(seq 1 22); do wget -nv https://wbbc.westlake.edu.cn/data/WBBC.chr$i.GRCh38.vcf.gz; done
 # bcftools concat needs tabix indices when the per-chrom VCFs lack
 # ##contig header lines (they do here).
 parallel -j 8 tabix -p vcf {} ::: WBBC.chr{1..22}.GRCh38.vcf.gz
 cd /hive/data/genomes/hg38/bed/varFreqs/westlakeChina
 bcftools concat --threads 8 -Oz -o wbbc.allChrom.vcf.gz \
     download/WBBC.chr{1..22}.GRCh38.vcf.gz
 # Stream re-header + filter AC=0. The Python helper inserts proper
 # ##contig (with lengths) and ##INFO lines, drops the dangling
 # SHAPEIT FORMAT line, and skips any row whose INFO starts AC=0;.
 zcat wbbc.allChrom.vcf.gz \
     | python3 ~/kent/src/hg/makeDb/scripts/varFreqs/wbbcFix.py \
     | bgzip -@ 8 -c > wbbc.vcf.gz
 tabix -p vcf wbbc.vcf.gz
 rm -f wbbc.allChrom.vcf.gz
 # Final: 2.2 GB bgzip + 2.2 MB tabix index, 78,594,274 variants.
 # 1,495,967 rows (~1.86%) with AC=0 were dropped.
 # Symlinks placed under /gbdb/hg38/varFreqs/wbbc/ for the wbbc stanza
 # in trackDb/human/varFreqs.ra.
 #
 # databases.tsv and populations.tsv now list WBBC + the four Han
 # regional groups, so the next varFreqsAll rebuild will pick the cohort
 # up. The trackDb filter UI (filterByRange.WBBCAF etc.) was deliberately
 # NOT added yet - those will be added together with the next rebuild
 # of varFreqsAll, otherwise the filters would appear in the UI before
 # the underlying bigBed has the columns.
 
 # Taiwan Precision Medicine Initiative (TPMI) - 2026-05-15 Claude max
 # TPMI is a Han-Chinese cohort of 565,390 participants (Yang et al. 2025,
 # Nature, PMID 41092961). Genotyping was done with two custom Axiom SNP
 # arrays (TPMv1 on 165,596 individuals; TPMv2 on 321,360 individuals).
 # Thermo Fisher publishes a chip annotation CSV for each array; the TPM1
 # annotation embeds the TPMI cohort allele frequency in a column named
 # "Allele Frequency". The TPM2 annotation does not (it only carries the
 # Affymetrix-bundled CEU/CHB/JPT/YRI HapMap freqs), so this track is
 # TPM1-only. The annotation CSVs declare hg38 coordinates, so no lift.
 cd /hive/data/genomes/hg38/bed/varFreqs/tpmi
 # TPM1_Array_Annotation.csv (~926 MB) and TPM2_Array_Annotation.csv
 # (~1.0 GB) were already in this directory at the start. Only the TPM1
 # file is used.
 # CSV -> VCF: filters out rows with no AF / blacklist / alt-or-random
 # chrom, anchors indels (Affymetrix uses "-" for the empty allele) by
 # fetching the anchor base from hg38.2bit via twoBitToFa, and derives
 # AC = round(AF * 100000). AN=100000 is the implied precision of the
 # AF values (every reported AF rounds to an exact integer at 1/100000;
 # AF=0 is also kept, AC=0). The true TPMv1 cohort AN is ~330,000 for
 # autosomes, so AC values are proportional to but smaller than the
 # real counts; documented in the VCF header and tpmi.html caveat.
 python3 ~/kent/src/hg/makeDb/scripts/tpmi/tpmiToVcf.py \
     TPM1_Array_Annotation.csv /hive/data/genomes/hg38/hg38.2bit tpmi.vcf
 bcftools sort -Oz -o tpmi.vcf.gz tpmi.vcf
 tabix -f -p vcf tpmi.vcf.gz
 rm -f tpmi.vcf
 # 752,921 source rows -> 672,843 VCF records. Skipped: 80,034 with no
 # reported AF (includes the entire chrY content); 36 on alt/random
 # contigs; 8 with no defined REF. Blacklist rows (~61k) all have no AF
 # so they are filtered by the no-AF rule. Distribution: 623,203 SNVs,
 # 34,954 deletions, 13,537 insertions, 1,149 MNVs.
 # Symlinks placed under /gbdb/hg38/varFreqs/tpmi/ for the tpmi stanza
 # in trackDb/human/varFreqs.ra (priority 5.6, next to wbbc/tommo).
 
 ##########
 # 2026-05-15 Claude max
 # Strip three unused FILTER definitions from the tishkoff180 VCF header.
 # Matt Hansen (mhansen@upenn.edu, 2026-04-29 email): the source header
 # declared LowQual, VQSRTrancheSNP99.90to100.00, and
 # VQSRTrancheSNP99.90to100.00+ but he had already removed the
 # corresponding INFO values and only kept PASS variants. The hgTracks
 # VCF code auto-generates filter checkboxes from ##FILTER lines, so
 # these spurious entries showed up on hgTrackUi. Confirmed every record
 # has FILTER=PASS, then rewrote only the header (body untouched) with
 # bcftools reheader and re-indexed.
 cd /hive/data/genomes/hg38/bed/varFreqs/tishkoff/
 bcftools view -h tishkoff180.vcf.gz \
     | grep -vE '^##FILTER=<ID=(LowQual|VQSRTrancheSNP99\.90to100\.00\+?),' \
     > header.new.txt
 bcftools reheader -h header.new.txt -o tishkoff180.new.vcf.gz tishkoff180.vcf.gz
 tabix -p vcf tishkoff180.new.vcf.gz
 # Sanity check: same 33,600,472 records, only PASS remains.
 mv tishkoff180.vcf.gz     tishkoff180.vcf.gz.bak
 mv tishkoff180.vcf.gz.tbi tishkoff180.vcf.gz.tbi.bak
 mv tishkoff180.new.vcf.gz     tishkoff180.vcf.gz
 mv tishkoff180.new.vcf.gz.tbi tishkoff180.vcf.gz.tbi
 rm header.new.txt
 # /gbdb/hg38/varFreqs/tishkoff/ symlinks unchanged. The .bak originals
 # can be removed once the change has been verified on hgwdev.
 
 ##########
 # 2026-05-15 Claude max
 # Hide all non-downloadable varFreqs subdirs from hgdownload by prefixing
 # the gbdb subdirectory with "_". hgdownload rsync excludes underscore-
 # prefixed directories, so this is the standard way to keep a track
 # usable in the browser (vcfTabix random access via /gbdb still works)
 # while preventing bulk download. Applies to every track that has
 # `tableBrowser off` in trackDb/human/varFreqs.ra, plus the combined
 # varFreqsAll bigBed (mixed licenses, never downloadable).
 cd /gbdb/hg38/varFreqs
 mv allofus  _allofus
 mv topmed   _topmed
 mv sfari    _sfari
 mv finngen  _finngen
 mv swefreq  _swefreq
 mv mgrb     _mgrb
 mv kova     _kova
 mv npm      _npm
 mv mxb      _mxb
 mv tishkoff _tishkoff
 mkdir _all && mv varFreqsAll.bb _all/varFreqsAll.bb
 # Symlink targets under /hive/data/genomes/... unchanged; only the gbdb
 # directory names change. Updated bigDataUrl paths for the matching 12
 # stanzas in trackDb/human/varFreqs.ra (allofus, topmed, sfariSparkExomes,
 # sfariSparkWgs, finngen, swefreq, mgrb, kova, npm, mxbFreq, tishkoff180,
 # varFreqsAll).
 # Also fixed Data Access wording in description pages that still claimed
 # Table Browser / download server availability: topmed.html, allofus.html,
 # sfariSparkExomes.html (shared by sfariSparkWgs), mxbFreq.html. The
 # standard restricted-track disclaimer (already in finngen.html, kova.html,
 # mgrb.html, npm.html, swefreq.html, tishkoff180.html, varFreqsAll.html)
 # is now uniform across all 11 restricted tracks plus the combined track.
 
 ##########
 # 2026-05-15 Claude max
 # Hide tpmi from bulk download (same _-prefix pattern as the other
 # license-restricted varFreqs subdirs).
 cd /gbdb/hg38/varFreqs
 mv tpmi _tpmi
 # Updated trackDb/human/varFreqs.ra tpmi stanza: bigDataUrl now points to
 # _tpmi/, and `tableBrowser off` was added. tpmi.html Data Access section
 # rewritten to the standard restricted-track disclaimer.
 # Registered tpmi in scripts/varFreqs/databases.tsv so the next
 # varFreqsAll combined-track rebuild picks it up:
 #   TPMI    TPMI Taiwan    /gbdb/hg38/varFreqs/_tpmi/tpmi.vcf.gz    AC    AF
 
 ##########
 # 2026-05-17 Claude max
 # ChinaMAP phase 1 - 10,588 deep-WGS Chinese individuals (Cao et al. 2020,
 # Cell Res, PMID 32355288). 136.75 M SNPs + 10.70 M short indels on chr1-22.
 # License: data may not be redistributed (see ChinaMAP Limitations on Use,
 # http://chinamapwgs.mbiobank.com/download/), so the subdirectory under
 # /gbdb is _-prefixed (hides from hgdownload) and the trackDb stanza has
 # `tableBrowser off`.
 cd /hive/data/genomes/hg38/bed/varFreqs/chinamap
 # mbiobank_ChinaMAP.phase1.vcf.gz (~2.0 GB) was already in this directory at
 # the start, fetched manually from http://chinamapwgs.mbiobank.com/download/
 # (one-time link, registration required). The file is already on GRCh38 with
 # chr-prefixed chromosome names, autosomes only, sorted, and ships AC/AF/AN
 # plus matched 1KGP_* INFO fields. No conversion/lift/normalisation needed -
 # just index.
 ln -s mbiobank_ChinaMAP.phase1.vcf.gz chinamap.vcf.gz
 tabix -p vcf chinamap.vcf.gz
 # Final: 147,448,941 variants (matches the 136.75 M SNPs + 10.70 M indels
 # reported in Cao et al.). 2.0 GB bgzip + 2.1 MB tabix index.
 # Registered ChinaMAP in scripts/varFreqs/databases.tsv so the next
 # varFreqsAll combined-track rebuild picks it up:
 #   ChinaMAP    China ChinaMAP    /gbdb/hg38/varFreqs/_chinamap/chinamap.vcf.gz    AC    AF
 # The varFreqs.ra filter UI fragment (filterByRange.ChinaMAPAF/AC, sources
 # filterValues) is deliberately NOT added yet; it will be added together
 # with the next rebuild of varFreqsAll (WBBC/TPMI precedent), otherwise
 # the filters would appear in the UI before the columns exist in the bb.
 
 ##########
 # 2026-05-17 Claude max
 # GenomeIndia: 9,768 WGS, 83 endogamous Indian populations
 # (Bhattacharyya et al. 2025 Nat Genet, PMID 40200122,
 # DOI 10.1038/s41588-025-02153-x).
 #
 # Source: 22 per-chromosome TSV files (autosomes only, no header) with
 # CHROM/POS/ID/REF/ALT/AF columns, distributed by the Indian Biological
 # Data Centre as 9768GI_SummaryStats.tar.gz. The release ships only the
 # alternate allele frequency: no AC, no AN, no per-population breakdown.
 # 129,938,889 high-confidence biallelic variants total.
 cd /hive/data/genomes/hg38/bed/varFreqs/genomeindia
 # Download (already done; raw TSVs live under
 # /gbdb/hg38/varFreqs/genomeindia/9768GI_SummaryStats/ from the original
 # wget by Max).
 # Convert TSV -> sorted, bgzipped, tabix-indexed VCF. Script synthesizes
 # AN = 2 * 9768 = 19536 and AC = round(AF*AN). See script header for the
 # caveat re: NS_GT >= 98% upstream filter.
 python3 ~/kent/src/hg/makeDb/scripts/varFreqs/genomeindiaToVcf.py \
     /gbdb/hg38/varFreqs/genomeindia/9768GI_SummaryStats \
     genomeindia.vcf.gz
 # Output: 129,938,889 variants, 0 skipped; matches the paper exactly.
 # Final size: 791 MB bgzip + 1.9 MB tabix index.
 # Symlinks placed at /gbdb/hg38/varFreqs/_genomeindia/{genomeindia.vcf.gz,
 # .tbi} for the genomeindia stanza in trackDb/human/varFreqs.ra.
 #
 # Combined-track wiring (filter UI fragment is added now, in the same
 # style as the GenomeAsia rebuild precedent, because GenomeIndia/AC and
 # /AF will be present in the next varFreqsAll rebuild):
 #   1. Added GenomeIndia row to
 #      ~/kent/src/hg/makeDb/scripts/varFreqs/databases.tsv
 #      (key=GenomeIndia, ac_field=AC, af_field=AF, no populations).
 #   2. Added GenomeIndia|GenomeIndia 9.7k WGS to filterValues.sources
 #      and filterByRange.GenomeIndiaAF/AC stanzas in varFreqs.ra.
 # A full varFreqsAll rebuild has NOT been re-run yet; do that after QA
 # clears the standalone subtrack.
 
 ##########
 # 2026-05-17 -> 2026-05-18 Claude max
 # varFreqsAll combined-track rebuild incorporating the four new subtracks
 # registered earlier in May: WBBC (~78.6M), TPMI (~672k), ChinaMAP
 # (~147M), GenomeIndia (~130M). Also drops IndiGen (no AC/AF in source)
 # from the merged output - the standalone indigenomes subtrack is
 # unchanged but no longer contributes a column to varFreqsAll.
 cd /hive/data/genomes/hg38/bed/varFreqs/all
 # files.txt regenerated from databases.tsv to point at the new
 # _-prefixed /gbdb paths (allofus, sfari, npm, kova, finngen, swefreq,
 # topmed, mgrb, mxb, tishkoff, tpmi, chinamap, genomeindia) and to add
 # wbbc/tpmi/chinamap/genomeindia.
 # Forced merge+csq re-run; existing per-VCF normalize cache reused for
 # the 25 unchanged databases. Removed stale IndiGenomes_Variants.norm
 # cache so it would not sneak into the find-based merge input list.
 rm -f merged.vcf.gz merged.vcf.gz.tbi merged.annotated.vcf.gz \
     merged.annotated.vcf.gz.tbi normalized_files.txt
 rm -f normalized/IndiGenomes_Variants.norm.vcf.gz*
 ./mergeAndAnnotate.sh
 python3 ~/kent/src/hg/makeDb/scripts/varFreqs/vcfToBigBed.py \
     --output-prefix varFreqsAll --threads 8
 # Final varFreqsAll.bb: 46 GB, 1,335,789,391 items, 147 fields, 25
 # chromosomes. Symlinked through /gbdb/hg38/varFreqs/_all/varFreqsAll.bb
 # (unchanged - the script writes the bb in place; symlink already
 # points at it).
 #
 # mergeAndAnnotate.sh fix: bcftools 1.22 (newer than the previous
 # build) is stricter than older versions about chromosome-naming
 # mismatches between the merged VCF (chr1..) and the Ensembl GFF3
 # (1..). Added --unify-chr-names chr,-,chr to the bcftools csq
 # invocation; also added --force so it tolerates the 5 alt contigs
 # (chr15_KI270850v1_alt etc.) that SCHEMA contributes to the merged
 # VCF but that the GFF3 does not annotate - earlier bcftools just
 # emitted them without csq calls; 1.22 aborts.
 #
 # trackDb fragment updated in human/varFreqs.ra: WBBC, TPMI, ChinaMAP,
 # GenomeIndia rows added to filterValues.sources and as per-database
 # filterByRange.<DB>AF/AC pairs; IndiGen pair removed (matches the new
 # bb columns). WBBC 4-region populations (North/Central/South/Lingnan)
 # added at the end of the per-population block. Auto-generated raw
 # fragment lives at /hive/data/genomes/hg38/bed/varFreqs/all/varFreqsAll.trackDb.ra
 # for reference; the manual filterType.consequence multipleListOr line
 # and the expanded consequence buckets (3_prime_utr, 5_prime_utr,
 # non_coding, others) are NOT in the auto-fragment - they have to be
 # kept on top of it.
 
 ##########
 # 2026-05-19 Claude max
 # Post-rebuild cleanups to scripts/varFreqs (no track change, just safer
 # next rebuild):
 #  - mergeAndAnnotate.sh now reads VCF paths directly from databases.tsv
 #    in both Step 4 (per-VCF strip+norm) and Step 5 (merge list). The old
 #    files.txt is no longer consulted; that file and databases.tsv used to
 #    drift, and Step 5's `find normalized/` would silently re-merge stale
 #    entries (e.g. IndiGenomes_Variants.norm.vcf.gz after IndiGen was
 #    dropped from databases.tsv).
 #  - vcfToBigBed.py concat step now streams `sort -k2,2n chr.bed >> out`
 #    via subprocess stdout= rather than capture_output=True. The old form
 #    buffered the entire sorted chrom (~24 GB for chr1) in Python RAM;
 #    fine on hgwdev's 4 TB box, OOMable elsewhere.
 #  - vcfToBigBed.py generate_trackdb_fragment() now emits the three
 #    customizations that previously had to be added on top of the
 #    auto-fragment by hand: filterType.consequence multipleListOr, the
 #    expanded consequence buckets (3_prime_utr, 5_prime_utr, non_coding,
 #    others), and skipEmptyFields on. The fragment is now paste-and-go.
 
 ##########
 # 2026-05-20 Claude max
 # Genome of the Netherlands (GoNL), GRCh38 re-analysis (v1.0).
 # Site-only VCF with INFO AC/AF/AN for the 498 unrelated parents
 # (250 fathers + 248 mothers, 2 mothers failed QC). Original paper
 # Genome of the Netherlands Consortium, Nat Genet 2014, PMID 24974849;
 # the GRCh38 re-analysis pipeline is described in the README that ships
 # with the download.
 cd /hive/data/genomes/hg38/bed/varFreqs/gonl/
 wget https://download.molgeniscloud.org/downloads/gonl_public/variants/GoNL_GRCh38_1.0/README.txt
 wget https://download.molgeniscloud.org/downloads/gonl_public/variants/GoNL_GRCh38_1.0/multisample.parents_only.info_only.vcf.gz
 wget https://download.molgeniscloud.org/downloads/gonl_public/variants/GoNL_GRCh38_1.0/multisample.parents_only.info_only.vcf.gz.tbi
 # Source already uses UCSC chr-prefixed names, so no chromosome renaming
 # needed. Drop the GRCh38 decoy contigs (chrUn_JTFH01*, chrUn_KN707* etc.)
 # and chrEBV that ship with the analysis-set FASTA but are not part of the
 # UCSC hg38 assembly.
 cut -f1 /hive/data/genomes/hg38/chrom.sizes | sort > hg38.chroms.txt
 zcat multisample.parents_only.info_only.vcf.gz | grep -v '^#' | cut -f1 \
     | sort -u > gonl.chroms.txt
 comm -12 gonl.chroms.txt hg38.chroms.txt > keep.chroms.txt
 ( bcftools view -h multisample.parents_only.info_only.vcf.gz; \
   bcftools view -H multisample.parents_only.info_only.vcf.gz \
     | awk -F'\t' 'BEGIN{while((getline k < "keep.chroms.txt") > 0) ok[k]=1} ok[$1]' \
 ) | bgzip -@ 8 -c > gonl.preNorm.vcf.gz
 tabix -p vcf gonl.preNorm.vcf.gz
 # 31,114,481 input records -> 30,904,161 kept (0.68% dropped, all on
 # decoy / EBV contigs not present in hg38). Max AN = 996, matching 498
 # diploid parents.
 # Split multiallelic sites and left-align indels against the hg38 reference.
 ln -sf /hive/data/genomes/hg38/bed/varFreqs/all/hg38.fa hg38.fa
 ln -sf /hive/data/genomes/hg38/bed/varFreqs/all/hg38.fa.fai hg38.fa.fai
 bcftools norm -f hg38.fa -m-any --threads 8 gonl.preNorm.vcf.gz -Oz -o gonl.vcf.gz
 tabix -p vcf gonl.vcf.gz
 # norm stats: 30,904,161 -> 36,363,474 records (2,629,361 multiallelic
 # sites split, 3,559,402 indels left-realigned, 0 dup/mismatch).
 # GoNL added to databases.tsv as well, so the next mergeAndAnnotate.sh
 # run will pick it up into varFreqsAll.bb. The current /gbdb _all
 # bigBed predates GoNL.
 
 ##########
 # 2026-05-28 Claude max
 # varFreqs reorg: split SFARI SPARK into ASD/non-ASD counts, add SCHEMA
 # case/control counts, drop genotyping-array cohorts (TPMI + MexBB) from the
 # WGS/WES varFreqsAll track, and add two new combined tracks:
 #   varFreqsDisease  (SPARK, SFARI WGS, TOPMed, SCHEMA, GREGoR, GA4K)
 #   varFreqsArray    (TPMI, MexBB, UKBB)
 # refs #36642
 
 # --- 1. Phenotype-stratified SPARK counts -----------------------------------
 # sparkMergeVcfAddCounts.sh now takes an optional 3rd arg: the SPARK
 # individuals_registration TSV. With it, bcftools +fill-tags is given a
 # sample-group file (col1=sample_sp_id, col2=AUT|NON_AUT) so the output
 # sites VCF carries AC_AUT/AN_AUT/AF_AUT and AC_NON_AUT/AN_NON_AUT/AF_NON_AUT
 # in addition to the overall AC/AN/AF. The same registration file (col 8
 # "asd": TRUE/FALSE) covers both WES (142,357 samples) and WGS (12,519
 # samples); samples with a blank asd value are not assigned to either group.
 REG=/hive/data/genomes/hg38/bed/varFreqs/sparkExomes/SPARKDataRelease_2025-12-15/individuals_registration-2025-12-15.tsv
 
 cd /hive/data/genomes/hg38/bed/varFreqs/sparkExomes/
 sh ~/kent/src/hg/makeDb/scripts/varFreqs/sparkMergeVcfAddCounts.sh \
     vcf/SPARK.iWES_v3.2024_08.deepvariant 8 "$REG"
 bcftools norm -m- SPARK.iWES_v3.2024_08.deepvariant.sites.vcf.gz \
     -Oz -o SPARK.iWES_v3.2024_08.deepvariant.norm.vcf.gz
 tabix -p vcf SPARK.iWES_v3.2024_08.deepvariant.norm.vcf.gz
 # 13,299,149 variants out, identical position list to the pre-pheno build.
 
 cd /hive/data/genomes/hg38/bed/varFreqs/sparkWgs/
 sh ~/kent/src/hg/makeDb/scripts/varFreqs/sparkMergeVcfAddCounts.sh \
     vcf/wgs_12519_genome.deepvariant 8 "$REG"
 bcftools norm -m- wgs_12519_genome.deepvariant.sites.vcf.gz \
     -Oz -o wgs_12519_genome.deepvariant.norm.vcf.gz
 tabix -p vcf wgs_12519_genome.deepvariant.norm.vcf.gz
 # 177,384,905 variants out. WGS coverage by phenotype: 3,565 AUT + 8,784
 # NON_AUT + 170 unassigned of 12,519 samples.
 
 # --- 2. SCHEMA case/control counts ------------------------------------------
 # schema_addAcAnAf.py now also emits AC_CASE/AN_CASE/AF_CASE and
 # AC_CTRL/AN_CTRL/AF_CTRL by summing the per-cohort ac_case/an_case and
 # ac_ctrl/an_ctrl arrays. The script is idempotent (it strips and re-adds
 # the fields it manages) so it can be re-run on the already-lifted
 # hg38_sorted file without re-running CrossMap.
 cd /hive/data/genomes/hg38/bed/varFreqs/schema
 python3 ~/kent/src/hg/makeDb/scripts/varFreqs/schema_addAcAnAf.py \
     SCHEMA_variant_results_hg38_sorted.vcf.gz \
     SCHEMA_variant_results_hg38_sorted.withCC.vcf
 bgzip -f SCHEMA_variant_results_hg38_sorted.withCC.vcf
 tabix -f -p vcf SCHEMA_variant_results_hg38_sorted.withCC.vcf.gz
 mv SCHEMA_variant_results_hg38_sorted.vcf.gz     SCHEMA_variant_results_hg38_sorted.vcf.gz.preCC.bak
 mv SCHEMA_variant_results_hg38_sorted.vcf.gz.tbi SCHEMA_variant_results_hg38_sorted.vcf.gz.tbi.preCC.bak
 mv SCHEMA_variant_results_hg38_sorted.withCC.vcf.gz     SCHEMA_variant_results_hg38_sorted.vcf.gz
 mv SCHEMA_variant_results_hg38_sorted.withCC.vcf.gz.tbi SCHEMA_variant_results_hg38_sorted.vcf.gz.tbi
 # 8,864,488 variants, AC_CASE+AC_CTRL = AC, sums verified.
 
 # --- 3. Three combined-track builds -----------------------------------------
 # mergeAndAnnotate.sh and vcfToBigBed.py now accept --databases /
 # --databases-file flags, so the same code drives three builds from three
 # config TSVs: databases.tsv, databases_disease.tsv, databases_array.tsv.
 # The normalized/ cache under all/normalized is shared across builds
 # (positions are identical regardless of which combined track they feed).
 # mergeAndAnnotate.sh pins /cluster/software/src/bcftools-1.22/ in front of
 # PATH because bcftools csq's --unify-chr-names was added in 1.22; the conda
 # default (1.14) silently fails the annotate step otherwise.
 
 # Reorg removed TPMI and MexBB from databases.tsv (moved to
 # databases_array.tsv) and added UKBB to databases_array.tsv. Both
 # databases_disease.tsv and databases_array.tsv reuse populations.tsv:
 # load_config only attaches a population row to a database that is in the
 # active config, so the same populations file works for all three builds.
 
 # varFreqsAll (28 cohorts, all WGS/WES; ~1.34 billion variants, 48 GB):
 bash ~/kent/src/hg/makeDb/scripts/varFreqs/mergeAndAnnotate.sh
 python3 ~/kent/src/hg/makeDb/scripts/varFreqs/vcfToBigBed.py \
     --annotated-vcf /hive/data/genomes/hg38/bed/varFreqs/all/merged.annotated.vcf.gz \
     --output-prefix varFreqsAll \
     --threads 6 \
     --work-dir /hive/data/genomes/hg38/bed/varFreqs/all
 ln -sfn /hive/data/genomes/hg38/bed/varFreqs/all/varFreqsAll.bb \
         /gbdb/hg38/varFreqs/_all/varFreqsAll.bb
 
 # varFreqsDisease (6 cohorts; ~932 M variants, 23 GB):
 SCR=~/kent/src/hg/makeDb/scripts/varFreqs
 bash $SCR/mergeAndAnnotate.sh --databases $SCR/databases_disease.tsv --tag .disease
 python3 $SCR/vcfToBigBed.py \
     --databases-file $SCR/databases_disease.tsv \
     --annotated-vcf /hive/data/genomes/hg38/bed/varFreqs/all/merged.disease.annotated.vcf.gz \
     --output-prefix varFreqsDisease \
     --threads 6 \
     --work-dir /hive/data/genomes/hg38/bed/varFreqs/disease
 ln -sfn /hive/data/genomes/hg38/bed/varFreqs/disease/varFreqsDisease.bb \
         /gbdb/hg38/varFreqs/_disease/varFreqsDisease.bb
 
 ##########
 # 2026-06-02 -> 2026-06-03 Claude max
 # Replaced the single varFreqsAll combined track (and the older varFreqsDisease
 # track) with TWO matched comparison tracks so the affected-vs-background
 # difference can be seen visually (e.g. autism-gene pLoF in cases vs the general
 # population). refs #36642
 #   varFreqsAffected   - variants seen in the affected/case arm of the disease
 #                        cohorts
 #   varFreqsBackground - "all other variants": population reference cohorts +
 #                        the unaffected/control/unknown arms of disease cohorts
 # A variant present in both groups appears in both tracks (overlap is intended).
 # Genotyping-array cohorts are in neither (they stay in varFreqsArray).
 #
 # Cohort classification (databases.tsv columns is_disease, disease_role):
 #   - TOPMed is a population cohort (is_disease=0): an NHLBI population/biobank
 #     reference, not an affected-disease case cohort, no affected/unaffected
 #     label. It feeds the background.
 #   - Disease-study cohorts (is_disease=1): SPARK, SFARI_WGS, SCHEMA, GREGoR
 #     (per-arm phenotype split) and GA4K (disease_role=affected: rare-disease
 #     cohort with no split, counted as affected; caveat - GA4K enrolls trios so
 #     a minority of carriers are unaffected parents).
 #   - populations.tsv has a 6th column "phenotype" (affected|unaffected|unknown)
 #     tagging the AUT/NON_AUT, CASE/CTRL, AFF/UNA/UNK arms. affected arms feed
 #     the affected track; unaffected/unknown arms feed the background track.
 #
 # Both tracks share one autoSql schema. Summary fields after dnaChange:
 #   affectedAF/AC      (string)  max AF / summed AC in affected/case arms
 #   affectedCohorts    (string)  disease cohorts contributing affected carriers
 #   backgroundAF/AC    (string)  max AF / summed AC in population + unaffected
 #   backgroundSources  (string)  cohorts contributing to the background
 #   inAffected         (uint)    1 if seen in an affected/case arm, else 0
 # followed by the per-database and per-population AC/AF columns. Each track
 # carries BOTH summaries (so the Affected track can be filtered to variants rare
 # in the background). Row inclusion: affected track = affectedAF/AC present;
 # background track = backgroundAF/AC present. score = that track's AF * 1000.
 # vcfToBigBed.py derives the length-filter ranges (filter.refLen/altLen/varLen)
 # from the observed data and emits them in the auto trackDb fragment.
 #
 # vcfToBigBed.py --split-affected emits the two tracks in one pass (shared
 # Phase 1 extract). Without the flag it writes a single track (used by the
 # array build).
 #
 # Naming: SPARK WGS display name -> "SFARI SPARK WGS" and SPARK WES -> "SFARI
 # SPARK WES" to match the standalone sfariSparkWgs/sfariSparkExomes subtracks
 # (internal key SFARI_WGS kept). mouseOver fields are wrapped in ${} to avoid
 # the trackDb subMultiField prefix-substitution bug (hui.old.c).
 #
 # Rebuild uses existing merged.annotated.vcf.gz (no re-merge needed).
 cd /hive/data/genomes/hg38/bed/varFreqs/all
 python3 ~/kent/src/hg/makeDb/scripts/varFreqs/vcfToBigBed.py \
     --annotated-vcf merged.annotated.vcf.gz \
     --output-prefix varFreqs \
     --split-affected \
     --threads 8 \
     --work-dir /hive/data/genomes/hg38/bed/varFreqs/all
 # Produces varFreqsAffected.bb and varFreqsBackground.bb. Symlinked under
 # /gbdb/hg38/varFreqs/_affected/ and /gbdb/hg38/varFreqs/_background/. The
 # varFreqsAffected and varFreqsBackground filter blocks in
 # trackDb/human/varFreqs.ra come from the paste-and-go auto fragment
 # varFreqs.trackDb.ra produced by this run (both stanzas use the same fragment;
 # only the mouseOver differs).
 
 # varFreqsArray (TPMI + MexBB + UKBB; ~14.7 M variants, 750 MB):
 bash $SCR/mergeAndAnnotate.sh --databases $SCR/databases_array.tsv --tag .array
 python3 $SCR/vcfToBigBed.py \
     --databases-file $SCR/databases_array.tsv \
     --annotated-vcf /hive/data/genomes/hg38/bed/varFreqs/all/merged.array.annotated.vcf.gz \
     --output-prefix varFreqsArray \
     --threads 6 \
     --work-dir /hive/data/genomes/hg38/bed/varFreqs/array
 ln -sfn /hive/data/genomes/hg38/bed/varFreqs/array/varFreqsArray.bb \
         /gbdb/hg38/varFreqs/_array/varFreqsArray.bb
+
+##########
+# 2026-06-12 Lou (Claude)
+# Switched the varFreqsAffected and varFreqsBackground combined tracks to a
+# pooled-AF formulation: affectedAF = sum(AC)/sum(AN) across the contributing
+# affected arms (and backgroundAF the same for the background arms), replacing
+# the prior max-across-cohorts AF. The change fixes a known artifact where a
+# small per-population subgroup with AC=AN=1 (e.g. AllOfUs Oceanian at the
+# APOE ε4 site) drove backgroundAF to 1.0 even though the genuine pooled rate
+# is ~0.12.
+#
+# vcfToBigBed.py now infers per-arm AN as round(AC/AF) when both are reported.
+# databases.tsv grew an 8th column "default_an": for cohorts that publish only
+# AF (ABraOM, ALFA), set this to the diploid cohort size so AC can be
+# synthesized (AC = round(AF * default_an)) and default_an feeds the pool
+# denominator. ABraOM=2342, ALFA=816000. Cohorts that publish only AC with no
+# default_an (MGRB, GREGoR per-arm AC_AFFECTED/UNAFFECTED/UNKNOWN, AllOfUs
+# per-population) appear in affectedCohorts/backgroundSources but contribute
+# 0 to the pool, preserving the invariant that pooled AF cannot exceed 1.
+#
+# Two new bigBed columns: affectedAN and backgroundAN (pool denominators).
+# mouseOver was updated to show "Affected AC/AN: 33238 / 213150" so the ratio
+# is visible. The combined-track filter UI exposes affectedAN and backgroundAN
+# as numeric range filters. AS schema field count: 161 -> 163.
+#
+# Rebuild used the existing merged.annotated.vcf.gz (no re-merge needed);
+# Phase 1 re-extracted per-cohort AC/AF, Phase 2 wrote the new BEDs, then
+# concat + bedToBigBed. Total wall-clock ~8 hours.
+cd /hive/data/genomes/hg38/bed/varFreqs/all
+# Back up first (the script overwrites):
+cp varFreqsAffected.bb   varFreqsAffected.bb.preAfPool.bak
+cp varFreqsBackground.bb varFreqsBackground.bb.preAfPool.bak
+# Build:
+python3 ~/kent/src/hg/makeDb/scripts/varFreqs/vcfToBigBed.py \
+    --annotated-vcf merged.annotated.vcf.gz \
+    --output-prefix varFreqs \
+    --split-affected \
+    --threads 8 \
+    --work-dir /hive/data/genomes/hg38/bed/varFreqs/all
+# Spot-check (APOE rs429358, chr19:44908683-44908684 T>C):
+#   pre  : affectedAC=33238 affectedAF=0.181730 (max, GA4K-dominated)
+#                            backgroundAF=1.000000 (max, AllOfUs_OCE artifact)
+#   post : affectedAC=32782 affectedAN=213150 affectedAF=0.153798
+#                            backgroundAC=341065 backgroundAN=2751112
+#                            backgroundAF=0.123974