3180d71425ab40bc022712bb95868bfe80747375
max
  Fri May 29 08:52:38 2026 -0700
[Claude] varFreqs: split SPARK+SCHEMA by phenotype, add disease + array combined tracks, drop array cohorts from varFreqsAll

#Preview2 week - bugs introduced now will need a build patch to fix
Split SFARI SPARK WES and WGS by autism status using fill-tags -S with the
SPARK individuals_registration TSV (AC_AUT / AN_AUT / AF_AUT plus
AC_NON_AUT / AN_NON_AUT / AF_NON_AUT). Added matching SCHEMA case/control
sums (AC_CASE etc.). Two new combined bigBed tracks: varFreqsDisease
(SPARK, SFARI WGS, TOPMed, SCHEMA, GREGoR, GA4K) and varFreqsArray (TPMI,
MexBB, UKBB). TPMI and MexBB are removed from varFreqsAll so the main
combined track is purely WGS/WES.

Build scripts parameterized so the same code drives all three combined
builds: mergeAndAnnotate.sh gains --databases / --tag, vcfToBigBed.py
gains --databases-file / --populations-file and a per-track autoSql table
name. mergeAndAnnotate.sh now pins /cluster/software/src/bcftools-1.22 in
PATH (--unify-chr-names is a 1.22 feature; conda's 1.14 silently fails).

refs #36642

diff --git src/hg/makeDb/trackDb/human/varFreqs.ra src/hg/makeDb/trackDb/human/varFreqs.ra
index 1c43df781e0..fe9150102f5 100644
--- src/hg/makeDb/trackDb/human/varFreqs.ra
+++ src/hg/makeDb/trackDb/human/varFreqs.ra
@@ -6,31 +6,31 @@
 visibility hide
 superTrack on
 
         track varFreqsAll
         shortLabel All Databases Combined
         longLabel Variant Frequencies: All Databases Combined with Consequence Annotations
         type bigBed 9 +
         parent varFreqs on
         bigDataUrl /gbdb/$D/varFreqs/_all/varFreqsAll.bb
         visibility pack
         itemRgb on
         maxWindowToDraw 5000000
         priority 0.1
         mouseOver <b>Var:</b> $name<br><b>AA change:</b> $aaChange<br><b>Var type:</b> $varType<br><b>Conseq:</b> $consequence<br><b>Max AF:</b> $maxAF<br><b>Total AC:</b> $totalAC<br><b>Sources:</b> $sources
         # Source database filter
-        filterValues.sources AllOfUs|AllOfUs,SPARK|SPARK WES,SFARI_WGS|SFARI WGS,GenomeAsia|GenomeAsia SNVs,GenomeAsiaIndel|GenomeAsia Indels,NPM|NPM Singapore,KOVA|KOVA Korea,ToMMo|ToMMo Japan,FinnGen|FinnGen Finland,Saudi|Saudi,SweGen|SweGen Sweden,TOPMed|TOPMed,ABraOM|ABraOM Brazil,ALFA|ALFA,MGRB|MGRB Australia,HRC|HRC,MexBB|Mexico Biobank,SGDP|SGDP,HGDP1kG|gnomAD HGDP+1kG,GREGoR|GREGoR,SCHEMA|SCHEMA,GA4K|GA4K PacBio LR,CoLoRSdb|CoLoRSdb PacBio LR,SVatalog|SVatalog 101 10XG SR,Tishkoff180|Tishkoff 180 African WGS,WBBC|WBBC China,TPMI|TPMI Taiwan,ChinaMAP|China ChinaMAP,GenomeIndia|GenomeIndia 9.7k WGS,GoNL|GoNL Netherlands ~13x SR
+        filterValues.sources AllOfUs|AllOfUs,SPARK|SPARK WES,SFARI_WGS|SFARI WGS,GenomeAsia|GenomeAsia SNVs,GenomeAsiaIndel|GenomeAsia Indels,NPM|NPM Singapore,KOVA|KOVA Korea,ToMMo|ToMMo Japan,FinnGen|FinnGen Finland,Saudi|Saudi,SweGen|SweGen Sweden,TOPMed|TOPMed,ABraOM|ABraOM Brazil,ALFA|ALFA,MGRB|MGRB Australia,HRC|HRC,SGDP|SGDP,HGDP1kG|gnomAD HGDP+1kG,GREGoR|GREGoR,SCHEMA|SCHEMA,GA4K|GA4K PacBio LR,CoLoRSdb|CoLoRSdb PacBio LR,SVatalog|SVatalog 101 10XG SR,Tishkoff180|Tishkoff 180 African WGS,WBBC|WBBC China,ChinaMAP|China ChinaMAP,GenomeIndia|GenomeIndia 9.7k WGS,GoNL|GoNL Netherlands ~13x SR
         filterType.sources multipleListOr
         filterLabel.sources Source Database
         # Variant type and consequence filters
         filterValues.varType SNV|SNV,INS|Insertion,DEL|Deletion,MNV|MNV
         filterLabel.varType Variant Type
         filterValues.consequence missense|Missense,synonymous|Synonymous,stop_gained|Stop Gained,frameshift|Frameshift,splice_donor|Splice Donor,splice_acceptor|Splice Acceptor,intron|Intron,3_prime_utr|3' UTR,5_prime_utr|5' UTR,non_coding|Non-coding,.|Intergenic,others|Other
         filterType.consequence multipleListOr
         filterLabel.consequence Consequence
         # Length filters
         filterByRange.refLen on
         filterLabel.refLen Reference Length
         filterByRange.altLen on
         filterLabel.altLen Alternate Length
         filterByRange.varLen on
         filterLabel.varLen Length Change
@@ -60,54 +60,50 @@
         filterLabel.FinnGenAF FinnGen Finland AF
         filterByRange.SaudiAF on
         filterLabel.SaudiAF Saudi AF
         filterByRange.SweGenAF on
         filterLabel.SweGenAF SweGen Sweden AF
         filterByRange.TOPMedAF on
         filterLabel.TOPMedAF TOPMed AF
         filterByRange.ABraOMAF on
         filterLabel.ABraOMAF ABraOM Brazil AF
         filterByRange.ALFAAF on
         filterLabel.ALFAAF ALFA AF
         filterByRange.MGRBAF on
         filterLabel.MGRBAF MGRB Australia AF
         filterByRange.HRCAF on
         filterLabel.HRCAF HRC AF
-        filterByRange.MexBBAF on
-        filterLabel.MexBBAF Mexico Biobank AF
         filterByRange.SGDPAF on
         filterLabel.SGDPAF SGDP AF
         filterByRange.HGDP1kGAF on
         filterLabel.HGDP1kGAF gnomAD HGDP+1kG AF (4k cohort)
         filterByRange.GREGoRAF on
         filterLabel.GREGoRAF GREGoR AF
         filterByRange.SCHEMAAF on
         filterLabel.SCHEMAAF SCHEMA AF
         filterByRange.GA4KAF on
         filterLabel.GA4KAF GA4K PacBio LR AF
         filterByRange.CoLoRSdbAF on
         filterLabel.CoLoRSdbAF CoLoRSdb PacBio LR AF
         filterByRange.SVatalogAF on
         filterLabel.SVatalogAF SVatalog 101 10XG SR AF
         filterByRange.Tishkoff180AF on
         filterLabel.Tishkoff180AF Tishkoff 180 African WGS AF
         filterByRange.NPMAF on
         filterLabel.NPMAF NPM Singapore AF
         filterByRange.WBBCAF on
         filterLabel.WBBCAF WBBC China AF
-        filterByRange.TPMIAF on
-        filterLabel.TPMIAF TPMI Taiwan AF
         filterByRange.ChinaMAPAF on
         filterLabel.ChinaMAPAF China ChinaMAP AF
         filterByRange.GenomeIndiaAF on
         filterLabel.GenomeIndiaAF GenomeIndia 9.7k WGS AF
         filterByRange.GoNLAF on
         filterLabel.GoNLAF GoNL Netherlands ~13x SR AF
         # Per-database AC filters
         filterByRange.AllOfUsAC on
         filterLabel.AllOfUsAC AllOfUs AC
         filterByRange.SPARKAC on
         filterLabel.SPARKAC SPARK WES AC
         filterByRange.SFARI_WGSAC on
         filterLabel.SFARI_WGSAC SFARI WGS AC
         filterByRange.GenomeAsiaAC on
         filterLabel.GenomeAsiaAC GenomeAsia SNVs AC
@@ -121,54 +117,50 @@
         filterLabel.FinnGenAC FinnGen Finland AC
         filterByRange.SaudiAC on
         filterLabel.SaudiAC Saudi AC
         filterByRange.SweGenAC on
         filterLabel.SweGenAC SweGen Sweden AC
         filterByRange.TOPMedAC on
         filterLabel.TOPMedAC TOPMed AC
         filterByRange.ABraOMAC on
         filterLabel.ABraOMAC ABraOM Brazil AC
         filterByRange.ALFAAC on
         filterLabel.ALFAAC ALFA AC
         filterByRange.MGRBAC on
         filterLabel.MGRBAC MGRB Australia AC
         filterByRange.HRCAC on
         filterLabel.HRCAC HRC AC
-        filterByRange.MexBBAC on
-        filterLabel.MexBBAC Mexico Biobank AC
         filterByRange.SGDPAC on
         filterLabel.SGDPAC SGDP AC
         filterByRange.HGDP1kGAC on
         filterLabel.HGDP1kGAC gnomAD HGDP+1kG AC (4k cohort)
         filterByRange.GREGoRAC on
         filterLabel.GREGoRAC GREGoR AC
         filterByRange.SCHEMAAC on
         filterLabel.SCHEMAAC SCHEMA AC
         filterByRange.GA4KAC on
         filterLabel.GA4KAC GA4K PacBio LR AC
         filterByRange.CoLoRSdbAC on
         filterLabel.CoLoRSdbAC CoLoRSdb PacBio LR AC
         filterByRange.SVatalogAC on
         filterLabel.SVatalogAC SVatalog 101 10XG SR AC
         filterByRange.Tishkoff180AC on
         filterLabel.Tishkoff180AC Tishkoff 180 African WGS AC
         filterByRange.NPMAC on
         filterLabel.NPMAC NPM Singapore AC
         filterByRange.WBBCAC on
         filterLabel.WBBCAC WBBC China AC
-        filterByRange.TPMIAC on
-        filterLabel.TPMIAC TPMI Taiwan AC
         filterByRange.ChinaMAPAC on
         filterLabel.ChinaMAPAC China ChinaMAP AC
         filterByRange.GenomeIndiaAC on
         filterLabel.GenomeIndiaAC GenomeIndia 9.7k WGS AC
         filterByRange.GoNLAC on
         filterLabel.GoNLAC GoNL Netherlands ~13x SR AC
         # Population-specific AF filters
         # AllOfUs local-ancestry populations
         # NB: these are local-ancestry-stratified frequencies (per-position, per-haplotype-class),
         # NOT the AllOfUs paper's global Rye ancestry categories. See varFreqs.html for details.
         filterByRange.AllOfUsAF_AFR on
         filterLabel.AllOfUsAF_AFR AllOfUs African AF (local ancestry)
         filterByRange.AllOfUsAF_AMR on
         filterLabel.AllOfUsAF_AMR AllOfUs Indigenous American AF (local ancestry)
         filterByRange.AllOfUsAF_EAS on
@@ -294,30 +286,199 @@
         filterLabel.WBBCAF_North WBBC North Han AF
         filterByRange.WBBCAF_Central on
         filterLabel.WBBCAF_Central WBBC Central Han AF
         filterByRange.WBBCAF_South on
         filterLabel.WBBCAF_South WBBC South Han AF
         filterByRange.WBBCAF_Lingnan on
         filterLabel.WBBCAF_Lingnan WBBC Lingnan Han AF
         filterByRange.WBBCAC_North on
         filterLabel.WBBCAC_North WBBC North Han AC
         filterByRange.WBBCAC_Central on
         filterLabel.WBBCAC_Central WBBC Central Han AC
         filterByRange.WBBCAC_South on
         filterLabel.WBBCAC_South WBBC South Han AC
         filterByRange.WBBCAC_Lingnan on
         filterLabel.WBBCAC_Lingnan WBBC Lingnan Han AC
+        # SFARI SPARK WES autism phenotype split (asd column of individuals_registration)
+        filterByRange.SPARKAF_AUT on
+        filterLabel.SPARKAF_AUT SPARK WES ASD proband AF
+        filterByRange.SPARKAF_NON_AUT on
+        filterLabel.SPARKAF_NON_AUT SPARK WES Non-ASD family AF
+        filterByRange.SPARKAC_AUT on
+        filterLabel.SPARKAC_AUT SPARK WES ASD proband AC
+        filterByRange.SPARKAC_NON_AUT on
+        filterLabel.SPARKAC_NON_AUT SPARK WES Non-ASD family AC
+        # SFARI SPARK WGS autism phenotype split
+        filterByRange.SFARI_WGSAF_AUT on
+        filterLabel.SFARI_WGSAF_AUT SFARI WGS ASD proband AF
+        filterByRange.SFARI_WGSAF_NON_AUT on
+        filterLabel.SFARI_WGSAF_NON_AUT SFARI WGS Non-ASD family AF
+        filterByRange.SFARI_WGSAC_AUT on
+        filterLabel.SFARI_WGSAC_AUT SFARI WGS ASD proband AC
+        filterByRange.SFARI_WGSAC_NON_AUT on
+        filterLabel.SFARI_WGSAC_NON_AUT SFARI WGS Non-ASD family AC
+        # SCHEMA schizophrenia case/control (summed across analysis groups)
+        filterByRange.SCHEMAAF_CASE on
+        filterLabel.SCHEMAAF_CASE SCHEMA Schizophrenia case AF
+        filterByRange.SCHEMAAF_CTRL on
+        filterLabel.SCHEMAAF_CTRL SCHEMA Control AF
+        filterByRange.SCHEMAAC_CASE on
+        filterLabel.SCHEMAAC_CASE SCHEMA Schizophrenia case AC
+        filterByRange.SCHEMAAC_CTRL on
+        filterLabel.SCHEMAAC_CTRL SCHEMA Control AC
+        skipEmptyFields on
+
+        track varFreqsDisease
+        shortLabel Disease-related Databases Combined
+        longLabel Variant Frequencies: Disease-related cohorts combined (autism, schizophrenia, rare disease, NHLBI heart/lung/blood)
+        type bigBed 9 +
+        parent varFreqs off
+        bigDataUrl /gbdb/$D/varFreqs/_disease/varFreqsDisease.bb
+        visibility hide
+        itemRgb on
+        maxWindowToDraw 5000000
+        priority 0.15
+        mouseOver <b>Var:</b> $name<br><b>AA change:</b> $aaChange<br><b>Var type:</b> $varType<br><b>Conseq:</b> $consequence<br><b>Max AF:</b> $maxAF<br><b>Total AC:</b> $totalAC<br><b>Sources:</b> $sources
+        filterValues.sources SPARK|SPARK WES,SFARI_WGS|SFARI WGS,TOPMed|TOPMed,SCHEMA|SCHEMA,GREGoR|GREGoR,GA4K|GA4K PacBio LR
+        filterType.sources multipleListOr
+        filterLabel.sources Source Database
+        filterValues.varType SNV|SNV,INS|Insertion,DEL|Deletion,MNV|MNV
+        filterLabel.varType Variant Type
+        filterValues.consequence missense|Missense,synonymous|Synonymous,stop_gained|Stop Gained,frameshift|Frameshift,splice_donor|Splice Donor,splice_acceptor|Splice Acceptor,intron|Intron,3_prime_utr|3' UTR,5_prime_utr|5' UTR,non_coding|Non-coding,.|Intergenic,others|Other
+        filterType.consequence multipleListOr
+        filterLabel.consequence Consequence
+        filterByRange.refLen on
+        filterLabel.refLen Reference Length
+        filterByRange.altLen on
+        filterLabel.altLen Alternate Length
+        filterByRange.varLen on
+        filterLabel.varLen Length Change
+        filterByRange.maxAF on
+        filterLabel.maxAF Max Allele Frequency
+        filterLimits.maxAF 0:1
+        filterByRange.totalAC on
+        filterLabel.totalAC Total Allele Count (all databases)
+        # Per-database AF filters
+        filterByRange.SPARKAF on
+        filterLabel.SPARKAF SPARK WES AF
+        filterByRange.SFARI_WGSAF on
+        filterLabel.SFARI_WGSAF SFARI WGS AF
+        filterByRange.TOPMedAF on
+        filterLabel.TOPMedAF TOPMed AF
+        filterByRange.SCHEMAAF on
+        filterLabel.SCHEMAAF SCHEMA AF
+        filterByRange.GREGoRAF on
+        filterLabel.GREGoRAF GREGoR AF
+        filterByRange.GA4KAF on
+        filterLabel.GA4KAF GA4K PacBio LR AF
+        # Per-database AC filters
+        filterByRange.SPARKAC on
+        filterLabel.SPARKAC SPARK WES AC
+        filterByRange.SFARI_WGSAC on
+        filterLabel.SFARI_WGSAC SFARI WGS AC
+        filterByRange.TOPMedAC on
+        filterLabel.TOPMedAC TOPMed AC
+        filterByRange.SCHEMAAC on
+        filterLabel.SCHEMAAC SCHEMA AC
+        filterByRange.GREGoRAC on
+        filterLabel.GREGoRAC GREGoR AC
+        filterByRange.GA4KAC on
+        filterLabel.GA4KAC GA4K PacBio LR AC
+        # SPARK WES autism phenotype split
+        filterByRange.SPARKAF_AUT on
+        filterLabel.SPARKAF_AUT SPARK WES ASD proband AF
+        filterByRange.SPARKAF_NON_AUT on
+        filterLabel.SPARKAF_NON_AUT SPARK WES Non-ASD family AF
+        filterByRange.SPARKAC_AUT on
+        filterLabel.SPARKAC_AUT SPARK WES ASD proband AC
+        filterByRange.SPARKAC_NON_AUT on
+        filterLabel.SPARKAC_NON_AUT SPARK WES Non-ASD family AC
+        # SFARI WGS autism phenotype split
+        filterByRange.SFARI_WGSAF_AUT on
+        filterLabel.SFARI_WGSAF_AUT SFARI WGS ASD proband AF
+        filterByRange.SFARI_WGSAF_NON_AUT on
+        filterLabel.SFARI_WGSAF_NON_AUT SFARI WGS Non-ASD family AF
+        filterByRange.SFARI_WGSAC_AUT on
+        filterLabel.SFARI_WGSAC_AUT SFARI WGS ASD proband AC
+        filterByRange.SFARI_WGSAC_NON_AUT on
+        filterLabel.SFARI_WGSAC_NON_AUT SFARI WGS Non-ASD family AC
+        # SCHEMA schizophrenia case/control
+        filterByRange.SCHEMAAF_CASE on
+        filterLabel.SCHEMAAF_CASE SCHEMA Schizophrenia case AF
+        filterByRange.SCHEMAAF_CTRL on
+        filterLabel.SCHEMAAF_CTRL SCHEMA Control AF
+        filterByRange.SCHEMAAC_CASE on
+        filterLabel.SCHEMAAC_CASE SCHEMA Schizophrenia case AC
+        filterByRange.SCHEMAAC_CTRL on
+        filterLabel.SCHEMAAC_CTRL SCHEMA Control AC
+        # GREGoR affected/unaffected/unknown
+        filterByRange.GREGoRAF_AFF on
+        filterLabel.GREGoRAF_AFF GREGoR Affected AF
+        filterByRange.GREGoRAF_UNA on
+        filterLabel.GREGoRAF_UNA GREGoR Unaffected AF
+        filterByRange.GREGoRAF_UNK on
+        filterLabel.GREGoRAF_UNK GREGoR Unknown AF
+        filterByRange.GREGoRAC_AFF on
+        filterLabel.GREGoRAC_AFF GREGoR Affected AC
+        filterByRange.GREGoRAC_UNA on
+        filterLabel.GREGoRAC_UNA GREGoR Unaffected AC
+        filterByRange.GREGoRAC_UNK on
+        filterLabel.GREGoRAC_UNK GREGoR Unknown AC
+        skipEmptyFields on
+
+        track varFreqsArray
+        shortLabel Genotyping Array Databases Combined
+        longLabel Variant Frequencies: Genotyping-array cohorts combined (TPMI, Mexico Biobank, UK Biobank imputed)
+        type bigBed 9 +
+        parent varFreqs off
+        bigDataUrl /gbdb/$D/varFreqs/_array/varFreqsArray.bb
+        visibility hide
+        itemRgb on
+        maxWindowToDraw 5000000
+        priority 0.2
+        mouseOver <b>Var:</b> $name<br><b>AA change:</b> $aaChange<br><b>Var type:</b> $varType<br><b>Conseq:</b> $consequence<br><b>Max AF:</b> $maxAF<br><b>Total AC:</b> $totalAC<br><b>Sources:</b> $sources
+        filterValues.sources TPMI|TPMI Taiwan,MexBB|Mexico Biobank,UKBB|UK Biobank imputed
+        filterType.sources multipleListOr
+        filterLabel.sources Source Database
+        filterValues.varType SNV|SNV,INS|Insertion,DEL|Deletion,MNV|MNV
+        filterLabel.varType Variant Type
+        filterValues.consequence missense|Missense,synonymous|Synonymous,stop_gained|Stop Gained,frameshift|Frameshift,splice_donor|Splice Donor,splice_acceptor|Splice Acceptor,intron|Intron,3_prime_utr|3' UTR,5_prime_utr|5' UTR,non_coding|Non-coding,.|Intergenic,others|Other
+        filterType.consequence multipleListOr
+        filterLabel.consequence Consequence
+        filterByRange.refLen on
+        filterLabel.refLen Reference Length
+        filterByRange.altLen on
+        filterLabel.altLen Alternate Length
+        filterByRange.varLen on
+        filterLabel.varLen Length Change
+        filterByRange.maxAF on
+        filterLabel.maxAF Max Allele Frequency
+        filterLimits.maxAF 0:1
+        filterByRange.totalAC on
+        filterLabel.totalAC Total Allele Count (all databases)
+        filterByRange.TPMIAF on
+        filterLabel.TPMIAF TPMI Taiwan AF
+        filterByRange.MexBBAF on
+        filterLabel.MexBBAF Mexico Biobank AF
+        filterByRange.UKBBAF on
+        filterLabel.UKBBAF UK Biobank imputed AF
+        filterByRange.TPMIAC on
+        filterLabel.TPMIAC TPMI Taiwan AC
+        filterByRange.MexBBAC on
+        filterLabel.MexBBAC Mexico Biobank AC
+        filterByRange.UKBBAC on
+        filterLabel.UKBBAC UK Biobank imputed AC
         skipEmptyFields on
 
         track allofus
         shortLabel AllOfUs v7 245k WGS
         longLabel Variant Frequencies: AllOfUs v7 - 245k WGS, local-ancestry-stratified, AC>=20
         type vcfTabix
         parent varFreqs on
         bigDataUrl /gbdb/$D/varFreqs/_allofus/allOfUs.locAncFreq.vcf.gz
         dataVersion V7
         visibility hide
         tableBrowser off
         priority 0.5
 
         #track me
         #shortLabel Regeneron Million Exomes 983k WES