e89a2c6a10b38da2591e8e4e7132fe29a4b1fbd7
max
  Mon Apr 20 09:31:08 2026 -0700
varFreqs: add colorsDbSnv subtrack (hg38 and hs1)

CoLoRSdb v1.2.0 long-read SNV/indel population-frequency VCFs from 1,027
PacBio HiFi whole genomes, joint-called with DeepVariant and GLnexus.
The GRCh38 release is shown on hg38 and the CHM13 release on hs1 (both
are native, not lifted). This is the first varFreqs subtrack that
renders on hs1.

refs #36642

diff --git src/hg/makeDb/doc/hg38/varFreqs.txt src/hg/makeDb/doc/hg38/varFreqs.txt
index 3051e5f2e12..0dcc1232735 100644
--- src/hg/makeDb/doc/hg38/varFreqs.txt
+++ src/hg/makeDb/doc/hg38/varFreqs.txt
@@ -1,241 +1,264 @@
 # Genomic Answers for Kids (GA4K), Children's Mercy - 2026-04-16 Claude max
 # GA4K is a pediatric rare-disease PacBio HiFi long-read cohort (Cohen et al.
 # 2022, Genet Med, PMID 35305867). The release ships 24 per-chromosome VCFs of
 # site-only small variants (SNVs and short indels), filtered to variants
 # replicated in >=2 unrelated GA4K individuals or matched to an HPRC variant.
 # Upstream data lives under /hive/data/genomes/hg38/bed/lrSv/GA4K (co-located
 # with the matched GA4K structural-variant release; see the lrSv makedoc).
 cd /hive/data/genomes/hg38/bed/lrSv/GA4K
 bcftools concat -Oz -o ga4kSnv.vcf.gz \
     pacbio_snv_vcf/pb_joint_merged.snv.chr{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y}.vcf.gz
 tabix -p vcf ga4kSnv.vcf.gz
 # Symlinks placed under /gbdb/hg38/varFreqs/ga4k/ for the ga4kSnv stanza in
 # trackDb/human/varFreqs.ra.
 
 # Mexico Biobank, Max, Nov 8 2025
 CrossMap.py vcf /gbdb/hg19/liftOver/hg19ToHg38.over.chain.gz /hive
 /data/genomes/hg19/bed/varFreqs/mexbb/MXBv2.vcf.gz /hive/data/genomes/hg38/p14Clean/hg38.p14.fa MXBv2.lift.hg19ToHg38.vcf && bgzip MXBv2.lift.hg19ToHg38.vcf && bcftools sort MXBv2.lift.hg19ToHg38.vcf -Oz -m 200G -T /data/tmp/ -o MXBv2.lift.hg19ToHg38.vcf.gz && tabix -p vcf MXBv2.lift.hg19ToHg38.vcf.gz
 
 # Mexico City Prospective study, Max Oct 28 2025
 cd /hive/data/genomes/hg38/bed/varFreqs/mcps/
 for i in `seq 1 22` X; do wget https://rgc-mcps.regeneron.com/downloads/20230130/chr$i.freq.vcf.gz; done
 for i in `seq 1 22` X; do wget https://rgc-mcps.regeneron.com/downloads/20230130/chr$i.freq.vcf.gz.tbi; done
 mv *vcf* vcf/
 bcftools concat  --threads 16  -Oz -o mcps.freq.vcf.gz vcf/chr{1..22}.freq.vcf.gz vcf/chrX.freq.vcf.gz
 # make normal AC and AF and AN fields for mouseovers
 zcat mcps.freq.vcf.gz | sed -e 's/_RAW//g' > mcps.fix.freq.vcf
 mv -f mcps.fix.freq.vcf mcps.freq.vcf
 bgzip mcps.freq.vcf
 tabix -p vcf mcps.freq.vcf.gz 
 
 # Regeneron million exomes, Max, Nov 3 2025
 cd /hive/data/genomes/hg38/bed/varFreqs/me
 for i in `seq 1 22` X Y; do wget https://rgc-research.regeneron.com/me/downloads/20231004/rgc_me_variant_frequencies_chr${i}_20231004.vcf.gz.tbi; done
 bcftools concat  --threads 10  -Oz -o rgc_me_freqs_20231004.vcf.gz rgc_me_variant_frequencies_chr{1..22}_20231004.vcf.gz  rgc_me_variant_frequencies_chrX_20231004.vcf.gz rgc_me_variant_frequencies_chrY_20231004.vcf.gz 
 zcat rgc_me_freqs_20231004.vcf.gz | sed -e 's/ALL_//g' > rgc_me_freqs_20231004.fix.vcf
 tabix -p vcf rgc_me_freqs_20231004.vcf.gz
 
 # GA south asia 100k pilot
 cd /hive/data/genomes/hg38/bed/varFreqs/ga100k/
 parallel -j 8 wget -q --no-check-certificate https://browser.genomeasia100k.org/service/web/download_files/{}.substitutions.annot.cont_withmaf.vcf.gz ::: {1..22} X Y
 # fix the header line, remove "FORMAT"
 for i in *.vcf.gz; do echo "zcat $i |   awk 'BEGIN{OFS=\"\\t\"} /^#CHROM/{NF=8; print; next} /^#/ {print; next} {NF=8; print}' |   bgzip -c > fixed/$i" >> cmds.txt; done
 parallel -j 8 < cmds.txt
 bcftools concat  --threads 16  -Oz -o ../ga100k.subst.vcf.gz fixed/{1..22}.substitutions.annot.cont_withmaf.vcf.gz
 # add indels
 wget -q --no-check-certificate https://browser.genomeasia100k.org/service/web/download_files/All.indels.annot.cont_withmaf.vcf.gz
 # index
 tabix -p vcf ../ga100k*.vcf.gz
 tabix -p vcf All*.vcf.gz
 
 # TOPMED Freeze 10
 cd /hive/data/genomes/hg38/bed/varFreqs/topmed/
 # need to download the VCFs manually, 22 VCFs, with one time links from https://bravo.sph.umich.edu/vcfs.html
 # grrrr...
 bcftools concat  --threads 10  -Oz -o topmed10.vcf.gz {1..22}.vcf.gz X.vcf.gz 
 tabix -p vcf topmed10.vcf.gz
 
 # Abraom brazil
 # get unique download link from https://abraom.ib.usp.br/download/index.php
 cd /hive/data/genomes/hg38/bed/varFreqs/abraom/
 wget 'https://abraom.ib.usp.br/download/download-files.php?fid=RklEMTIzNDU2&key=1762266466-key690a0d62348de0.22872232' -O abraom.tar
 tar xvfz abraom.tar
 ln -s  /hive/data/genomes/hg38/p14Clean/hg38.p14.fa
 samtools faidx hg38.p14.fa 
 python ~/kent/src/hg/makeDb/scripts/varFreqs/abraomToVcf.py SABE1171.Abraom.clean.tsv abraom.vcf hg38.p14.fa
 tabix -p vcf abraom.vcf.gz 
 
 # SGDP
 cd /hive/data/genomes/hg38/bed/varFreqs/sgp/
 CrossMap.py vcf /gbdb/hg19/liftOver/hg19ToHg38.over.chain.gz /hive/data/genomes/hg19/bed/varFreqs/sgdp/SGDP.nh2.vcf.gz hg38.p14.fa sgdp.hg38.nh2.vcf
 bgzip sgdp.hg38.nh2.vcf
 bcftools sort sgdp.hg38.nh2.vcf.gz -Oz -m 200G -T /data/tmp/ -o sgdp.hg38.nh2.sort.vcf.gz 
 mv sgdp.hg38.nh2.sort.vcf.gz SGDP.nh2.vcf.gz
 tabix -p vcf SGDP.nh2.vcf.gz
 
 # KOVA
 cd /hive/data/genomes/hg38/bed/varFreqs/sgp/
 # got tsv file via google drive link from 장인수 <insoo078@kribb.re.kr> 
 # VCF converter, written by Claude Opus 4.1 using 2 lines of example input
 python ~/kent/src/hg/makeDb/scripts/varFreqs/kovaToVcf.py 1_KOVA.v7.tsv.gz kova.v7.vcf
 bgzip kova.v7.vcf
 tabix -p vcf kova.v7.vcf.gz
 
 # NPM Singapore
 cd /hive/data/genomes/hg38/bed/varFreqs/npm/
 # downloaded data manually from chorus website, https://chorus.grids-platform.io/vcfdl
 bcftools concat  --threads 10  -Oz -o SG10K_Health_r5.3.2.sites.vcf.bgz  SG10K_Health_r5.3.2.sites.chr{1..22}.vcf.bgz SG10K_Health_r5.3.2.sites.chrX.vcf.bgz SG10K_Health_r5.3.2.sites.chrY.vcf.bgz 
 tabiv -p vcf SG10K_Health_r5.3.2.sites.vcf.bgz
 
 # Saudi 300 genomes
 cd /hive/data/genomes/hg38/bed/varFreqs/saudi
 wget https://figshare.com/ndownloader/files/51297884 -O 51297884.tsv.gz
 python3 ~/kent/src/hg/makeDb/scripts/varFreqs/saudiToVcf.py
 bgzip saudi.vcf
 tabix -p vcf saudi.vcf.gz
 
 # SFARI SPARK
 cd /hive/data/genomes/hg38/bed/varFreqs/sparkExomes/
 # used globus to download into vcf/
 sh ~/kent/src/hg/makeDb/scripts/varFreqs/sparkMergeVcfAddCounts.sh vcf/SPARK.iWES_v3.2024_08.deepvariant 8
 bcftools norm -m-  SPARK.iWES_v3.2024_08.deepvariant.sites.vcf.gz -Oz > SPARK.iWES_v3.2024_08.deepvariant.norm.vcf.gz && tabix -p vcf SPARK.iWES_v3.2024_08.deepvariant.norm.vcf.gz
 
 cd /hive/data/genomes/hg38/bed/varFreqs/sparkWgs/
 # used globus to download into vcf/
 sh ~/kent/src/hg/makeDb/scripts/varFreqs/sparkMergeVcfAddCounts.sh vcf/wgs_12519_genome.deepvariant 8
 bcftools norm -m-  wgs_12519_genome.deepvariant.sites.vcf.gz -Oz > wgs_12519_genome.deepvariant.norm.vcf.gz
 tabix -p vcf wgs_12519_genome.deepvariant.norm.vcf.gz
 
 # NCBI ALFA bigBed to VCF, Max Jan 26 2026
 # Source: ALFA R4 bigBed files, 904M variants, output 163M with non-zero AF
 cd /hive/data/genomes/hg38/bed/varFreqs/alfa
 python3 ~/kent/src/hg/makeDb/scripts/varFreqs/alfa_to_vcf.py --out ALFA.vcf --zero-af-file ALFA_zero.txt
 # Compress and index
 bgzip ALFA.vcf
 tabix -p vcf ALFA.vcf.gz
 # Final: 2.7GB, 163M variants (146M SNPs, 17M indels), ALFA_zero.txt has 26GB of zero-AF variants
 
 # HRC (Haplotype Reference Consortium), Claude max, Mar 17 2026
 # Source: HRC.r1-1.GRCh37.wgs.mac5.sites.tab.gz
 # 40M variants from 32,488 WGS samples, originally on GRCh37
 cd /hive/data/genomes/hg38/bed/varFreqs/hrc/
 # download HRC.r1-1.GRCh37.wgs.mac5.sites.tab.gz from http://www.haplotype-reference-consortium.org/site
 python3 ~/kent/src/hg/makeDb/scripts/varFreqs/hrcToVcf.py
 # 40,405,505 variants read, 8,052 unmapped, 40,397,453 lifted to hg38
 # sort, compress, index
 bcftools sort hrc.vcf -Oz -o hrc.vcf.gz
 tabix -p vcf hrc.vcf.gz
 rm hrc.vcf
 ln -s /hive/data/genomes/hg38/bed/varFreqs/hrc/hrc.vcf.gz /gbdb/hg38/varFreqs/hrc/hrc.vcf.gz
 ln -s /hive/data/genomes/hg38/bed/varFreqs/hrc/hrc.vcf.gz.tbi /gbdb/hg38/varFreqs/hrc/hrc.vcf.gz.tbi
 
 # Australia, Max, Jan 2026
 # received files from m.hobbs@garvan.org.au
 cd /hive/data/genomes/hg38/bed/varFreqs/mgrb/
 bcftools norm -f hg38.fa -m-any MGRB.phase3.GRCh38.vcf.gz -o MGRB.phase3.GRCh38.norm.vcf.gz
 tabix MGRB.phase3.GRCh38.norm.vcf.gz
 
 # SCHEMA Schizophrenia Exome Meta-Analysis track for hg38, Max, Jan 22 2026
 # source: https://schema.broadinstitute.org/
 # Original is in hg19/GRCh37 coordinates
 cd /hive/data/genomes/hg38/bed/varFreqs/schema
 # SCHEMA_variant_results.vcf.bgz (384M, hg19 coordinates)
 # Step 1: Add AC, AN, AF fields by summing case+control counts
 ~/kent/src/hg/makeDb/scripts/varFreqs/schema_addAcAnAf.py
 bgzip SCHEMA_variant_results_withAF.vcf
 tabix -p vcf SCHEMA_variant_results_withAF.vcf.gz
 # Step 2: Liftover from hg19 to hg38
 # prep hg38 reference FASTA
 zcat /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/bigZips/hg38.fa.gz > hg38.fa
 samtools faidx hg38.fa
 CrossMap.py vcf /gbdb/hg19/liftOver/hg19ToHg38.over.chain.gz \
     SCHEMA_variant_results_withAF.vcf.gz \
     hg38.fa \
     SCHEMA_variant_results_hg38.vcf
 # Output stats: Total entries: 8865268, Failed to map: 780
 # Sort
 grep "^#" SCHEMA_variant_results_hg38.vcf > SCHEMA_variant_results_hg38_sorted.vcf
 grep -v "^#" SCHEMA_variant_results_hg38.vcf | sort -k1,1V -k2,2n >> SCHEMA_variant_results_hg38_sorted.vcf
 # Compress and index
 bgzip SCHEMA_variant_results_hg38_sorted.vcf
 tabix -p vcf SCHEMA_variant_results_hg38_sorted.vcf.gz
 # Clean up temporary files
 rm -f SCHEMA_variant_results_hg38.vcf SCHEMA_variant_results_hg38.vcf.unmap hg38.fa hg38.fa.fai
 
 # Gregor rare disease project, Max, Mar 2026
 cd /hive/data/genomes/hg38/bed/varFreqs/gregor/
 # Downloaded from G Drive, pointed to by Jon Bernstein, Stanford
 # https://drive.google.com/drive/folders/1v-BnW7nKcEjF-NyLqU1Up3YJuP5KJJAg
 # created symlink into my UCSC G Drive, then used rclone
 rclone copy mhaeussldrive:RO4 ./
 bcftools concat --threads 16 -Oz -o gregor.vcf.gz chr{1..22}.vcf.gz chrX.vcf.gz chrY.vcf.gz
 tabix -p vcf gregor.vcf.gz
 # output ~20 GB, took 10 minutes.
 
 # HGDP1k data from the phased Vars track, Max/Claude, Mar 18 2026
 # Just flattening what we have and reducing details
 # Source: 3.2TB VCF with 4094 genomes and per-population INFO fields for 80 populations
 # Strip genotypes and keep only overall + continental group fields (drop per-population-per-sex)
 # Already has chr prefix, no rename needed
 # Note: first attempt kept all fields -> 169GB, too large. This version keeps only continental groups.
 cd /hive/data/genomes/hg38/bed/varFreqs/hgdp1kFreq/
 KEEP="INFO/AC,INFO/AF,INFO/AN,INFO/nhomalt,INFO/gnomad_AC,INFO/gnomad_AF,INFO/gnomad_AN,INFO/gnomad_AC_afr,INFO/gnomad_AF_a
 fr,INFO/gnomad_AN_afr,INFO/gnomad_AC_ami,INFO/gnomad_AF_ami,INFO/gnomad_AN_ami,INFO/gnomad_AC_amr,INFO/gnomad_AF_amr,INFO/g
 nomad_AN_amr,INFO/gnomad_AC_asj,INFO/gnomad_AF_asj,INFO/gnomad_AN_asj,INFO/gnomad_AC_eas,INFO/gnomad_AF_eas,INFO/gnomad_AN_
 eas,INFO/gnomad_AC_fin,INFO/gnomad_AF_fin,INFO/gnomad_AN_fin,INFO/gnomad_AC_mid,INFO/gnomad_AF_mid,INFO/gnomad_AN_mid,INFO/
 gnomad_AC_nfe,INFO/gnomad_AF_nfe,INFO/gnomad_AN_nfe,INFO/gnomad_AC_oth,INFO/gnomad_AF_oth,INFO/gnomad_AN_oth,INFO/gnomad_AC
 _sas,INFO/gnomad_AF_sas,INFO/gnomad_AN_sas,INFO/gnomad_popmax,INFO/gnomad_faf95_popmax"
 # This took days to complete, so asked Claude to make it parallel
 #bcftools view -G /gbdb/hg38/phasedVars/hgdp1k/gnomad.genomes.v3.1.2.hgdp_tgp.vcf.gz --threads 8 \
 #| bcftools annotate -x "^${KEEP}" -Oz --threads 4 -o hgdp1k.freq.vcf.gz
 # use 30 threads, and chunks of 50 Mbp
 sh ~/kent/src/hg/makeDb/scripts/varFreqs/vcfFilterParallel.sh /gbdb/hg38/phasedVars/hgdp1k/gnomad.genomes.v3.1.2.hgdp_tgp.vcf.gz hgdp1k.freq.parallel.vcf.gz "$KEEP" 30 50 &
 tabix -p vcf hgdp1k.freq.vcf.gz
 
 # Swefreq, Max, Feb 2026
 # downloaded files from https://swefreq.nbis.se/dataset/SweGen/download
 # Access was approved through the website, but I emailed swefreq@scilifelab.se, it needed a reminder email
 # Also got email from adam.ameur@igp.uu.se with followup info and do-no-allow-downloads instruction
 cd /hive/data/genomes/hg38/bed/varFreqs/swefreq
 
 # Indigenomes, Max Jan 2026
 # downloaded from https://clingen.igib.res.in/indigen/, used as-is
 cd /hive/data/genomes/hg38/bed/varFreqs/indigenomes/
 
 # Japan Tommo 60k, Max Jan 2026
 # downloaded from https://jmorp.megabank.tohoku.ac.jp/downloads
 cd /hive/data/genomes/hg38/bed/varFreqs/tommo61kjpn/
 # copied urls from website
 wget -i urls.txt 
 bcftools concat --threads 16 -Oz -o tommo-61kjpn-20250616-GRCh38-snvindel-af-autosome.vcf.gz \
     tommo-61kjpn-20250616-GRCh38-snvindel-af-autosome-chr{1..22}.vcf.gz
 tabix -p vcf tommo-61kjpn-20250616-GRCh38-snvindel-af-autosome.vcf.gz
 
 # FinnGen, Max/Claude, Jan 2026
 cd /hive/data/genomes/hg38/bed/varFreqs/finngen/                                                                           
 # Source TSV was downloaded from FinnGen (via email link from Google Cloud bucket)                                         
 # finnge_R12_annotated_variants_v1.gz (32 GB TSV)                                                                          
 # Convert TSV to VCF using custom Python script (written by Claude Opus 4.5)                                               
 python ~/kent/src/hg/makeDb/scripts/varFreqs/finngen_to_vcf.py \                                                                    
     finnge_R12_annotated_variants_v1.gz \                                                                                    
     finnge_R12_annotated_variants_v1.vcf                                                                                     
 # Compress and index                                                                                                       
 bgzip finnge_R12_annotated_variants_v1.vcf -@8                                                                             
 tabix -p vcf finnge_R12_annotated_variants_v1.vcf.gz                                                                       
 
 # All of Us, Max Feb 2026
 # Received from Qudsi at UCSC in the Ioannidis group via phoenix
 # only concated and ran tabix on it
 cd /hive/data/genomes/hg38/bed/varFreqs/allofus/
 bcftools concat --threads 16 -Oz -o allOfUs.locAncFreq.vcf.gz clean/allele_freq_chr{1..22}.NW.clean.conf90.oneline.vcf.gz
 tabix allOfUs.locAncFreq.vcf.gz
 
 ##########
 # 2026-03-27 Claude max
 
 # Two phased SV VCF tracks moved into phasedVars superTrack from lrSv:
 # - han945SvVcf: Per-sample genotypes for 945 Han Chinese SVs
 # - lrSv1kgOntPhased: Phased SVs from 1,019 diverse humans (1KG ONT)
 # Data files remain in /hive/data/genomes/hg38/bed/lrSv/
 # Symlinks moved from /gbdb/{hg38,hs1}/lrSv/ to /gbdb/{hg38,hs1}/phasedVars/
 # Build documentation for these tracks is in lrSv.txt
+
+##########
+# 2026-04-20 Claude max
+
+# CoLoRSdb v1.2.0 long-read SNV/indel population frequencies added as
+# the colorsDbSnv subtrack of varFreqs, for both hg38 and hs1.
+#
+# Upstream VCFs (GRCh38 and CHM13 releases) are already present in
+# /hive/data/genomes/hg38/bed/lrSv/colorsDb/ (placed there when the
+# CoLoRSdb SV track was first built under lrSv). We just add VCF
+# symlinks under each assembly's varFreqs directory using a consistent
+# filename so the shared trackDb stanza can use $D.
+
+mkdir -p /gbdb/hg38/varFreqs/colorsDb /gbdb/hs1/varFreqs/colorsDb
+ln -sf /hive/data/genomes/hg38/bed/lrSv/colorsDb/CoLoRSdb.GRCh38.v1.2.0.deepvariant.glnexus.vcf.gz     /gbdb/hg38/varFreqs/colorsDb/colorsDbSnv.vcf.gz
+ln -sf /hive/data/genomes/hg38/bed/lrSv/colorsDb/CoLoRSdb.GRCh38.v1.2.0.deepvariant.glnexus.vcf.gz.tbi /gbdb/hg38/varFreqs/colorsDb/colorsDbSnv.vcf.gz.tbi
+ln -sf /hive/data/genomes/hg38/bed/lrSv/colorsDb/CoLoRSdb.CHM13.v1.2.0.deepvariant.glnexus.vcf.gz      /gbdb/hs1/varFreqs/colorsDb/colorsDbSnv.vcf.gz
+ln -sf /hive/data/genomes/hg38/bed/lrSv/colorsDb/CoLoRSdb.CHM13.v1.2.0.deepvariant.glnexus.vcf.gz.tbi  /gbdb/hs1/varFreqs/colorsDb/colorsDbSnv.vcf.gz.tbi
+
+# The varFreqs.ra trackDb file is already in human/ (shared for both
+# hg38 and hs1 via the human/trackDb.ra include), so no move was needed.
+# Only colorsDbSnv is expected to render on hs1 - the other varFreqs
+# subtracks have hg38-only data and will silently show nothing there.