165a15d6a94d53f8162a01e69f3912a7a23a3b50 max Mon Mar 23 06:47:55 2026 -0700 mostly done with the variant frequencies track, refs#36642 diff --git src/hg/makeDb/doc/hg38/varFreqs.txt src/hg/makeDb/doc/hg38/varFreqs.txt index 0e300cef74d..e7f35b77bcc 100644 --- src/hg/makeDb/doc/hg38/varFreqs.txt +++ src/hg/makeDb/doc/hg38/varFreqs.txt @@ -159,32 +159,35 @@ # output ~20 GB, took 10 minutes. # HGDP1k data from the phased Vars track, Max/Claude, Mar 18 2026 # Just flattening what we have and reducing details # Source: 3.2TB VCF with 4094 genomes and per-population INFO fields for 80 populations # Strip genotypes and keep only overall + continental group fields (drop per-population-per-sex) # Already has chr prefix, no rename needed # Note: first attempt kept all fields -> 169GB, too large. This version keeps only continental groups. cd /hive/data/genomes/hg38/bed/varFreqs/hgdp1kFreq/ KEEP="INFO/AC,INFO/AF,INFO/AN,INFO/nhomalt,INFO/gnomad_AC,INFO/gnomad_AF,INFO/gnomad_AN,INFO/gnomad_AC_afr,INFO/gnomad_AF_a fr,INFO/gnomad_AN_afr,INFO/gnomad_AC_ami,INFO/gnomad_AF_ami,INFO/gnomad_AN_ami,INFO/gnomad_AC_amr,INFO/gnomad_AF_amr,INFO/g nomad_AN_amr,INFO/gnomad_AC_asj,INFO/gnomad_AF_asj,INFO/gnomad_AN_asj,INFO/gnomad_AC_eas,INFO/gnomad_AF_eas,INFO/gnomad_AN_ eas,INFO/gnomad_AC_fin,INFO/gnomad_AF_fin,INFO/gnomad_AN_fin,INFO/gnomad_AC_mid,INFO/gnomad_AF_mid,INFO/gnomad_AN_mid,INFO/ gnomad_AC_nfe,INFO/gnomad_AF_nfe,INFO/gnomad_AN_nfe,INFO/gnomad_AC_oth,INFO/gnomad_AF_oth,INFO/gnomad_AN_oth,INFO/gnomad_AC _sas,INFO/gnomad_AF_sas,INFO/gnomad_AN_sas,INFO/gnomad_popmax,INFO/gnomad_faf95_popmax" -bcftools view -G /gbdb/hg38/phasedVars/hgdp1k/gnomad.genomes.v3.1.2.hgdp_tgp.vcf.gz --threads 8 \ -| bcftools annotate -x "^${KEEP}" -Oz --threads 4 -o hgdp1k.freq.vcf.gz +# This took days to complete, so asked Claude to make it parallel +#bcftools view -G /gbdb/hg38/phasedVars/hgdp1k/gnomad.genomes.v3.1.2.hgdp_tgp.vcf.gz --threads 8 \ +#| bcftools annotate -x "^${KEEP}" -Oz --threads 4 -o hgdp1k.freq.vcf.gz +# use 30 threads, and chunks of 50 Mbp +sh ~/kent/src/hg/makeDb/scripts/varFreqs/vcfFilterParallel.sh /gbdb/hg38/phasedVars/hgdp1k/gnomad.genomes.v3.1.2.hgdp_tgp.vcf.gz hgdp1k.freq.parallel.vcf.gz "$KEEP" 30 50 & tabix -p vcf hgdp1k.freq.vcf.gz # Swefreq, Max, Feb 2026 # downloaded files from https://swefreq.nbis.se/dataset/SweGen/download # Access was approved through the website, but I emailed swefreq@scilifelab.se, it needed a reminder email # Also got email from adam.ameur@igp.uu.se with followup info and do-no-allow-downloads instruction cd /hive/data/genomes/hg38/bed/varFreqs/swefreq # Indigenomes, Max Jan 2026 # downloaded from https://clingen.igib.res.in/indigen/, used as-is cd /hive/data/genomes/hg38/bed/varFreqs/indigenomes/ # Japan Tommo 60k, Max Jan 2026 # downloaded from https://jmorp.megabank.tohoku.ac.jp/downloads cd /hive/data/genomes/hg38/bed/varFreqs/tommo61kjpn/