aa61ebc800429515f9ced7e28f669c6042219f43 max Wed Mar 18 09:09:13 2026 -0700 varFreqs supertrack: add GREGoR track, update all HTML docs, move scripts to varFreqs/, refs #36642 Add GREGoR R04 WGS track to varFreqs superTrack. Update Data Access and Methods sections for all 20+ subtrack HTML files with consistent formatting, sequencing methods from source papers, and links to makeDoc and Github scripts. Move all varFreqs conversion scripts into scripts/varFreqs/ subdirectory and update makeDoc paths accordingly. Co-Authored-By: Claude Opus 4.6 diff --git src/hg/makeDb/doc/hg38/varFreqs.txt src/hg/makeDb/doc/hg38/varFreqs.txt index 4beb612b8a9..0e300cef74d 100644 --- src/hg/makeDb/doc/hg38/varFreqs.txt +++ src/hg/makeDb/doc/hg38/varFreqs.txt @@ -36,110 +36,110 @@ # TOPMED Freeze 10 cd /hive/data/genomes/hg38/bed/varFreqs/topmed/ # need to download the VCFs manually, 22 VCFs, with one time links from https://bravo.sph.umich.edu/vcfs.html # grrrr... bcftools concat --threads 10 -Oz -o topmed10.vcf.gz {1..22}.vcf.gz X.vcf.gz tabix -p vcf topmed10.vcf.gz # Abraom brazil # get unique download link from https://abraom.ib.usp.br/download/index.php cd /hive/data/genomes/hg38/bed/varFreqs/abraom/ wget 'https://abraom.ib.usp.br/download/download-files.php?fid=RklEMTIzNDU2&key=1762266466-key690a0d62348de0.22872232' -O abraom.tar tar xvfz abraom.tar ln -s /hive/data/genomes/hg38/p14Clean/hg38.p14.fa samtools faidx hg38.p14.fa -python ~/kent/src/hg/makeDb/scripts/abraomToVcf.py SABE1171.Abraom.clean.tsv abraom.vcf hg38.p14.fa +python ~/kent/src/hg/makeDb/scripts/varFreqs/abraomToVcf.py SABE1171.Abraom.clean.tsv abraom.vcf hg38.p14.fa tabix -p vcf abraom.vcf.gz # SGDP cd /hive/data/genomes/hg38/bed/varFreqs/sgp/ CrossMap.py vcf /gbdb/hg19/liftOver/hg19ToHg38.over.chain.gz /hive/data/genomes/hg19/bed/varFreqs/sgdp/SGDP.nh2.vcf.gz hg38.p14.fa sgdp.hg38.nh2.vcf bgzip sgdp.hg38.nh2.vcf bcftools sort sgdp.hg38.nh2.vcf.gz -Oz -m 200G -T /data/tmp/ -o sgdp.hg38.nh2.sort.vcf.gz mv sgdp.hg38.nh2.sort.vcf.gz SGDP.nh2.vcf.gz tabix -p vcf SGDP.nh2.vcf.gz # KOVA cd /hive/data/genomes/hg38/bed/varFreqs/sgp/ # got tsv file via google drive link from 장인수 # VCF converter, written by Claude Opus 4.1 using 2 lines of example input -python ~/kent/src/hg/makeDb/scripts/kovaToVcf.py 1_KOVA.v7.tsv.gz kova.v7.vcf +python ~/kent/src/hg/makeDb/scripts/varFreqs/kovaToVcf.py 1_KOVA.v7.tsv.gz kova.v7.vcf bgzip kova.v7.vcf tabix -p vcf kova.v7.vcf.gz # NPM Singapore cd /hive/data/genomes/hg38/bed/varFreqs/npm/ # downloaded data manually from chorus website, https://chorus.grids-platform.io/vcfdl bcftools concat --threads 10 -Oz -o SG10K_Health_r5.3.2.sites.vcf.bgz SG10K_Health_r5.3.2.sites.chr{1..22}.vcf.bgz SG10K_Health_r5.3.2.sites.chrX.vcf.bgz SG10K_Health_r5.3.2.sites.chrY.vcf.bgz tabiv -p vcf SG10K_Health_r5.3.2.sites.vcf.bgz # Saudi 300 genomes cd /hive/data/genomes/hg38/bed/varFreqs/saudi wget https://figshare.com/ndownloader/files/51297884 -O 51297884.tsv.gz -python3 ~/kent/src/hg/makeDb/scripts/saudiToVcf.py +python3 ~/kent/src/hg/makeDb/scripts/varFreqs/saudiToVcf.py bgzip saudi.vcf tabix -p vcf saudi.vcf.gz # SFARI SPARK cd /hive/data/genomes/hg38/bed/varFreqs/sparkExomes/ # used globus to download into vcf/ -sh ~/kent/src/hg/makeDb/scripts/sparkMergeVcfAddCounts.sh vcf/SPARK.iWES_v3.2024_08.deepvariant 8 +sh ~/kent/src/hg/makeDb/scripts/varFreqs/sparkMergeVcfAddCounts.sh vcf/SPARK.iWES_v3.2024_08.deepvariant 8 bcftools norm -m- SPARK.iWES_v3.2024_08.deepvariant.sites.vcf.gz -Oz > SPARK.iWES_v3.2024_08.deepvariant.norm.vcf.gz && tabix -p vcf SPARK.iWES_v3.2024_08.deepvariant.norm.vcf.gz cd /hive/data/genomes/hg38/bed/varFreqs/sparkWgs/ # used globus to download into vcf/ -sh ~/kent/src/hg/makeDb/scripts/sparkMergeVcfAddCounts.sh vcf/wgs_12519_genome.deepvariant 8 +sh ~/kent/src/hg/makeDb/scripts/varFreqs/sparkMergeVcfAddCounts.sh vcf/wgs_12519_genome.deepvariant 8 bcftools norm -m- wgs_12519_genome.deepvariant.sites.vcf.gz -Oz > wgs_12519_genome.deepvariant.norm.vcf.gz tabix -p vcf wgs_12519_genome.deepvariant.norm.vcf.gz # NCBI ALFA bigBed to VCF, Max Jan 26 2026 # Source: ALFA R4 bigBed files, 904M variants, output 163M with non-zero AF cd /hive/data/genomes/hg38/bed/varFreqs/alfa -python3 alfa_to_vcf.py --out ALFA.vcf --zero-af-file ALFA_zero.txt +python3 ~/kent/src/hg/makeDb/scripts/varFreqs/alfa_to_vcf.py --out ALFA.vcf --zero-af-file ALFA_zero.txt # Compress and index bgzip ALFA.vcf tabix -p vcf ALFA.vcf.gz # Final: 2.7GB, 163M variants (146M SNPs, 17M indels), ALFA_zero.txt has 26GB of zero-AF variants # HRC (Haplotype Reference Consortium), Claude max, Mar 17 2026 # Source: HRC.r1-1.GRCh37.wgs.mac5.sites.tab.gz # 40M variants from 32,488 WGS samples, originally on GRCh37 cd /hive/data/genomes/hg38/bed/varFreqs/hrc/ # download HRC.r1-1.GRCh37.wgs.mac5.sites.tab.gz from http://www.haplotype-reference-consortium.org/site -python3 ~/kent/src/hg/makeDb/scripts/hrcToVcf.py +python3 ~/kent/src/hg/makeDb/scripts/varFreqs/hrcToVcf.py # 40,405,505 variants read, 8,052 unmapped, 40,397,453 lifted to hg38 # sort, compress, index bcftools sort hrc.vcf -Oz -o hrc.vcf.gz tabix -p vcf hrc.vcf.gz rm hrc.vcf ln -s /hive/data/genomes/hg38/bed/varFreqs/hrc/hrc.vcf.gz /gbdb/hg38/varFreqs/hrc/hrc.vcf.gz ln -s /hive/data/genomes/hg38/bed/varFreqs/hrc/hrc.vcf.gz.tbi /gbdb/hg38/varFreqs/hrc/hrc.vcf.gz.tbi # Australia, Max, Jan 2026 # received files from m.hobbs@garvan.org.au cd /hive/data/genomes/hg38/bed/varFreqs/mgrb/ bcftools norm -f hg38.fa -m-any MGRB.phase3.GRCh38.vcf.gz -o MGRB.phase3.GRCh38.norm.vcf.gz tabix MGRB.phase3.GRCh38.norm.vcf.gz # SCHEMA Schizophrenia Exome Meta-Analysis track for hg38, Max, Jan 22 2026 # source: https://schema.broadinstitute.org/ # Original is in hg19/GRCh37 coordinates cd /hive/data/genomes/hg38/bed/varFreqs/schema # SCHEMA_variant_results.vcf.bgz (384M, hg19 coordinates) # Step 1: Add AC, AN, AF fields by summing case+control counts -~/kent/src/hg/makeDb/scripts/schema_addAcAnAf.py +~/kent/src/hg/makeDb/scripts/varFreqs/schema_addAcAnAf.py bgzip SCHEMA_variant_results_withAF.vcf tabix -p vcf SCHEMA_variant_results_withAF.vcf.gz # Step 2: Liftover from hg19 to hg38 # prep hg38 reference FASTA zcat /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/bigZips/hg38.fa.gz > hg38.fa samtools faidx hg38.fa CrossMap.py vcf /gbdb/hg19/liftOver/hg19ToHg38.over.chain.gz \ SCHEMA_variant_results_withAF.vcf.gz \ hg38.fa \ SCHEMA_variant_results_hg38.vcf # Output stats: Total entries: 8865268, Failed to map: 780 # Sort grep "^#" SCHEMA_variant_results_hg38.vcf > SCHEMA_variant_results_hg38_sorted.vcf grep -v "^#" SCHEMA_variant_results_hg38.vcf | sort -k1,1V -k2,2n >> SCHEMA_variant_results_hg38_sorted.vcf # Compress and index @@ -187,28 +187,28 @@ # Japan Tommo 60k, Max Jan 2026 # downloaded from https://jmorp.megabank.tohoku.ac.jp/downloads cd /hive/data/genomes/hg38/bed/varFreqs/tommo61kjpn/ # copied urls from website wget -i urls.txt bcftools concat --threads 16 -Oz -o tommo-61kjpn-20250616-GRCh38-snvindel-af-autosome.vcf.gz \ tommo-61kjpn-20250616-GRCh38-snvindel-af-autosome-chr{1..22}.vcf.gz tabix -p vcf tommo-61kjpn-20250616-GRCh38-snvindel-af-autosome.vcf.gz # FinnGen, Max/Claude, Jan 2026 cd /hive/data/genomes/hg38/bed/varFreqs/finngen/ # Source TSV was downloaded from FinnGen (via email link from Google Cloud bucket) # finnge_R12_annotated_variants_v1.gz (32 GB TSV) # Convert TSV to VCF using custom Python script (written by Claude Opus 4.5) -python ~/kent/src/hg/makeDb/scripts/finngen_to_vcf.py \ +python ~/kent/src/hg/makeDb/scripts/varFreqs/finngen_to_vcf.py \ finnge_R12_annotated_variants_v1.gz \ finnge_R12_annotated_variants_v1.vcf # Compress and index bgzip finnge_R12_annotated_variants_v1.vcf -@8 tabix -p vcf finnge_R12_annotated_variants_v1.vcf.gz # All of Us, Max Feb 2026 # Received from Qudsi at UCSC in the Ioannidis group via phoenix # only concated and ran tabix on it cd /hive/data/genomes/hg38/bed/varFreqs/allofus/ bcftools concat --threads 16 -Oz -o allOfUs.locAncFreq.vcf.gz clean/allele_freq_chr{1..22}.NW.clean.conf90.oneline.vcf.gz tabix allOfUs.locAncFreq.vcf.gz