src/hg/makeDb/doc/hg38/gnomad.txt 06d7be056190c14b85e71bc12523f18ea6815b5e

06d7be056190c14b85e71bc12523f18ea6815b5e
markd
  Mon Dec 7 00:50:29 2020 -0800
BLAT mmap index support merge with master

diff --git src/hg/makeDb/doc/hg38/gnomad.txt src/hg/makeDb/doc/hg38/gnomad.txt
index 4dfaa2c..259d418 100644
--- src/hg/makeDb/doc/hg38/gnomad.txt
+++ src/hg/makeDb/doc/hg38/gnomad.txt
@@ -1,88 +1,151 @@
 # gnomaAD for hg38
 # RM 21782
 
 # Genome Aggregation Database at the Broad
 # https://gnomad.broadinstitute.org
 
 # (2019-10-07 kate)
 
 # March 6, 2019: gnomAD 2.1.1
 
 mkdir vcf; cd vcf
 
 # hg38
 
 mkdir hg38; cd hg38
 mkdir -p /gbdb/hg38/gnomAD/vcf
 
 # Download from GRCh38 liftover section of downloads page:
 https://gnomad.broadinstitute.org/downloads#variants-grch38-liftover
 
 wget https://storage.googleapis.com/gnomad-public/release/2.1.1/README.txt
 
 # Exome variants
 
 # vcf
 # 85.31 GiB, MD5: cff8d0cfed50adc9211d1feaed2d4ca7
 wget https://storage.googleapis.com/gnomad-public/release/2.1.1/liftover_grch38/vcf/exomes/gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.bgz >&! wget.exomes.log &
 
 # from log: 20 minutes download time
 # --2019-10-07 16:40:48--  https://storage.googleapis.com/gnomad-public/release/2.1.1/liftover_grch38/vcf/exomes/gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.bgz
 # 2019-10-07 17:00:52 (72.6 MB/s) - 'gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.bgz' saved [91604348731/91604348731]
 
 du -sh *.bgz
 # 171G    gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.bgz
 
 md5sum gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.bgz
 # cff8d0cfed50adc9211d1feaed2d4ca7  gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.bgz
 
 ln -s `pwd`/gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.bgz /gbdb/hg38/gnomAD/vcf/gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.gz
 
 # index file
 wget https://storage.googleapis.com/gnomad-public/release/2.1.1/liftover_grch38/vcf/exomes/gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.bgz.tbi
 
 ln -s `pwd`/gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.bgz.tbi /gbdb/hg38/gnomAD/vcf/gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.gz.tbi
 
 # Genomes Variants
 # 743.06 GiB, MD5: 83de3d5b52669f714e810d4fcf047c18
 wget https://storage.googleapis.com/gnomad-public/release/2.1.1/liftover_grch38/vcf/genomes/gnomad.genomes.r2.1.1.sites.liftover_grch38.vcf.bgz >&! wget.genomes.log &
 
 # from log:  3 hrs download time
 #--2019-10-07 17:03:11--  https://storage.googleapis.com/gnomad-public/release/2.1.1/liftover_grch38/vcf/genomes/gnomad.genomes.r2.1.1.sites.liftover_grch38.vcf.bgz
 # 2019-10-07 20:02:56 (70.6 MB/s) - 'gnomad.genomes.r2.1.1.sites.liftover_grch38.vcf.bgz' saved [797851733355/797851733355]
 
 ln -s `pwd`/gnomad.genomes.r2.1.1.sites.liftover_grch38.vcf.bgz /gbdb/hg38/gnomAD/vcf/gnomad.genomes.r2.1.1.sites.liftover_grch38.vcf.gz
 
 wget https://storage.googleapis.com/gnomad-public/release/2.1.1/liftover_grch38/vcf/genomes/gnomad.genomes.r2.1.1.sites.liftover_grch38.vcf.bgz.tbi
 ln -s `pwd`/gnomad.genomes.r2.1.1.sites.liftover_grch38.vcf.bgz.tbi /gbdb/hg38/gnomAD/vcf/gnomad.genomes.r2.1.1.sites.liftover_grch38.vcf.gz.tbi
 
 #################
 Use/adapt trackDb and HTML from hg19 track
 
 #################
 # Add V3.  Native hg38 analysis (not lifted)
 # Released 10/16/2019 by MacArthur lab, announced here:
 # https://macarthurlab.org/2019/10/16/gnomad-v3-0/
 # (2019-10-21 kate)
 
 mkdir /hive/data/outside/gnomAD.3
 cd /hive/data/outside/gnomAD.3
 
 wget https://storage.googleapis.com/gnomad-public/release/3.0/vcf/genomes/gnomad.genomes.r3.0.sites.vcf.bgz.tbi
 
 ln -s `pwd`/gnomad.genomes.r3.0.sites.vcf.bgz.tbi /gbdb/hg38/gnomAD/vcf/gnomad.genomes.r3.0.sites.vcf.gz.tbi
 
 (date; wget https://storage.googleapis.com/gnomad-public/release/3.0/vcf/genomes/gnomad.genomes.r3.0.sites.vcf.bgz; date) >&! wget.genomes.log &
 
 #Mon Oct 21 11:26:58 PDT 2019
 #Mon Oct 21 12:27:14 PDT 2019
 #    ~1 hr
 
 md5sum gnomad.genomes.r3.0.sites.vcf.bgz
 
 ln -s `pwd`/gnomad.genomes.r3.0.sites.vcf.bgz /gbdb/hg38/gnomAD/vcf/gnomad.genomes.r3.0.sites.vcf.gz
 
 
+##############################################################################
+# gnomAD v3.1 Update - Nov 24, 2020 - ChrisL - DONE
+# See /hive/data/outside/gnomAD.3/v3.1/make.txt for how to download
+# the new version
 
+WORKDIR=/hive/data/inside/gnomAD/v3.1
+cd $WORKDIR
+db="hg38"
+mkdir -p $db/genomes
 
+# get the headers:
+bcftools view -h /hive/data/outside/gnomAD.3/v3.1/genomes/gnomad.genomes.v3.1.sites.chr1.vcf.bgz > gnomad.v3.1.header
 
+# The VEP field has the important information, format:
+bcftools view -h /hive/data/outside/gnomAD.3/v3.1/genomes/gnomad.genomes.v3.1.sites.chr1.vcf.bgz | grep vep | grep -o "Format: .*$" | cut -d' ' -f2- | tr '|' '\t' | tl > gnomad.v3.1.vep.fields
+
+# the specific fields (same as v2.1.1 except '_' has been swapped to '-' EXCEPT in popmax
+# fields -___-), also add some new fields like CADD scores:
+fields="AC,AN,AF,faf95,nhomalt,vep,popmax,AC_popmax,AN_popmax,AF_popmax,AC-afr,AN-afr,AF-afr,nhomalt-afr,AC-ami,AN-ami,AF-ami,nhomalt-ami,AC-amr,AN-amr,AF-amr,nhomalt-amr,AC-asj,AN-asj,AF-asj,nhomalt-asj,AC-eas,AN-eas,AF-eas,nhomalt-eas,AC-fin,AN-fin,AF-fin,nhomalt-fin,AC-mid,AN-mid,AF-mid,nhomalt-mid,AC-nfe,AN-nfe,AF-nfe,nhomalt-nfe,AC-sas,AN-sas,AF-sas,nhomalt-sas,AC-oth,AN-oth,AF-oth,nhomalt-oth,cadd_phred,revel_score,splice_ai_max_ds,splice_ai_consequence,primate_ai_score"
+
+# Don't use parallel anymore, use parasol cause this still takes far too long and bogs down hgwdev:
+
+# make script to run each job:
+cat <<'_EOF' > run.sh
+#!/bin/bash
+
+# don't do the following, as vcfToBed will correctly exit with an error
+#set -beEu -o pipefail
+fields="AC,AN,AF,faf95,nhomalt,vep,popmax,AC_popmax,AN_popmax,AF_popmax,AC-afr,AN-afr,AF-afr,nhomalt-afr,AC-ami,AN-ami,AF-ami,nhomalt-ami,AC-amr,AN-amr,AF-amr,nhomalt-amr,AC-asj,AN-asj,AF-asj,nhomalt-asj,AC-eas,AN-eas,AF-eas,nhomalt-eas,AC-fin,AN-fin,AF-fin,nhomalt-fin,AC-mid,AN-mid,AF-mid,nhomalt-mid,AC-nfe,AN-nfe,AF-nfe,nhomalt-nfe,AC-sas,AN-sas,AF-sas,nhomalt-sas,AC-oth,AN-oth,AF-oth,nhomalt-oth,cadd_phred,revel_score,splice_ai_max_ds,splice_ai_consequence,primate_ai_score"
+infile=$1
+finalBed=$2
+
+# get the suffix name after the final '/':
+suffix=${infile#/hive/data/outside/gnomAD.3/v3.1/genomes/}
+suffix=${suffix%.vcf.bgz}
+
+outBed=/hive/data/inside/gnomAD/v3.1/hg38/genomes/${suffix}.bed
+
+vcfToBed -fields="${fields}" ${infile} ${outBed} 2>/dev/null
+/hive/data/inside/gnomAD/v3.1/gnomadVcfBedToBigBed ${outBed} stdout | sort -k1,1 -k2,2n > ${finalBed}
+_EOF
+
+# make a jobList file, here's a sample line:
+./run.sh /hive/data/outside/gnomAD.3/v3.1/genomes/gnomad.genomes.v3.1.sites.chr1.vcf.bgz {check out exists+ /hive/data/inside/gnomAD/v3.1/hg38/genomes/chr1.bed}
+
+ssh ku "cd ${WORKDIR}; para create jobList; para try; exit"
+
+# woops errors in the gnomadVcfBedToBigBed script caused the first 10 jobs fail, fix up the script
+# and run those jobs separately:
+# the regular expression is for extracting chr*.bed out of the input file and making just that name
+# the output file
+
+# first start the rest of the jobs:
+ssh ku "cd ${WORKDIR}; para push; exit"
+# then finish off:
+parallel -j10 --joblog secondhalf.run.log --plus "./gnomadVcfBedToBigBed {} hg38/genomes/{= s/.*(chr[0-9]+\.bed$)/\$1/ =}" ::: hg38/genomes/gnomad.genomes.v3.1.sites.chr{1,10,11,12,13,14,15,16,17,18}.bed
+# See para.time and secondhalf.run.log for timing info
+
+time sort -k1,1 -k2,2n hg38/genomes/chr*.bed > gnomad.v3.1.genomes.bed
+# real    26m7.992s
+# user    31m40.053s
+# sys     14m31.118s
+
+nohup time bedToBigBed -type=bed9+64 -tab -as=genomes.as gnomad.v3.1.genomes.bed /hive/data/genomes/hg38/chrom.sizes genomes.bb &> bigBed.log &
+mkdir -p /gbdb/hg38/gnomAD/v3.1/variants/
+ln -s `pwd`/genomes.bb /gbdb/hg38/gnomAD/v3.1/variants/