06d7be056190c14b85e71bc12523f18ea6815b5e markd Mon Dec 7 00:50:29 2020 -0800 BLAT mmap index support merge with master diff --git src/hg/makeDb/doc/hg38/gnomad.txt src/hg/makeDb/doc/hg38/gnomad.txt index 4dfaa2c..259d418 100644 --- src/hg/makeDb/doc/hg38/gnomad.txt +++ src/hg/makeDb/doc/hg38/gnomad.txt @@ -1,88 +1,151 @@ # gnomaAD for hg38 # RM 21782 # Genome Aggregation Database at the Broad # https://gnomad.broadinstitute.org # (2019-10-07 kate) # March 6, 2019: gnomAD 2.1.1 mkdir vcf; cd vcf # hg38 mkdir hg38; cd hg38 mkdir -p /gbdb/hg38/gnomAD/vcf # Download from GRCh38 liftover section of downloads page: https://gnomad.broadinstitute.org/downloads#variants-grch38-liftover wget https://storage.googleapis.com/gnomad-public/release/2.1.1/README.txt # Exome variants # vcf # 85.31 GiB, MD5: cff8d0cfed50adc9211d1feaed2d4ca7 wget https://storage.googleapis.com/gnomad-public/release/2.1.1/liftover_grch38/vcf/exomes/gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.bgz >&! wget.exomes.log & # from log: 20 minutes download time # --2019-10-07 16:40:48-- https://storage.googleapis.com/gnomad-public/release/2.1.1/liftover_grch38/vcf/exomes/gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.bgz # 2019-10-07 17:00:52 (72.6 MB/s) - 'gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.bgz' saved [91604348731/91604348731] du -sh *.bgz # 171G gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.bgz md5sum gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.bgz # cff8d0cfed50adc9211d1feaed2d4ca7 gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.bgz ln -s `pwd`/gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.bgz /gbdb/hg38/gnomAD/vcf/gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.gz # index file wget https://storage.googleapis.com/gnomad-public/release/2.1.1/liftover_grch38/vcf/exomes/gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.bgz.tbi ln -s `pwd`/gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.bgz.tbi /gbdb/hg38/gnomAD/vcf/gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.gz.tbi # Genomes Variants # 743.06 GiB, MD5: 83de3d5b52669f714e810d4fcf047c18 wget https://storage.googleapis.com/gnomad-public/release/2.1.1/liftover_grch38/vcf/genomes/gnomad.genomes.r2.1.1.sites.liftover_grch38.vcf.bgz >&! wget.genomes.log & # from log: 3 hrs download time #--2019-10-07 17:03:11-- https://storage.googleapis.com/gnomad-public/release/2.1.1/liftover_grch38/vcf/genomes/gnomad.genomes.r2.1.1.sites.liftover_grch38.vcf.bgz # 2019-10-07 20:02:56 (70.6 MB/s) - 'gnomad.genomes.r2.1.1.sites.liftover_grch38.vcf.bgz' saved [797851733355/797851733355] ln -s `pwd`/gnomad.genomes.r2.1.1.sites.liftover_grch38.vcf.bgz /gbdb/hg38/gnomAD/vcf/gnomad.genomes.r2.1.1.sites.liftover_grch38.vcf.gz wget https://storage.googleapis.com/gnomad-public/release/2.1.1/liftover_grch38/vcf/genomes/gnomad.genomes.r2.1.1.sites.liftover_grch38.vcf.bgz.tbi ln -s `pwd`/gnomad.genomes.r2.1.1.sites.liftover_grch38.vcf.bgz.tbi /gbdb/hg38/gnomAD/vcf/gnomad.genomes.r2.1.1.sites.liftover_grch38.vcf.gz.tbi ################# Use/adapt trackDb and HTML from hg19 track ################# # Add V3. Native hg38 analysis (not lifted) # Released 10/16/2019 by MacArthur lab, announced here: # https://macarthurlab.org/2019/10/16/gnomad-v3-0/ # (2019-10-21 kate) mkdir /hive/data/outside/gnomAD.3 cd /hive/data/outside/gnomAD.3 wget https://storage.googleapis.com/gnomad-public/release/3.0/vcf/genomes/gnomad.genomes.r3.0.sites.vcf.bgz.tbi ln -s `pwd`/gnomad.genomes.r3.0.sites.vcf.bgz.tbi /gbdb/hg38/gnomAD/vcf/gnomad.genomes.r3.0.sites.vcf.gz.tbi (date; wget https://storage.googleapis.com/gnomad-public/release/3.0/vcf/genomes/gnomad.genomes.r3.0.sites.vcf.bgz; date) >&! wget.genomes.log & #Mon Oct 21 11:26:58 PDT 2019 #Mon Oct 21 12:27:14 PDT 2019 # ~1 hr md5sum gnomad.genomes.r3.0.sites.vcf.bgz ln -s `pwd`/gnomad.genomes.r3.0.sites.vcf.bgz /gbdb/hg38/gnomAD/vcf/gnomad.genomes.r3.0.sites.vcf.gz +############################################################################## +# gnomAD v3.1 Update - Nov 24, 2020 - ChrisL - DONE +# See /hive/data/outside/gnomAD.3/v3.1/make.txt for how to download +# the new version +WORKDIR=/hive/data/inside/gnomAD/v3.1 +cd $WORKDIR +db="hg38" +mkdir -p $db/genomes +# get the headers: +bcftools view -h /hive/data/outside/gnomAD.3/v3.1/genomes/gnomad.genomes.v3.1.sites.chr1.vcf.bgz > gnomad.v3.1.header +# The VEP field has the important information, format: +bcftools view -h /hive/data/outside/gnomAD.3/v3.1/genomes/gnomad.genomes.v3.1.sites.chr1.vcf.bgz | grep vep | grep -o "Format: .*$" | cut -d' ' -f2- | tr '|' '\t' | tl > gnomad.v3.1.vep.fields + +# the specific fields (same as v2.1.1 except '_' has been swapped to '-' EXCEPT in popmax +# fields -___-), also add some new fields like CADD scores: +fields="AC,AN,AF,faf95,nhomalt,vep,popmax,AC_popmax,AN_popmax,AF_popmax,AC-afr,AN-afr,AF-afr,nhomalt-afr,AC-ami,AN-ami,AF-ami,nhomalt-ami,AC-amr,AN-amr,AF-amr,nhomalt-amr,AC-asj,AN-asj,AF-asj,nhomalt-asj,AC-eas,AN-eas,AF-eas,nhomalt-eas,AC-fin,AN-fin,AF-fin,nhomalt-fin,AC-mid,AN-mid,AF-mid,nhomalt-mid,AC-nfe,AN-nfe,AF-nfe,nhomalt-nfe,AC-sas,AN-sas,AF-sas,nhomalt-sas,AC-oth,AN-oth,AF-oth,nhomalt-oth,cadd_phred,revel_score,splice_ai_max_ds,splice_ai_consequence,primate_ai_score" + +# Don't use parallel anymore, use parasol cause this still takes far too long and bogs down hgwdev: + +# make script to run each job: +cat <<'_EOF' > run.sh +#!/bin/bash + +# don't do the following, as vcfToBed will correctly exit with an error +#set -beEu -o pipefail +fields="AC,AN,AF,faf95,nhomalt,vep,popmax,AC_popmax,AN_popmax,AF_popmax,AC-afr,AN-afr,AF-afr,nhomalt-afr,AC-ami,AN-ami,AF-ami,nhomalt-ami,AC-amr,AN-amr,AF-amr,nhomalt-amr,AC-asj,AN-asj,AF-asj,nhomalt-asj,AC-eas,AN-eas,AF-eas,nhomalt-eas,AC-fin,AN-fin,AF-fin,nhomalt-fin,AC-mid,AN-mid,AF-mid,nhomalt-mid,AC-nfe,AN-nfe,AF-nfe,nhomalt-nfe,AC-sas,AN-sas,AF-sas,nhomalt-sas,AC-oth,AN-oth,AF-oth,nhomalt-oth,cadd_phred,revel_score,splice_ai_max_ds,splice_ai_consequence,primate_ai_score" +infile=$1 +finalBed=$2 + +# get the suffix name after the final '/': +suffix=${infile#/hive/data/outside/gnomAD.3/v3.1/genomes/} +suffix=${suffix%.vcf.bgz} + +outBed=/hive/data/inside/gnomAD/v3.1/hg38/genomes/${suffix}.bed + +vcfToBed -fields="${fields}" ${infile} ${outBed} 2>/dev/null +/hive/data/inside/gnomAD/v3.1/gnomadVcfBedToBigBed ${outBed} stdout | sort -k1,1 -k2,2n > ${finalBed} +_EOF + +# make a jobList file, here's a sample line: +./run.sh /hive/data/outside/gnomAD.3/v3.1/genomes/gnomad.genomes.v3.1.sites.chr1.vcf.bgz {check out exists+ /hive/data/inside/gnomAD/v3.1/hg38/genomes/chr1.bed} + +ssh ku "cd ${WORKDIR}; para create jobList; para try; exit" + +# woops errors in the gnomadVcfBedToBigBed script caused the first 10 jobs fail, fix up the script +# and run those jobs separately: +# the regular expression is for extracting chr*.bed out of the input file and making just that name +# the output file + +# first start the rest of the jobs: +ssh ku "cd ${WORKDIR}; para push; exit" +# then finish off: +parallel -j10 --joblog secondhalf.run.log --plus "./gnomadVcfBedToBigBed {} hg38/genomes/{= s/.*(chr[0-9]+\.bed$)/\$1/ =}" ::: hg38/genomes/gnomad.genomes.v3.1.sites.chr{1,10,11,12,13,14,15,16,17,18}.bed +# See para.time and secondhalf.run.log for timing info + +time sort -k1,1 -k2,2n hg38/genomes/chr*.bed > gnomad.v3.1.genomes.bed +# real 26m7.992s +# user 31m40.053s +# sys 14m31.118s + +nohup time bedToBigBed -type=bed9+64 -tab -as=genomes.as gnomad.v3.1.genomes.bed /hive/data/genomes/hg38/chrom.sizes genomes.bb &> bigBed.log & +mkdir -p /gbdb/hg38/gnomAD/v3.1/variants/ +ln -s `pwd`/genomes.bb /gbdb/hg38/gnomAD/v3.1/variants/