6d039ce16bfa7c8c38372807bf21155c86819dc2 chmalee Tue Jun 29 09:41:09 2021 -0700 Commiting script and makedoc for gnomAD v3.1.1 track diff --git src/hg/makeDb/doc/hg38/gnomad.txt src/hg/makeDb/doc/hg38/gnomad.txt index 259d418..fd347c5 100644 --- src/hg/makeDb/doc/hg38/gnomad.txt +++ src/hg/makeDb/doc/hg38/gnomad.txt @@ -137,15 +137,79 @@ # first start the rest of the jobs: ssh ku "cd ${WORKDIR}; para push; exit" # then finish off: parallel -j10 --joblog secondhalf.run.log --plus "./gnomadVcfBedToBigBed {} hg38/genomes/{= s/.*(chr[0-9]+\.bed$)/\$1/ =}" ::: hg38/genomes/gnomad.genomes.v3.1.sites.chr{1,10,11,12,13,14,15,16,17,18}.bed # See para.time and secondhalf.run.log for timing info time sort -k1,1 -k2,2n hg38/genomes/chr*.bed > gnomad.v3.1.genomes.bed # real 26m7.992s # user 31m40.053s # sys 14m31.118s nohup time bedToBigBed -type=bed9+64 -tab -as=genomes.as gnomad.v3.1.genomes.bed /hive/data/genomes/hg38/chrom.sizes genomes.bb &> bigBed.log & mkdir -p /gbdb/hg38/gnomAD/v3.1/variants/ ln -s `pwd`/genomes.bb /gbdb/hg38/gnomAD/v3.1/variants/ + +############################################################################## +# gnomAD v3.1.1 Update - Jun 29, 2021 - ChrisL - DONE +# See /hive/data/outside/gnomAD.3/v3.1.1/make.txt for how to download +# the new version +############################################################################## +WORKDIR=/hive/data/inside/gnomAD/v3.1.1 +cd $WORKDIR +db="hg38" +mkdir -p $db/genomes + +# get the headers: +bcftools view -h /hive/data/outside/gnomAD.3/v3.1.1/genomes/gnomad.genomes.v3.1.1.sites.chr1.vcf.bgz > gnomad.v3.1.1.header + +# The VEP field has most of the important information, format: +bcftools view -h /hive/data/outside/gnomAD.3/v3.1.1/genomes/gnomad.genomes.v3.1.1.sites.chr1.vcf.bgz | grep vep | grep -o "Format: .*$" | cut -d' ' -f2- | tr '|' '\t' | tl > gnomad.v3.1.1.vep.fields + +# Now make two control scripts for specifying fields, see +# /hive/data/inside/gnomAD/v3.1.1/runVcfToBed.sh and +# /hive/data/inside/gnomAD/v3.1.1/convertBeds.sh +# for more information + +# First convert the VCFs to beds: +# Need a jobList for parallel to run: +today=`date +%F` +for c in {1..22} X Y M; do + echo "./runVcfToBed.sh /hive/data/outside/gnomAD.3/v3.1.1/genomes/gnomad.genomes.v3.1.1.sites.chr$c.vcf.bgz $WORKDIR/$db/genomes/gnomad.v3.1.1.chr$c.bed" +done > vcfToBed.jobList +parallel --joblog vcfToBed.run.log --jobs 10 :::: vcfToBed.jobList + +# WOOPS wrong path to chrM, run separately: +./runVcfToBed.sh /hive/data/outside/gnomAD.3/v3.1.1/genomes/gnomad.genomes.v3.1.sites.chrM.vcf.bgz /hive/data/inside/gnomAD/v3.1.1/hg38/genomes/gnomad.v3.1.chrM.bed + +# uh oh chrM went wrong again. The chrM variant calling pipeline is different than +# the standard variant calling pipeline gnomAD used for the rest of the genome. Thus +# we need to get specific fields out of the chrM file, and append empty fields +# into the other tab files (the chrM specific data will be missing for the other +# chromosomes while the regular genome data will be mostly missing for the chrM +# data). This also brings up a bug in the original where I was missing Total counts +# for the population data, so just do a full re-run which unifies the tab files +# for both chrM and the rest of the genome: +./runVcfToBed.chrM.sh /hive/data/outside/gnomAD.3/v3.1.1/genomes/gnomad.genomes.v3.1.sites.chrM.vcf.bgz /hive/data/inside/gnomAD/v3.1.1/hg38/genomes/gnomad.v3.1.chrM.bed +for c in {1..22} X Y; do + echo "./convertBeds.sh $WORKDIR/$db/genomes/gnomad.v3.1.1.chr$c.bed $WORKDIR/$db/genomes/chr$c.bed" +done > convertBed.jobList +echo "./convertBeds.chrM.sh $WORKDIR/$db/genomes/gnomad.v3.1.chrM.bed $WORKDIR/$db/genomes/chrM.bed" >> convertBed.jobList +time (parallel --joblog convertBed.run.log --jobs 25 :::: convertBed.jobList) &> convertBed.log & + +# Now we have one bed file and one external file per chromosome, use bgzip and +# bedJoinTabOffset to combine them together: +time bgzip -iI gnomad.v3.1.1.details.tab.gz.gzi -c gnomad.v3.1.1.details.pre.tab > gnomad.v3.1.1.details.tab.gz & +# real 255m53.801s +# user 238m3.982s +# sys 16m7.130s +time (ls -1S hg38/genomes/chr*.bed | + parallel --joblog join.run.log --max-procs 12 \\ + bedJoinTabOffset -verbose=2 -bedKey=3 gnomad.v3.1.1.details.pre.tab {} joined/{/}) &> bedJoinTabOffset.log & +time sort --merge joined/*.bed | grep -v "^#" > gnomad.v3.1.1.genomes.joined.bed +# real 16m4.876s +# user 12m2.699s +# sys 6m30.244s + +# and lastly turn the merged bed into a bigBed: +time (bedToBigBed -type=bed9+15 -tab -as=genomes.as -extraIndex=name,rsId,_displayName gnomad.v3.1.1.genomes.joined.bed /hive/data/genomes/hg38/chrom.sizes genomes.bb) &> bigBed.log &