9af983b83acb335907ac9bf3da0873cb6e06c48d chmalee Wed Jun 26 10:55:48 2024 -0700 Add updates for gnomad v4.1 genomes and exomes to makedoc diff --git src/hg/makeDb/doc/hg38/gnomad.txt src/hg/makeDb/doc/hg38/gnomad.txt index 68e37f6..5acbe10 100644 --- src/hg/makeDb/doc/hg38/gnomad.txt +++ src/hg/makeDb/doc/hg38/gnomad.txt @@ -1,540 +1,526 @@ # gnomaAD for hg38 # RM 21782 # Genome Aggregation Database at the Broad # https://gnomad.broadinstitute.org # (2019-10-07 kate) # March 6, 2019: gnomAD 2.1.1 mkdir vcf; cd vcf # hg38 mkdir hg38; cd hg38 mkdir -p /gbdb/hg38/gnomAD/vcf # Download from GRCh38 liftover section of downloads page: https://gnomad.broadinstitute.org/downloads#variants-grch38-liftover wget https://storage.googleapis.com/gnomad-public/release/2.1.1/README.txt # Exome variants # vcf # 85.31 GiB, MD5: cff8d0cfed50adc9211d1feaed2d4ca7 wget https://storage.googleapis.com/gnomad-public/release/2.1.1/liftover_grch38/vcf/exomes/gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.bgz >&! wget.exomes.log & # from log: 20 minutes download time # --2019-10-07 16:40:48-- https://storage.googleapis.com/gnomad-public/release/2.1.1/liftover_grch38/vcf/exomes/gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.bgz # 2019-10-07 17:00:52 (72.6 MB/s) - 'gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.bgz' saved [91604348731/91604348731] du -sh *.bgz # 171G gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.bgz md5sum gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.bgz # cff8d0cfed50adc9211d1feaed2d4ca7 gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.bgz ln -s `pwd`/gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.bgz /gbdb/hg38/gnomAD/vcf/gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.gz # index file wget https://storage.googleapis.com/gnomad-public/release/2.1.1/liftover_grch38/vcf/exomes/gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.bgz.tbi ln -s `pwd`/gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.bgz.tbi /gbdb/hg38/gnomAD/vcf/gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.gz.tbi # Genomes Variants # 743.06 GiB, MD5: 83de3d5b52669f714e810d4fcf047c18 wget https://storage.googleapis.com/gnomad-public/release/2.1.1/liftover_grch38/vcf/genomes/gnomad.genomes.r2.1.1.sites.liftover_grch38.vcf.bgz >&! wget.genomes.log & # from log: 3 hrs download time #--2019-10-07 17:03:11-- https://storage.googleapis.com/gnomad-public/release/2.1.1/liftover_grch38/vcf/genomes/gnomad.genomes.r2.1.1.sites.liftover_grch38.vcf.bgz # 2019-10-07 20:02:56 (70.6 MB/s) - 'gnomad.genomes.r2.1.1.sites.liftover_grch38.vcf.bgz' saved [797851733355/797851733355] ln -s `pwd`/gnomad.genomes.r2.1.1.sites.liftover_grch38.vcf.bgz /gbdb/hg38/gnomAD/vcf/gnomad.genomes.r2.1.1.sites.liftover_grch38.vcf.gz wget https://storage.googleapis.com/gnomad-public/release/2.1.1/liftover_grch38/vcf/genomes/gnomad.genomes.r2.1.1.sites.liftover_grch38.vcf.bgz.tbi ln -s `pwd`/gnomad.genomes.r2.1.1.sites.liftover_grch38.vcf.bgz.tbi /gbdb/hg38/gnomAD/vcf/gnomad.genomes.r2.1.1.sites.liftover_grch38.vcf.gz.tbi ################# Use/adapt trackDb and HTML from hg19 track ################# # Add V3. Native hg38 analysis (not lifted) # Released 10/16/2019 by MacArthur lab, announced here: # https://macarthurlab.org/2019/10/16/gnomad-v3-0/ # (2019-10-21 kate) mkdir /hive/data/outside/gnomAD.3 cd /hive/data/outside/gnomAD.3 wget https://storage.googleapis.com/gnomad-public/release/3.0/vcf/genomes/gnomad.genomes.r3.0.sites.vcf.bgz.tbi ln -s `pwd`/gnomad.genomes.r3.0.sites.vcf.bgz.tbi /gbdb/hg38/gnomAD/vcf/gnomad.genomes.r3.0.sites.vcf.gz.tbi (date; wget https://storage.googleapis.com/gnomad-public/release/3.0/vcf/genomes/gnomad.genomes.r3.0.sites.vcf.bgz; date) >&! wget.genomes.log & #Mon Oct 21 11:26:58 PDT 2019 #Mon Oct 21 12:27:14 PDT 2019 # ~1 hr md5sum gnomad.genomes.r3.0.sites.vcf.bgz ln -s `pwd`/gnomad.genomes.r3.0.sites.vcf.bgz /gbdb/hg38/gnomAD/vcf/gnomad.genomes.r3.0.sites.vcf.gz ############################################################################## # gnomAD v3.1 Update - Nov 24, 2020 - ChrisL - DONE # See /hive/data/outside/gnomAD.3/v3.1/make.txt for how to download # the new version WORKDIR=/hive/data/inside/gnomAD/v3.1 cd $WORKDIR db="hg38" mkdir -p $db/genomes # get the headers: bcftools view -h /hive/data/outside/gnomAD.3/v3.1/genomes/gnomad.genomes.v3.1.sites.chr1.vcf.bgz > gnomad.v3.1.header # The VEP field has the important information, format: bcftools view -h /hive/data/outside/gnomAD.3/v3.1/genomes/gnomad.genomes.v3.1.sites.chr1.vcf.bgz | grep vep | grep -o "Format: .*$" | cut -d' ' -f2- | tr '|' '\t' | tl > gnomad.v3.1.vep.fields # the specific fields (same as v2.1.1 except '_' has been swapped to '-' EXCEPT in popmax # fields -___-), also add some new fields like CADD scores: fields="AC,AN,AF,faf95,nhomalt,vep,popmax,AC_popmax,AN_popmax,AF_popmax,AC-afr,AN-afr,AF-afr,nhomalt-afr,AC-ami,AN-ami,AF-ami,nhomalt-ami,AC-amr,AN-amr,AF-amr,nhomalt-amr,AC-asj,AN-asj,AF-asj,nhomalt-asj,AC-eas,AN-eas,AF-eas,nhomalt-eas,AC-fin,AN-fin,AF-fin,nhomalt-fin,AC-mid,AN-mid,AF-mid,nhomalt-mid,AC-nfe,AN-nfe,AF-nfe,nhomalt-nfe,AC-sas,AN-sas,AF-sas,nhomalt-sas,AC-oth,AN-oth,AF-oth,nhomalt-oth,cadd_phred,revel_score,splice_ai_max_ds,splice_ai_consequence,primate_ai_score" # Don't use parallel anymore, use parasol cause this still takes far too long and bogs down hgwdev: # make script to run each job: cat <<'_EOF' > run.sh #!/bin/bash # don't do the following, as vcfToBed will correctly exit with an error #set -beEu -o pipefail fields="AC,AN,AF,faf95,nhomalt,vep,popmax,AC_popmax,AN_popmax,AF_popmax,AC-afr,AN-afr,AF-afr,nhomalt-afr,AC-ami,AN-ami,AF-ami,nhomalt-ami,AC-amr,AN-amr,AF-amr,nhomalt-amr,AC-asj,AN-asj,AF-asj,nhomalt-asj,AC-eas,AN-eas,AF-eas,nhomalt-eas,AC-fin,AN-fin,AF-fin,nhomalt-fin,AC-mid,AN-mid,AF-mid,nhomalt-mid,AC-nfe,AN-nfe,AF-nfe,nhomalt-nfe,AC-sas,AN-sas,AF-sas,nhomalt-sas,AC-oth,AN-oth,AF-oth,nhomalt-oth,cadd_phred,revel_score,splice_ai_max_ds,splice_ai_consequence,primate_ai_score" infile=$1 finalBed=$2 # get the suffix name after the final '/': suffix=${infile#/hive/data/outside/gnomAD.3/v3.1/genomes/} suffix=${suffix%.vcf.bgz} outBed=/hive/data/inside/gnomAD/v3.1/hg38/genomes/${suffix}.bed vcfToBed -fields="${fields}" ${infile} ${outBed} 2>/dev/null /hive/data/inside/gnomAD/v3.1/gnomadVcfBedToBigBed ${outBed} stdout | sort -k1,1 -k2,2n > ${finalBed} _EOF # make a jobList file, here's a sample line: ./run.sh /hive/data/outside/gnomAD.3/v3.1/genomes/gnomad.genomes.v3.1.sites.chr1.vcf.bgz {check out exists+ /hive/data/inside/gnomAD/v3.1/hg38/genomes/chr1.bed} ssh ku "cd ${WORKDIR}; para create jobList; para try; exit" # woops errors in the gnomadVcfBedToBigBed script caused the first 10 jobs fail, fix up the script # and run those jobs separately: # the regular expression is for extracting chr*.bed out of the input file and making just that name # the output file # first start the rest of the jobs: ssh ku "cd ${WORKDIR}; para push; exit" # then finish off: parallel -j10 --joblog secondhalf.run.log --plus "./gnomadVcfBedToBigBed {} hg38/genomes/{= s/.*(chr[0-9]+\.bed$)/\$1/ =}" ::: hg38/genomes/gnomad.genomes.v3.1.sites.chr{1,10,11,12,13,14,15,16,17,18}.bed # See para.time and secondhalf.run.log for timing info time sort -k1,1 -k2,2n hg38/genomes/chr*.bed > gnomad.v3.1.genomes.bed # real 26m7.992s # user 31m40.053s # sys 14m31.118s nohup time bedToBigBed -type=bed9+64 -tab -as=genomes.as gnomad.v3.1.genomes.bed /hive/data/genomes/hg38/chrom.sizes genomes.bb &> bigBed.log & mkdir -p /gbdb/hg38/gnomAD/v3.1/variants/ ln -s `pwd`/genomes.bb /gbdb/hg38/gnomAD/v3.1/variants/ ############################################################################## # gnomAD v3.1.1 Update - Jun 29, 2021 - ChrisL - DONE # See /hive/data/outside/gnomAD.3/v3.1.1/make.txt for how to download # the new version ############################################################################## WORKDIR=/hive/data/inside/gnomAD/v3.1.1 cd $WORKDIR db="hg38" mkdir -p $db/genomes # get the headers: bcftools view -h /hive/data/outside/gnomAD.3/v3.1.1/genomes/gnomad.genomes.v3.1.1.sites.chr1.vcf.bgz > gnomad.v3.1.1.header # The VEP field has most of the important information, format: bcftools view -h /hive/data/outside/gnomAD.3/v3.1.1/genomes/gnomad.genomes.v3.1.1.sites.chr1.vcf.bgz | grep vep | grep -o "Format: .*$" | cut -d' ' -f2- | tr '|' '\t' | tl > gnomad.v3.1.1.vep.fields # Now make two control scripts for specifying fields, see # /hive/data/inside/gnomAD/v3.1.1/runVcfToBed.sh and # /hive/data/inside/gnomAD/v3.1.1/convertBeds.sh # for more information # First convert the VCFs to beds: # Need a jobList for parallel to run: today=`date +%F` for c in {1..22} X Y M; do echo "./runVcfToBed.sh /hive/data/outside/gnomAD.3/v3.1.1/genomes/gnomad.genomes.v3.1.1.sites.chr$c.vcf.bgz $WORKDIR/$db/genomes/gnomad.v3.1.1.chr$c.bed" done > vcfToBed.jobList parallel --joblog vcfToBed.run.log --jobs 10 :::: vcfToBed.jobList # WOOPS wrong path to chrM, run separately: ./runVcfToBed.sh /hive/data/outside/gnomAD.3/v3.1.1/genomes/gnomad.genomes.v3.1.sites.chrM.vcf.bgz /hive/data/inside/gnomAD/v3.1.1/hg38/genomes/gnomad.v3.1.chrM.bed # uh oh chrM went wrong again. The chrM variant calling pipeline is different than # the standard variant calling pipeline gnomAD used for the rest of the genome. Thus # we need to get specific fields out of the chrM file, and append empty fields # into the other tab files (the chrM specific data will be missing for the other # chromosomes while the regular genome data will be mostly missing for the chrM # data). This also brings up a bug in the original where I was missing Total counts # for the population data, so just do a full re-run which unifies the tab files # for both chrM and the rest of the genome: ./runVcfToBed.chrM.sh /hive/data/outside/gnomAD.3/v3.1.1/genomes/gnomad.genomes.v3.1.sites.chrM.vcf.bgz /hive/data/inside/gnomAD/v3.1.1/hg38/genomes/gnomad.v3.1.chrM.bed for c in {1..22} X Y; do echo "./convertBeds.sh $WORKDIR/$db/genomes/gnomad.v3.1.1.chr$c.bed $WORKDIR/$db/genomes/chr$c.bed" done > convertBed.jobList echo "./convertBeds.chrM.sh $WORKDIR/$db/genomes/gnomad.v3.1.chrM.bed $WORKDIR/$db/genomes/chrM.bed" >> convertBed.jobList time (parallel --joblog convertBed.run.log --jobs 25 :::: convertBed.jobList) &> convertBed.log & # Now we have one bed file and one external file per chromosome, use bgzip and # bedJoinTabOffset to combine them together: time bgzip -iI gnomad.v3.1.1.details.tab.gz.gzi -c gnomad.v3.1.1.details.pre.tab > gnomad.v3.1.1.details.tab.gz & # real 255m53.801s # user 238m3.982s # sys 16m7.130s time (ls -1S hg38/genomes/chr*.bed | parallel --joblog join.run.log --max-procs 12 \\ bedJoinTabOffset -verbose=2 -bedKey=3 gnomad.v3.1.1.details.pre.tab {} joined/{/}) &> bedJoinTabOffset.log & time sort --merge joined/*.bed | grep -v "^#" > gnomad.v3.1.1.genomes.joined.bed # real 16m4.876s # user 12m2.699s # sys 6m30.244s # and lastly turn the merged bed into a bigBed: time (bedToBigBed -type=bed9+15 -tab -as=genomes.as -extraIndex=name,rsId,_displayName gnomad.v3.1.1.genomes.joined.bed /hive/data/genomes/hg38/chrom.sizes genomes.bb) &> bigBed.log & # Max Tue Apr 5 04:14:40 PDT 2022 - adding mutation constraint subtrack cd /hive/data/genomes/hg38/bed/gnomad/constraint # Downloaded BED file from https://www.biorxiv.org/content/10.1101/2022.03.20.485034v1.supplementary-material # got chrX scores from konrad.j.karczewski@gmail.com because they feel unsure about them cat Supplementary_Data_2.bed constraint_z_genome_1kb_chrx.bed > mutConstraint.bed bedSort mutConstraint.bed mutConstraint.bed bedGraphToBigWig *.bed ../../../chrom.sizes mutConstraint.bw bedGraphToBigWig *.bed ../../../chrom.sizes mutConstraint.bw ############################################################################## # gnomAD v4 - Nov 1, 2023 - ChrisL - DONE ############################################################################## # See /hive/data/outside/gnomAD.4/make.txt for how to download # the new version # Make the work dir: mkdir -p /hive/data/inside/gnomAD/v4 cd /hive/data/inside/gnomAD/v4 # for each vcf in the download dirs, install files into /gbdb and load up a table with the pointers gbdbPath="/gbdb/hg38/gnomAD/v4/" dataPath="/hive/data/outside/gnomAD.4/" for tbl in exomes genomes do mkdir -p ${gbdbPath}${tbl} ln -s ${dataPath}${tbl}/*.vcf.bgz* ${gbdbPath}${tbl} cp /dev/null ${tbl}.txt if [ ${tbl} == "exomes" ]; then for c in {1..22} X Y do f=${gbdbPath}${tbl}/gnomad.exomes.v4.0.sites.chr${c}.vcf.bgz echo -e "${f}\tchr${c}" >> ${tbl}.txt done else for c in {1..22} X Y do f=${gbdbPath}${tbl}/gnomad.genomes.v4.0.sites.chr${c}.vcf.bgz echo -e "${f}\tchr${c}" >> ${tbl}.txt done fi tName="${tbl^}" hgLoadSqlTab hg38 gnomad${tName}V4 ~/kent/src/hg/lib/bbiChroms.sql genomes.txt done ############################################################################## # Add cancer/non-cancer filter to gnomAD v3.1.1 - Feb 16, 2024 - ChrisL - DONE ############################################################################## # see /hive/data/inside/gnomAD/v3.1.1/2024-02-12/README # for the steps ############################################################################## ############################################################################## # Update transcript contraint scores for gnomAD v4 - Mar 1, 2024 - ChrisL - DONE ############################################################################## # The data is not the same as the hg19 version, many less fields, but we mostly threw a lot of them # away before anyways. Also this file had both ENST and NM/XM transcripts # Here are the fields used in the previous version: zcat ../../gnomAD.2/constraint/gnomad.v2.1.1.lof_metrics.by_transcript.txt.bgz | head -1 | tawk '{print $76,$77,$78,$65,$66,$1,$2,$4,$5,$6,$34,$13,$14,$15,$33,$18,$21,$22,$25,$26,$27,$28,$29,$30,$31}' chromosome start_position end_position gene_id transcript_level gene transcript obs_mis exp_mis oe_mis mis_z obs_syn exp_syn oe_syn syn_z obs_lof exp_lof pLI oe_lof oe_syn_lower oe_syn_upper oe_mis_lower oe_mis_upper oe_lof_lower oe_lof_upper # Right off the bat we will be missing the chrom,start and end # Get the transcripts to get the coordinates and exon-intron boundaries hgsql -Ne "select * from wgEncodeGencodeCompV39" hg38 \ | cut -f2- | genePredToBed -tab stdin stdout | sed -Ee 's/\.[0-9]+//' \ | sort -k4 > hg38.gencodeCompV39.bed12 hgsql -Ne "select * from ncbiRefSeq" hg38 \ | cut -f2- | genePredToBed -tab stdin stdout \ | sort -k4 > hg38.refSeq.bed12 cat hg38.gencodeCompV39.bed12 hg38.refSeq.bed12 | sort -k4 > transcripts.coords f=gnomad.v4.0.constraint_metrics.tsv # I don't think this command will work just copying and pasting like the above will tail -n +2 $f \ | tawk '{print $1,$2,$24,$25,$27,$32,$37,$38,$40,$45,$12,$13,$17,$15,$42,$43,$29,$30,$20,$21}' \ | sort -k2 | join -t $'\t' -1 4 -2 2 transcripts.coords - \ | tawk '{for (i=1; i<=12; i++) {printf "%s\t", $i} printf "%s\t%s\t%s\t\t\t", $2, $3, $4; for (i=13; i <= NF; i++) {printf "%s", $i; if (i != NF) {printf "\t"}}; printf "\n"} ' \ | ~/kent/src/hg/makeDb/gnomad/combine.awk -v doTranscripts=true bedSort pliByTranscript.tab pliByTranscript.tab bedSort missenseByTranscript.tab missenseByTranscript.tab # Copy the old autosql file: cp ../../gnomAD.2/constraint/{missense,pli}Metrics.as . # Turn into a bigBed and link sizes=/hive/data/genomes/hg38/chrom.sizes bedToBigBed -type=bed12+6 -as=pliMetrics.as -tab -extraIndex=name,geneName pliByTranscript.tab $sizes pliByTranscript.bb bedToBigBed -type=bed12+5 -as=missenseMetrics.as -tab -extraIndex=name,geneName missenseByTranscript.tab $sizes missenseByTranscript.bb ############################################################################## # gnomAD v4.1 Update to bigBed - May 02, 2024 - ChrisL ############################################################################## # start a screen as this will take a while screen -S gnomADv4 WORKDIR=/hive/data/inside/gnomAD/v4/v4.1 cd $WORKDIR db="hg38" mkdir -p $db/{exomes,genomes} # get the headers: bcftools view -h /hive/data/outside/gnomAD.4/v4.1/genomes/gnomad.genomes.v4.1.sites.chr1.vcf.bgz > gnomad.v4.1.genomes.header bcftools view -h /hive/data/outside/gnomAD.4/v4.1/exomes/gnomad.exomes.v4.1.sites.chr1.vcf.bgz > gnomad.v4.1.exomes.header # Looks like there are INFO field differences between the two files, for some reason # there are different populations included for each? # The Genomes data has Amish population information, while the exomes data has UK BioBank # data and faf95_mid? # For the full list of differences, use this command: sdiff <(grep -Eo "^##.*=<[^,]+," gnomad.v4.1.genomes.header | sort) <(grep -Eo "^##.*=<[^,]+," gnomad.v4.1.exomes.header | sort ) | less # List out the INFO fields for each set: grep -Eo "^##INFO= gnomad.v4.1.genomes.infoFields # The VEP field has most of the important information, format: bcftools view -h /hive/data/outside/gnomAD.4/v4.1/exomes/gnomad.exomes.v4.1.sites.chr1.vcf.bgz | grep vep | grep -o "Format: .*$" | cut -d' ' -f2- | tr '|' '\t' | tl > gnomad.exomes.v4.1.vep.fields bcftools view -h /hive/data/outside/gnomAD.4/v4.1/genomes/gnomad.genomes.v4.1.sites.chr1.vcf.bgz | grep vep | grep -o "Format: .*$" | cut -d' ' -f2- | tr '|' '\t' | tl > gnomad.genomes.v4.1.vep.fields # at least the VEP fields are the same # Use the fields from v3.1.1 as a starting point: grep "^fields" ../../v3.1.1/runVcfToBed.sh | cut -d'=' -f2 | tr ',' '\n' | tr -d '"' > v3.1.1.vcfToBed.fieldList # check at least those fields are present in both sets: for line in $(cat v3.1.1.vcfToBed.fieldList); do grep $line gnomad.v4.1.exomes.infoFields &> /dev/null; if [ $? != 0 ]; then echo "$line not in v4.1 exomes"; fi; done popmax not in v4.1 exomes AC_popmax not in v4.1 exomes AN_popmax not in v4.1 exomes AF_popmax not in v4.1 exomes AC_ami not in v4.1 exomes AN_ami not in v4.1 exomes AF_ami not in v4.1 exomes nhomalt_ami not in v4.1 exomes AC_oth not in v4.1 exomes AN_oth not in v4.1 exomes AF_oth not in v4.1 exomes nhomalt_oth not in v4.1 exomes revel_score not in v4.1 exomes splice_ai_max_ds not in v4.1 exomes splice_ai_consequence not in v4.1 exomes primate_ai_score not in v4.1 exomes for line in $(cat v3.1.1.vcfToBed.fieldList); do grep $line gnomad.v4.1.genomes.infoFields &> /dev/null; if [ $? != 0 ]; then echo "$line not in v4.1 genomes"; fi; done # So they changed popmax to grpmax, and spliceAI and revel scores: grep -i "revel\|splice\|primate" gnomad.v4.1.exomes.infoFields revel_max spliceai_ds_max grep -i "revel\|splice\|primate" gnomad.v4.1.genomes.infoFields revel_max spliceai_ds_max cp v3.1.1.vcfToBed.fieldList v4.1.vcfToBed.exomes.fieldList cp v3.1.1.vcfToBed.fieldList v4.1.vcfToBed.genomes.fieldList # Make the fixups in vim, including the population information changes: remove ami from exomes, oth to remaining and the revel and splice_ai field changes, add variant_type # I used this command fixing up until there was nothing only on the left # and no '|' on the right sdiff <(sort v4.1.vcfToBed.exomes.fieldList) <(sort gnomad.v4.1.exomes.infoFields) | grep -v "ukb" | less # Now make two control scripts for specifying fields, see # /hive/data/inside/gnomAD/v4/v4.1/runVcfToBed.{exomes,genomes}.sh and # /hive/data/inside/gnomAD/v4/v4.1/convertBeds.{exomes,genomes}.sh # for more information # First convert the VCFs to beds: # Need a jobList for parallel to run: for c in {1..22} X Y; do echo "./runVcfToBed.exomes.sh /hive/data/outside/gnomAD.4/v4.1/exomes/gnomad.exomes.v4.1.sites.chr$c.vcf.bgz $WORKDIR/$db/exomes/gnomad.v4.1.chr$c.bed" done > vcfToBed.jobList for c in {1..22} X Y; do echo "./runVcfToBed.genomes.sh /hive/data/outside/gnomAD.4/v4.1/genomes/gnomad.genomes.v4.1.sites.chr$c.vcf.bgz $WORKDIR/$db/genomes/gnomad.v4.1.chr$c.bed" done >> vcfToBed.jobList parallel --joblog vcfToBed.run.log --jobs 15 :::: vcfToBed.jobList # Average of 105 minutes per job: tail -n +2 /hive/data/inside/gnomAD/v4/v4.1/vcfToBed.run.log | cut -f4 | awk '{sum += $1} END {print (sum/NR) / 60}' 54.0564 # convert the beds into a smaller bed file + a tab sep file with everything else for c in {1..22} X Y; do echo "./convertBeds.exomes.sh $WORKDIR/$db/exomes/gnomad.v4.1.chr$c.bed $WORKDIR/$db/exomes/chr$c.bed" done > convertBed.jobList for c in {1..22} X Y; do echo "./convertBeds.genomes.sh $WORKDIR/$db/genomes/gnomad.v4.1.chr$c.bed $WORKDIR/$db/genomes/chr$c.bed" done >> convertBed.jobList + +# convert the beds into a smaller bed file + a tab sep file with everything else time (parallel --joblog convertBed.run.log --jobs 25 :::: convertBed.jobList) &> convertBed.log & +# real 178m32.563s +# user 2843m21.030s +# sys 134m13.788s # Now we have one bed file and one external file per chromosome, use bgzip and # bedJoinTabOffset to combine them together: # Merge all the tab files together: -mkdir -p joined/{exomes,genomes} # First exomes: time sort --merge ${WORKDIR}/hg38/exomes/gnomad.*.tab > gnomad.v4.1.exomes.details.pre.tab -# real 14m24.203s -# user 2m17.384s -# sys 10m16.118s +# real 15m11.762s +# user 2m11.927s +# sys 11m1.171s time bgzip -iI gnomad.v4.1.exomes.details.tab.gz.gzi -c gnomad.v4.1.exomes.details.pre.tab > gnomad.v4.1.exomes.details.tab.gz -# real 62m28.397s -# user 58m29.676s -# sys 3m31.012s +# real 159m17.597s +# user 79m31.099s +# sys 5m1.504s # The parallel command changes {} into the input file (example: hg38/exomes/chr1.bed), and # {/} to the basename (example: chr1.bed) time (ls -1S hg38/exomes/chr*.bed | parallel --joblog join.run.log --max-procs 12 \\ bedJoinTabOffset -verbose=2 -bedKey=3 gnomad.v4.1.exomes.details.pre.tab {} joined/exomes/{/}) &> bedJoinTabOffset.log -# real 155m42.266s -# user 381m15.009s -# sys 50m29.407s +# real 129m2.819s +# user 379m29.001s +# sys 58m4.237s + time sort -S 40G --merge joined/exomes/*.bed | grep -v "^#" > gnomad.v4.1.exomes.joined.bed -# real 6m38.608s -# user 3m49.595s -# sys 2m43.676s +# real 7m43.575s +# user 83m46.236s +# sys 9m1.771s # Then genomes: time sort --merge ${WORKDIR}/hg38/genomes/gnomad.*.tab > gnomad.v4.1.genomes.details.pre.tab -# real 48m36.383s -# user 7m40.632s -# sys 34m7.595s +# real 55m0.135s +# user 8m46.206s +# sys 41m47.511s -# FIELD_FIX_RESTART_MARK time bgzip -iI gnomad.v4.1.genomes.details.tab.gz.gzi -c gnomad.v4.1.genomes.details.pre.tab > gnomad.v4.1.genomes.details.tab.gz -# real 230m19.470s -# user 216m16.280s -# sys 11m28.438s +# real 245m46.743s +# user 232m29.072s +# sys 11m57.409s # The parallel command changes {} into the input file (example: hg38/genomes/chr1.bed), and # {/} to the basename (example: chr1.bed) time (ls -1S hg38/genomes/chr*.bed | parallel --joblog join.run.log --max-procs 12 \\ - bedJoinTabOffset -verbose=2 -bedKey=3 gnomad.v4.1.genomes.details.pre.tab {} joined/genomes/{/}) &> bedJoinTabOffset.log -# real 430m5.112s -# user 1505m15.341s -# sys 286m35.990s + bedJoinTabOffset -verbose=2 -bedKey=3 gnomad.v4.1.genomes.details.pre.tab {} joined/genomes/{/}) &> bedJoinTabOffset.genomes.log +# real 373m33.753s +# user 1643m28.197s +# sys 275m53.429s time sort -S 40G --merge joined/genomes/*.bed | grep -v "^#" > gnomad.v4.1.genomes.joined.bed -# real 26m31.856s -# user 14m24.696s -# sys 9m57.459s +# real 27m20.072s +# user 16m46.584s +# sys 12m3.441s # and lastly turn the merged beds into bigBeds: -time (bedToBigBed -type=bed9+16 -tab -as=exomes.as -extraIndex=name,rsId,_displayName gnomad.v4.1.exomes.joined.bed /hive/data/genomes/hg38/chrom.sizes exomes.bb) &> bigBed.exomeslog & -# pass1 - making usageList (24 chroms): 239308 millis -# pass2 - checking and writing primary data (183717261 records, 25 fields): 1695174 millis -# Sorting and writing extra index 0: 258529 millis -# Sorting and writing extra index 1: 172276 millis -# Sorting and writing extra index 2: 73698 millis - -# real 43m44.216s -# user 40m4.371s -# sys 1m54.306s - -time (bedToBigBed -maxAlloc=250000000000 -type=bed9+16 -tab -as=genomes.as -extraIndex=name,rsId,_displayName gnomad.v4.1.genomes.joined.bed /hive/data/genomes/hg38/chrom.sizes genomes.bb) &> bigBed.genomeslog & -# pass1 - making usageList (24 chroms): 720685 millis -# pass2 - checking and writing primary data (759336320 records, 25 fields): 8127679 millis -# Sorting and writing extra index 0: 1711243 millis -# Sorting and writing extra index 1: 1627587 millis -# Sorting and writing extra index 2: 436891 millis -# -# real 227m12.682s -# user 204m24.791s -# sys 20m16.421s - -# Make symlinks to /gbdb: -cd /gbdb/hg38/gnomAD/ -mkdir v4.1 -cd v4.1 -mkdir exomes -mkdir genomes -cd $WORKDIR -ln -s `pwd`/genomes.bb /gbdb/hg38/gnomAD/v4.1/genomes/ -ln -s `pwd`/gnomad.v4.1.genomes.details.tab.gz /gbdb/hg38/gnomAD/v4.1/genomes/ -ln -s `pwd`/gnomad.v4.1.genomes.details.tab.gz.gzi /gbdb/hg38/gnomAD/v4.1/genomes/ -ln -s `pwd`/exomes.bb /gbdb/hg38/gnomAD/v4.1/exomes/ -ln -s `pwd`/gnomad.v4.1.exomes.details.tab.gz /gbdb/hg38/gnomAD/v4.1/exomes/ -ln -s `pwd`/gnomad.v4.1.exomes.details.tab.gz.gzi /gbdb/hg38/gnomAD/v4.1/exomes/ - -# Woops the conversion script had the wrong header listing, change it up: -sed -i -e '1,24s/segdup/segdup\tvariant_type\tallele_type/' gnomad.v4.1.genomes.details.pre.tab -# and then re-do the genomes steps from the FIELD_FIX_RESTART_MARK mark +time (bedToBigBed -type=bed9+21 -tab -as=exomes.as -extraIndex=name,rsId,_displayName gnomad.v4.1.exomes.joined.bed /hive/data/genomes/hg38/chrom.sizes exomes.bb) &> bigBed.exomeslog & +# pass1 - making usageList (24 chroms): 172816 millis +# pass2 - checking and writing primary data (183717261 records, 31 fields): 1872918 millis +# Sorting and writing extra index 0: 248199 millis +# Sorting and writing extra index 1: 206027 millis +# Sorting and writing extra index 2: 99569 millis + +# real 46m32.224s +# user 43m8.900s +# sys 2m32.009s + +time (bedToBigBed -maxAlloc=250000000000 -type=bed9+21 -tab -as=genomes.as -extraIndex=name,rsId,_displayName gnomad.v4.1.genomes.joined.bed /hive/data/genomes/hg38/chrom.sizes genomes.bb) &> bigBed.genomeslog & +# pass1 - making usageList (24 chroms): 776401 millis +# pass2 - checking and writing primary data (759336320 records, 31 fields): 10139390 millis +# Sorting and writing extra index 0: 1502079 millis +# Sorting and writing extra index 1: 1460242 millis +# Sorting and writing extra index 2: 433804 millis + +# real 254m58.161s +# user 230m40.491s +# sys 21m41.152s ############################################################################## # gnomAD v4.1 Missense and pli by transcript tracks- June 03, 2024 - ChrisL ############################################################################## wget https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/constraint/gnomad.v4.1.constraint_metrics.tsv # Get the transcripts to get the coordinates and exon-intron boundaries hgsql -Ne "select * from wgEncodeGencodeCompV39" hg38 \ | cut -f2- | genePredToBed -tab stdin stdout | sed -Ee 's/\.[0-9]+//' \ | sort -k4 > hg38.gencodeCompV39.bed12 hgsql -Ne "select * from ncbiRefSeq" hg38 \ | cut -f2- | genePredToBed -tab stdin stdout \ | sort -k4 > hg38.refSeq.bed12 cat hg38.gencodeCompV39.bed12 hg38.refSeq.bed12 | sort -k4 > transcripts.coords f=gnomad.v4.1.constraint_metrics.tsv # The order of fields is different between v4.0 and v4.1, figure out the fields we need to extract: head -1 ../gnomad.v4.0.constraint_metrics.tsv | tawk '{print $1,$2,$24,$25,$27,$32,$37,$38,$40,$45,$12,$13,$17,$15,$42,$43,$29,$30,$20,$21}' | tr '\t' '\n' > v4.0.wantedFields head -1 ../gnomad.v4.0.constraint_metrics.tsv | tr '\t' '\n' | nl > v4.0.fieldOrder head -1 $f | tr '\t' '\n' | nl > v4.1.fieldOrder grep -Fwf v4.0.wantedFields v4.1.fieldOrder > v4.1.wantedFields for field in $(head -1 ../gnomad.v4.0.constraint_metrics.tsv | tawk '{print $1,$2,$24,$25,$27,$32,$37,$38,$40,$45,$12,$13,$17,$15,$42,$43,$29,$30,$20,$21}' | tr '\t' '\n'); do grep -w $field v4.1.fieldOrder; done | cut -f1 | tr '\n' ',' | tr -d ' '; echo # I don't think this command will work just copying and pasting like the above will tail -n +2 $f \ | tawk '{print $1,$3,$28,$29,$31,$36,$41,$42,$44,$49,$14,$15,$19,$17,$46,$47,$33,$34,$22,$23}' \ | sort -k2 | join -t $'\t' -1 4 -2 2 transcripts.coords - \ | tawk '{for (i=1; i<=12; i++) {printf "%s\t", $i} printf "%s\t%s\t%s\t\t\t", $2, $3, $4; for (i=13; i <= NF; i++) {printf "%s", $i; if (i != NF) {printf "\t"}}; printf "\n"} ' \ | ~/kent/src/hg/makeDb/gnomad/combine.awk -v doTranscripts=true bedSort pliByTranscript.tab pliByTranscript.tab bedSort missenseByTranscript.tab missenseByTranscript.tab # Copy the old autosql file: cp ../{missense,pli}Metrics.as . # Turn into a bigBed and link sizes=/hive/data/genomes/hg38/chrom.sizes bedToBigBed -type=bed12+6 -as=pliMetrics.as -tab -extraIndex=name,geneName pliByTranscript.tab $sizes pliByTranscript.bb pass1 - making usageList (376 chroms): 443 millis pass2 - checking and writing primary data (168326 records, 18 fields): 3529 millis Sorting and writing extra index 0: 91 millis Sorting and writing extra index 1: 83 millis bedToBigBed -type=bed12+5 -as=missenseMetrics.as -tab -extraIndex=name,geneName missenseByTranscript.tab $sizes missenseByTranscript.bb pass1 - making usageList (376 chroms): 505 millis pass2 - checking and writing primary data (168326 records, 17 fields): 2841 millis Sorting and writing extra index 0: 171 millis Sorting and writing extra index 1: 89 millis