src/hg/makeDb/doc/hg38/gnomad.txt edcdcb9dd80eba31d99e159e27e6db9fe360cece

edcdcb9dd80eba31d99e159e27e6db9fe360cece
gperez2
  Sat Sep 21 17:05:59 2024 -0700
Adding the gnomAD v4.1 CNV track, refs 34253

diff --git src/hg/makeDb/doc/hg38/gnomad.txt src/hg/makeDb/doc/hg38/gnomad.txt
index 93a6d59..2911d24 100644
--- src/hg/makeDb/doc/hg38/gnomad.txt
+++ src/hg/makeDb/doc/hg38/gnomad.txt
@@ -1,582 +1,645 @@
 # gnomaAD for hg38
 # RM 21782
 
 # Genome Aggregation Database at the Broad
 # https://gnomad.broadinstitute.org
 
 # (2019-10-07 kate)
 
 # March 6, 2019: gnomAD 2.1.1
 
 mkdir vcf; cd vcf
 
 # hg38
 
 mkdir hg38; cd hg38
 mkdir -p /gbdb/hg38/gnomAD/vcf
 
 # Download from GRCh38 liftover section of downloads page:
 https://gnomad.broadinstitute.org/downloads#variants-grch38-liftover
 
 wget https://storage.googleapis.com/gnomad-public/release/2.1.1/README.txt
 
 # Exome variants
 
 # vcf
 # 85.31 GiB, MD5: cff8d0cfed50adc9211d1feaed2d4ca7
 wget https://storage.googleapis.com/gnomad-public/release/2.1.1/liftover_grch38/vcf/exomes/gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.bgz >&! wget.exomes.log &
 
 # from log: 20 minutes download time
 # --2019-10-07 16:40:48--  https://storage.googleapis.com/gnomad-public/release/2.1.1/liftover_grch38/vcf/exomes/gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.bgz
 # 2019-10-07 17:00:52 (72.6 MB/s) - 'gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.bgz' saved [91604348731/91604348731]
 
 du -sh *.bgz
 # 171G    gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.bgz
 
 md5sum gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.bgz
 # cff8d0cfed50adc9211d1feaed2d4ca7  gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.bgz
 
 ln -s `pwd`/gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.bgz /gbdb/hg38/gnomAD/vcf/gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.gz
 
 # index file
 wget https://storage.googleapis.com/gnomad-public/release/2.1.1/liftover_grch38/vcf/exomes/gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.bgz.tbi
 
 ln -s `pwd`/gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.bgz.tbi /gbdb/hg38/gnomAD/vcf/gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.gz.tbi
 
 # Genomes Variants
 # 743.06 GiB, MD5: 83de3d5b52669f714e810d4fcf047c18
 wget https://storage.googleapis.com/gnomad-public/release/2.1.1/liftover_grch38/vcf/genomes/gnomad.genomes.r2.1.1.sites.liftover_grch38.vcf.bgz >&! wget.genomes.log &
 
 # from log:  3 hrs download time
 #--2019-10-07 17:03:11--  https://storage.googleapis.com/gnomad-public/release/2.1.1/liftover_grch38/vcf/genomes/gnomad.genomes.r2.1.1.sites.liftover_grch38.vcf.bgz
 # 2019-10-07 20:02:56 (70.6 MB/s) - 'gnomad.genomes.r2.1.1.sites.liftover_grch38.vcf.bgz' saved [797851733355/797851733355]
 
 ln -s `pwd`/gnomad.genomes.r2.1.1.sites.liftover_grch38.vcf.bgz /gbdb/hg38/gnomAD/vcf/gnomad.genomes.r2.1.1.sites.liftover_grch38.vcf.gz
 
 wget https://storage.googleapis.com/gnomad-public/release/2.1.1/liftover_grch38/vcf/genomes/gnomad.genomes.r2.1.1.sites.liftover_grch38.vcf.bgz.tbi
 ln -s `pwd`/gnomad.genomes.r2.1.1.sites.liftover_grch38.vcf.bgz.tbi /gbdb/hg38/gnomAD/vcf/gnomad.genomes.r2.1.1.sites.liftover_grch38.vcf.gz.tbi
 
 #################
 Use/adapt trackDb and HTML from hg19 track
 
 #################
 # Add V3.  Native hg38 analysis (not lifted)
 # Released 10/16/2019 by MacArthur lab, announced here:
 # https://macarthurlab.org/2019/10/16/gnomad-v3-0/
 # (2019-10-21 kate)
 
 mkdir /hive/data/outside/gnomAD.3
 cd /hive/data/outside/gnomAD.3
 
 wget https://storage.googleapis.com/gnomad-public/release/3.0/vcf/genomes/gnomad.genomes.r3.0.sites.vcf.bgz.tbi
 
 ln -s `pwd`/gnomad.genomes.r3.0.sites.vcf.bgz.tbi /gbdb/hg38/gnomAD/vcf/gnomad.genomes.r3.0.sites.vcf.gz.tbi
 
 (date; wget https://storage.googleapis.com/gnomad-public/release/3.0/vcf/genomes/gnomad.genomes.r3.0.sites.vcf.bgz; date) >&! wget.genomes.log &
 
 #Mon Oct 21 11:26:58 PDT 2019
 #Mon Oct 21 12:27:14 PDT 2019
 #    ~1 hr
 
 md5sum gnomad.genomes.r3.0.sites.vcf.bgz
 
 ln -s `pwd`/gnomad.genomes.r3.0.sites.vcf.bgz /gbdb/hg38/gnomAD/vcf/gnomad.genomes.r3.0.sites.vcf.gz
 
 
 ##############################################################################
 # gnomAD v3.1 Update - Nov 24, 2020 - ChrisL - DONE
 # See /hive/data/outside/gnomAD.3/v3.1/make.txt for how to download
 # the new version
 
 WORKDIR=/hive/data/inside/gnomAD/v3.1
 cd $WORKDIR
 db="hg38"
 mkdir -p $db/genomes
 
 # get the headers:
 bcftools view -h /hive/data/outside/gnomAD.3/v3.1/genomes/gnomad.genomes.v3.1.sites.chr1.vcf.bgz > gnomad.v3.1.header
 
 # The VEP field has the important information, format:
 bcftools view -h /hive/data/outside/gnomAD.3/v3.1/genomes/gnomad.genomes.v3.1.sites.chr1.vcf.bgz | grep vep | grep -o "Format: .*$" | cut -d' ' -f2- | tr '|' '\t' | tl > gnomad.v3.1.vep.fields
 
 # the specific fields (same as v2.1.1 except '_' has been swapped to '-' EXCEPT in popmax
 # fields -___-), also add some new fields like CADD scores:
 fields="AC,AN,AF,faf95,nhomalt,vep,popmax,AC_popmax,AN_popmax,AF_popmax,AC-afr,AN-afr,AF-afr,nhomalt-afr,AC-ami,AN-ami,AF-ami,nhomalt-ami,AC-amr,AN-amr,AF-amr,nhomalt-amr,AC-asj,AN-asj,AF-asj,nhomalt-asj,AC-eas,AN-eas,AF-eas,nhomalt-eas,AC-fin,AN-fin,AF-fin,nhomalt-fin,AC-mid,AN-mid,AF-mid,nhomalt-mid,AC-nfe,AN-nfe,AF-nfe,nhomalt-nfe,AC-sas,AN-sas,AF-sas,nhomalt-sas,AC-oth,AN-oth,AF-oth,nhomalt-oth,cadd_phred,revel_score,splice_ai_max_ds,splice_ai_consequence,primate_ai_score"
 
 # Don't use parallel anymore, use parasol cause this still takes far too long and bogs down hgwdev:
 
 # make script to run each job:
 cat <<'_EOF' > run.sh
 #!/bin/bash
 
 # don't do the following, as vcfToBed will correctly exit with an error
 #set -beEu -o pipefail
 fields="AC,AN,AF,faf95,nhomalt,vep,popmax,AC_popmax,AN_popmax,AF_popmax,AC-afr,AN-afr,AF-afr,nhomalt-afr,AC-ami,AN-ami,AF-ami,nhomalt-ami,AC-amr,AN-amr,AF-amr,nhomalt-amr,AC-asj,AN-asj,AF-asj,nhomalt-asj,AC-eas,AN-eas,AF-eas,nhomalt-eas,AC-fin,AN-fin,AF-fin,nhomalt-fin,AC-mid,AN-mid,AF-mid,nhomalt-mid,AC-nfe,AN-nfe,AF-nfe,nhomalt-nfe,AC-sas,AN-sas,AF-sas,nhomalt-sas,AC-oth,AN-oth,AF-oth,nhomalt-oth,cadd_phred,revel_score,splice_ai_max_ds,splice_ai_consequence,primate_ai_score"
 infile=$1
 finalBed=$2
 
 # get the suffix name after the final '/':
 suffix=${infile#/hive/data/outside/gnomAD.3/v3.1/genomes/}
 suffix=${suffix%.vcf.bgz}
 
 outBed=/hive/data/inside/gnomAD/v3.1/hg38/genomes/${suffix}.bed
 
 vcfToBed -fields="${fields}" ${infile} ${outBed} 2>/dev/null
 /hive/data/inside/gnomAD/v3.1/gnomadVcfBedToBigBed ${outBed} stdout | sort -k1,1 -k2,2n > ${finalBed}
 _EOF
 
 # make a jobList file, here's a sample line:
 ./run.sh /hive/data/outside/gnomAD.3/v3.1/genomes/gnomad.genomes.v3.1.sites.chr1.vcf.bgz {check out exists+ /hive/data/inside/gnomAD/v3.1/hg38/genomes/chr1.bed}
 
 ssh ku "cd ${WORKDIR}; para create jobList; para try; exit"
 
 # woops errors in the gnomadVcfBedToBigBed script caused the first 10 jobs fail, fix up the script
 # and run those jobs separately:
 # the regular expression is for extracting chr*.bed out of the input file and making just that name
 # the output file
 
 # first start the rest of the jobs:
 ssh ku "cd ${WORKDIR}; para push; exit"
 # then finish off:
 parallel -j10 --joblog secondhalf.run.log --plus "./gnomadVcfBedToBigBed {} hg38/genomes/{= s/.*(chr[0-9]+\.bed$)/\$1/ =}" ::: hg38/genomes/gnomad.genomes.v3.1.sites.chr{1,10,11,12,13,14,15,16,17,18}.bed
 # See para.time and secondhalf.run.log for timing info
 
 time sort -k1,1 -k2,2n hg38/genomes/chr*.bed > gnomad.v3.1.genomes.bed
 # real    26m7.992s
 # user    31m40.053s
 # sys     14m31.118s
 
 nohup time bedToBigBed -type=bed9+64 -tab -as=genomes.as gnomad.v3.1.genomes.bed /hive/data/genomes/hg38/chrom.sizes genomes.bb &> bigBed.log &
 mkdir -p /gbdb/hg38/gnomAD/v3.1/variants/
 ln -s `pwd`/genomes.bb /gbdb/hg38/gnomAD/v3.1/variants/
 
 ##############################################################################
 # gnomAD v3.1.1 Update - Jun 29, 2021 - ChrisL - DONE
 # See /hive/data/outside/gnomAD.3/v3.1.1/make.txt for how to download
 # the new version
 ##############################################################################
 WORKDIR=/hive/data/inside/gnomAD/v3.1.1
 cd $WORKDIR
 db="hg38"
 mkdir -p $db/genomes
 
 # get the headers:
 bcftools view -h /hive/data/outside/gnomAD.3/v3.1.1/genomes/gnomad.genomes.v3.1.1.sites.chr1.vcf.bgz > gnomad.v3.1.1.header
 
 # The VEP field has most of the important information, format:
 bcftools view -h /hive/data/outside/gnomAD.3/v3.1.1/genomes/gnomad.genomes.v3.1.1.sites.chr1.vcf.bgz | grep vep | grep -o "Format: .*$" | cut -d' ' -f2- | tr '|' '\t' | tl > gnomad.v3.1.1.vep.fields
 
 # Now make two control scripts for specifying fields, see
 # /hive/data/inside/gnomAD/v3.1.1/runVcfToBed.sh and
 # /hive/data/inside/gnomAD/v3.1.1/convertBeds.sh
 # for more information
 
 # First convert the VCFs to beds:
 #    Need a jobList for parallel to run:
 today=`date +%F`
 for c in {1..22} X Y M; do
     echo "./runVcfToBed.sh /hive/data/outside/gnomAD.3/v3.1.1/genomes/gnomad.genomes.v3.1.1.sites.chr$c.vcf.bgz $WORKDIR/$db/genomes/gnomad.v3.1.1.chr$c.bed"
 done > vcfToBed.jobList
 parallel --joblog vcfToBed.run.log --jobs 10 :::: vcfToBed.jobList
 
 # WOOPS wrong path to chrM, run separately:
 ./runVcfToBed.sh /hive/data/outside/gnomAD.3/v3.1.1/genomes/gnomad.genomes.v3.1.sites.chrM.vcf.bgz /hive/data/inside/gnomAD/v3.1.1/hg38/genomes/gnomad.v3.1.chrM.bed
 
 # uh oh chrM went wrong again. The chrM variant calling pipeline is different than
 # the standard variant calling pipeline gnomAD used for the rest of the genome. Thus
 # we need to get specific fields out of the chrM file, and append empty fields
 # into the other tab files (the chrM specific data will be missing for the other
 # chromosomes while the regular genome data will be mostly missing for the chrM
 # data). This also brings up a bug in the original where I was missing Total counts
 # for the population data, so just do a full re-run which unifies the tab files
 # for both chrM and the rest of the genome:
 ./runVcfToBed.chrM.sh /hive/data/outside/gnomAD.3/v3.1.1/genomes/gnomad.genomes.v3.1.sites.chrM.vcf.bgz /hive/data/inside/gnomAD/v3.1.1/hg38/genomes/gnomad.v3.1.chrM.bed
 for c in {1..22} X Y; do
     echo "./convertBeds.sh $WORKDIR/$db/genomes/gnomad.v3.1.1.chr$c.bed $WORKDIR/$db/genomes/chr$c.bed"
 done > convertBed.jobList
 echo "./convertBeds.chrM.sh $WORKDIR/$db/genomes/gnomad.v3.1.chrM.bed $WORKDIR/$db/genomes/chrM.bed" >> convertBed.jobList
 time (parallel --joblog convertBed.run.log --jobs 25 :::: convertBed.jobList) &> convertBed.log &
 
 # Now we have one bed file and one external file per chromosome, use bgzip and
 # bedJoinTabOffset to combine them together:
 time bgzip -iI gnomad.v3.1.1.details.tab.gz.gzi -c gnomad.v3.1.1.details.pre.tab > gnomad.v3.1.1.details.tab.gz &
 # real  255m53.801s
 # user    238m3.982s
 # sys 16m7.130s
 time (ls -1S hg38/genomes/chr*.bed |
     parallel --joblog join.run.log --max-procs 12 \\
         bedJoinTabOffset -verbose=2 -bedKey=3 gnomad.v3.1.1.details.pre.tab {} joined/{/}) &> bedJoinTabOffset.log &
 time sort --merge joined/*.bed | grep -v "^#" > gnomad.v3.1.1.genomes.joined.bed
 # real    16m4.876s
 # user    12m2.699s
 # sys 6m30.244s
 
 # and lastly turn the merged bed into a bigBed:
 time (bedToBigBed -type=bed9+15 -tab -as=genomes.as -extraIndex=name,rsId,_displayName gnomad.v3.1.1.genomes.joined.bed /hive/data/genomes/hg38/chrom.sizes genomes.bb) &> bigBed.log &
 
 # Max Tue Apr  5 04:14:40 PDT 2022 - adding mutation constraint subtrack
 cd /hive/data/genomes/hg38/bed/gnomad/constraint
 # Downloaded BED file from https://www.biorxiv.org/content/10.1101/2022.03.20.485034v1.supplementary-material
 # got chrX scores from konrad.j.karczewski@gmail.com because they feel unsure about them
 cat Supplementary_Data_2.bed constraint_z_genome_1kb_chrx.bed > mutConstraint.bed
 bedSort mutConstraint.bed mutConstraint.bed
 bedGraphToBigWig *.bed ../../../chrom.sizes mutConstraint.bw
 
 
 bedGraphToBigWig *.bed ../../../chrom.sizes mutConstraint.bw
 ##############################################################################
 # gnomAD v4 - Nov 1, 2023 - ChrisL - DONE
 ##############################################################################
 # See /hive/data/outside/gnomAD.4/make.txt for how to download
 # the new version
 # Make the work dir:
 mkdir -p /hive/data/inside/gnomAD/v4
 cd /hive/data/inside/gnomAD/v4
 
 # for each vcf in the download  dirs, install files into /gbdb and load up a table with the pointers
 gbdbPath="/gbdb/hg38/gnomAD/v4/"
 dataPath="/hive/data/outside/gnomAD.4/"
 for tbl in exomes genomes
 do
     mkdir -p ${gbdbPath}${tbl}
     ln -s ${dataPath}${tbl}/*.vcf.bgz* ${gbdbPath}${tbl}
     cp /dev/null ${tbl}.txt
     if [ ${tbl} == "exomes" ]; then
         for c in {1..22} X Y
         do
             f=${gbdbPath}${tbl}/gnomad.exomes.v4.0.sites.chr${c}.vcf.bgz
             echo -e "${f}\tchr${c}" >> ${tbl}.txt
         done
     else
         for c in {1..22} X Y
         do
             f=${gbdbPath}${tbl}/gnomad.genomes.v4.0.sites.chr${c}.vcf.bgz
             echo -e "${f}\tchr${c}" >> ${tbl}.txt
         done
     fi
     tName="${tbl^}"
     hgLoadSqlTab hg38 gnomad${tName}V4 ~/kent/src/hg/lib/bbiChroms.sql genomes.txt
 done
 
 
 ##############################################################################
 # Add cancer/non-cancer filter to gnomAD v3.1.1  - Feb 16, 2024 - ChrisL - DONE
 ##############################################################################
 # see /hive/data/inside/gnomAD/v3.1.1/2024-02-12/README
 # for the steps
 ##############################################################################
 
 ##############################################################################
 # Update transcript contraint scores for gnomAD v4  - Mar 1, 2024 - ChrisL - DONE
 ##############################################################################
 # The data is not the same as the hg19 version, many less fields, but we mostly threw a lot of them
 # away before anyways. Also this file had both ENST and NM/XM transcripts
 
 # Here are the fields used in the previous version:
 zcat ../../gnomAD.2/constraint/gnomad.v2.1.1.lof_metrics.by_transcript.txt.bgz | head -1 | tawk '{print $76,$77,$78,$65,$66,$1,$2,$4,$5,$6,$34,$13,$14,$15,$33,$18,$21,$22,$25,$26,$27,$28,$29,$30,$31}'
 chromosome  start_position  end_position    gene_id transcript_level    gene    transcript  obs_mis exp_mis oe_mis  mis_z   obs_syn exp_syn oe_syn  syn_z   obs_lof exp_lof pLI oe_lof  oe_syn_lower    oe_syn_upper    oe_mis_lower    oe_mis_upper    oe_lof_lower    oe_lof_upper
 # Right off the bat we will be missing the chrom,start and end
 
 # Get the transcripts to get the coordinates and exon-intron boundaries
 hgsql -Ne "select * from wgEncodeGencodeCompV39" hg38 \
     | cut -f2- | genePredToBed -tab stdin stdout | sed -Ee 's/\.[0-9]+//' \
     | sort -k4 > hg38.gencodeCompV39.bed12
 hgsql -Ne "select * from ncbiRefSeq" hg38 \
     | cut -f2- | genePredToBed -tab stdin stdout \
     | sort -k4 > hg38.refSeq.bed12
 cat hg38.gencodeCompV39.bed12 hg38.refSeq.bed12 | sort -k4 > transcripts.coords
 
 f=gnomad.v4.0.constraint_metrics.tsv
 # I don't think this command will work just copying and pasting like the above will
 tail -n +2 $f \
     | tawk '{print $1,$2,$24,$25,$27,$32,$37,$38,$40,$45,$12,$13,$17,$15,$42,$43,$29,$30,$20,$21}' \
     | sort -k2 | join -t $'\t' -1 4 -2 2 transcripts.coords - \
     | tawk '{for (i=1; i<=12; i++) {printf "%s\t", $i} printf "%s\t%s\t%s\t\t\t", $2, $3, $4; for (i=13; i <= NF; i++) {printf "%s", $i; if (i != NF) {printf "\t"}}; printf "\n"} ' \
     | ~/kent/src/hg/makeDb/gnomad/combine.awk -v doTranscripts=true
 bedSort pliByTranscript.tab pliByTranscript.tab
 bedSort missenseByTranscript.tab missenseByTranscript.tab
 
 # Copy the old autosql file:
 cp ../../gnomAD.2/constraint/{missense,pli}Metrics.as .
 
 # Turn into a bigBed and link
 sizes=/hive/data/genomes/hg38/chrom.sizes
 bedToBigBed -type=bed12+6 -as=pliMetrics.as -tab -extraIndex=name,geneName pliByTranscript.tab $sizes pliByTranscript.bb
 bedToBigBed -type=bed12+5 -as=missenseMetrics.as -tab -extraIndex=name,geneName missenseByTranscript.tab $sizes missenseByTranscript.bb
 
 ##############################################################################
 # gnomAD v4.1 Update to bigBed - May 02, 2024 - ChrisL
 ##############################################################################
 # start a screen as this will take a while
 screen -S gnomADv4
 WORKDIR=/hive/data/inside/gnomAD/v4/v4.1
 cd $WORKDIR
 db="hg38"
 mkdir -p $db/{exomes,genomes}
 
 # get the headers:
 bcftools view -h /hive/data/outside/gnomAD.4/v4.1/genomes/gnomad.genomes.v4.1.sites.chr1.vcf.bgz > gnomad.v4.1.genomes.header
 bcftools view -h /hive/data/outside/gnomAD.4/v4.1/exomes/gnomad.exomes.v4.1.sites.chr1.vcf.bgz > gnomad.v4.1.exomes.header
 
 # Looks like there are INFO field differences between the two files, for some reason
 # there are different populations included for each?
 # The Genomes data has Amish population information, while the exomes data has UK BioBank
 # data and faf95_mid?
 # For the full list of differences, use this command:
 sdiff <(grep -Eo "^##.*=<[^,]+," gnomad.v4.1.genomes.header  | sort) <(grep -Eo "^##.*=<[^,]+," gnomad.v4.1.exomes.header | sort ) | less
 # List out the INFO fields for each set:
 grep -Eo "^##INFO=<ID=[^,]+" gnomad.v4.1.exomes.header | cut -d'=' -f3 > gnomad.v4.1.exomes.infoFields
 grep -Eo "^##INFO=<ID=[^,]+" gnomad.v4.1.genomes.header | cut -d'=' -f3 > gnomad.v4.1.genomes.infoFields
 
 # The VEP field has most of the important information, format:
 bcftools view -h /hive/data/outside/gnomAD.4/v4.1/exomes/gnomad.exomes.v4.1.sites.chr1.vcf.bgz | grep vep | grep -o "Format: .*$" | cut -d' ' -f2- | tr '|' '\t' | tl > gnomad.exomes.v4.1.vep.fields
 bcftools view -h /hive/data/outside/gnomAD.4/v4.1/genomes/gnomad.genomes.v4.1.sites.chr1.vcf.bgz | grep vep | grep -o "Format: .*$" | cut -d' ' -f2- | tr '|' '\t' | tl > gnomad.genomes.v4.1.vep.fields
 # at least the VEP fields are the same
 
 # Use the fields from v3.1.1 as a starting point:
 grep "^fields" ../../v3.1.1/runVcfToBed.sh | cut -d'=' -f2 | tr ',' '\n' | tr -d '"' > v3.1.1.vcfToBed.fieldList
 # check at least those fields are present in both sets:
 for line in $(cat v3.1.1.vcfToBed.fieldList); do grep $line gnomad.v4.1.exomes.infoFields &> /dev/null; if [ $? != 0 ]; then echo "$line not in v4.1 exomes"; fi; done
 popmax not in v4.1 exomes
 AC_popmax not in v4.1 exomes
 AN_popmax not in v4.1 exomes
 AF_popmax not in v4.1 exomes
 AC_ami not in v4.1 exomes
 AN_ami not in v4.1 exomes
 AF_ami not in v4.1 exomes
 nhomalt_ami not in v4.1 exomes
 AC_oth not in v4.1 exomes
 AN_oth not in v4.1 exomes
 AF_oth not in v4.1 exomes
 nhomalt_oth not in v4.1 exomes
 revel_score not in v4.1 exomes
 splice_ai_max_ds not in v4.1 exomes
 splice_ai_consequence not in v4.1 exomes
 primate_ai_score not in v4.1 exomes
 
 for line in $(cat v3.1.1.vcfToBed.fieldList); do grep $line gnomad.v4.1.genomes.infoFields &> /dev/null; if [ $? != 0 ]; then echo "$line not in v4.1 genomes"; fi; done
 
 # So they changed popmax to grpmax, and spliceAI and revel scores:
 grep -i "revel\|splice\|primate" gnomad.v4.1.exomes.infoFields
 revel_max
 spliceai_ds_max
 grep -i "revel\|splice\|primate" gnomad.v4.1.genomes.infoFields
 revel_max
 spliceai_ds_max
 cp v3.1.1.vcfToBed.fieldList v4.1.vcfToBed.exomes.fieldList
 cp v3.1.1.vcfToBed.fieldList v4.1.vcfToBed.genomes.fieldList
 # Make the fixups in vim, including the population information changes: remove ami from exomes, oth to remaining and the revel and splice_ai field changes, add variant_type
 # I used this command fixing up until there was nothing only on the left
 # and no '|' on the right
 sdiff <(sort v4.1.vcfToBed.exomes.fieldList) <(sort gnomad.v4.1.exomes.infoFields) | grep -v "ukb" | less
 
 # Now make two control scripts for specifying fields, see
 # /hive/data/inside/gnomAD/v4/v4.1/runVcfToBed.{exomes,genomes}.sh and
 # /hive/data/inside/gnomAD/v4/v4.1/convertBeds.{exomes,genomes}.sh
 # for more information
 
 # First convert the VCFs to beds:
 #    Need a jobList for parallel to run:
 for c in {1..22} X Y; do
     echo "./runVcfToBed.exomes.sh /hive/data/outside/gnomAD.4/v4.1/exomes/gnomad.exomes.v4.1.sites.chr$c.vcf.bgz $WORKDIR/$db/exomes/gnomad.v4.1.chr$c.bed"
 done > vcfToBed.jobList
 for c in {1..22} X Y; do
     echo "./runVcfToBed.genomes.sh /hive/data/outside/gnomAD.4/v4.1/genomes/gnomad.genomes.v4.1.sites.chr$c.vcf.bgz $WORKDIR/$db/genomes/gnomad.v4.1.chr$c.bed"
 done >> vcfToBed.jobList
 parallel --joblog vcfToBed.run.log --jobs 15 :::: vcfToBed.jobList
 
 # Average of 105 minutes per job:
 tail -n +2 /hive/data/inside/gnomAD/v4/v4.1/vcfToBed.run.log | cut -f4 | awk '{sum += $1} END {print (sum/NR) / 60}'
 54.0564
 
 # convert the beds into a smaller bed file + a tab sep file with everything else
 for c in {1..22} X Y; do
     echo "./convertBeds.exomes.sh $WORKDIR/$db/exomes/gnomad.v4.1.chr$c.bed $WORKDIR/$db/exomes/chr$c.bed"
 done > convertBed.jobList
 for c in {1..22} X Y; do
     echo "./convertBeds.genomes.sh $WORKDIR/$db/genomes/gnomad.v4.1.chr$c.bed $WORKDIR/$db/genomes/chr$c.bed"
 done >> convertBed.jobList
 
 # convert the beds into a smaller bed file + a tab sep file with everything else
 time (parallel --joblog convertBed.run.log --jobs 25 :::: convertBed.jobList) &> convertBed.log &
 # real    178m32.563s
 # user    2843m21.030s
 # sys     134m13.788s
 
 # Now we have one bed file and one external file per chromosome, use bgzip and
 # bedJoinTabOffset to combine them together:
 # Merge all the tab files together:
 # First exomes:
 time sort --merge ${WORKDIR}/hg38/exomes/gnomad.*.tab > gnomad.v4.1.exomes.details.pre.tab
 # real    15m11.762s
 # user    2m11.927s
 # sys     11m1.171s
 
 time bgzip -iI gnomad.v4.1.exomes.details.tab.gz.gzi -c gnomad.v4.1.exomes.details.pre.tab > gnomad.v4.1.exomes.details.tab.gz
 # real    159m17.597s
 # user    79m31.099s
 # sys     5m1.504s
 
 # The parallel command changes {} into the input file (example: hg38/exomes/chr1.bed), and
 # {/} to the basename (example: chr1.bed)
 time (ls -1S hg38/exomes/chr*.bed |
     parallel --joblog join.run.log --max-procs 12 \\
         bedJoinTabOffset -verbose=2 -bedKey=3 gnomad.v4.1.exomes.details.pre.tab {} joined/exomes/{/}) &> bedJoinTabOffset.log
 # real    129m2.819s
 # user    379m29.001s
 # sys     58m4.237s
 
 time sort -S 40G --merge joined/exomes/*.bed | grep -v "^#" > gnomad.v4.1.exomes.joined.bed
 # real    7m43.575s
 # user    83m46.236s
 # sys     9m1.771s
 
 # Then genomes:
 time sort --merge ${WORKDIR}/hg38/genomes/gnomad.*.tab > gnomad.v4.1.genomes.details.pre.tab
 # real    55m0.135s
 # user    8m46.206s
 # sys     41m47.511s
 
 time bgzip -iI gnomad.v4.1.genomes.details.tab.gz.gzi -c gnomad.v4.1.genomes.details.pre.tab > gnomad.v4.1.genomes.details.tab.gz
 # real    245m46.743s
 # user    232m29.072s
 # sys     11m57.409s
 
 # The parallel command changes {} into the input file (example: hg38/genomes/chr1.bed), and
 # {/} to the basename (example: chr1.bed)
 time (ls -1S hg38/genomes/chr*.bed |
     parallel --joblog join.run.log --max-procs 12 \\
         bedJoinTabOffset -verbose=2 -bedKey=3 gnomad.v4.1.genomes.details.pre.tab {} joined/genomes/{/}) &> bedJoinTabOffset.genomes.log
 # real    373m33.753s
 # user    1643m28.197s
 # sys     275m53.429s
 
 time sort -S 40G --merge joined/genomes/*.bed | grep -v "^#" > gnomad.v4.1.genomes.joined.bed
 # real    27m20.072s
 # user    16m46.584s
 # sys     12m3.441s
 
 # and lastly turn the merged beds into bigBeds:
 time (bedToBigBed -type=bed9+21 -tab -as=exomes.as -extraIndex=name,rsId,_displayName gnomad.v4.1.exomes.joined.bed /hive/data/genomes/hg38/chrom.sizes exomes.bb) &> bigBed.exomeslog &
 # pass1 - making usageList (24 chroms): 172816 millis
 # pass2 - checking and writing primary data (183717261 records, 31 fields): 1872918 millis
 # Sorting and writing extra index 0: 248199 millis
 # Sorting and writing extra index 1: 206027 millis
 # Sorting and writing extra index 2: 99569 millis
 
 # real    46m32.224s
 # user    43m8.900s
 # sys     2m32.009s
 
 time (bedToBigBed -maxAlloc=250000000000 -type=bed9+21 -tab -as=genomes.as -extraIndex=name,rsId,_displayName gnomad.v4.1.genomes.joined.bed /hive/data/genomes/hg38/chrom.sizes genomes.bb) &> bigBed.genomeslog &
 # pass1 - making usageList (24 chroms): 776401 millis
 # pass2 - checking and writing primary data (759336320 records, 31 fields): 10139390 millis
 # Sorting and writing extra index 0: 1502079 millis
 # Sorting and writing extra index 1: 1460242 millis
 # Sorting and writing extra index 2: 433804 millis
 
 # real    254m58.161s
 # user    230m40.491s
 # sys     21m41.152s
 
 ##############################################################################
 # gnomAD v4.1 Missense and pli by transcript tracks- June 03, 2024 - ChrisL
 ##############################################################################
 wget https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/constraint/gnomad.v4.1.constraint_metrics.tsv
 
 # Get the transcripts to get the coordinates and exon-intron boundaries
 hgsql -Ne "select * from wgEncodeGencodeCompV39" hg38 \
     | cut -f2- | genePredToBed -tab stdin stdout | sed -Ee 's/\.[0-9]+//' \
     | sort -k4 > hg38.gencodeCompV39.bed12
 hgsql -Ne "select * from ncbiRefSeq" hg38 \
     | cut -f2- | genePredToBed -tab stdin stdout \
     | sort -k4 > hg38.refSeq.bed12
 cat hg38.gencodeCompV39.bed12 hg38.refSeq.bed12 | sort -k4 > transcripts.coords
 
 f=gnomad.v4.1.constraint_metrics.tsv
 # The order of fields is different between v4.0 and v4.1, figure out the fields we need to extract:
 head -1 ../gnomad.v4.0.constraint_metrics.tsv | tawk '{print $1,$2,$24,$25,$27,$32,$37,$38,$40,$45,$12,$13,$17,$15,$42,$43,$29,$30,$20,$21}' | tr '\t' '\n' > v4.0.wantedFields
 head -1 ../gnomad.v4.0.constraint_metrics.tsv | tr '\t' '\n' | nl > v4.0.fieldOrder
 head -1 $f | tr '\t' '\n' | nl > v4.1.fieldOrder
 grep -Fwf v4.0.wantedFields v4.1.fieldOrder  > v4.1.wantedFields
 for field in $(head -1 ../gnomad.v4.0.constraint_metrics.tsv | tawk '{print $1,$2,$24,$25,$27,$32,$37,$38,$40,$45,$12,$13,$17,$15,$42,$43,$29,$30,$20,$21}' | tr '\t' '\n'); do grep -w $field v4.1.fieldOrder; done | cut -f1 | tr '\n' ',' | tr -d ' '; echo
 # I don't think this command will work just copying and pasting like the above will
 tail -n +2 $f \
     | tawk '{print $1,$3,$28,$29,$31,$36,$41,$42,$44,$49,$14,$15,$19,$17,$46,$47,$33,$34,$22,$23}' \
     | sort -k2 | join -t $'\t' -1 4 -2 2 transcripts.coords - \
     | tawk '{for (i=1; i<=12; i++) {printf "%s\t", $i} printf "%s\t%s\t%s\t\t\t", $2, $3, $4; for (i=13; i <= NF; i++) {printf "%s", $i; if (i != NF) {printf "\t"}}; printf "\n"} ' \
     | ~/kent/src/hg/makeDb/gnomad/combine.awk -v doTranscripts=true
 bedSort pliByTranscript.tab pliByTranscript.tab
 bedSort missenseByTranscript.tab missenseByTranscript.tab
 
 # Copy the old autosql file:
 cp ../{missense,pli}Metrics.as .
 
 # Turn into a bigBed and link
 sizes=/hive/data/genomes/hg38/chrom.sizes
 bedToBigBed -type=bed12+6 -as=pliMetrics.as -tab -extraIndex=name,geneName pliByTranscript.tab $sizes pliByTranscript.bb
 pass1 - making usageList (376 chroms): 443 millis
 pass2 - checking and writing primary data (168326 records, 18 fields): 3529 millis
 Sorting and writing extra index 0: 91 millis
 Sorting and writing extra index 1: 83 millis
 bedToBigBed -type=bed12+5 -as=missenseMetrics.as -tab -extraIndex=name,geneName missenseByTranscript.tab $sizes missenseByTranscript.bb
 pass1 - making usageList (376 chroms): 505 millis
 pass2 - checking and writing primary data (168326 records, 17 fields): 2841 millis
 Sorting and writing extra index 0: 171 millis
 Sorting and writing extra index 1: 89 millis
 ##############################################################################
 # gnomAD Structural Variants v4 - Gerardo
 # Redmine #33823
 
     cd /hive/data/outside/gnomAD.4/sv
     wget https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/genome_sv/gnomad.v4.1.sv.sites.bed.gz
     wget https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/genome_sv/gnomad.v4.1.sv.non_neuro_controls.sites.bed.gz
     #Used the gnomad.v4.1.sv.non_neuro_controls.sites.bed.gz file to build the track since it has
     #additional annotations of frequencies among non_neuro samples, and non_control samples and the
     #two bed files have the same records.
     for f in gnomad.v4.1.sv.non_neuro_controls.sites.bed.gz; do out=${f/.bed.gz/}; zcat $f | tail -n +2 | tawk '{print $1, $2, $3, $4, $26, $29, $27, $28, $30, $31, $32, $33, $34, $39, $41, $44, $45, $47, $48, $49, $52, $53}'  > $out.bed4Plus; done
     # variant types:
 
     zcat  gnomad.v4.1.sv.non_neuro_controls.sites.bed.gz | cut -f45 | sort | uniq -c
      356035 BND
         721 CNV
       15189 CPX
          99 CTX
     1206278 DEL
      269326 DUP
      304645 INS
        2193 INV
           1 SVTYPE
 
     # add colors based on gnomad website and get in to proper bed9+
     cp /hive/data/outside/gnomAD.2/structuralVariants/gnomadSvToUcsc.awk .
     #Modified the gnomadSvToUcsc.awk script and named it gnomadSvToUcsc_mod.awk to get the following
     #fields:
       #Allele Count for Non-neuro (field #627)
       #Allele Number for Non-neuro (field #626)
       #Allele Frequency for Non-neuro (field #628)
       #Number of heterozygous variant carriers for Non-neuro (field #631)
       #Number of homozygous alternate variant carriers for Non-neuro fiedl (field #632)
       #Allele Count Controls (field #803)
       #Allele Number Controls (field #802)
       #Allele Frequency Controls (field #804)
       #Number of heterozygous variant carriers for Controls (field #807)
       #Number of homozygous alternate variant carriers for Controls (fiedl #808)
 
     for f in gnomad.v4.1.sv.non_neuro_controls.sites.bed4Plus; do out=${f/.bed4Plus/}; bedClip -truncate $f /hive/data/genomes/hg38/chrom.sizes stdout | ./gnomadSvToUcsc_mod.awk | sort -k1,1 -k2,2n > $out.bed9Plus; done
 
     cp /hive/data/outside/gnomAD.2/structuralVariants/gnomadSv.as .
     #Modified the gnomadSv.as file to include the non_neuro samples, and the control samples, and
     #named the file gnomadSvMod.as 
     for f in gnomad.v4.1.sv.non_neuro_controls.sites.bed9Plus; do out=${f/.bed9Plus/}; bedToBigBed -tab -type=bed9+19 -as=gnomadSvMod.as -extraIndex=name $f /hive/data/genomes/hg38/chrom.sizes $out.bb; done
     cd /gbdb/hg38/gnomAD/v4
     mkdir structuralVariants; cd structuralVariants
     cp -s /hive/data/outside/gnomAD.4/sv/gnomad.v4.1.sv.non_neuro_controls.sites.bb .
     cd ~/kent/src/hg/makeDb/trackDb/human/hg38
     cp gnomad.alpha.ra gnomadSV.alpha.ra
     #Copied the gnomadSvFull track stanza from ~/kent/src/hg/makeDb/trackDb/human/hg19/trackDb.gnomad.ra
     # Added filters for non-neurological and control Allele Frequencies  
     cp ../hg19/gnomadSv.html .
     #Updated the gnomadSv.html
     
 ##############################################################################
+# gnomAD CNVs v4.1 - Gerardo
+# Redmine #34253
+
+
+    cd /hive/data/outside/gnomAD.4
+    mkdir cnv; cd cnv
+    wget https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/exome_cnv/gnomad.v4.1.cnv.non_neuro_controls.bed
+
+    #Looked at the bed file and decided to get the following columns
+    #chrom	chromStart	chromEnd	name	score	strand	thickStart	thickEnd	itemRgb	SVLEN	SVTYPE	Genes	SC	SN	SF	SC_XX	SN_XX	SF_XX	SC_XY	SN_XY	SF_XY
+    #Need to remove the "variant_is_80_" substring from the 4th column string to make the values into gnomAD variant IDs.    
+    #Need to add colors to the 9th column. Variant Types. Deletion (DEL):"255,0,0" and Duplication (DUP):"0,0,255"
+    
+    #Loop through each file matching the pattern (in this case, only gnomad.v4.1.cnv.non_neuro_controls.bed).
+    for f in gnomad.v4.1.cnv.non_neuro_controls.bed; do 
+    
+        #Create an output filename by replacing the ".bed" extension with nothing.
+        #This will be used later to save the modified file with a new suffix.
+        out=${f/.bed/}
+    
+        #Process the file:
+        # Skip the first 100 lines using `tail -n +101`.
+        # Use `awk` to modify and print the desired columns.
+        cat $f | tail -n +101 | \
+    
+        #First awk block to make a bed9plus:
+        # Set both the input and output field separators to tabs (FS=OFS="\t").
+        # Use gsub() to remove the "variant_is_80_" prefix from column 4.
+        # Print specific columns:
+        #  Columns 1, 2, 3, 4.  A zero constant for column 5. A period constant for column 6. Columns 2 and 3 again for columns 7 and 8. Columns 9, 13, 14, 19, 22, 32, 42, 52, 62, 72, 82, 92, and 102.
+        awk 'BEGIN {FS=OFS="\t"} {gsub("variant_is_80_", "", $4); print $1, $2, $3, $4, 0, ".", $2, $3, $9, $13, $14, $19, $22, $32, $42, $52, $62, $72, $82, $92, $102}' | \
+    
+        #Second awk block to add colors to column 9:
+        # Set both the input and output field separators to tabs (FS=OFS="\t").
+        # Check the value of column 11:
+        #  If column 11 contains "DEL", set column 9 to "255,0,0" (red color).
+        #  If column 11 contains "DUP", set column 9 to "0,0,255" (blue color).
+        # Print the modified line with the updated values.
+        awk 'BEGIN {FS=OFS="\t"} {if ($11 == "DEL") $9="255,0,0"; else if ($11 == "DUP") $9="0,0,255"; print}' \
+    
+        #Redirect the output to a new file, appending the suffix ".bed9Plus" to the original filename.
+        > $out.bed9Plus
+
+    # End of the loop.
+    done
+    
+    cat gnomad.v4.1.cnv.non_neuro_controls.bed9Plus | cut -f11 | sort | uniq -c
+     20989 DEL
+     25026 DUP
+
+    #Made the as file and the file is located in the following directory: /hive/data/outside/gnomAD.4/cnv/gnomadCNV.as
+    
+    #Made a bigBed9plus
+    for f in gnomad.v4.1.cnv.non_neuro_controls.bed9Plus; do out=${f/.bed9Plus/}; bedToBigBed -tab -type=bed9+12 -as=gnomadCNV.as -extraIndex=name $f /hive/data/genomes/hg38/chrom.sizes $out.bb; done
+    
+    cd /gbdb/hg38/gnomAD/v4
+    mkdir cnv; cd cnv
+    ln -s /hive/data/outside/gnomAD.4/cnv/gnomad.v4.1.cnv.non_neuro_controls.bb
+    cd ~/kent/src/hg/makeDb/trackDb/human/hg38
+    #Edited the gnomad.alpha.ra file
+    cp gnomadSv.html gnomadCNV.html
+    #Updated the gnomadCNV.html
+##############################################################################