9af983b83acb335907ac9bf3da0873cb6e06c48d chmalee Wed Jun 26 10:55:48 2024 -0700 Add updates for gnomad v4.1 genomes and exomes to makedoc diff --git src/hg/makeDb/doc/hg38/gnomad.txt src/hg/makeDb/doc/hg38/gnomad.txt index 68e37f6..5acbe10 100644 --- src/hg/makeDb/doc/hg38/gnomad.txt +++ src/hg/makeDb/doc/hg38/gnomad.txt @@ -384,126 +384,112 @@ echo "./runVcfToBed.genomes.sh /hive/data/outside/gnomAD.4/v4.1/genomes/gnomad.genomes.v4.1.sites.chr$c.vcf.bgz $WORKDIR/$db/genomes/gnomad.v4.1.chr$c.bed" done >> vcfToBed.jobList parallel --joblog vcfToBed.run.log --jobs 15 :::: vcfToBed.jobList # Average of 105 minutes per job: tail -n +2 /hive/data/inside/gnomAD/v4/v4.1/vcfToBed.run.log | cut -f4 | awk '{sum += $1} END {print (sum/NR) / 60}' 54.0564 # convert the beds into a smaller bed file + a tab sep file with everything else for c in {1..22} X Y; do echo "./convertBeds.exomes.sh $WORKDIR/$db/exomes/gnomad.v4.1.chr$c.bed $WORKDIR/$db/exomes/chr$c.bed" done > convertBed.jobList for c in {1..22} X Y; do echo "./convertBeds.genomes.sh $WORKDIR/$db/genomes/gnomad.v4.1.chr$c.bed $WORKDIR/$db/genomes/chr$c.bed" done >> convertBed.jobList + +# convert the beds into a smaller bed file + a tab sep file with everything else time (parallel --joblog convertBed.run.log --jobs 25 :::: convertBed.jobList) &> convertBed.log & +# real 178m32.563s +# user 2843m21.030s +# sys 134m13.788s # Now we have one bed file and one external file per chromosome, use bgzip and # bedJoinTabOffset to combine them together: # Merge all the tab files together: -mkdir -p joined/{exomes,genomes} # First exomes: time sort --merge ${WORKDIR}/hg38/exomes/gnomad.*.tab > gnomad.v4.1.exomes.details.pre.tab -# real 14m24.203s -# user 2m17.384s -# sys 10m16.118s +# real 15m11.762s +# user 2m11.927s +# sys 11m1.171s time bgzip -iI gnomad.v4.1.exomes.details.tab.gz.gzi -c gnomad.v4.1.exomes.details.pre.tab > gnomad.v4.1.exomes.details.tab.gz -# real 62m28.397s -# user 58m29.676s -# sys 3m31.012s +# real 159m17.597s +# user 79m31.099s +# sys 5m1.504s # The parallel command changes {} into the input file (example: hg38/exomes/chr1.bed), and # {/} to the basename (example: chr1.bed) time (ls -1S hg38/exomes/chr*.bed | parallel --joblog join.run.log --max-procs 12 \\ bedJoinTabOffset -verbose=2 -bedKey=3 gnomad.v4.1.exomes.details.pre.tab {} joined/exomes/{/}) &> bedJoinTabOffset.log -# real 155m42.266s -# user 381m15.009s -# sys 50m29.407s +# real 129m2.819s +# user 379m29.001s +# sys 58m4.237s + time sort -S 40G --merge joined/exomes/*.bed | grep -v "^#" > gnomad.v4.1.exomes.joined.bed -# real 6m38.608s -# user 3m49.595s -# sys 2m43.676s +# real 7m43.575s +# user 83m46.236s +# sys 9m1.771s # Then genomes: time sort --merge ${WORKDIR}/hg38/genomes/gnomad.*.tab > gnomad.v4.1.genomes.details.pre.tab -# real 48m36.383s -# user 7m40.632s -# sys 34m7.595s +# real 55m0.135s +# user 8m46.206s +# sys 41m47.511s -# FIELD_FIX_RESTART_MARK time bgzip -iI gnomad.v4.1.genomes.details.tab.gz.gzi -c gnomad.v4.1.genomes.details.pre.tab > gnomad.v4.1.genomes.details.tab.gz -# real 230m19.470s -# user 216m16.280s -# sys 11m28.438s +# real 245m46.743s +# user 232m29.072s +# sys 11m57.409s # The parallel command changes {} into the input file (example: hg38/genomes/chr1.bed), and # {/} to the basename (example: chr1.bed) time (ls -1S hg38/genomes/chr*.bed | parallel --joblog join.run.log --max-procs 12 \\ - bedJoinTabOffset -verbose=2 -bedKey=3 gnomad.v4.1.genomes.details.pre.tab {} joined/genomes/{/}) &> bedJoinTabOffset.log -# real 430m5.112s -# user 1505m15.341s -# sys 286m35.990s + bedJoinTabOffset -verbose=2 -bedKey=3 gnomad.v4.1.genomes.details.pre.tab {} joined/genomes/{/}) &> bedJoinTabOffset.genomes.log +# real 373m33.753s +# user 1643m28.197s +# sys 275m53.429s time sort -S 40G --merge joined/genomes/*.bed | grep -v "^#" > gnomad.v4.1.genomes.joined.bed -# real 26m31.856s -# user 14m24.696s -# sys 9m57.459s +# real 27m20.072s +# user 16m46.584s +# sys 12m3.441s # and lastly turn the merged beds into bigBeds: -time (bedToBigBed -type=bed9+16 -tab -as=exomes.as -extraIndex=name,rsId,_displayName gnomad.v4.1.exomes.joined.bed /hive/data/genomes/hg38/chrom.sizes exomes.bb) &> bigBed.exomeslog & -# pass1 - making usageList (24 chroms): 239308 millis -# pass2 - checking and writing primary data (183717261 records, 25 fields): 1695174 millis -# Sorting and writing extra index 0: 258529 millis -# Sorting and writing extra index 1: 172276 millis -# Sorting and writing extra index 2: 73698 millis - -# real 43m44.216s -# user 40m4.371s -# sys 1m54.306s - -time (bedToBigBed -maxAlloc=250000000000 -type=bed9+16 -tab -as=genomes.as -extraIndex=name,rsId,_displayName gnomad.v4.1.genomes.joined.bed /hive/data/genomes/hg38/chrom.sizes genomes.bb) &> bigBed.genomeslog & -# pass1 - making usageList (24 chroms): 720685 millis -# pass2 - checking and writing primary data (759336320 records, 25 fields): 8127679 millis -# Sorting and writing extra index 0: 1711243 millis -# Sorting and writing extra index 1: 1627587 millis -# Sorting and writing extra index 2: 436891 millis -# -# real 227m12.682s -# user 204m24.791s -# sys 20m16.421s - -# Make symlinks to /gbdb: -cd /gbdb/hg38/gnomAD/ -mkdir v4.1 -cd v4.1 -mkdir exomes -mkdir genomes -cd $WORKDIR -ln -s `pwd`/genomes.bb /gbdb/hg38/gnomAD/v4.1/genomes/ -ln -s `pwd`/gnomad.v4.1.genomes.details.tab.gz /gbdb/hg38/gnomAD/v4.1/genomes/ -ln -s `pwd`/gnomad.v4.1.genomes.details.tab.gz.gzi /gbdb/hg38/gnomAD/v4.1/genomes/ -ln -s `pwd`/exomes.bb /gbdb/hg38/gnomAD/v4.1/exomes/ -ln -s `pwd`/gnomad.v4.1.exomes.details.tab.gz /gbdb/hg38/gnomAD/v4.1/exomes/ -ln -s `pwd`/gnomad.v4.1.exomes.details.tab.gz.gzi /gbdb/hg38/gnomAD/v4.1/exomes/ - -# Woops the conversion script had the wrong header listing, change it up: -sed -i -e '1,24s/segdup/segdup\tvariant_type\tallele_type/' gnomad.v4.1.genomes.details.pre.tab -# and then re-do the genomes steps from the FIELD_FIX_RESTART_MARK mark +time (bedToBigBed -type=bed9+21 -tab -as=exomes.as -extraIndex=name,rsId,_displayName gnomad.v4.1.exomes.joined.bed /hive/data/genomes/hg38/chrom.sizes exomes.bb) &> bigBed.exomeslog & +# pass1 - making usageList (24 chroms): 172816 millis +# pass2 - checking and writing primary data (183717261 records, 31 fields): 1872918 millis +# Sorting and writing extra index 0: 248199 millis +# Sorting and writing extra index 1: 206027 millis +# Sorting and writing extra index 2: 99569 millis + +# real 46m32.224s +# user 43m8.900s +# sys 2m32.009s + +time (bedToBigBed -maxAlloc=250000000000 -type=bed9+21 -tab -as=genomes.as -extraIndex=name,rsId,_displayName gnomad.v4.1.genomes.joined.bed /hive/data/genomes/hg38/chrom.sizes genomes.bb) &> bigBed.genomeslog & +# pass1 - making usageList (24 chroms): 776401 millis +# pass2 - checking and writing primary data (759336320 records, 31 fields): 10139390 millis +# Sorting and writing extra index 0: 1502079 millis +# Sorting and writing extra index 1: 1460242 millis +# Sorting and writing extra index 2: 433804 millis + +# real 254m58.161s +# user 230m40.491s +# sys 21m41.152s ############################################################################## # gnomAD v4.1 Missense and pli by transcript tracks- June 03, 2024 - ChrisL ############################################################################## wget https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/constraint/gnomad.v4.1.constraint_metrics.tsv # Get the transcripts to get the coordinates and exon-intron boundaries hgsql -Ne "select * from wgEncodeGencodeCompV39" hg38 \ | cut -f2- | genePredToBed -tab stdin stdout | sed -Ee 's/\.[0-9]+//' \ | sort -k4 > hg38.gencodeCompV39.bed12 hgsql -Ne "select * from ncbiRefSeq" hg38 \ | cut -f2- | genePredToBed -tab stdin stdout \ | sort -k4 > hg38.refSeq.bed12 cat hg38.gencodeCompV39.bed12 hg38.refSeq.bed12 | sort -k4 > transcripts.coords