87d5b93cb522e7bfa4a66ae41f93e94394d2e1d1 chmalee Fri Sep 18 14:34:02 2020 -0700 Stagin gnomad pext track, refs #25869 diff --git src/hg/makeDb/doc/hg19.txt src/hg/makeDb/doc/hg19.txt index d87741f..70a0649 100644 --- src/hg/makeDb/doc/hg19.txt +++ src/hg/makeDb/doc/hg19.txt @@ -33721,17 +33721,58 @@ #join -t $'\t' sort.main.bed clinSign.txt | uniq -c | sed 's/^ *//' | sed 's/ /\t/' | tawk '{print $3,$4,$5, $6, $7, $1}' | sort -k1,1 -k2,2n > sorted.bed # fawk written by "hand" #tawk -f fawk sorted.bed > bigBedInput.bed bedToBigBed -as=clinsub.as -type=bed9+2 -tab bigBedInput.bed /cluster/data/hg19/chrom.sizes clinsub.bb ln -s `pwd`/clinsub.bb /gbdb/hg19 ############################################################################# # Trios for Genome In a Bottle - DONE 08/04/2020 ChrisL # see ~/kent/src/hg/makeDb/giab/make.txt ############################################################################# # COVID GWAS from COVID-19 Host Genetics Initiative Sep 2020 Kate # see ~kent/src/hg/makeDb/doc/covid/covidHgiGwas.txt +############################################################################# +# gnomAD PEXT scores + +# PEXT data: +# The baselevel is the sum of the expression value for all transcripts touching that base +# The annotation-level is the sum of the expression of transcripts on which a variant has a +# given annotation + +# download the pext data: +wget https://storage.googleapis.com/gnomad-public/papers/2019-tx-annotation/pre_computed/all.possible.snvs.tx_annotated.GTEx.v7.021520.tsv.bgz +wget https://storage.googleapis.com/gnomad-public/papers/2019-tx-annotation/gnomad_browser/all.baselevel.021620.tsv.bgz + +# these files are humongous even with compression: +ls -lh all.* +-rw-rw-r-- 1 chmalee genecats 307M Apr 8 21:54 all.baselevel.021620.tsv.bgz +-rw-rw-r-- 1 chmalee genecats 6.7G Feb 14 2020 all.possible.snvs.tx_annotated.GTEx.v7.021520.tsv.bgz + +# how large are these files? +time zcat all.base* | wc -l +35305149 + +real 1m11.964s +user 1m8.725s +sys 0m10.385s + +# this is theoretically all coding bases in gencode v19, split by tissue then run buildPext.py to +# make one bigWig per tissue: +mkdir run +seq 4 57 | parallel -j10 'zcat all.baselevel.021620.tsv.bgz | cut -f1-3,{} | gzip -c > run/tissue{}.pext.gz' + +# overlapping exons in coding regions causes problems, don't output any scores +# for those regions +seq 4 57 | parallel --joblog run.log -j20 './buildPext.py run/tissue{}.pext.gz -o split' +tail -n +2 run.log | cut -f4 | awk '{sum += $1}END{print sum/NR}' +452.034 + +# Turn into bigWigs: +find split/ -name "*.bed" | parallel -j15 'sort -k1,1 -k2,2n {} | cut -f1-3,5 > {.}.bedGraph' +find split/ -name "*.bedGraph" | parallel -j15 'bedGraphToBigWig {} /hive/data/genomes/hg19/chrom.sizes {.}.bw' +mkdir -p /gbdb/hg19/gnomAD/pext +ln -s `pwd`/split/*.bw /gbdb/hg19/gnomAD/pext/