f872e1088fcd06308ca592418458d7787aca1bf1 kate Tue May 5 10:42:31 2020 -0700 Load GTEx V8 gene expression for hg19. refs #25130 diff --git src/hg/makeDb/doc/hg19.txt src/hg/makeDb/doc/hg19.txt index 4d9b4e5..31f53ef 100644 --- src/hg/makeDb/doc/hg19.txt +++ src/hg/makeDb/doc/hg19.txt @@ -33585,15 +33585,98 @@ wget ftp://ftp-trace.ncbi.nih.gov/giab/ftp/data/NA12878/analysis/NIST_union_callsets_06172013/VQSRv2.18_filterAlign.bed.gz wget ftp://ftp-trace.ncbi.nih.gov/giab/ftp/data/NA12878/analysis/NIST_union_callsets_06172013/VQSRv2.18_filterConflicting.bed.gz wget ftp://ftp-trace.ncbi.nih.gov/giab/ftp/data/NA12878/analysis/NIST_union_callsets_06172013/VQSRv2.18_filterCov.bed.gz wget ftp://ftp-trace.ncbi.nih.gov/giab/ftp/data/NA12878/analysis/NIST_union_callsets_06172013/VQSRv2.18_filterHapNoVar.bed.gz wget ftp://ftp-trace.ncbi.nih.gov/giab/ftp/data/NA12878/analysis/NIST_union_callsets_06172013/VQSRv2.18_filterMap.bed.gz wget ftp://ftp-trace.ncbi.nih.gov/giab/ftp/data/NA12878/analysis/NIST_union_callsets_06172013/VQSRv2.18_filterSSE.bed.gz wget ftp://ftp-trace.ncbi.nih.gov/giab/ftp/data/NA12878/analysis/NIST_union_callsets_06172013/VQSRv2.18_filterlt2Datasets.bed.gz gunzip *.gz cd .. for i in orig/*.bed; do out=`echo $i | sed -e 's|orig/VQSRv2.18_||g'`; out=`basename $out .bed`; echo $out; chromToUcsc -a hg19.chromAlias.tsv -i $i -o bed/$out.bed; done for i in bed/filter*.bed; do echo $i; bedSort $i $i; bedToBigBed $i /hive/data/genomes/hg19/chrom.sizes bb/`basename $i .bed`.bed -type=bed3; done cd /gbdb/hg19/bbi/special; for i in /hive/data/genomes/hg19/bed/specialRegions/bb/filter*.bb; do ln -s $i; done bedSort orig/hg19-blacklist.v2.bed orig/hg19-blacklist.v2.bed bedToBigBed orig/hg19-blacklist.v2.bed /hive/data/genomes/hg19/chrom.sizes bb/encBlacklist.bb -tab + +############################################################################# +# GTEx V8 (April 2020) Kate +# Create BED from hgFixed tables (see doc/gtex) + +cd /hive/data/outside/gtex/V8/rnaSeq + +# Lift GTEx LDACC gene models (GENCODE V26 isoforms collapsed to single gene model) +from hg38 annotation by GTEx LDACC + +set chain = /hive/data/genomes/hg38/bed/liftOver/hg38ToHg19.over.chain.gz +liftOver -genePred gencodeV26.hg38.genePred $chain gencodeV26.hg19.lifted.genePred \ + gencodeV26.hg19.unmapped +# 1300 gencodeV26.hg19.unmapped +# (was 925 in V6 lift hg19 to hg38) + +# TODO: Consider transmap + +hgLoadGenePred hg19 gtexGeneModelV8 gencodeV26.hg19.lifted.genePred + +# Load BED table +cd /hive/data/genomes/hg19/bed/gtex +mkdir -p V8 +cd V8 + +set gencode = V26lift37 +~/kent/src/hg/makeDb/outside/hgGtexGeneBed/hgGtexGeneBed \ + hg19 -noLoad -gtexVersion=V8 -gencodeVersion=$gencode gtexGeneV8 -verbose=2 >&! log.txt & + +# Max score: 267400.000000 + +wc -l gtexGeneV8.tab +#54481 gtexGeneV8.tab + +# 1070 genes not found in GencodeAttrs table +# e.g. +Can't find geneId ENSG00000278267 in wgEncodeGencodeAttrsV26lift37 +#from Ensembl page for MIR6859-1: +#Stable ID ENSG00000278267 not present in GRCh37. + +# 650 genes not found in modelHash +# e.g. +Can't find gene ENSG00000279928.2 in modelHash +# From Ensembl +Gene: FO538757.1 ENSG00000279928 +There is no ungapped mapping of this gene onto the GRCh37 assembly. +Stable ID ENSG00000279928 not present in GRCh37. + +#Max score: 219385.906250 +wc -l gtexGeneV8.tab +# 54481 gtexGeneV8.tab +# 55393 gtexGeneV6.tab + +# Add scores (see hg38/gtex.txt for background) + +set bedScore = ~/kent/src/utils/bedScore/bedScore +$bedScore -col=10 -minScore=0 -log -method=encode gtexGeneV8.tab gtexGeneBedV8.bed +textHistogram -real -autoScale=14 -log -col=5 gtexGeneBedV8.bed +0.000000 ************************************************************ 20189 +71.428643 **************************************************** 5512 +142.857286 **************************************************** 5401 +214.285929 *************************************************** 4587 +285.714571 *************************************************** 4399 +357.143214 *************************************************** 4672 +428.571857 **************************************************** 5390 +500.000500 ************************************************* 3073 +571.429143 ***************************************** 904 +642.857786 ********************************* 248 +714.286429 *************************** 80 +785.715071 ************** 10 +857.143714 ******** 4 +928.572357 *************** 12 + +# table looks OK, load it +set lib = ~/kent/src/hg/lib +hgLoadBed hg19 -noBin -tab -type=bed6+4 \ + -as=$lib/gtexGeneBed.as -sqlTable=$lib/gtexGeneBed.sql -renameSqlTable \ + gtexGeneV8 gtexGeneBedV8.bed +# Read 54481 elements of size 10 from gtexGeneBedV8.bed + +# TODO: Add to gene sorter + +