15e40046b987f458aa1d8413fe95a2e3d6a3ca60 chmalee Thu Sep 23 09:06:34 2021 -0700 Fixing double append bug when creating gnomAD pli and missense constraint tracks, found by Jonathan in Bob email diff --git src/hg/makeDb/doc/hg19.txt src/hg/makeDb/doc/hg19.txt index fd309730..4813f45 100644 --- src/hg/makeDb/doc/hg19.txt +++ src/hg/makeDb/doc/hg19.txt @@ -33175,68 +33175,98 @@ # 289674 total # check that v19 has all the transcripts: comm -12 hg19.gencodeV19.transcripts pliByGene.transcripts | wc -l 19704 comm -12 hg19.gencodeV19.transcripts pliByTranscripts.transcripts | wc -l 80950 rm hg19.gencodeV19.transcripts # ok safe to use v19 exon boundaries, just need to drop the version numbers: hgsql -Ne "select * from wgEncodeGencodeCompV19" hg19 | cut -f2- | genePredToBed | sed -Ee 's/\.[0-9]+//' | sort -k4 > hg19.gencodeCompV19.bed12 gzip -cd $geneFile | tail -n +2 \ | tawk '{print $75,$76,$77,$64,$65,$1,$2,$3,$4,$5,$33,$12,$13,$14,$32,$17,$20,$21,$24,$25,$26,$27,$28,$29,$30}' \ | sort -k7 | join -t $'\t' -1 4 -2 7 hg19.gencodeCompV19.bed12 - \ - | ~/kent/src/hg/makeDb/gnomad/combine.awk -v doTranscripts=false 2>genes.chromMismatches \ - | sort -k1,1 -k2,2n > pliByGene.bed + | ~/kent/src/hg/makeDb/gnomad/combine.awk -v doTranscripts=false 2>genes.chromMismatches gzip -cd $transcriptFile | tail -n +2 \ | tawk '{print $76,$77,$78,$65,$66,$1,$2,$4,$5,$6,$34,$13,$14,$15,$33,$18,$21,$22,$25,$26,$27,$28,$29,$30,$31}' \ | sort -k7 | join -t $'\t' -1 4 -2 7 hg19.gencodeCompV19.bed12 - \ - | ~/kent/src/hg/makeDb/gnomad/combine.awk -v doTranscripts=true 2>transcripts.chromMismatches \ - | sort -k1,1 -k2,2n > pliByTranscript.bed + | ~/kent/src/hg/makeDb/gnomad/combine.awk -v doTranscripts=true 2>transcripts.chromMismatches +sort -k1,1 -k2,2n pliByTranscript.tab > pliByTranscript.bed +sort -k1,1 -k2,2n missenseByTranscript.tab > missenseByTranscript.bed +sort -k1,1 -k2,2n pliByGene.tab > pliByGene.bed +sort -k1,1 -k2,2n missenseByGene.tab > missenseByGene.bed -# make .as file: +# make pli .as file: # table pliMetrics -# "bed12+5 for displaying gnomAD haploinsufficiency prediction scores" +# "bed12+6 for displaying gnomAD haploinsufficiency prediction scores" # ( # string chrom; "Reference sequence chromosome or scaffold" # uint chromStart; "Start position in chromosome" # uint chromEnd; "End position in chromosome" # string name; "ENST or ENSG Name" # uint score; "pLI score between 0-1000" # char[1] strand; "strand of transcript" # uint thickStart; "Start of where display is thick" # uint thickEnd; "End of where display should be thick" # uint itemRgb; "Color of item" # int blockCount; "Number of exons" # int[blockCount] blockSizes; "Size of each exon" # int[blockCount] blockStarts; "0-based start position of each exon" # string _mouseOver; "Mouseover label" +# float _loeuf; "LOEUF value for filters" +# float _pli; "pLI value for filters" # string geneName; "Associated Gene symbol" # string synonymous; "Synonymous metrics" -# string missense; "Missense metrics" # string pLoF; "Predicted Loss of Function metrics # ) +# make missense .as file: +#table missenseMetrics +#"bed12+5 for displaying gnomAD missense prediction scores" +# ( +# string chrom; "Reference sequence chromosome or scaffold" +# uint chromStart; "Start position in chromosome" +# uint chromEnd; "End position in chromosome" +# string name; "ENST or ENSG Name" +# uint score; "pLI score between 0-1000, or -1 for NA" +# char[1] strand; "strand of transcript" +# uint thickStart; "Start of where display is thick" +# uint thickEnd; "End of where display should be thick" +# uint itemRgb; "Color of item" +# int blockCount; "Number of exons" +# int[blockCount] blockSizes; "Size of each exon" +# int[blockCount] chromStarts; "0-based start position of each exon" +# string _mouseOver; "Mouseover label" +# float _zscore; "Z-score value for filters" +# string geneName; "Gene symbol" +# string synonymous; "Synonymous metrics" +# string missense; "Missense metrics" +# ) + sizes=/hive/data/genomes/hg19/chrom.sizes -bedToBigBed -type=bed12+5 -as=pliMetrics.as -tab -extraIndex=name,geneName pliByGene.bed $sizes pliByGene.bb -bedToBigBed -type=bed12+5 -as=pliMetrics.as -tab -extraIndex=name,geneName pliByTranscript.bed $sizes pliByTranscript.bb +bedToBigBed -type=bed12+6 -as=pliMetrics.as -tab -extraIndex=name,geneName pliByGene.bed $sizes pliByGene.bb +bedToBigBed -type=bed12+6 -as=pliMetrics.as -tab -extraIndex=name,geneName pliByTranscript.bed $sizes pliByTranscript.bb +bedToBigBed -type=bed12+5 -as=missenseMetrics.as -tab -extraIndex=name,geneName missenseByGene.bed $sizes missenseByGene.bb +edToBigBed -type=bed12+5 -as=missenseMetrics.as -tab -extraIndex=name,geneName missenseByTranscript.bed $sizes missenseByTranscript.bb cd /gbdb/hg19/gnomAD/pLI/ ln -s /hive/data/outside/gnomAD.2/constraint/pliByGene.bb ln -s /hive/data/outside/gnomAD.2/constraint/pliByTranscript.bb +ln -s /hive/data/outside/gnomAD.2/constraint/missenseByGene.bb +ln -s /hive/data/outside/gnomAD.2/constraint/missenseByTranscript.bb ############################################################################## 2020-01-13: Add size filter to dgvMerged and dgvSupporting track (ChrisL) cd /hive/data/genomes/hg19/bed/dgv/160810 zcat dgvMerged.bed.gz | tawk '{print $0, $3-$2}' > dgvMergedWithSize.bed zcat dgvSupporting.bed.gz | tawk '{print $0, $3-$2}' > dgvSupportingWithSize.bed cat dgvPlusSize.as # table dgvPlus # "Database of Genomic Variants incorporating dbVar, July 2013 and later" # ( # string chrom; "Reference sequence chromosome or scaffold" # uint chromStart; "Start position in chromosome" # uint chromEnd; "End position in chromosome" # string name; "ID of merged variant or supporting variant" # uint score; "Score from 0-1000 (placeholder for BED 9+ format)" # char[1] strand; "+ or - (placeholder for BED 9+ format)"