15e40046b987f458aa1d8413fe95a2e3d6a3ca60
chmalee
  Thu Sep 23 09:06:34 2021 -0700
Fixing double append bug when creating gnomAD pli and missense constraint tracks, found by Jonathan in Bob email

diff --git src/hg/makeDb/doc/hg19.txt src/hg/makeDb/doc/hg19.txt
index fd309730..4813f45 100644
--- src/hg/makeDb/doc/hg19.txt
+++ src/hg/makeDb/doc/hg19.txt
@@ -33175,68 +33175,98 @@
 # 289674 total
 
 # check that v19 has all the transcripts:
 comm -12 hg19.gencodeV19.transcripts pliByGene.transcripts | wc -l
 19704
 comm -12 hg19.gencodeV19.transcripts pliByTranscripts.transcripts | wc -l
 80950
 rm hg19.gencodeV19.transcripts
 
 # ok safe to use v19 exon boundaries, just need to drop the version numbers:
 hgsql -Ne "select * from wgEncodeGencodeCompV19" hg19 | cut -f2- | genePredToBed | sed -Ee 's/\.[0-9]+//' | sort -k4 > hg19.gencodeCompV19.bed12
 
 gzip -cd $geneFile | tail -n +2 \
     | tawk '{print $75,$76,$77,$64,$65,$1,$2,$3,$4,$5,$33,$12,$13,$14,$32,$17,$20,$21,$24,$25,$26,$27,$28,$29,$30}' \
     | sort -k7 | join -t $'\t' -1 4 -2 7 hg19.gencodeCompV19.bed12 - \
-    | ~/kent/src/hg/makeDb/gnomad/combine.awk -v doTranscripts=false 2>genes.chromMismatches \
-    | sort -k1,1 -k2,2n > pliByGene.bed
+    | ~/kent/src/hg/makeDb/gnomad/combine.awk -v doTranscripts=false 2>genes.chromMismatches
 
 gzip -cd $transcriptFile | tail -n +2 \
     | tawk '{print $76,$77,$78,$65,$66,$1,$2,$4,$5,$6,$34,$13,$14,$15,$33,$18,$21,$22,$25,$26,$27,$28,$29,$30,$31}' \
     | sort -k7 | join -t $'\t' -1 4 -2 7 hg19.gencodeCompV19.bed12 - \
-    | ~/kent/src/hg/makeDb/gnomad/combine.awk -v doTranscripts=true 2>transcripts.chromMismatches \
-    | sort -k1,1 -k2,2n > pliByTranscript.bed
+    | ~/kent/src/hg/makeDb/gnomad/combine.awk -v doTranscripts=true 2>transcripts.chromMismatches
+sort -k1,1 -k2,2n pliByTranscript.tab > pliByTranscript.bed
+sort -k1,1 -k2,2n missenseByTranscript.tab > missenseByTranscript.bed
+sort -k1,1 -k2,2n pliByGene.tab > pliByGene.bed
+sort -k1,1 -k2,2n missenseByGene.tab > missenseByGene.bed
 
-# make .as file:
+# make pli .as file:
 #  table pliMetrics
-#  "bed12+5 for displaying gnomAD haploinsufficiency prediction scores"
+#  "bed12+6 for displaying gnomAD haploinsufficiency prediction scores"
 #      (
 #      string chrom;      "Reference sequence chromosome or scaffold"
 #      uint   chromStart; "Start position in chromosome"
 #      uint   chromEnd;   "End position in chromosome"
 #      string name;       "ENST or ENSG Name"
 #      uint   score;      "pLI score between 0-1000"
 #      char[1] strand;    "strand of transcript"
 #      uint thickStart;   "Start of where display is thick"
 #      uint thickEnd;     "End of where display should be thick"
 #      uint itemRgb;    "Color of item"
 #      int blockCount;   "Number of exons"
 #      int[blockCount] blockSizes;  "Size of each exon"
 #      int[blockCount] blockStarts; "0-based start position of each exon"
 #      string _mouseOver;  "Mouseover label"
+#      float _loeuf;      "LOEUF value for filters"
+#      float _pli;        "pLI value for filters"
 #      string geneName;   "Associated Gene symbol"
 #      string synonymous; "Synonymous metrics"
-#      string missense;   "Missense metrics"
 #      string pLoF;       "Predicted Loss of Function metrics
 #      )
 
+# make missense .as file:
+#table missenseMetrics
+#"bed12+5 for displaying gnomAD missense prediction scores"
+#    (
+#    string chrom;      "Reference sequence chromosome or scaffold"
+#    uint   chromStart; "Start position in chromosome"
+#    uint   chromEnd;   "End position in chromosome"
+#    string name;       "ENST or ENSG Name"
+#    uint   score;      "pLI score between 0-1000, or  -1 for NA"
+#    char[1] strand;    "strand of transcript"
+#    uint thickStart;   "Start of where display is thick"
+#    uint thickEnd;     "End of where display should be thick"
+#    uint itemRgb;    "Color of item"
+#    int blockCount;   "Number of exons"
+#    int[blockCount] blockSizes;  "Size of each exon"
+#    int[blockCount] chromStarts; "0-based start position of each exon"
+#    string _mouseOver;  "Mouseover label"
+#    float _zscore;         "Z-score value for filters"
+#    string geneName;   "Gene symbol"
+#    string synonymous; "Synonymous metrics"
+#    string missense;   "Missense metrics"
+#    )
+
 sizes=/hive/data/genomes/hg19/chrom.sizes
-bedToBigBed -type=bed12+5 -as=pliMetrics.as -tab -extraIndex=name,geneName pliByGene.bed $sizes pliByGene.bb
-bedToBigBed -type=bed12+5 -as=pliMetrics.as -tab -extraIndex=name,geneName pliByTranscript.bed $sizes pliByTranscript.bb
+bedToBigBed -type=bed12+6 -as=pliMetrics.as -tab -extraIndex=name,geneName pliByGene.bed $sizes pliByGene.bb
+bedToBigBed -type=bed12+6 -as=pliMetrics.as -tab -extraIndex=name,geneName pliByTranscript.bed $sizes pliByTranscript.bb
+bedToBigBed -type=bed12+5 -as=missenseMetrics.as -tab -extraIndex=name,geneName missenseByGene.bed $sizes missenseByGene.bb
+edToBigBed -type=bed12+5 -as=missenseMetrics.as -tab -extraIndex=name,geneName missenseByTranscript.bed $sizes missenseByTranscript.bb
 cd /gbdb/hg19/gnomAD/pLI/
 ln -s /hive/data/outside/gnomAD.2/constraint/pliByGene.bb
 ln -s /hive/data/outside/gnomAD.2/constraint/pliByTranscript.bb
+ln -s /hive/data/outside/gnomAD.2/constraint/missenseByGene.bb
+ln -s /hive/data/outside/gnomAD.2/constraint/missenseByTranscript.bb
 ##############################################################################
 2020-01-13: Add size filter to dgvMerged and dgvSupporting track (ChrisL)
 cd /hive/data/genomes/hg19/bed/dgv/160810
 zcat dgvMerged.bed.gz | tawk '{print $0, $3-$2}' > dgvMergedWithSize.bed
 zcat dgvSupporting.bed.gz | tawk '{print $0, $3-$2}' > dgvSupportingWithSize.bed
 cat dgvPlusSize.as
 # table dgvPlus
 # "Database of Genomic Variants incorporating dbVar, July 2013 and later"
 #     (
 #     string chrom;       "Reference sequence chromosome or scaffold"
 #     uint   chromStart;  "Start position in chromosome"
 #     uint   chromEnd;    "End position in chromosome"
 #     string name;        "ID of merged variant or supporting variant"
 #     uint   score;       "Score from 0-1000 (placeholder for BED 9+ format)"
 #     char[1] strand;     "+ or - (placeholder for BED 9+ format)"