15e40046b987f458aa1d8413fe95a2e3d6a3ca60 chmalee Thu Sep 23 09:06:34 2021 -0700 Fixing double append bug when creating gnomAD pli and missense constraint tracks, found by Jonathan in Bob email diff --git src/hg/makeDb/gnomad/combine.awk src/hg/makeDb/gnomad/combine.awk index 3ab87e2..10c6874 100755 --- src/hg/makeDb/gnomad/combine.awk +++ src/hg/makeDb/gnomad/combine.awk @@ -1,82 +1,101 @@ #!/usr/bin/awk -f -### # re-organizes the output of a join command into a valid bed12+ -# should be run in a pipe, for example: -# join -t $'\t' -1 4 -2 7 gencode.bed12 pliByTranscript.trimmed | combineGencodePli.awk -v doTranscripts=true | sort -k1,1 -k2,2n > pliByTranscript.bed +# +# re-organizes the output of a join command into a valid bed12+ +# should be run in a pipe, and this script creates two files, one for pli one for missense: +# join -t $'\t' -1 4 -2 7 gencode.bed12 pliByTranscript.trimmed | combine.awk -v doTranscripts=true +# sort -k1,1 -k2,2 pliByTranscript.tab > pliByTranscript.bed +# sort -k1,1 -k2,2 missenseByTranscript.tab > missenseByTranscript.bed # # The doTranscripts argument tells the script which accession to use as the name field -# in the final bed, which is the ENST accession for by_transcript, and the ENSG accession for -# by_gene. -### +# in the final beds, which is the ENST accession for by_transcript, and the ENSG accession for +# by_gene. It also sets the output files name correctly {missense,pli}By{Gene,Transcript}.tab +# BEGIN { FS="\t"; OFS="\t"; isTranscripts=doTranscripts } { chrom=$2 gnomadChrom = sprintf("chr%s", $13) if (chrom != gnomadChrom) { # so far just the multiple mapping PAR regions printf "bad join: %s\n", $0 > "/dev/stderr" next } chromStart=$3 chromEnd=$4 -if (isTranscripts == "true") +missOutFile="" +pliOutFile="" +if (isTranscripts == "true") { name=$1 -else + missOutFile="missenseByTranscript.tab" + pliOutFile="pliByTranscript.tab" +} else { name=$16 + missOutFile="missenseByGene.tab" + pliOutFile="pliByGene.tab" +} if ($29 == "NA") { pLI = -1 if ($28 != "NA" && $27 != "NA" && $29 != "NA" && $30 != "NA" && $35 != "NA" && $36 != "NA") { # doesn't come up with this version but you never know printf "error: 'NA' value for pLI but not other metrics, line: %d\n", NR > "/dev/stderr" next } pLof=sprintf("pLoF exp: NA, obs: NA, pLI = NA, o/e = NA (NA)") - mouseOver=sprintf("LOEUF: NA, pLI: NA") + pliMouseOver=sprintf("LOEUF: NA, pLI: NA") } else { pLI=sprintf("%0.2f", $29) pLof=sprintf("pLoF exp: %.1f, obs: %d, pLI = %.2f, o/e = %.2f (%.2f - %.2f)", $28,$27,$29,$30,$35,$36) - mouseOver=sprintf("LOEUF: %.2f, pLI: %.2f", $36, $29) + pliMouseOver=sprintf("LOEUF: %.2f, pLI: %.2f", $36, $29) loeuf=sprintf("%0.2f", $36) } strand=$6 thickStart=$7 thickEnd=$8 -itemRgb="" +pliRgb="" +missRgb="" -if (loeuf == -1) {itemRgb = "160,160,160"} -else if (loeuf >= 0 && loeuf < 0.1) {itemRgb = "244,0,2"} -else if (loeuf >= 0.1 && loeuf < 0.2) {itemRgb = "240,74,3"} -else if (loeuf >= 0.2 && loeuf < 0.3) {itemRgb = "233,127,5"} -else if (loeuf >= 0.3 && loeuf < 0.4) {itemRgb = "224,165,8"} -else if (loeuf >= 0.4 && loeuf < 0.5) {itemRgb = "210,191,13"} -else if (loeuf >= 0.5 && loeuf < 0.6) {itemRgb = "191,210,22"} -else if (loeuf >= 0.6 && loeuf < 0.7) {itemRgb = "165,224,26"} -else if (loeuf >= 0.7 && loeuf < 0.8) {itemRgb = "127,233,58"} -else if (loeuf >= 0.8 && loeuf < 0.9) {itemRgb = "74,240,94"} -else if (loeuf >= 0.9) {itemRgb = "0,244,153"} +if (loeuf == -1) {pliRgb = "160,160,160"} +else if (loeuf >= 0 && loeuf < 0.1) {pliRgb = "244,0,2"} +else if (loeuf >= 0.1 && loeuf < 0.2) {pliRgb = "240,74,3"} +else if (loeuf >= 0.2 && loeuf < 0.3) {pliRgb = "233,127,5"} +else if (loeuf >= 0.3 && loeuf < 0.4) {pliRgb = "224,165,8"} +else if (loeuf >= 0.4 && loeuf < 0.5) {pliRgb = "210,191,13"} +else if (loeuf >= 0.5 && loeuf < 0.6) {pliRgb = "191,210,22"} +else if (loeuf >= 0.6 && loeuf < 0.7) {pliRgb = "165,224,26"} +else if (loeuf >= 0.7 && loeuf < 0.8) {pliRgb = "127,233,58"} +else if (loeuf >= 0.8 && loeuf < 0.9) {pliRgb = "74,240,94"} +else if (loeuf >= 0.9) {pliRgb = "0,244,153"} else { printf "error: loeuf '%s' out of range for gene/transcript: %s\n", loeuf, name > "/dev/stderr" } +if ($22 > 3.09) + missRgb = "244,0,2" +else + missRgb = "0,0,0" + if (pLI == -1) bedScore = 0 else { score=sprintf("%0.2f",pLI) bedScore=sprintf("%d",score*1000) } blockCount=$10 blockSizes=$11 blockStarts=$12 geneName=$18 missense=sprintf("Missense exp: %.1f, obs: %d, Z = %.2f, o/e = %.2f (%.2f - %.2f)", $20,$19,$22,$21,$33,$34) synonymous=sprintf("Synonymous exp: %.1f, obs: %d, Z = %.2f, o/e = %.2f (%.2f - %.2f)", $24,$23,$26,$25,$31,$32) +missScore=sprintf("%0.2f", $22) +missMouseOver=sprintf("Z: %0.2f", $22) -print chrom, chromStart, chromEnd, name, bedScore, strand, thickStart, thickEnd, itemRgb, blockCount, blockSizes, blockStarts, mouseOver, loeuf, pLI, geneName, synonymous, missense, pLof +print chrom, chromStart, chromEnd, name, bedScore, strand, thickStart, thickEnd, pliRgb, blockCount, blockSizes, blockStarts, pliMouseOver, loeuf, pLI, geneName, synonymous, pLof >> pliOutFile +print chrom, chromStart, chromEnd, name, bedScore, strand, thickStart, thickEnd, missRgb, blockCount, blockSizes, blockStarts, missMouseOver, missScore, geneName, synonymous, missense >> missOutFile }