73ecf418edee1ae991841f9bbf6ec5fe623407ec chmalee Fri Mar 1 15:24:02 2024 -0800 Staging gnomAD v4 constraint track on test, refs #33084 diff --git src/hg/makeDb/gnomad/combine.awk src/hg/makeDb/gnomad/combine.awk index 10c6874..8c712ac 100755 --- src/hg/makeDb/gnomad/combine.awk +++ src/hg/makeDb/gnomad/combine.awk @@ -1,101 +1,104 @@ #!/usr/bin/awk -f # # re-organizes the output of a join command into a valid bed12+ # should be run in a pipe, and this script creates two files, one for pli one for missense: # join -t $'\t' -1 4 -2 7 gencode.bed12 pliByTranscript.trimmed | combine.awk -v doTranscripts=true # sort -k1,1 -k2,2 pliByTranscript.tab > pliByTranscript.bed # sort -k1,1 -k2,2 missenseByTranscript.tab > missenseByTranscript.bed # # The doTranscripts argument tells the script which accession to use as the name field # in the final beds, which is the ENST accession for by_transcript, and the ENSG accession for # by_gene. It also sets the output files name correctly {missense,pli}By{Gene,Transcript}.tab # BEGIN { FS="\t"; OFS="\t"; isTranscripts=doTranscripts } { chrom=$2 +gnomadChrom=$13 +if (substr($13, 1, 3) != "chr") { gnomadChrom=sprintf("chr%s", $13) +} if (chrom != gnomadChrom) { # so far just the multiple mapping PAR regions printf "bad join: %s\n", $0 > "/dev/stderr" next } chromStart=$3 chromEnd=$4 missOutFile="" pliOutFile="" if (isTranscripts == "true") { name=$1 missOutFile="missenseByTranscript.tab" pliOutFile="pliByTranscript.tab" } else { name=$16 missOutFile="missenseByGene.tab" pliOutFile="pliByGene.tab" } if ($29 == "NA") { pLI = -1 if ($28 != "NA" && $27 != "NA" && $29 != "NA" && $30 != "NA" && $35 != "NA" && $36 != "NA") { # doesn't come up with this version but you never know printf "error: 'NA' value for pLI but not other metrics, line: %d\n", NR > "/dev/stderr" next } pLof=sprintf("pLoF exp: NA, obs: NA, pLI = NA, o/e = NA (NA)") pliMouseOver=sprintf("LOEUF: NA, pLI: NA") } else { pLI=sprintf("%0.2f", $29) pLof=sprintf("pLoF exp: %.1f, obs: %d, pLI = %.2f, o/e = %.2f (%.2f - %.2f)", $28,$27,$29,$30,$35,$36) pliMouseOver=sprintf("LOEUF: %.2f, pLI: %.2f", $36, $29) loeuf=sprintf("%0.2f", $36) } strand=$6 thickStart=$7 thickEnd=$8 pliRgb="" missRgb="" if (loeuf == -1) {pliRgb = "160,160,160"} else if (loeuf >= 0 && loeuf < 0.1) {pliRgb = "244,0,2"} else if (loeuf >= 0.1 && loeuf < 0.2) {pliRgb = "240,74,3"} else if (loeuf >= 0.2 && loeuf < 0.3) {pliRgb = "233,127,5"} else if (loeuf >= 0.3 && loeuf < 0.4) {pliRgb = "224,165,8"} else if (loeuf >= 0.4 && loeuf < 0.5) {pliRgb = "210,191,13"} else if (loeuf >= 0.5 && loeuf < 0.6) {pliRgb = "191,210,22"} else if (loeuf >= 0.6 && loeuf < 0.7) {pliRgb = "165,224,26"} else if (loeuf >= 0.7 && loeuf < 0.8) {pliRgb = "127,233,58"} else if (loeuf >= 0.8 && loeuf < 0.9) {pliRgb = "74,240,94"} else if (loeuf >= 0.9) {pliRgb = "0,244,153"} else { printf "error: loeuf '%s' out of range for gene/transcript: %s\n", loeuf, name > "/dev/stderr" } if ($22 > 3.09) missRgb = "244,0,2" else missRgb = "0,0,0" if (pLI == -1) bedScore = 0 else { score=sprintf("%0.2f",pLI) bedScore=sprintf("%d",score*1000) } blockCount=$10 blockSizes=$11 blockStarts=$12 geneName=$18 missense=sprintf("Missense exp: %.1f, obs: %d, Z = %.2f, o/e = %.2f (%.2f - %.2f)", $20,$19,$22,$21,$33,$34) synonymous=sprintf("Synonymous exp: %.1f, obs: %d, Z = %.2f, o/e = %.2f (%.2f - %.2f)", $24,$23,$26,$25,$31,$32) missScore=sprintf("%0.2f", $22) missMouseOver=sprintf("Z: %0.2f", $22) print chrom, chromStart, chromEnd, name, bedScore, strand, thickStart, thickEnd, pliRgb, blockCount, blockSizes, blockStarts, pliMouseOver, loeuf, pLI, geneName, synonymous, pLof >> pliOutFile print chrom, chromStart, chromEnd, name, bedScore, strand, thickStart, thickEnd, missRgb, blockCount, blockSizes, blockStarts, missMouseOver, missScore, geneName, synonymous, missense >> missOutFile }