574399b4a7d8fc75c1745fa83564f547ddcb9e8c
chmalee
  Sat Jun 13 00:43:54 2026 -0700
gnomAD v4.1.1 gene constraint tracks for hg38, refs #37351

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>

diff --git src/hg/makeDb/gnomad/combine.v4.1.1.awk src/hg/makeDb/gnomad/combine.v4.1.1.awk
new file mode 100644
index 00000000000..bdc144ec34e
--- /dev/null
+++ src/hg/makeDb/gnomad/combine.v4.1.1.awk
@@ -0,0 +1,117 @@
+#!/usr/bin/awk -f
+#
+# v4.1.1 variant of combine.awk: same bed12 + LOEUF/pLI/missense/synonymous
+# output, plus three trailing fields new in the v4.1.1 gene constraint table:
+# constraintFlags (outlier reasons), geneFlags (low-coverage/low-mappability
+# warnings), and qualityMetrics (the raw gene_quality_metrics). The metric
+# fields stay at $1-$36 exactly as in combine.awk; the new inputs are at
+# $37-$42 (constraint_flags, gene_flags, prop_bp_AN90, mean_AS_MQ,
+# prop_segdup, prop_LCR).
+#
+BEGIN {
+    FS="\t";
+    OFS="\t";
+    isTranscripts=doTranscripts
+}
+
+{
+chrom=$2
+gnomadChrom=$13
+if (substr($13, 1, 3) != "chr") {
+    gnomadChrom=sprintf("chr%s", $13)
+}
+if (chrom != gnomadChrom) {
+    # so far just the multiple mapping PAR regions
+    printf "bad join: %s\n", $0 > "/dev/stderr"
+    next
+}
+
+chromStart=$3
+chromEnd=$4
+missOutFile=""
+pliOutFile=""
+if (isTranscripts == "true") {
+    name=$1
+    missOutFile="missenseByTranscript.tab"
+    pliOutFile="pliByTranscript.tab"
+} else {
+    name=$16
+    missOutFile="missenseByGene.tab"
+    pliOutFile="pliByGene.tab"
+}
+
+if ($29 == "NA") {
+    pLI = -1
+    if ($28 != "NA" && $27 != "NA" && $29 != "NA" && $30 != "NA" && $35 != "NA" && $36 != "NA") {
+        printf "error: 'NA' value for pLI but not other metrics, line: %d\n", NR > "/dev/stderr"
+        next
+    }
+    pLof=sprintf("pLoF exp: NA, obs: NA, pLI = NA, o/e = NA (NA)")
+    pliMouseOver=sprintf("LOEUF: NA, pLI: NA")
+} else {
+    pLI=sprintf("%0.2f", $29)
+    pLof=sprintf("pLoF exp: %.1f, obs: %d, pLI = %.2f, o/e = %.2f (%.2f - %.2f)", $28,$27,$29,$30,$35,$36)
+    pliMouseOver=sprintf("LOEUF: %.2f, pLI: %.2f", $36, $29)
+    loeuf=sprintf("%0.2f", $36)
+}
+strand=$6
+thickStart=$7
+thickEnd=$8
+pliRgb=""
+missRgb=""
+
+if (loeuf == -1) {pliRgb = "160,160,160"}
+else if (loeuf >= 0 && loeuf < 0.1) {pliRgb = "244,0,2"}
+else if (loeuf >= 0.1 && loeuf < 0.2) {pliRgb = "240,74,3"}
+else if (loeuf >= 0.2 && loeuf < 0.3) {pliRgb = "233,127,5"}
+else if (loeuf >= 0.3 && loeuf < 0.4) {pliRgb = "224,165,8"}
+else if (loeuf >= 0.4 && loeuf < 0.5) {pliRgb = "210,191,13"}
+else if (loeuf >= 0.5 && loeuf < 0.6) {pliRgb = "191,210,22"}
+else if (loeuf >= 0.6 && loeuf < 0.7) {pliRgb = "165,224,26"}
+else if (loeuf >= 0.7 && loeuf < 0.8) {pliRgb = "127,233,58"}
+else if (loeuf >= 0.8 && loeuf < 0.9) {pliRgb = "74,240,94"}
+else if (loeuf >= 0.9) {pliRgb = "0,244,153"}
+else {
+    printf "error: loeuf '%s' out of range for gene/transcript: %s\n", loeuf, name > "/dev/stderr"
+}
+
+if ($22 > 3.09)
+    missRgb = "244,0,2"
+else
+    missRgb = "0,0,0"
+
+if (pLI == -1)
+    bedScore = 0
+else {
+    score=sprintf("%0.2f",pLI)
+    bedScore=sprintf("%d",score*1000)
+}
+
+blockCount=$10
+blockSizes=$11
+blockStarts=$12
+geneName=$18
+missense=sprintf("Missense exp: %.1f, obs: %d, Z = %.2f, o/e = %.2f (%.2f - %.2f)", $20,$19,$22,$21,$33,$34)
+synonymous=sprintf("Synonymous exp: %.1f, obs: %d, Z = %.2f, o/e = %.2f (%.2f - %.2f)", $24,$23,$26,$25,$31,$32)
+missScore=sprintf("%0.2f", $22)
+missMouseOver=sprintf("Z: %0.2f", $22)
+
+# v4.1.1 additions: clean the JSON-array flag strings into plain text
+constraintFlags=$37
+gsub(/[]["]/, "", constraintFlags)
+gsub(/,/, "; ", constraintFlags)
+if (constraintFlags == "" || $37 == "NA") constraintFlags="none"
+geneFlags=$38
+gsub(/[]["]/, "", geneFlags)
+gsub(/,/, "; ", geneFlags)
+if (geneFlags == "" || $38 == "NA") geneFlags="none"
+qualityMetrics=sprintf("prop bp AN90: %s, mean AS MQ: %s, prop segdup: %s, prop LCR: %s", $39, $40, $41, $42)
+# surface the coverage/mappability warning on the mouseOver when present
+if (geneFlags != "none") {
+    pliMouseOver=sprintf("%s | flags: %s", pliMouseOver, geneFlags)
+    missMouseOver=sprintf("%s | flags: %s", missMouseOver, geneFlags)
+}
+
+print chrom, chromStart, chromEnd, name, bedScore, strand, thickStart, thickEnd, pliRgb, blockCount, blockSizes, blockStarts, pliMouseOver, loeuf, pLI, geneName, synonymous, pLof, constraintFlags, geneFlags, qualityMetrics >> pliOutFile
+print chrom, chromStart, chromEnd, name, bedScore, strand, thickStart, thickEnd, missRgb, blockCount, blockSizes, blockStarts, missMouseOver, missScore, geneName, synonymous, missense, constraintFlags, geneFlags, qualityMetrics >> missOutFile
+}