574399b4a7d8fc75c1745fa83564f547ddcb9e8c
chmalee
  Sat Jun 13 00:43:54 2026 -0700
gnomAD v4.1.1 gene constraint tracks for hg38, refs #37351

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>

diff --git src/hg/makeDb/doc/hg38/gnomad.txt src/hg/makeDb/doc/hg38/gnomad.txt
index 5fadce03a90..a063ca2a3f5 100644
--- src/hg/makeDb/doc/hg38/gnomad.txt
+++ src/hg/makeDb/doc/hg38/gnomad.txt
@@ -778,15 +778,72 @@
 # details file, bedJoinTabOffset, bedToBigBed, and the /gbdb symlinks) is driven
 # by one resumable script, run once per dataset (long job, run in screen):
 #   src/hg/makeDb/scripts/gnomadV4.1.1/buildGnomadV4.1.1BigBed.sh exomes
 #   src/hg/makeDb/scripts/gnomadV4.1.1/buildGnomadV4.1.1BigBed.sh genomes
 # Workdir /hive/data/inside/gnomAD/v4/v4.1.1, progress in build.{exomes,genomes}.log.
 # Scripts, field lists and autosql:
 # https://github.com/ucscGenomeBrowser/kent/tree/master/src/hg/makeDb/scripts/gnomadV4.1.1
 
 # trackDb: the bigBed composite gnomadVariantsV4.1.1 (children
 # gnomadExomesVariantsV4_1_1, gnomadGenomesVariantsV4_1_1) was added to
 # human/hg38/gnomad.ra alongside v4.1, cloned from the v4.1 stanza. The
 # experimental vcfTabix track in gnomadV4.1.1.ra (alpha only) was renamed to
 # gnomadVariantsV4_1_1Vcf so the bigBed track could take the canonical name; its
 # per-chrom lookup tables are built by loadGnomadV4.1.1ChromTables.sh.
 
+##############################################################################
+# gnomAD v4.1.1 gene constraint update - June 12, 2026 - Claude (chmalee)
+##############################################################################
+# v4.1.1 recomputed the gene constraint metrics (LOEUF/pLI/oe changed, e.g. BRCA1
+# LOEUF 0.885 -> 0.928), re-ran LOFTEE (END_TRUNC threshold change), and annotates
+# more transcripts (211523 -> 221898 rows). Rebuilt the by-transcript LoF and
+# missense constraint tracks following the v4.1 recipe above, with two changes:
+# the v4.1.1 table has a different 113-column schema so the extracted column
+# numbers were re-derived by field name, and three new fields are carried -
+# constraint outlier flags, gene low-coverage/low-mappability flags, and the raw
+# exome quality metrics. The per-population gen_anc_* columns are not carried.
+
+WORKDIR=/hive/data/inside/gnomAD/v4/v4.1.1/constraint
+cd $WORKDIR
+wget https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1.1/constraint/gnomad.v4.1.1.constraint_metrics.tsv.bgz
+
+# transcript coords (same as v4.1): GENCODE V39 + ncbiRefSeq, version stripped
+hgsql -Ne "select * from wgEncodeGencodeCompV39" hg38 \
+    | cut -f2- | genePredToBed -tab stdin stdout | sed -Ee 's/\.[0-9]+//' \
+    | sort -k4 > hg38.gencodeCompV39.bed12
+hgsql -Ne "select * from ncbiRefSeq" hg38 \
+    | cut -f2- | genePredToBed -tab stdin stdout | sort -k4 > hg38.refSeq.bed12
+cat hg38.gencodeCompV39.bed12 hg38.refSeq.bed12 | sort -k4 > transcripts.coords
+
+# The field names match v4.1 (lof.oe_ci.upper = LOEUF, lof.pLI, mis.*, syn.*), so
+# the v4.1 extract was remapped by name to the v4.1.1 column numbers; the trailing
+# six columns add constraint_flags, gene_flags and the four gene_quality_metrics.
+# To re-derive the indices if the schema changes again:
+#   bgzip -cd gnomad.v4.1.1.constraint_metrics.tsv.bgz | head -1 | tr '\t' '\n' | nl
+
+# Extract -> join to transcript coords -> reformat -> combine.v4.1.1.awk, which
+# writes pliByTranscript.tab (bed12+9) and missenseByTranscript.tab (bed12+8).
+# combine.v4.1.1.awk and the autosql live in src/hg/makeDb/gnomad/ :
+#   combine.v4.1.1.awk, pliMetrics.v4.1.1.as, missenseMetrics.v4.1.1.as
+AWK=~/kent/src/hg/makeDb/gnomad/combine.v4.1.1.awk
+rm -f pliByTranscript.tab missenseByTranscript.tab
+bgzip -cd gnomad.v4.1.1.constraint_metrics.tsv.bgz | tail -n +2 \
+  | awk -F'\t' -v OFS='\t' '{print $1,$3,$43,$44,$45,$47,$22,$23,$24,$26,$88,$89,$111,$90,$27,$28,$48,$49,$93,$94,$19,$18,$14,$15,$16,$17}' \
+  | sort -k2 \
+  | join -t $'\t' -1 4 -2 2 transcripts.coords - \
+  | awk -F'\t' '{for(i=1;i<=12;i++){printf "%s\t",$i} printf "%s\t%s\t%s\t\t\t",$2,$3,$4; for(i=13;i<=NF;i++){printf "%s",$i; if(i!=NF)printf "\t"}; printf "\n"}' \
+  | awk -f $AWK -v doTranscripts=true
+bedSort pliByTranscript.tab pliByTranscript.tab
+bedSort missenseByTranscript.tab missenseByTranscript.tab
+
+# build the bigBeds (176740 transcripts)
+sizes=/hive/data/genomes/hg38/chrom.sizes
+bedToBigBed -type=bed12+9 -tab -extraIndex=name,geneName \
+    -as=~/kent/src/hg/makeDb/gnomad/pliMetrics.v4.1.1.as \
+    pliByTranscript.tab $sizes pliByTranscript.v4.1.1.bb
+bedToBigBed -type=bed12+8 -tab -extraIndex=name,geneName \
+    -as=~/kent/src/hg/makeDb/gnomad/missenseMetrics.v4.1.1.as \
+    missenseByTranscript.tab $sizes missenseByTranscript.v4.1.1.bb
+
+# trackDb: a "Constraint V4.1.1" view group (pliByTranscriptV4_1_1,
+# missenseByTranscriptV4_1_1) was added to gnomad.constraint.alpha.ra, which is
+# included alpha-only while gnomad.constraint.public.ra is included beta,public.