3972ba54c468ace338d4a5578de1d20bf6c1f9ec lrnassar Mon Apr 20 15:39:26 2026 -0700 Adding Rule 4 (long-exon rule, Lindeboom 2016) to NMD Escape tracks and releasing on Apr. 22, 2026. refs #33737 Script: added a fourth rule to genePredNmdEsc. Coding exons longer than 400 bp (excluding the last coding exon, which is already covered by the 50 bp rule) are flagged as NMD-escape regions. Rebuilt the Gencode and NCBI RefSeq bigBed files. trackDb: - nmd.ra: appended "/400nt" to the nmdEsc longLabels, set nmdEscGencode default visibility to dense so the track is visible in cart-reset views, changed all four NMDetective subtracks from "visibility full" to "visibility hide", updated pennantIcon to the Apr. 22, 2026 release date and anchor. - nmd.html: mention long internal exons in the overview description, update the rule count from three to four. - nmdEscTranscripts.html: add the long-exon rule to the rule list and color legend (gold, #FFD700), expand the Background section with mechanisms for the intronless, start-proximal, and long-exon rules, correct the 50 bp rule description to include the entire last coding exon, fix Lindeboom 2016 author initials (RG -> RGH). News: - newsarch.html: add the 2026-04-22 NMD Escape news entry covering all four rules, with acknowledgements to Guido Neidhardt and Andreas Lahner for suggesting the track and the Decipher Genome Browser team for inspiring the visualization. - indexNews.html: add the front-page news link. makedoc: - nmd.txt: dated note for the Rule 4 rebuild. diff --git src/hg/makeDb/scripts/nmd/genePredNmdEsc src/hg/makeDb/scripts/nmd/genePredNmdEsc index f95e07be6ea..12faa973367 100755 --- src/hg/makeDb/scripts/nmd/genePredNmdEsc +++ src/hg/makeDb/scripts/nmd/genePredNmdEsc @@ -1,36 +1,41 @@ #!/usr/bin/env python import logging, sys, optparse, gzip, subprocess from collections import defaultdict from os.path import join, basename, dirname, isfile # ==== functions ===== # Colors by rule RULE_COLORS = { 1: "255,0,0", # red - last 50bp of last coding junction 2: "255,140,0", # orange - intronless transcript 3: "139,0,0", # dark red - first 100bp of coding nucleotides + 4: "255,215,0", # gold - long coding exon (>400 nt) } RULE_DESCRIPTIONS = { 1: "Rule 1 - last 50bp of last coding exon junction", 2: "Rule 2 - intronless transcript", 3: "Rule 3 - first 100bp of coding nucleotides", + 4: "Rule 4 - PTC in long exon (>400 nt)", } +# Lindeboom et al. 2016: reduced NMD efficiency for PTCs in exons longer than this +LONG_EXON_THRESHOLD = 400 + def parseArgs(): " setup logging, parse command line arguments and options. -h shows auto-generated help page " parser = optparse.OptionParser("usage: %prog [options] inFname outDecoFname outCollapsedFname - " "Output BEDs with NMD escape regions. First output is decorator format, " "second is collapsed bigGenePred with gene symbols and transcript lists.") parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") parser.add_option("-f", "--format", dest="format", action="store", default="genePredExt", help="Input format: 'genePredExt' (with bin column, e.g. ncbiRefSeq) or " "'bigGenePred' (e.g. gencode .bb file, will use bigBedToBed). Default: genePredExt") (options, args) = parser.parse_args() if len(args) != 3: parser.print_help() @@ -220,37 +225,48 @@ regionData[key]["geneSym"] = geneSym if len(cdsExons)==1: # rule 2: intronless transcript bed = [chrom, str(cdsStart), str(cdsEnd), name] bedOut(bed, txStart, txEnd, decoOfh, 2) addRegions([(chrom, cdsStart, cdsEnd)], 2) else: if strand=="+": # rule 3: first 100bp of coding nucleotides regions = outputExonsUpTo(False, cdsExons, chrom, txStart, txEnd, name, 100, decoOfh, 3) addRegions(regions, 3) # rule 1: last 50bp of last coding junction regions = outputExonsUpTo(True, cdsExons, chrom, txStart, txEnd, name, -50, decoOfh, 1) addRegions(regions, 1) + # rule 4: coding exons >400 nt (excluding last; last is already covered by rule 1) + rule4Exons = cdsExons[:-1] else: # rule 3: first 100bp of coding nucleotides regions = outputExonsUpTo(True, cdsExons, chrom, txStart, txEnd, name, 100, decoOfh, 3) addRegions(regions, 3) # rule 1: last 50bp of last coding junction regions = outputExonsUpTo(False, cdsExons, chrom, txStart, txEnd, name, -50, decoOfh, 1) addRegions(regions, 1) + # rule 4: coding exons >400 nt (excluding last, which on - strand is cdsExons[0]) + rule4Exons = cdsExons[1:] + + # rule 4: Lindeboom et al. 2016 long-exon rule + for exStart, exEnd in rule4Exons: + if exEnd - exStart > LONG_EXON_THRESHOLD: + bed = [chrom, str(exStart), str(exEnd), name] + bedOut(bed, txStart, txEnd, decoOfh, 4) + addRegions([(chrom, exStart, exEnd)], 4) decoOfh.close() # write collapsed output as bed 9 + mouseover + transcripts collOfh = open(outCollapsedFname, "w") for (chrom, start, end, rule), data in sorted(regionData.items()): txList = sorted(set(data["transcripts"])) geneSym = data["geneSym"] if not geneSym: geneSym = txList[0] # pick strand: use the strand if all transcripts agree, else "." strands = data["strands"] strand = list(strands)[0] if len(strands) == 1 else "."