4c4eb6b4f0728fc8a985c5245267602d3bd86227
angie
  Thu Nov 21 09:49:25 2013 -0800
I misunderstood the SO term NMD_transcript_variant to mean 'a variantthat induces nonsense-mediate decay', i.e. worse than just a stop-gain.
However, the term actually means 'variant in a transcript that is
*already* subject to NMD', i.e. less serious than a variant in an
intron of a gene that gets translated.  gpFx.c now has different
logic for assigning that term, and hgVai has a new filtering option.
fixes #12205

diff --git src/hg/lib/gpFx.c src/hg/lib/gpFx.c
index d40fb99..fdbcee1 100644
--- src/hg/lib/gpFx.c
+++ src/hg/lib/gpFx.c
@@ -262,47 +262,52 @@
 	strlen(restOfTranscript) + 1);
     }
 
 return newTranscript;
 }
 
 static void setNCExonVals(struct gpFx *gpFx, int exonIx, int cdnaPos)
 /* This gpFx is for a variant in exon of non-coding gene or UTR exon of coding gene;
  * set details.nonCodingExon values. */
 {
 gpFx->details.nonCodingExon.exonNumber = exonIx;
 gpFx->details.nonCodingExon.cDnaPosition = cdnaPos;
 }
 
 static struct gpFx *gpFxCheckUtr( struct allele *allele, struct genePred *pred,
-				  struct txCoords *txc, int exonIx, struct lm *lm)
+				  struct txCoords *txc, int exonIx, boolean predIsNmd,
+				  struct lm *lm)
 /* check for effects in UTR of coding gene -- caller ensures it's in exon, pred is coding
  * and exonIx has been strand-adjusted */
 {
 struct gpFx *gpFx = NULL;
 enum soTerm term = 0;
 struct variant *variant = allele->variant;
 if ((variant->chromStart < pred->cdsStart && variant->chromEnd > pred->txStart) ||
     (variant->chromStart == pred->cdsStart && variant->chromEnd == pred->cdsStart)) // insertion
     // we're in left UTR
     term = (*pred->strand == '-') ? _3_prime_UTR_variant : _5_prime_UTR_variant;
 else if ((variant->chromStart < pred->txEnd && variant->chromEnd > pred->cdsEnd) ||
 	 (variant->chromStart == pred->cdsEnd && variant->chromEnd == pred->cdsEnd)) //insertion
     // we're in right UTR
     term = (*pred->strand == '-') ? _5_prime_UTR_variant : _3_prime_UTR_variant;
 if (term != 0)
     {
+    if (predIsNmd)
+	// This transcript is already subject to nonsense-mediated decay, so the effect
+	// is probably not a big deal:
+	term = NMD_transcript_variant;
     gpFx = gpFxNew(allele->sequence, pred->name, term, nonCodingExon, lm);
     setNCExonVals(gpFx, exonIx, txc->startInCdna);
     }
 return gpFx;
 }
 
 static struct gpFx *gpFxChangedNoncodingExon(struct allele *allele, struct genePred *pred,
 					     struct txCoords *txc, int exonIx, struct lm *lm)
 /* generate an effect for a variant in a non-coding transcript */
 {
 struct gpFx *gpFx = gpFxNew(allele->sequence, pred->name, non_coding_exon_variant, nonCodingExon,
 			    lm);
 setNCExonVals(gpFx, exonIx, txc->startInCdna);
 return gpFx;
 }
@@ -525,56 +530,32 @@
     reverseComplement(newAlleleSeq, newAlLen);
     }
 int variantSizeOnCds = endInCds - startInCds;
 if (variantSizeOnCds < 0)
     errAbort("gpFx: endInCds (%d) < startInCds (%d)", endInCds, startInCds);
 char *newCodingSeq = mergeAllele(oldCodingSeq, startInCds, variantSizeOnCds,
 				 newAlleleSeq, newAlLen, lm);
 // If newCodingSequence has an early stop, truncate there:
 truncateAtStopCodon(newCodingSeq);
 int variantSizeOnRef = allele->variant->chromEnd - allele->variant->chromStart;
 if (retCdsBasesAdded)
     *retCdsBasesAdded = allele->length - variantSizeOnRef;
 return newCodingSeq;
 }
 
-static boolean isSafeFromNMD(int exonNum, struct variant *variant, struct genePred *pred)
-/* Return TRUE if variant in strand-corrected exonNum is in the region
- * of pred that would make it safe from Nonsense-Mediated Decay (NMD).
- * NMD is triggered by the presence of a stop codon that appears
- * before ~50bp before the end of the last exon.  In other words, if
- * there's a stop codon with a detectable downstream splice junction,
- * translation is prevented -- see
- * http://en.wikipedia.org/wiki/MRNA_surveillance#Mechanism_in_mammals */
-{
-boolean lastExonNum = pred->exonCount - 1;
-if (exonNum == lastExonNum)
-    return TRUE;
-int nextToLastExonNum = pred->exonCount - 2;
-if (exonNum == nextToLastExonNum)
-    {
-    // Is it within 50bp of 3' end of exon?
-    if (pred->strand[0] == '-')
-	return (variant->chromEnd < pred->exonStarts[1] + 50);
-    else
-	return (variant->chromStart > pred->exonEnds[nextToLastExonNum] - 50);
-    }
-return FALSE;
-}
-
 static void setSpecificCodingSoTerm(struct gpFx *effect, char *oldAa, char *newAa,
-				    int cdsBasesAdded, boolean safeFromNMD)
+				    int cdsBasesAdded)
 /* Assuming that deletions are marked with dashes in newCodingSequence, 
  * and that effect fields aside from soNumber are already populated, use the
  * pep seqs and number of dashes to determine the appropriate SO term.
  * Probably the majority of cases will be synonymous_variant or missense_variant,
  * but we have to check for several other special cases esp. indels. */
 {
 struct codingChange *cc = &effect->details.codingChange;
 int oldAaSize = strlen(oldAa), newAaSize = strlen(newAa);
 if (sameString(newAa, oldAa))
     {
     if (cc->pepPosition == oldAaSize-1 && cc->aaOld[0] == 'Z')
 	effect->soNumber = stop_retained_variant;
     else
 	effect->soNumber = synonymous_variant;
     }
@@ -595,34 +576,32 @@
 	}
     else
 	{
 	// Not a deletion; could be single-base (including early stop) or insertion
 	if (newAaSize < oldAaSize)
 	    {
 	    // Not a deletion but protein got smaller; must have been an early stop codon,
 	    // possibly following a frameshift caused by an insertion.
 	    if (cc->aaNew[0] != 'Z')
 		{
 		if (newAa[newAaSize-1] != 'Z')
 		    errAbort("gpFx: new protein is smaller but last base in new sequence "
 			     "is '%c' not 'Z'", newAa[newAaSize-1]);
 		effect->soNumber = frameshift_variant;
 		}
-	    else if (safeFromNMD)
-		effect->soNumber = stop_gained;
 	    else
-		effect->soNumber = NMD_transcript_variant;
+		effect->soNumber = stop_gained;
 	    }
 	else if (newAaSize > oldAaSize)
 	    {
 	    // protein got bigger; insertion or lost stop codon
 	    if (cc->aaOld[0] == 'Z')
 		effect->soNumber = stop_lost;
 	    else if ((cdsBasesAdded % 3) == 0)
 		effect->soNumber = inframe_insertion;
 	    else
 		effect->soNumber = frameshift_variant;
 	    }
 	else
 	    {
 	    // Single aa change
 	    if (cc->pepPosition == 0 && cc->aaOld[0] == 'M')
@@ -630,31 +609,31 @@
 	    else if (cc->pepPosition == oldAaSize-1)
 		{
 		if (oldAa[oldAaSize-1] == 'Z')
 		    effect->soNumber = stop_lost;
 		else
 		    effect->soNumber = incomplete_terminal_codon_variant;
 		}
 	    else
 		effect->soNumber = missense_variant;
 	    }
 	}
     }
 }
 
 static struct gpFx *gpFxChangedCds(struct allele *allele, struct genePred *pred,
-				   struct txCoords *txc, int exonIx,
+				   struct txCoords *txc, int exonIx, boolean predIsNmd,
 				   struct dnaSeq *transcriptSequence, struct lm *lm)
 /* calculate effect of allele change on coding transcript */
 {
 // calculate original and variant coding DNA and AA's
 boolean addedBasesForFrame = FALSE;
 char *oldCodingSequence = getCodingSequence(pred, transcriptSequence->dna, &addedBasesForFrame, lm);
 int startInCds = txc->startInCds, endInCds = txc->endInCds;
 if (addedBasesForFrame)
     {
     // The annotated CDS exons were not all in frame, so getCodingSequence added 'N's
     // and now we can't simply use txc->startInCds.
     startInCds = getCorrectedCdsOffset(pred, txc->startInCds);
     endInCds = getCorrectedCdsOffset(pred, txc->endInCds);
     }
 int oldCdsLen = strlen(oldCodingSequence);
@@ -699,233 +678,253 @@
 	}
     cc->codonOld = lmCloneStringZ(lm, oldCodingSequence + pepPos*3, numOldCodons*3);
     cc->codonNew = lmCloneStringZ(lm, newCodingSequence + pepPos*3, numNewCodons*3);
     cc->aaOld = lmCloneStringZ(lm, oldaa + pepPos, numOldCodons);
     cc->aaNew = lmCloneStringZ(lm, newaa + pepPos, numNewCodons);
     }
 else
     {
     // frameshift -- who knows how many codons we can reliably predict...
     cc->codonOld = lmCloneString(lm, oldCodingSequence + pepPos*3);
     cc->codonNew = lmCloneString(lm, newCodingSequence + pepPos*3);
     cc->aaOld = lmCloneString(lm, oldaa + pepPos);
     cc->aaNew = lmCloneString(lm, newaa + pepPos);
     }
 
-boolean safeFromNMD = isSafeFromNMD(exonIx, allele->variant, pred);
-setSpecificCodingSoTerm(effect, oldaa, newaa, cdsBasesAdded, safeFromNMD);
+if (predIsNmd)
+    // This transcript is already subject to nonsense-mediated decay, so the effect
+    // is probably not a big deal:
+    effect->soNumber = NMD_transcript_variant;
+else
+    setSpecificCodingSoTerm(effect, oldaa, newaa, cdsBasesAdded);
 
 return effect;
 }
 
 
 static boolean hasAltAllele(struct allele *alleles)
 /* Make sure there's something to work on here... */
 {
 while (alleles != NULL && alleles->isReference)
     alleles = alleles->next;
 return (alleles != NULL);
 }
 
 char *firstAltAllele(struct allele *alleles)
 /* Ensembl always reports an alternate allele, even if that allele is not being used
  * to calculate any consequence.  When allele doesn't really matter, just use the
  * first alternate allele that is given. */
 {
 while (alleles != NULL && alleles->isReference)
     alleles = alleles->next;
 if (alleles == NULL)
     errAbort("firstAltAllele: no alt allele in list");
 return alleles->sequence;
 }
 
 static struct gpFx *gpFxInExon(struct variant *variant, struct txCoords *txc, int exonIx,
-			       struct genePred *pred, struct dnaSeq *transcriptSeq, struct lm *lm)
+			       struct genePred *pred, boolean predIsNmd,
+			       struct dnaSeq *transcriptSeq, struct lm *lm)
 /* Given a variant that overlaps an exon of pred, figure out what each allele does. */
 {
 struct gpFx *effectsList = NULL;
 struct allele *allele = variant->alleles;
 for ( ; allele ; allele = allele->next)
     {
     if (!allele->isReference)
 	{
 	if (pred->cdsStart != pred->cdsEnd)
 	    {
 	    // first find effects of allele in UTR, if any
 	    effectsList = slCat(effectsList,
-				gpFxCheckUtr(allele, pred, txc, exonIx, lm));
+				gpFxCheckUtr(allele, pred, txc, exonIx, predIsNmd, lm));
 	    if (txc->startInCds >= 0)
 		effectsList = slCat(effectsList,
-				    gpFxChangedCds(allele, pred, txc, exonIx, transcriptSeq, lm));
+				    gpFxChangedCds(allele, pred, txc, exonIx, predIsNmd,
+						   transcriptSeq, lm));
 	    }
 	else
 	    effectsList = slCat(effectsList,
 				gpFxChangedNoncodingExon(allele, pred, txc, exonIx, lm));
 
+	if (!predIsNmd)
+	    {
 	    // Was entire exon deleted?
 	    int exonNumPos = exonIx;
 	    if (pred->strand[0] == '-')
 		exonNumPos = pred->exonCount - 1 - exonIx;
 	    uint exonStart = pred->exonStarts[exonNumPos], exonEnd = pred->exonEnds[exonNumPos];
 	    if (variant->chromStart <= exonStart && variant->chromEnd >= exonEnd)
 		{
 		struct gpFx *effect = gpFxNew(allele->sequence, pred->name, exon_loss,
 					      nonCodingExon, lm);
 		setNCExonVals(effect, exonIx, txc->startInCdna);
 		slAddTail(&effectsList, effect);
 		}
 	    else
 		{
 		// If variant is in exon *but* within 3 bases of splice site,
 		// it also qualifies as splice_region_variant:
 		if ((variant->chromEnd > exonEnd-3 && variant->chromStart < exonEnd &&
 		     exonIx < pred->exonCount - 1) ||
 		    (variant->chromEnd > exonStart && variant->chromStart < exonStart+3 &&
 		     exonIx > 0))
 		    {
-		struct gpFx *effect = gpFxNew(allele->sequence, pred->name, splice_region_variant,
-					      nonCodingExon, lm);
+		    struct gpFx *effect = gpFxNew(allele->sequence, pred->name,
+						  splice_region_variant, nonCodingExon, lm);
 		    setNCExonVals(effect, exonIx, txc->startInCdna);
 		    slAddTail(&effectsList, effect);
 		    }
 		}
 	    }
 	}
+    }
 return effectsList;
 }
 
 static struct gpFx *gpFxInIntron(struct variant *variant, struct txCoords *txc, int intronIx,
-				 struct genePred *pred, char *altAllele, struct lm *lm)
+				 struct genePred *pred, boolean predIsNmd, char *altAllele,
+				 struct lm *lm)
 // Annotate a variant that overlaps an intron (and possibly splice region)
 //#*** TODO: watch out for "introns" that are actually indels between tx seq and ref genome!
 {
 struct gpFx *effectsList = NULL;
 boolean minusStrand = (pred->strand[0] == '-');
 // If on - strand, flip intron number back to + strand for getting intron coords:
 int intronPos = minusStrand ? (pred->exonCount - intronIx - 2) : intronIx;
 int intronStart = pred->exonEnds[intronPos];
 int intronEnd = pred->exonStarts[intronPos+1];
 if (variant->chromEnd > intronStart && variant->chromStart < intronEnd)
     {
     enum soTerm soNumber = intron_variant;
     if (variant->chromEnd > intronStart && variant->chromStart < intronStart+2)
 	// Within 2 bases of intron start(/end for '-'):
 	soNumber = minusStrand ? splice_acceptor_variant : splice_donor_variant;
     if (variant->chromEnd > intronEnd-2 && variant->chromStart < intronEnd)
 	// Within 2 bases of intron end(/start for '-'):
 	soNumber = minusStrand ? splice_donor_variant : splice_acceptor_variant;
     else if ((variant->chromEnd > intronStart+3 && variant->chromStart < intronStart+8) ||
 	     (variant->chromEnd > intronEnd-8 && variant->chromStart < intronEnd+3))
 	// Within 3 to 8 bases of intron start or end:
 	soNumber = splice_region_variant;
+    if (predIsNmd)
+	// This transcript is already subject to nonsense-mediated decay, so the effect
+	// is probably not a big deal:
+	soNumber = NMD_transcript_variant;
     struct gpFx *effects = gpFxNew(altAllele, pred->name, soNumber, intron, lm);
     effects->details.intron.intronNumber = intronIx;
     slAddTail(&effectsList, effects);
     }
 return effectsList;
 }
 
 static struct gpFx *gpFxCheckTranscript(struct variant *variant, struct genePred *pred,
 					struct dnaSeq *transcriptSeq, struct lm *lm)
 /* Check to see if variant overlaps an exon and/or intron of pred. */
 {
 struct gpFx *effectsList = NULL;
 uint varStart = variant->chromStart, varEnd = variant->chromEnd;
 if (varStart < pred->txEnd && varEnd > pred->txStart)
     {
+    boolean predIsNmd = genePredNmdTarget(pred);
     char *defaultAltAllele = firstAltAllele(variant->alleles);
     struct txCoords txc = getTxCoords(variant, pred);
     // Simplest case first: variant starts and ends in a single exon or single intron
     if (txc.startInExon == txc.endInExon && txc.startExonIx == txc.endExonIx)
 	{
 	int ix = txc.startExonIx;
 	if (txc.startInExon)
 	    {
 	    // Exonic variant; figure out what kind:
 	    effectsList = slCat(effectsList,
-				gpFxInExon(variant, &txc, ix, pred, transcriptSeq, lm));
+				gpFxInExon(variant, &txc, ix, pred, predIsNmd, transcriptSeq, lm));
 	    }
 	else
 	    {
 	    // Intronic (and/or splice) variant:
 	    effectsList = slCat(effectsList,
-				gpFxInIntron(variant, &txc, ix, pred, defaultAltAllele, lm));
+				gpFxInIntron(variant, &txc, ix, pred, predIsNmd, defaultAltAllele,
+					     lm));
 	    }
 	}
     else
 	{
+	if (!predIsNmd)
+	    {
 	    // Let the user beware -- this variant is just complex (it overlaps at least one
 	    // exon/intron boundary).  It could be an insertion, an MNV (multi-nt var) or
 	    // a deletion.
 	    struct gpFx *effect = gpFxNew(defaultAltAllele, pred->name, complex_transcript_variant,
 					  none, lm);
 	    effectsList = slCat(effectsList, effect);
+	    }
 	// But we can at least say which introns and/or exons are affected.
 	// Transform exon and intron numbers into ordered integers, -1 (upstream) through
 	// 2*lastExonIx+1 (downstream), with even numbers being exonNum*2 and odd numbers
 	// being intronNum*2 + 1:
 	int vieStart = (2 * txc.startExonIx) + (txc.startInExon ? 0 : 1);
 	int vieEnd = (2 * txc.endExonIx) + (txc.endInExon ? 0 : 1);
 	if (vieEnd < vieStart)
 	    {
 	    // Insertion at exon boundary (or bug)
 	    if (vieEnd != vieStart-1 || varStart != varEnd || txc.startInExon == txc.endInExon)
 		errAbort("gpFxCheckTranscript: expecting insertion in pred=%s "
 			 "but varStart=%d, varEnd=%d, vieStart=%d, vieEnd=%d, "
 			 "starts in %son, ends in %son",
 			 pred->name, varStart, varEnd, vieStart, vieEnd,
 			 (txc.startInExon ? "ex" : "intr"), (txc.endInExon ? "ex" : "intr"));
 	    // Since it's an insertion, remember that end is before start.
 	    if (txc.startInExon)
 		{
 		// Intronic end precedes exonic start. Watch out for upstream as "intron[-1]":
 		if (txc.endExonIx >= 0)
 		    effectsList = slCat(effectsList,
-					gpFxInIntron(variant, &txc, txc.endExonIx, pred,
+					gpFxInIntron(variant, &txc, txc.endExonIx, pred, predIsNmd,
 						     defaultAltAllele, lm));
 		effectsList = slCat(effectsList,
-				    gpFxInExon(variant, &txc, txc.startExonIx, pred,
+				    gpFxInExon(variant, &txc, txc.startExonIx, pred, predIsNmd,
 					       transcriptSeq, lm));
 		}
 	    else
 		{
 		// Exonic end precedes intronic start.
 		effectsList = slCat(effectsList,
-				    gpFxInExon(variant, &txc, txc.endExonIx, pred,
+				    gpFxInExon(variant, &txc, txc.endExonIx, pred, predIsNmd,
 					       transcriptSeq, lm));
 		// Watch out for downstream as "intron[lastExonIx]"
 		if (txc.startExonIx < txc.exonCount - 1)
 		    effectsList = slCat(effectsList,
 					gpFxInIntron(variant, &txc, txc.startExonIx, pred,
-						     defaultAltAllele, lm));
+						     predIsNmd, defaultAltAllele, lm));
 		}
 	    } // end if variant is insertion
 	else
 	    {
 	    // MNV or deletion - consider each overlapping intron and/or exon
 	    int ie;
 	    // Watch out for upstream (vieStart < 0) and downstream (vieEnd > last exon).
 	    for (ie = max(vieStart, 0);  ie <= min(vieEnd, 2*(pred->exonCount-1));  ie++)
 		{
 		boolean isExon = (ie%2 == 0);
 		int ix = ie / 2;
 		if (isExon)
 		    effectsList = slCat(effectsList,
-					gpFxInExon(variant, &txc, ix, pred, transcriptSeq, lm));
+					gpFxInExon(variant, &txc, ix, pred, predIsNmd,
+						   transcriptSeq, lm));
 		else
 		    effectsList = slCat(effectsList,
-					gpFxInIntron(variant, &txc, ix, pred,
+					gpFxInIntron(variant, &txc, ix, pred, predIsNmd,
 						     defaultAltAllele, lm));
 		} // end for each (partial) exon/intron overlapping variant
 	    } // end if variant is MNV or deletion
 	} // end if variant is complex
     } // end if variant overlaps pred
 return effectsList;
 }
 
 static struct gpFx *gpFxCheckUpDownstream(struct variant *variant, struct genePred *pred,
 					  struct lm *lm)
 // check to see if the variant is up or downstream
 {
 struct gpFx *effectsList = NULL;
 char *defaultAltAllele = firstAltAllele(variant->alleles);