4c4eb6b4f0728fc8a985c5245267602d3bd86227 angie Thu Nov 21 09:49:25 2013 -0800 I misunderstood the SO term NMD_transcript_variant to mean 'a variantthat induces nonsense-mediate decay', i.e. worse than just a stop-gain. However, the term actually means 'variant in a transcript that is *already* subject to NMD', i.e. less serious than a variant in an intron of a gene that gets translated. gpFx.c now has different logic for assigning that term, and hgVai has a new filtering option. fixes #12205 diff --git src/hg/lib/gpFx.c src/hg/lib/gpFx.c index d40fb99..fdbcee1 100644 --- src/hg/lib/gpFx.c +++ src/hg/lib/gpFx.c @@ -262,47 +262,52 @@ strlen(restOfTranscript) + 1); } return newTranscript; } static void setNCExonVals(struct gpFx *gpFx, int exonIx, int cdnaPos) /* This gpFx is for a variant in exon of non-coding gene or UTR exon of coding gene; * set details.nonCodingExon values. */ { gpFx->details.nonCodingExon.exonNumber = exonIx; gpFx->details.nonCodingExon.cDnaPosition = cdnaPos; } static struct gpFx *gpFxCheckUtr( struct allele *allele, struct genePred *pred, - struct txCoords *txc, int exonIx, struct lm *lm) + struct txCoords *txc, int exonIx, boolean predIsNmd, + struct lm *lm) /* check for effects in UTR of coding gene -- caller ensures it's in exon, pred is coding * and exonIx has been strand-adjusted */ { struct gpFx *gpFx = NULL; enum soTerm term = 0; struct variant *variant = allele->variant; if ((variant->chromStart < pred->cdsStart && variant->chromEnd > pred->txStart) || (variant->chromStart == pred->cdsStart && variant->chromEnd == pred->cdsStart)) // insertion // we're in left UTR term = (*pred->strand == '-') ? _3_prime_UTR_variant : _5_prime_UTR_variant; else if ((variant->chromStart < pred->txEnd && variant->chromEnd > pred->cdsEnd) || (variant->chromStart == pred->cdsEnd && variant->chromEnd == pred->cdsEnd)) //insertion // we're in right UTR term = (*pred->strand == '-') ? _5_prime_UTR_variant : _3_prime_UTR_variant; if (term != 0) { + if (predIsNmd) + // This transcript is already subject to nonsense-mediated decay, so the effect + // is probably not a big deal: + term = NMD_transcript_variant; gpFx = gpFxNew(allele->sequence, pred->name, term, nonCodingExon, lm); setNCExonVals(gpFx, exonIx, txc->startInCdna); } return gpFx; } static struct gpFx *gpFxChangedNoncodingExon(struct allele *allele, struct genePred *pred, struct txCoords *txc, int exonIx, struct lm *lm) /* generate an effect for a variant in a non-coding transcript */ { struct gpFx *gpFx = gpFxNew(allele->sequence, pred->name, non_coding_exon_variant, nonCodingExon, lm); setNCExonVals(gpFx, exonIx, txc->startInCdna); return gpFx; } @@ -525,56 +530,32 @@ reverseComplement(newAlleleSeq, newAlLen); } int variantSizeOnCds = endInCds - startInCds; if (variantSizeOnCds < 0) errAbort("gpFx: endInCds (%d) < startInCds (%d)", endInCds, startInCds); char *newCodingSeq = mergeAllele(oldCodingSeq, startInCds, variantSizeOnCds, newAlleleSeq, newAlLen, lm); // If newCodingSequence has an early stop, truncate there: truncateAtStopCodon(newCodingSeq); int variantSizeOnRef = allele->variant->chromEnd - allele->variant->chromStart; if (retCdsBasesAdded) *retCdsBasesAdded = allele->length - variantSizeOnRef; return newCodingSeq; } -static boolean isSafeFromNMD(int exonNum, struct variant *variant, struct genePred *pred) -/* Return TRUE if variant in strand-corrected exonNum is in the region - * of pred that would make it safe from Nonsense-Mediated Decay (NMD). - * NMD is triggered by the presence of a stop codon that appears - * before ~50bp before the end of the last exon. In other words, if - * there's a stop codon with a detectable downstream splice junction, - * translation is prevented -- see - * http://en.wikipedia.org/wiki/MRNA_surveillance#Mechanism_in_mammals */ -{ -boolean lastExonNum = pred->exonCount - 1; -if (exonNum == lastExonNum) - return TRUE; -int nextToLastExonNum = pred->exonCount - 2; -if (exonNum == nextToLastExonNum) - { - // Is it within 50bp of 3' end of exon? - if (pred->strand[0] == '-') - return (variant->chromEnd < pred->exonStarts[1] + 50); - else - return (variant->chromStart > pred->exonEnds[nextToLastExonNum] - 50); - } -return FALSE; -} - static void setSpecificCodingSoTerm(struct gpFx *effect, char *oldAa, char *newAa, - int cdsBasesAdded, boolean safeFromNMD) + int cdsBasesAdded) /* Assuming that deletions are marked with dashes in newCodingSequence, * and that effect fields aside from soNumber are already populated, use the * pep seqs and number of dashes to determine the appropriate SO term. * Probably the majority of cases will be synonymous_variant or missense_variant, * but we have to check for several other special cases esp. indels. */ { struct codingChange *cc = &effect->details.codingChange; int oldAaSize = strlen(oldAa), newAaSize = strlen(newAa); if (sameString(newAa, oldAa)) { if (cc->pepPosition == oldAaSize-1 && cc->aaOld[0] == 'Z') effect->soNumber = stop_retained_variant; else effect->soNumber = synonymous_variant; } @@ -595,34 +576,32 @@ } else { // Not a deletion; could be single-base (including early stop) or insertion if (newAaSize < oldAaSize) { // Not a deletion but protein got smaller; must have been an early stop codon, // possibly following a frameshift caused by an insertion. if (cc->aaNew[0] != 'Z') { if (newAa[newAaSize-1] != 'Z') errAbort("gpFx: new protein is smaller but last base in new sequence " "is '%c' not 'Z'", newAa[newAaSize-1]); effect->soNumber = frameshift_variant; } - else if (safeFromNMD) - effect->soNumber = stop_gained; else - effect->soNumber = NMD_transcript_variant; + effect->soNumber = stop_gained; } else if (newAaSize > oldAaSize) { // protein got bigger; insertion or lost stop codon if (cc->aaOld[0] == 'Z') effect->soNumber = stop_lost; else if ((cdsBasesAdded % 3) == 0) effect->soNumber = inframe_insertion; else effect->soNumber = frameshift_variant; } else { // Single aa change if (cc->pepPosition == 0 && cc->aaOld[0] == 'M') @@ -630,31 +609,31 @@ else if (cc->pepPosition == oldAaSize-1) { if (oldAa[oldAaSize-1] == 'Z') effect->soNumber = stop_lost; else effect->soNumber = incomplete_terminal_codon_variant; } else effect->soNumber = missense_variant; } } } } static struct gpFx *gpFxChangedCds(struct allele *allele, struct genePred *pred, - struct txCoords *txc, int exonIx, + struct txCoords *txc, int exonIx, boolean predIsNmd, struct dnaSeq *transcriptSequence, struct lm *lm) /* calculate effect of allele change on coding transcript */ { // calculate original and variant coding DNA and AA's boolean addedBasesForFrame = FALSE; char *oldCodingSequence = getCodingSequence(pred, transcriptSequence->dna, &addedBasesForFrame, lm); int startInCds = txc->startInCds, endInCds = txc->endInCds; if (addedBasesForFrame) { // The annotated CDS exons were not all in frame, so getCodingSequence added 'N's // and now we can't simply use txc->startInCds. startInCds = getCorrectedCdsOffset(pred, txc->startInCds); endInCds = getCorrectedCdsOffset(pred, txc->endInCds); } int oldCdsLen = strlen(oldCodingSequence); @@ -699,233 +678,253 @@ } cc->codonOld = lmCloneStringZ(lm, oldCodingSequence + pepPos*3, numOldCodons*3); cc->codonNew = lmCloneStringZ(lm, newCodingSequence + pepPos*3, numNewCodons*3); cc->aaOld = lmCloneStringZ(lm, oldaa + pepPos, numOldCodons); cc->aaNew = lmCloneStringZ(lm, newaa + pepPos, numNewCodons); } else { // frameshift -- who knows how many codons we can reliably predict... cc->codonOld = lmCloneString(lm, oldCodingSequence + pepPos*3); cc->codonNew = lmCloneString(lm, newCodingSequence + pepPos*3); cc->aaOld = lmCloneString(lm, oldaa + pepPos); cc->aaNew = lmCloneString(lm, newaa + pepPos); } -boolean safeFromNMD = isSafeFromNMD(exonIx, allele->variant, pred); -setSpecificCodingSoTerm(effect, oldaa, newaa, cdsBasesAdded, safeFromNMD); +if (predIsNmd) + // This transcript is already subject to nonsense-mediated decay, so the effect + // is probably not a big deal: + effect->soNumber = NMD_transcript_variant; +else + setSpecificCodingSoTerm(effect, oldaa, newaa, cdsBasesAdded); return effect; } static boolean hasAltAllele(struct allele *alleles) /* Make sure there's something to work on here... */ { while (alleles != NULL && alleles->isReference) alleles = alleles->next; return (alleles != NULL); } char *firstAltAllele(struct allele *alleles) /* Ensembl always reports an alternate allele, even if that allele is not being used * to calculate any consequence. When allele doesn't really matter, just use the * first alternate allele that is given. */ { while (alleles != NULL && alleles->isReference) alleles = alleles->next; if (alleles == NULL) errAbort("firstAltAllele: no alt allele in list"); return alleles->sequence; } static struct gpFx *gpFxInExon(struct variant *variant, struct txCoords *txc, int exonIx, - struct genePred *pred, struct dnaSeq *transcriptSeq, struct lm *lm) + struct genePred *pred, boolean predIsNmd, + struct dnaSeq *transcriptSeq, struct lm *lm) /* Given a variant that overlaps an exon of pred, figure out what each allele does. */ { struct gpFx *effectsList = NULL; struct allele *allele = variant->alleles; for ( ; allele ; allele = allele->next) { if (!allele->isReference) { if (pred->cdsStart != pred->cdsEnd) { // first find effects of allele in UTR, if any effectsList = slCat(effectsList, - gpFxCheckUtr(allele, pred, txc, exonIx, lm)); + gpFxCheckUtr(allele, pred, txc, exonIx, predIsNmd, lm)); if (txc->startInCds >= 0) effectsList = slCat(effectsList, - gpFxChangedCds(allele, pred, txc, exonIx, transcriptSeq, lm)); + gpFxChangedCds(allele, pred, txc, exonIx, predIsNmd, + transcriptSeq, lm)); } else effectsList = slCat(effectsList, gpFxChangedNoncodingExon(allele, pred, txc, exonIx, lm)); + if (!predIsNmd) + { // Was entire exon deleted? int exonNumPos = exonIx; if (pred->strand[0] == '-') exonNumPos = pred->exonCount - 1 - exonIx; uint exonStart = pred->exonStarts[exonNumPos], exonEnd = pred->exonEnds[exonNumPos]; if (variant->chromStart <= exonStart && variant->chromEnd >= exonEnd) { struct gpFx *effect = gpFxNew(allele->sequence, pred->name, exon_loss, nonCodingExon, lm); setNCExonVals(effect, exonIx, txc->startInCdna); slAddTail(&effectsList, effect); } else { // If variant is in exon *but* within 3 bases of splice site, // it also qualifies as splice_region_variant: if ((variant->chromEnd > exonEnd-3 && variant->chromStart < exonEnd && exonIx < pred->exonCount - 1) || (variant->chromEnd > exonStart && variant->chromStart < exonStart+3 && exonIx > 0)) { - struct gpFx *effect = gpFxNew(allele->sequence, pred->name, splice_region_variant, - nonCodingExon, lm); + struct gpFx *effect = gpFxNew(allele->sequence, pred->name, + splice_region_variant, nonCodingExon, lm); setNCExonVals(effect, exonIx, txc->startInCdna); slAddTail(&effectsList, effect); } } } } + } return effectsList; } static struct gpFx *gpFxInIntron(struct variant *variant, struct txCoords *txc, int intronIx, - struct genePred *pred, char *altAllele, struct lm *lm) + struct genePred *pred, boolean predIsNmd, char *altAllele, + struct lm *lm) // Annotate a variant that overlaps an intron (and possibly splice region) //#*** TODO: watch out for "introns" that are actually indels between tx seq and ref genome! { struct gpFx *effectsList = NULL; boolean minusStrand = (pred->strand[0] == '-'); // If on - strand, flip intron number back to + strand for getting intron coords: int intronPos = minusStrand ? (pred->exonCount - intronIx - 2) : intronIx; int intronStart = pred->exonEnds[intronPos]; int intronEnd = pred->exonStarts[intronPos+1]; if (variant->chromEnd > intronStart && variant->chromStart < intronEnd) { enum soTerm soNumber = intron_variant; if (variant->chromEnd > intronStart && variant->chromStart < intronStart+2) // Within 2 bases of intron start(/end for '-'): soNumber = minusStrand ? splice_acceptor_variant : splice_donor_variant; if (variant->chromEnd > intronEnd-2 && variant->chromStart < intronEnd) // Within 2 bases of intron end(/start for '-'): soNumber = minusStrand ? splice_donor_variant : splice_acceptor_variant; else if ((variant->chromEnd > intronStart+3 && variant->chromStart < intronStart+8) || (variant->chromEnd > intronEnd-8 && variant->chromStart < intronEnd+3)) // Within 3 to 8 bases of intron start or end: soNumber = splice_region_variant; + if (predIsNmd) + // This transcript is already subject to nonsense-mediated decay, so the effect + // is probably not a big deal: + soNumber = NMD_transcript_variant; struct gpFx *effects = gpFxNew(altAllele, pred->name, soNumber, intron, lm); effects->details.intron.intronNumber = intronIx; slAddTail(&effectsList, effects); } return effectsList; } static struct gpFx *gpFxCheckTranscript(struct variant *variant, struct genePred *pred, struct dnaSeq *transcriptSeq, struct lm *lm) /* Check to see if variant overlaps an exon and/or intron of pred. */ { struct gpFx *effectsList = NULL; uint varStart = variant->chromStart, varEnd = variant->chromEnd; if (varStart < pred->txEnd && varEnd > pred->txStart) { + boolean predIsNmd = genePredNmdTarget(pred); char *defaultAltAllele = firstAltAllele(variant->alleles); struct txCoords txc = getTxCoords(variant, pred); // Simplest case first: variant starts and ends in a single exon or single intron if (txc.startInExon == txc.endInExon && txc.startExonIx == txc.endExonIx) { int ix = txc.startExonIx; if (txc.startInExon) { // Exonic variant; figure out what kind: effectsList = slCat(effectsList, - gpFxInExon(variant, &txc, ix, pred, transcriptSeq, lm)); + gpFxInExon(variant, &txc, ix, pred, predIsNmd, transcriptSeq, lm)); } else { // Intronic (and/or splice) variant: effectsList = slCat(effectsList, - gpFxInIntron(variant, &txc, ix, pred, defaultAltAllele, lm)); + gpFxInIntron(variant, &txc, ix, pred, predIsNmd, defaultAltAllele, + lm)); } } else { + if (!predIsNmd) + { // Let the user beware -- this variant is just complex (it overlaps at least one // exon/intron boundary). It could be an insertion, an MNV (multi-nt var) or // a deletion. struct gpFx *effect = gpFxNew(defaultAltAllele, pred->name, complex_transcript_variant, none, lm); effectsList = slCat(effectsList, effect); + } // But we can at least say which introns and/or exons are affected. // Transform exon and intron numbers into ordered integers, -1 (upstream) through // 2*lastExonIx+1 (downstream), with even numbers being exonNum*2 and odd numbers // being intronNum*2 + 1: int vieStart = (2 * txc.startExonIx) + (txc.startInExon ? 0 : 1); int vieEnd = (2 * txc.endExonIx) + (txc.endInExon ? 0 : 1); if (vieEnd < vieStart) { // Insertion at exon boundary (or bug) if (vieEnd != vieStart-1 || varStart != varEnd || txc.startInExon == txc.endInExon) errAbort("gpFxCheckTranscript: expecting insertion in pred=%s " "but varStart=%d, varEnd=%d, vieStart=%d, vieEnd=%d, " "starts in %son, ends in %son", pred->name, varStart, varEnd, vieStart, vieEnd, (txc.startInExon ? "ex" : "intr"), (txc.endInExon ? "ex" : "intr")); // Since it's an insertion, remember that end is before start. if (txc.startInExon) { // Intronic end precedes exonic start. Watch out for upstream as "intron[-1]": if (txc.endExonIx >= 0) effectsList = slCat(effectsList, - gpFxInIntron(variant, &txc, txc.endExonIx, pred, + gpFxInIntron(variant, &txc, txc.endExonIx, pred, predIsNmd, defaultAltAllele, lm)); effectsList = slCat(effectsList, - gpFxInExon(variant, &txc, txc.startExonIx, pred, + gpFxInExon(variant, &txc, txc.startExonIx, pred, predIsNmd, transcriptSeq, lm)); } else { // Exonic end precedes intronic start. effectsList = slCat(effectsList, - gpFxInExon(variant, &txc, txc.endExonIx, pred, + gpFxInExon(variant, &txc, txc.endExonIx, pred, predIsNmd, transcriptSeq, lm)); // Watch out for downstream as "intron[lastExonIx]" if (txc.startExonIx < txc.exonCount - 1) effectsList = slCat(effectsList, gpFxInIntron(variant, &txc, txc.startExonIx, pred, - defaultAltAllele, lm)); + predIsNmd, defaultAltAllele, lm)); } } // end if variant is insertion else { // MNV or deletion - consider each overlapping intron and/or exon int ie; // Watch out for upstream (vieStart < 0) and downstream (vieEnd > last exon). for (ie = max(vieStart, 0); ie <= min(vieEnd, 2*(pred->exonCount-1)); ie++) { boolean isExon = (ie%2 == 0); int ix = ie / 2; if (isExon) effectsList = slCat(effectsList, - gpFxInExon(variant, &txc, ix, pred, transcriptSeq, lm)); + gpFxInExon(variant, &txc, ix, pred, predIsNmd, + transcriptSeq, lm)); else effectsList = slCat(effectsList, - gpFxInIntron(variant, &txc, ix, pred, + gpFxInIntron(variant, &txc, ix, pred, predIsNmd, defaultAltAllele, lm)); } // end for each (partial) exon/intron overlapping variant } // end if variant is MNV or deletion } // end if variant is complex } // end if variant overlaps pred return effectsList; } static struct gpFx *gpFxCheckUpDownstream(struct variant *variant, struct genePred *pred, struct lm *lm) // check to see if the variant is up or downstream { struct gpFx *effectsList = NULL; char *defaultAltAllele = firstAltAllele(variant->alleles);