6fa1df41a2b6302d544686fb59cea1023fb6e674 angie Mon Sep 30 16:38:59 2013 -0700 Fix for crash found by Jonathan in http://redmine.soe.ucsc.edu/issues/11775#note-60 ;more careful error-checking and corner-case handling in getTxCoords. When CDS starts out of frame, we were getting early truncation of coding sequence, leading to some strange characters in output; the ideal fix would be to take exonFrames into account, but for now we can just bail if coding sequence ends before the variant hits it. Also fixing a couple bone-headed incorrect invocations of slAddTail, and making sure that an exon's splice_region_variant is listed with an exon number, not an intron number. refs #11775, #11771 diff --git src/hg/lib/gpFx.c src/hg/lib/gpFx.c index 19ddb6b..17cc5f1 100644 --- src/hg/lib/gpFx.c +++ src/hg/lib/gpFx.c @@ -54,149 +54,170 @@ } static struct txCoords txCoordsReverse(struct txCoords *txcIn) /* Return a struct txCoords with same info as txcIn, but strand-swapped. */ { if (txcIn->exonCount <= 0 || txcIn->cdnaSize <= 0) errAbort("txCoordsReverse: called with non-positive exonCount (%d) or cdnaSize (%d) ", txcIn->exonCount, txcIn->cdnaSize); struct txCoords txcSwapped; char swappedStrand = (txcIn->strand == '+') ? '-' : '+'; txCoordsInit(&txcSwapped, txcIn->exonCount, swappedStrand, txcIn->cdnaSize, txcIn->cdsSize); if (txcIn->startInExon) { txcSwapped.endInExon = TRUE; txcSwapped.endExonIx = txcIn->exonCount - 1 - txcIn->startExonIx; - if (txcIn->startInCdna >= 0) - txcSwapped.endInCdna = txcIn->cdnaSize - txcIn->startInCdna; - if (txcIn->startInCds >= 0) - txcSwapped.endInCds = txcIn->cdsSize - txcIn->startInCds; } else { // Intron number, not exon number, so subtract another 1 here: txcSwapped.endExonIx = txcIn->exonCount - 2 - txcIn->startExonIx; } +if (txcIn->startInCdna >= 0) + txcSwapped.endInCdna = txcIn->cdnaSize - txcIn->startInCdna; +if (txcIn->startInCds >= 0) + txcSwapped.endInCds = txcIn->cdsSize - txcIn->startInCds; if (txcIn->endInExon) { txcSwapped.startInExon = TRUE; txcSwapped.startExonIx = txcIn->exonCount - 1 - txcIn->endExonIx; - if (txcIn->endInCdna > 0) - txcSwapped.startInCdna = txcIn->cdnaSize - txcIn->endInCdna; - if (txcIn->endInCds > 0) - txcSwapped.startInCds = txcIn->cdsSize - txcIn->endInCds; } else { // Intron number, not exon number, so subtract another 1 here: txcSwapped.startExonIx = txcIn->exonCount - 2 - txcIn->endExonIx; } +if (txcIn->endInCdna > 0) + txcSwapped.startInCdna = txcIn->cdnaSize - txcIn->endInCdna; +if (txcIn->endInCds > 0) + txcSwapped.startInCds = txcIn->cdsSize - txcIn->endInCds; return txcSwapped; } static struct txCoords getTxCoords(struct variant *variant, struct genePred *pred) /* Figure out where the variant's start and end hit the transcript: intron, UTR, CDS? * Result is on pred->strand. */ { struct txCoords txc; txCoordsInit(&txc, pred->exonCount, '+', 0, 0); // Compute cdnaSize, cdsSize below. int exonOffset = 0, cdsOffset = 0; uint varStart = variant->chromStart, varEnd = variant->chromEnd; // If the variant begins upstream, handle that before looping on exons: if (varStart < pred->txStart && varEnd > pred->txStart) { txc.startInCdna = 0; - if (varEnd > pred->cdsStart) + if (varStart < pred->cdsEnd && varEnd > pred->cdsStart) txc.startInCds = 0; } int ii; for (ii = 0; ii < pred->exonCount; ii++) { uint exonStart = pred->exonStarts[ii], exonEnd = pred->exonEnds[ii]; uint exonCdsStart = max(pred->cdsStart, exonStart); uint exonCdsEnd = min(pred->cdsEnd, exonEnd); if (varStart >= exonStart && varStart < exonEnd) { txc.startInExon = TRUE; txc.startExonIx = ii; txc.startInCdna = exonOffset + varStart - exonStart; if (varStart >= pred->cdsStart && varStart < pred->cdsEnd) txc.startInCds = cdsOffset + varStart - exonCdsStart; + // If this is an insertion at the beginning of an exon, varEnd is at the end + // of the preceding intron and its endInC* have not been set, so copy them over: + if (varEnd == varStart) + { + txc.endInCdna = txc.startInCdna; + txc.endInCds = txc.startInCds; + } } if (varEnd > exonStart && varEnd <= exonEnd) { txc.endInExon = TRUE; txc.endExonIx = ii; txc.endInCdna = exonOffset + varEnd - exonStart; if (varEnd > pred->cdsStart && varEnd <= pred->cdsEnd) txc.endInCds = cdsOffset + varEnd - exonCdsStart; + // If this is an insertion at the end of an exon, varStart is at the beginning + // of the following intron and its startInC* have not been set, so copy them over: + if (varStart == varEnd) + { + txc.startInCdna = txc.endInCdna; + txc.startInCds = txc.endInCds; + } } if (ii < pred->exonCount - 1) { uint nextExonStart = pred->exonStarts[ii+1]; // 'exonIx' is actually an intronIx in this case: if (varStart >= exonEnd && varStart < nextExonStart) { txc.startExonIx = ii; if (varEnd > nextExonStart) { // Variant starts in an intron, but it overlaps the next exon; // note the start in cDNA (and CDS if applicable): txc.startInCdna = exonOffset + exonEnd - exonStart; - if (varEnd > pred->cdsStart) + if (varStart < pred->cdsEnd && varEnd > pred->cdsStart) { uint nextExonEnd = pred->exonEnds[ii+1]; if (nextExonEnd > pred->cdsStart) txc.startInCds = cdsOffset; else txc.startInCds = 0; } } } if (varEnd > exonEnd && varEnd <= nextExonStart) { txc.endExonIx = ii; if (varStart < exonEnd) { // Variant ends in an intron, but it also overlaps the previous exon; // note the end in cDNA (and CDS if applicable): txc.endInCdna = exonOffset + exonEnd - exonStart; - if (varStart < pred->cdsEnd) + if (varEnd > pred->cdsStart && varStart < pred->cdsEnd) { if (exonStart < pred->cdsEnd) txc.endInCds = cdsOffset + exonCdsEnd - exonCdsStart; else txc.endInCds = cdsOffset; } } } } exonOffset += exonEnd - exonStart; if (exonStart < pred->cdsEnd && exonEnd > pred->cdsStart) cdsOffset += exonCdsEnd - exonCdsStart; } txc.cdnaSize = exonOffset; txc.cdsSize = cdsOffset; // Does variant end downstream? if (varEnd > pred->txEnd) { txc.endInCdna = txc.cdnaSize; - if (varStart < pred->cdsEnd) + if (varStart < pred->cdsEnd && varEnd > pred->cdsStart) txc.endInCds = txc.cdsSize; } if (pred->strand[0] == '-') txc = txCoordsReverse(&txc); +if ((txc.startInCdna == -1) != (txc.endInCdna == -1) || + (txc.startInCds >= 0 && txc.endInCds < 0)) + errAbort("getTxCoords: inconsistent start/ends for variant %s:%d-%d in %s at %s:%d-%d: " + "startInCdna=%d, endInCdna=%d; startInCds=%d, endInCds=%d", + variant->chrom, varStart+1, varEnd, + pred->name, pred->chrom, pred->txStart, pred->txEnd, + txc.startInCdna, txc.endInCdna, txc.startInCds, txc.endInCds); return txc; } struct gpFx *gpFxNew(char *allele, char *transcript, enum soTerm soNumber, enum detailType detailType, struct lm *lm) /* Fill in the common members of gpFx; leave soTerm-specific members for caller to fill in. */ { struct gpFx *effect; lmAllocVar(lm, effect); effect->allele = collapseDashes(strUpper(lmCloneString(lm, allele))); effect->transcript = lmCloneString(lm, transcript); effect->soNumber = soNumber; effect->detailType = detailType; return effect; } @@ -378,30 +399,32 @@ static char *gpFxModifyCodingSequence(char *oldCodingSeq, struct genePred *pred, struct txCoords *txc, struct allele *allele, int *retCdsBasesAdded, struct lm *lm) /* Return a new coding sequence that is oldCodingSeq with allele applied. */ { boolean isRc = (pred->strand[0] == '-'); char *newAlleleSeq = allele->sequence; int newAlLen = strlen(newAlleleSeq); if (isRc) { newAlleleSeq = lmCloneString(lm, allele->sequence); reverseComplement(newAlleleSeq, newAlLen); } int variantSizeOnCds = txc->endInCds - txc->startInCds; +if (variantSizeOnCds < 0) + errAbort("gpFx: endInCds (%d) < startInCds (%d)", txc->endInCds, txc->startInCds); char *newCodingSeq = mergeAllele(oldCodingSeq, txc->startInCds, variantSizeOnCds, newAlleleSeq, allele->length, lm); // If newCodingSequence has an early stop, truncate there: truncateAtStopCodon(newCodingSeq); int variantSizeOnRef = allele->variant->chromEnd - allele->variant->chromStart; if (retCdsBasesAdded) *retCdsBasesAdded = allele->length - variantSizeOnRef; return newCodingSeq; } static boolean isSafeFromNMD(int exonNum, struct variant *variant, struct genePred *pred) /* Return TRUE if variant in strand-corrected exonNum is in the region * of pred that would make it safe from Nonsense-Mediated Decay (NMD). * NMD is triggered by the presence of a stop codon that appears * before ~50bp before the end of the last exon. In other words, if @@ -497,50 +520,55 @@ else effect->soNumber = incomplete_terminal_codon_variant; } else effect->soNumber = missense_variant; } } } } static struct gpFx *gpFxChangedCds(struct allele *allele, struct genePred *pred, struct txCoords *txc, int exonIx, struct dnaSeq *transcriptSequence, struct lm *lm) /* calculate effect of allele change on coding transcript */ { -struct gpFx *effectsList = NULL; // calculate original and variant coding DNA and AA's char *oldCodingSequence = getCodingSequence(pred, transcriptSequence->dna, lm); int oldCdsLen = strlen(oldCodingSequence); char *oldaa = lmSimpleTranslate(lm, oldCodingSequence, oldCdsLen); int cdsBasesAdded = 0; char *newCodingSequence = gpFxModifyCodingSequence(oldCodingSequence, pred, txc, allele, &cdsBasesAdded, lm); int newCdsLen = strlen(newCodingSequence); char *newaa = lmSimpleTranslate(lm, newCodingSequence, newCdsLen); // allocate the effect structure - fill in soNumber and details below struct gpFx *effect = gpFxNew(allele->sequence, pred->name, coding_sequence_variant, codingChange, lm); struct codingChange *cc = &effect->details.codingChange; cc->cDnaPosition = txc->startInCdna; cc->cdsPosition = txc->startInCds; cc->exonNumber = exonIx; int pepPos = txc->startInCds / 3; +// At this point we don't use genePredExt's exonFrames field -- we just assume that +// the CDS starts in frame. That's not always the case (e.g. ensGene has some CDSs +// that begin out of frame), so watch out for early truncation of oldCodingSequence +// due to stop codon in the wrong frame: +if (pepPos >= strlen(oldaa)) + return NULL; cc->pepPosition = pepPos; if (cdsBasesAdded % 3 == 0) { // Common case: substitution, same number of old/new codons/peps: int numOldCodons = (1 + allele->length / 3), numNewCodons = (1 + allele->length / 3); if (cdsBasesAdded > 0) { // insertion: more new codons than old numOldCodons = (cc->cdsPosition % 3) == 0 ? 0 : 1; numNewCodons = numOldCodons + (cdsBasesAdded / 3); } else if (cdsBasesAdded < 0) { // deletion: more old codons than new numNewCodons = (cc->cdsPosition % 3) == 0 ? 0 : 1; @@ -551,32 +579,31 @@ cc->aaOld = lmCloneStringZ(lm, oldaa + pepPos, numOldCodons); cc->aaNew = lmCloneStringZ(lm, newaa + pepPos, numNewCodons); } else { // frameshift -- who knows how many codons we can reliably predict... cc->codonOld = lmCloneString(lm, oldCodingSequence + pepPos*3); cc->codonNew = lmCloneString(lm, newCodingSequence + pepPos*3); cc->aaOld = lmCloneString(lm, oldaa + pepPos); cc->aaNew = lmCloneString(lm, newaa + pepPos); } boolean safeFromNMD = isSafeFromNMD(exonIx, allele->variant, pred); setSpecificCodingSoTerm(effect, oldaa, newaa, cdsBasesAdded, safeFromNMD); -slAddHead(&effectsList, effect); -return effectsList; +return effect; } static boolean hasAltAllele(struct allele *alleles) /* Make sure there's something to work on here... */ { while (alleles != NULL && alleles->isReference) alleles = alleles->next; return (alleles != NULL); } static char *firstAltAllele(struct allele *alleles) /* Ensembl always reports an alternate allele, even if that allele is not being used * to calculate any consequence. When allele doesn't really matter, just use the * first alternate allele that is given. */ @@ -609,45 +636,43 @@ } else effectsList = slCat(effectsList, gpFxChangedNoncodingExon(allele, pred, txc, exonIx, lm)); // Was entire exon deleted? int exonNumPos = exonIx; if (pred->strand[0] == '-') exonNumPos = pred->exonCount - 1 - exonIx; uint exonStart = pred->exonStarts[exonNumPos], exonEnd = pred->exonEnds[exonNumPos]; if (variant->chromStart <= exonStart && variant->chromEnd >= exonEnd) { struct gpFx *effect = gpFxNew(allele->sequence, pred->name, exon_loss, nonCodingExon, lm); setNCExonVals(effect, exonIx, txc->startInCdna); - effect->details.intron.intronNumber = exonIx; - slAddTail(effectsList, effect); + slAddTail(&effectsList, effect); } else { // If variant is in exon *but* within 3 bases of splice site, // it also qualifies as splice_region_variant: if ((variant->chromEnd > exonEnd-3 && variant->chromStart < exonEnd) || (variant->chromEnd > exonStart && variant->chromStart < exonStart+3)) { struct gpFx *effect = gpFxNew(allele->sequence, pred->name, splice_region_variant, - intron, lm); -//#*** Shouldn't we make sure to have the right intron number here? - effect->details.intron.intronNumber = exonIx; - slAddTail(effectsList, effect); + nonCodingExon, lm); + setNCExonVals(effect, exonIx, txc->startInCdna); + slAddTail(&effectsList, effect); } } } } return effectsList; } static struct gpFx *gpFxInIntron(struct variant *variant, struct txCoords *txc, int intronIx, struct genePred *pred, char *altAllele, struct lm *lm) // Annotate a variant that overlaps an intron (and possibly splice region) //#*** TODO: watch out for "introns" that are actually indels between tx seq and ref genome! { struct gpFx *effectsList = NULL; boolean minusStrand = (pred->strand[0] == '-'); // If on - strand, flip intron number back to + strand for getting intron coords: