6fa1df41a2b6302d544686fb59cea1023fb6e674
angie
  Mon Sep 30 16:38:59 2013 -0700
Fix for crash found by Jonathan in http://redmine.soe.ucsc.edu/issues/11775#note-60 ;more careful error-checking and corner-case handling in getTxCoords.
When CDS starts out of frame, we were getting early truncation of
coding sequence, leading to some strange characters in output; the
ideal fix would be to take exonFrames into account, but for now we can
just bail if coding sequence ends before the variant hits it.  Also
fixing a couple bone-headed incorrect invocations of slAddTail, and
making sure that an exon's splice_region_variant is listed with an
exon number, not an intron number.
refs #11775, #11771

diff --git src/hg/lib/gpFx.c src/hg/lib/gpFx.c
index 19ddb6b..17cc5f1 100644
--- src/hg/lib/gpFx.c
+++ src/hg/lib/gpFx.c
@@ -54,149 +54,170 @@
 }
 
 static struct txCoords txCoordsReverse(struct txCoords *txcIn)
 /* Return a struct txCoords with same info as txcIn, but strand-swapped. */
 {
 if (txcIn->exonCount <= 0 || txcIn->cdnaSize <= 0)
     errAbort("txCoordsReverse: called with non-positive exonCount (%d) or cdnaSize (%d) ",
 	     txcIn->exonCount, txcIn->cdnaSize);
 struct txCoords txcSwapped;
 char swappedStrand = (txcIn->strand == '+') ? '-' : '+';
 txCoordsInit(&txcSwapped, txcIn->exonCount, swappedStrand, txcIn->cdnaSize, txcIn->cdsSize);
 if (txcIn->startInExon)
     {
     txcSwapped.endInExon = TRUE;
     txcSwapped.endExonIx = txcIn->exonCount - 1 - txcIn->startExonIx;
-    if (txcIn->startInCdna >= 0)
-	txcSwapped.endInCdna = txcIn->cdnaSize - txcIn->startInCdna;
-    if (txcIn->startInCds >= 0)
-	txcSwapped.endInCds = txcIn->cdsSize - txcIn->startInCds;
     }
 else
     {
     // Intron number, not exon number, so subtract another 1 here:
     txcSwapped.endExonIx = txcIn->exonCount - 2 - txcIn->startExonIx;
     }
+if (txcIn->startInCdna >= 0)
+    txcSwapped.endInCdna = txcIn->cdnaSize - txcIn->startInCdna;
+if (txcIn->startInCds >= 0)
+    txcSwapped.endInCds = txcIn->cdsSize - txcIn->startInCds;
 if (txcIn->endInExon)
     {
     txcSwapped.startInExon = TRUE;
     txcSwapped.startExonIx = txcIn->exonCount - 1 - txcIn->endExonIx;
-    if (txcIn->endInCdna > 0)
-	txcSwapped.startInCdna = txcIn->cdnaSize - txcIn->endInCdna;
-    if (txcIn->endInCds > 0)
-	txcSwapped.startInCds = txcIn->cdsSize - txcIn->endInCds;
     }
 else
     {
     // Intron number, not exon number, so subtract another 1 here:
     txcSwapped.startExonIx = txcIn->exonCount - 2 - txcIn->endExonIx;
     }
+if (txcIn->endInCdna > 0)
+    txcSwapped.startInCdna = txcIn->cdnaSize - txcIn->endInCdna;
+if (txcIn->endInCds > 0)
+    txcSwapped.startInCds = txcIn->cdsSize - txcIn->endInCds;
 return txcSwapped;
 }
 
 static struct txCoords getTxCoords(struct variant *variant, struct genePred *pred)
 /* Figure out where the variant's start and end hit the transcript: intron, UTR, CDS?
  * Result is on pred->strand. */
 {
 struct txCoords txc;
 txCoordsInit(&txc, pred->exonCount, '+', 0, 0); // Compute cdnaSize, cdsSize below.
 int exonOffset = 0, cdsOffset = 0;
 uint varStart = variant->chromStart, varEnd = variant->chromEnd;
 // If the variant begins upstream, handle that before looping on exons:
 if (varStart < pred->txStart && varEnd > pred->txStart)
     {
     txc.startInCdna = 0;
-    if (varEnd > pred->cdsStart)
+    if (varStart < pred->cdsEnd && varEnd > pred->cdsStart)
 	txc.startInCds = 0;
     }
 int ii;
 for (ii = 0;  ii < pred->exonCount;  ii++)
     {
     uint exonStart = pred->exonStarts[ii], exonEnd = pred->exonEnds[ii];
     uint exonCdsStart = max(pred->cdsStart, exonStart);
     uint exonCdsEnd = min(pred->cdsEnd, exonEnd);
     if (varStart >= exonStart && varStart < exonEnd)
 	{
 	txc.startInExon = TRUE;
 	txc.startExonIx = ii;
 	txc.startInCdna = exonOffset + varStart - exonStart;
 	if (varStart >= pred->cdsStart && varStart < pred->cdsEnd)
 	    txc.startInCds = cdsOffset + varStart - exonCdsStart;
+	// If this is an insertion at the beginning of an exon, varEnd is at the end
+	// of the preceding intron and its endInC* have not been set, so copy them over:
+	if (varEnd == varStart)
+	    {
+	    txc.endInCdna = txc.startInCdna;
+	    txc.endInCds = txc.startInCds;
+	    }
 	}
     if (varEnd > exonStart && varEnd <= exonEnd)
 	{
 	txc.endInExon = TRUE;
 	txc.endExonIx = ii;
 	txc.endInCdna = exonOffset + varEnd - exonStart;
 	if (varEnd > pred->cdsStart && varEnd <= pred->cdsEnd)
 	    txc.endInCds = cdsOffset + varEnd - exonCdsStart;
+	// If this is an insertion at the end of an exon, varStart is at the beginning
+	// of the following intron and its startInC* have not been set, so copy them over:
+	if (varStart == varEnd)
+	    {
+	    txc.startInCdna = txc.endInCdna;
+	    txc.startInCds = txc.endInCds;
+	    }
 	}
     if (ii < pred->exonCount - 1)
 	{
 	uint nextExonStart = pred->exonStarts[ii+1];
 	// 'exonIx' is actually an intronIx in this case:
 	if (varStart >= exonEnd && varStart < nextExonStart)
 	    {
 	    txc.startExonIx = ii;
 	    if (varEnd > nextExonStart)
 		{
 		// Variant starts in an intron, but it overlaps the next exon;
 		// note the start in cDNA (and CDS if applicable):
 		txc.startInCdna = exonOffset + exonEnd - exonStart;
-		if (varEnd > pred->cdsStart)
+		if (varStart < pred->cdsEnd && varEnd > pred->cdsStart)
 		    {
 		    uint nextExonEnd = pred->exonEnds[ii+1];
 		    if (nextExonEnd > pred->cdsStart)
 			txc.startInCds = cdsOffset;
 		    else
 			txc.startInCds = 0;
 		    }
 		}
 	    }
 	if (varEnd > exonEnd && varEnd <= nextExonStart)
 	    {
 	    txc.endExonIx = ii;
 	    if (varStart < exonEnd)
 		{
 		// Variant ends in an intron, but it also overlaps the previous exon;
 		// note the end in cDNA (and CDS if applicable):
 		txc.endInCdna = exonOffset + exonEnd - exonStart;
-		if (varStart < pred->cdsEnd)
+		if (varEnd > pred->cdsStart && varStart < pred->cdsEnd)
 		    {
 		    if (exonStart < pred->cdsEnd)
 			txc.endInCds = cdsOffset + exonCdsEnd - exonCdsStart;
 		    else
 			txc.endInCds = cdsOffset;
 		    }
 		}
 	    }
 	}
     exonOffset += exonEnd - exonStart;
     if (exonStart < pred->cdsEnd && exonEnd > pred->cdsStart)
 	cdsOffset += exonCdsEnd - exonCdsStart;
     }
 txc.cdnaSize = exonOffset;
 txc.cdsSize = cdsOffset;
 // Does variant end downstream?
 if (varEnd > pred->txEnd)
     {
     txc.endInCdna = txc.cdnaSize;
-    if (varStart < pred->cdsEnd)
+    if (varStart < pred->cdsEnd && varEnd > pred->cdsStart)
 	txc.endInCds = txc.cdsSize;
     }
 if (pred->strand[0] == '-')
     txc = txCoordsReverse(&txc);
+if ((txc.startInCdna == -1) != (txc.endInCdna == -1) ||
+    (txc.startInCds >= 0 && txc.endInCds < 0))
+    errAbort("getTxCoords: inconsistent start/ends for variant %s:%d-%d in %s at %s:%d-%d: "
+	     "startInCdna=%d, endInCdna=%d;  startInCds=%d, endInCds=%d",
+	     variant->chrom, varStart+1, varEnd,
+	     pred->name, pred->chrom, pred->txStart, pred->txEnd,
+	     txc.startInCdna, txc.endInCdna, txc.startInCds, txc.endInCds);
 return txc;
 }
 
 struct gpFx *gpFxNew(char *allele, char *transcript, enum soTerm soNumber,
 		     enum detailType detailType, struct lm *lm)
 /* Fill in the common members of gpFx; leave soTerm-specific members for caller to fill in. */
 {
 struct gpFx *effect;
 lmAllocVar(lm, effect);
 effect->allele = collapseDashes(strUpper(lmCloneString(lm, allele)));
 effect->transcript = lmCloneString(lm, transcript);
 effect->soNumber = soNumber;
 effect->detailType = detailType;
 return effect;
 }
@@ -378,30 +399,32 @@
 
 static char *gpFxModifyCodingSequence(char *oldCodingSeq, struct genePred *pred,
 				      struct txCoords *txc, struct allele *allele,
 				      int *retCdsBasesAdded, struct lm *lm)
 /* Return a new coding sequence that is oldCodingSeq with allele applied. */
 {
 boolean isRc = (pred->strand[0] == '-');
 char *newAlleleSeq = allele->sequence;
 int newAlLen = strlen(newAlleleSeq);
 if (isRc)
     {
     newAlleleSeq = lmCloneString(lm, allele->sequence);
     reverseComplement(newAlleleSeq, newAlLen);
     }
 int variantSizeOnCds = txc->endInCds - txc->startInCds;
+if (variantSizeOnCds < 0)
+    errAbort("gpFx: endInCds (%d) < startInCds (%d)", txc->endInCds, txc->startInCds);
 char *newCodingSeq = mergeAllele(oldCodingSeq, txc->startInCds, variantSizeOnCds,
 				 newAlleleSeq, allele->length, lm);
 // If newCodingSequence has an early stop, truncate there:
 truncateAtStopCodon(newCodingSeq);
 int variantSizeOnRef = allele->variant->chromEnd - allele->variant->chromStart;
 if (retCdsBasesAdded)
     *retCdsBasesAdded = allele->length - variantSizeOnRef;
 return newCodingSeq;
 }
 
 static boolean isSafeFromNMD(int exonNum, struct variant *variant, struct genePred *pred)
 /* Return TRUE if variant in strand-corrected exonNum is in the region
  * of pred that would make it safe from Nonsense-Mediated Decay (NMD).
  * NMD is triggered by the presence of a stop codon that appears
  * before ~50bp before the end of the last exon.  In other words, if
@@ -497,50 +520,55 @@
 		else
 		    effect->soNumber = incomplete_terminal_codon_variant;
 		}
 	    else
 		effect->soNumber = missense_variant;
 	    }
 	}
     }
 }
 
 static struct gpFx *gpFxChangedCds(struct allele *allele, struct genePred *pred,
 				   struct txCoords *txc, int exonIx,
 				   struct dnaSeq *transcriptSequence, struct lm *lm)
 /* calculate effect of allele change on coding transcript */
 {
-struct gpFx *effectsList = NULL;
 // calculate original and variant coding DNA and AA's
 char *oldCodingSequence = getCodingSequence(pred, transcriptSequence->dna, lm);
 int oldCdsLen = strlen(oldCodingSequence);
 char *oldaa = lmSimpleTranslate(lm, oldCodingSequence, oldCdsLen);
 int cdsBasesAdded = 0;
 char *newCodingSequence = gpFxModifyCodingSequence(oldCodingSequence, pred, txc, allele,
 						   &cdsBasesAdded, lm);
 int newCdsLen = strlen(newCodingSequence);
 char *newaa = lmSimpleTranslate(lm, newCodingSequence, newCdsLen);
 
 
 // allocate the effect structure - fill in soNumber and details below
 struct gpFx *effect = gpFxNew(allele->sequence, pred->name, coding_sequence_variant, codingChange,
 			      lm);
 struct codingChange *cc = &effect->details.codingChange;
 cc->cDnaPosition = txc->startInCdna;
 cc->cdsPosition = txc->startInCds;
 cc->exonNumber = exonIx;
 int pepPos = txc->startInCds / 3;
+// At this point we don't use genePredExt's exonFrames field -- we just assume that
+// the CDS starts in frame.  That's not always the case (e.g. ensGene has some CDSs
+// that begin out of frame), so watch out for early truncation of oldCodingSequence
+// due to stop codon in the wrong frame:
+if (pepPos >= strlen(oldaa))
+    return NULL;
 cc->pepPosition = pepPos;
 if (cdsBasesAdded % 3 == 0)
     {
     // Common case: substitution, same number of old/new codons/peps:
     int numOldCodons = (1 + allele->length / 3), numNewCodons = (1 + allele->length / 3);
     if (cdsBasesAdded > 0)
 	{
 	// insertion: more new codons than old
 	numOldCodons = (cc->cdsPosition % 3) == 0 ? 0 : 1;
 	numNewCodons = numOldCodons + (cdsBasesAdded / 3);
 	}
     else if (cdsBasesAdded < 0)
 	{
 	// deletion: more old codons than new
 	numNewCodons = (cc->cdsPosition % 3) == 0 ? 0 : 1;
@@ -551,32 +579,31 @@
     cc->aaOld = lmCloneStringZ(lm, oldaa + pepPos, numOldCodons);
     cc->aaNew = lmCloneStringZ(lm, newaa + pepPos, numNewCodons);
     }
 else
     {
     // frameshift -- who knows how many codons we can reliably predict...
     cc->codonOld = lmCloneString(lm, oldCodingSequence + pepPos*3);
     cc->codonNew = lmCloneString(lm, newCodingSequence + pepPos*3);
     cc->aaOld = lmCloneString(lm, oldaa + pepPos);
     cc->aaNew = lmCloneString(lm, newaa + pepPos);
     }
 
 boolean safeFromNMD = isSafeFromNMD(exonIx, allele->variant, pred);
 setSpecificCodingSoTerm(effect, oldaa, newaa, cdsBasesAdded, safeFromNMD);
 
-slAddHead(&effectsList, effect);
-return effectsList;
+return effect;
 }
 
 
 static boolean hasAltAllele(struct allele *alleles)
 /* Make sure there's something to work on here... */
 {
 while (alleles != NULL && alleles->isReference)
     alleles = alleles->next;
 return (alleles != NULL);
 }
 
 static char *firstAltAllele(struct allele *alleles)
 /* Ensembl always reports an alternate allele, even if that allele is not being used
  * to calculate any consequence.  When allele doesn't really matter, just use the
  * first alternate allele that is given. */
@@ -609,45 +636,43 @@
 	    }
 	else
 	    effectsList = slCat(effectsList,
 				gpFxChangedNoncodingExon(allele, pred, txc, exonIx, lm));
 
 	// Was entire exon deleted?
 	int exonNumPos = exonIx;
 	if (pred->strand[0] == '-')
 	    exonNumPos = pred->exonCount - 1 - exonIx;
 	uint exonStart = pred->exonStarts[exonNumPos], exonEnd = pred->exonEnds[exonNumPos];
 	if (variant->chromStart <= exonStart && variant->chromEnd >= exonEnd)
 	    {
 	    struct gpFx *effect = gpFxNew(allele->sequence, pred->name, exon_loss,
 					  nonCodingExon, lm);
 	    setNCExonVals(effect, exonIx, txc->startInCdna);
-	    effect->details.intron.intronNumber = exonIx;
-	    slAddTail(effectsList, effect);
+	    slAddTail(&effectsList, effect);
 	    }
 	else
 	    {
 	    // If variant is in exon *but* within 3 bases of splice site,
 	    // it also qualifies as splice_region_variant:
 	    if ((variant->chromEnd > exonEnd-3 && variant->chromStart < exonEnd) ||
 		(variant->chromEnd > exonStart && variant->chromStart < exonStart+3))
 		{
 		struct gpFx *effect = gpFxNew(allele->sequence, pred->name, splice_region_variant,
-					      intron, lm);
-//#*** Shouldn't we make sure to have the right intron number here?
-		effect->details.intron.intronNumber = exonIx;
-		slAddTail(effectsList, effect);
+					      nonCodingExon, lm);
+		setNCExonVals(effect, exonIx, txc->startInCdna);
+		slAddTail(&effectsList, effect);
 		}
 	    }
 	}
     }
 return effectsList;
 }
 
 static struct gpFx *gpFxInIntron(struct variant *variant, struct txCoords *txc, int intronIx,
 				 struct genePred *pred, char *altAllele, struct lm *lm)
 // Annotate a variant that overlaps an intron (and possibly splice region)
 //#*** TODO: watch out for "introns" that are actually indels between tx seq and ref genome!
 {
 struct gpFx *effectsList = NULL;
 boolean minusStrand = (pred->strand[0] == '-');
 // If on - strand, flip intron number back to + strand for getting intron coords: