6515c0a27505f990f9dd65f8c595b04e981e815f
angie
  Wed Oct 23 15:36:41 2013 -0700
If a genePred table includes the exonFrames column, beware... it maycontain transcripts with incomplete CDS that begin out-of-frame.
So far we've been assuming that the CDS sequence begins in frame,
which works fine for UCSC Genes and RefSeq.  However, less tidy
sets such as Ensembl and Gencode Comprehensive have lots of
out-of-frame CDSs.
In order to not have incorrect translation to peptide sequence,
we need to use a padded CDS sequence.  For example, if the
first coding exon's frame is 1, we now add an 'N' at the beginning
of the coding sequence to align it to the correct frame.  We also
look out for subsequent surprises in exonFrames, though I expect
those to be rare.  Of course, when the coding sequence is tweaked
like that, care must be taken when modifying it with an alt allele.
fixes #11990
refs #11175

diff --git src/hg/lib/gpFx.c src/hg/lib/gpFx.c
index 9d13251..54ad3e4 100644
--- src/hg/lib/gpFx.c
+++ src/hg/lib/gpFx.c
@@ -331,55 +331,151 @@
     }
 if (strand == '+' && ii >= pred->exonCount)
     errAbort("getCodingOffsetInTx: exon number overflow (strand=+, exonCount=%d, num=%d)",
 	     pred->exonCount, ii);
 if (strand == '-' && ii < 0)
     errAbort("getCodingOffsetInTx: exon number underflow (strand=-, exonCount=%d, num=%d)",
 	     pred->exonCount, ii);
 // clip off part of UTR in exon that has CDS in it too
 int exonOffset = pred->cdsStart - pred->exonStarts[ii];
 if (strand == '-')
     exonOffset = pred->exonEnds[ii] - pred->cdsEnd;
 offset += exonOffset;
 return offset;
 }
 
-static char *getCodingSequence(struct genePred *pred, char *transcriptSequence, struct lm *lm)
-/* Extract the CDS from a transcript. Temporarily force transcriptSequence to + strand
- * so we can work in + strand coords, then restore transcriptSequence and put result
- * on correct strand. */
+static char *getCodingSequenceSimple(struct genePred *pred, char *transcriptSequence, struct lm *lm)
+/* Extract the CDS from a transcript, assuming frame is 0 for all coding exons.
+ * Temporarily force transcriptSequence to + strand so we can work in + strand coords,
+ * then restore transcriptSequence and put result on correct strand. */
 {
 if (*pred->strand == '-')
     reverseComplement(transcriptSequence, strlen(transcriptSequence));
 
 // Get leftmost CDS boundary to trim off 5' (or 3' if on minus strand):
 int cdsOffset = getCodingOffsetInTx(pred, '+');
 char *newString = lmCloneString(lm, transcriptSequence + cdsOffset);
 
 // trim off 3' (or 5' if on minus strand)
 newString[genePredCdsSize(pred)] = 0;
 
 // correct for strand
 if (*pred->strand == '-')
     {
     reverseComplement(transcriptSequence, strlen(transcriptSequence));
     reverseComplement(newString, strlen(newString));
     }
 
 return newString;
 }
 
+INLINE int calcMissingBases(struct genePred *pred, int exonIx, int cdsOffset)
+/* If pred's exonFrame differs from the frame that we expect based on number
+ * of CDS bases collected so far, return the number of 'N's we need to add
+ * in order to restore the reading frame. */
+{
+int missingBases = 0;
+int exonFrame = pred->exonFrames[exonIx];
+if (exonFrame >= 0)
+    {
+    int startingFrame = cdsOffset % 3;
+    missingBases = exonFrame - startingFrame;
+    if (missingBases < 0)
+	missingBases += 3;
+    }
+return missingBases;
+}
+
+static char *getCodingSequence(struct genePred *pred, char *transcriptSequence,
+			       boolean *retAddedBases, struct lm *lm)
+/* Extract the CDS sequence from a transcript.  If pred has exonFrames, add 'N' where
+ * needed (for example, if the coding region begins out-of-frame, add one or two 'N's
+ * at the beginning of the cds sequence) and set retAddedBases if we do add 'N'.
+ * If pred doesn't have exonFrames, use the simple method above. */
+{
+if (retAddedBases)
+    *retAddedBases = FALSE;
+if (pred->optFields & genePredExonFramesFld)
+    {
+    boolean isRc = (pred->strand[0] == '-');
+    int i, iStart = 0, iIncr = 1;
+    if (isRc)
+	{
+	iStart = pred->exonCount-1;
+	iIncr = -1;
+	}
+    char *cdsSeq = lmAlloc(lm, genePredCdsSize(pred) + 3 * pred->exonCount);
+    int txOffset = getCodingOffsetInTx(pred, pred->strand[0]), cdsOffset = 0;
+    for (i = iStart;  i >= 0 && i < pred->exonCount;  i += iIncr)
+	{
+	int start, end;
+	if (genePredCdsExon(pred, i, &start, &end))
+	    {
+	    int exonCdsSize = end - start;
+	    int missingBases = calcMissingBases(pred, i, cdsOffset);
+	    if (missingBases > 0)
+		{
+		if (retAddedBases)
+		    *retAddedBases = TRUE;
+		while (missingBases > 0)
+		    {
+		    cdsSeq[cdsOffset++] = 'N';
+		    missingBases--;
+		    }
+		}
+	    memcpy(&cdsSeq[cdsOffset], &transcriptSequence[txOffset], exonCdsSize);
+	    cdsOffset += exonCdsSize;
+	    txOffset += exonCdsSize;
+	    }
+	}
+    return cdsSeq;
+    }
+else
+    return getCodingSequenceSimple(pred, transcriptSequence, lm);
+}
+
+static int getCorrectedCdsOffset(struct genePred *pred, int cdsOffsetIn)
+/* Increment cdsOffset for each 'N' that getCodingSequence added prior to it. */
+{
+int totalMissingBases = 0;
+int cdsOffsetSoFar = 0;
+if (pred->optFields & genePredExonFramesFld)
+    {
+    boolean isRc = (pred->strand[0] == '-');
+    int i, iStart = 0, iIncr = 1;
+    if (isRc)
+	{
+	iStart = pred->exonCount-1;
+	iIncr = -1;
+	}
+    for (i = iStart;  i >= 0 && i < pred->exonCount;  i += iIncr)
+	{
+	int start, end;
+	if (genePredCdsExon(pred, i, &start, &end))
+	    {
+	    // Don't count missing bases after cdsOffsetIn:
+	    if (cdsOffsetSoFar > cdsOffsetIn)
+		break;
+	    int exonCdsSize = end - start;
+	    totalMissingBases += calcMissingBases(pred, i, cdsOffsetSoFar + totalMissingBases);
+	    cdsOffsetSoFar += exonCdsSize;
+	    }
+	}
+    }
+return cdsOffsetIn + totalMissingBases;
+}
+
 static char *lmSimpleTranslate(struct lm *lm, char *dna, int size)
 /* Just translate dna into pep, use 'Z' and truncate for stop codon, and use localmem. */
 {
 int aaLen = (size / 3) + 1;
 char *pep = lmAlloc(lm, aaLen + 1);
 int i;
 for (i = 0;  i < size;  i += 3)
     {
     char aa = lookupCodon(dna + i);
     pep[i/3] = aa;
     if (aa == '\0')
 	{
 	pep[i/3] = 'Z';
 	break;
 	}
@@ -396,46 +492,46 @@
 if (codingSeq == NULL)
     errAbort("truncateAtStopCodon: null input");
 char *p = codingSeq;
 while (p[0] != '\0' && p[1] != '\0' && p[2] != '\0')
     {
     if (isStopCodon(p))
 	{
 	p[3] = '\0';
 	break;
 	}
     p += 3;
     }
 }
 
 static char *gpFxModifyCodingSequence(char *oldCodingSeq, struct genePred *pred,
-				      struct txCoords *txc, struct allele *allele,
+				      int startInCds, int endInCds, struct allele *allele,
 				      int *retCdsBasesAdded, struct lm *lm)
 /* Return a new coding sequence that is oldCodingSeq with allele applied. */
 {
 boolean isRc = (pred->strand[0] == '-');
 char *newAlleleSeq = allele->sequence;
 int newAlLen = strlen(newAlleleSeq);
 if (isRc)
     {
     newAlleleSeq = lmCloneString(lm, allele->sequence);
     reverseComplement(newAlleleSeq, newAlLen);
     }
-int variantSizeOnCds = txc->endInCds - txc->startInCds;
+int variantSizeOnCds = endInCds - startInCds;
 if (variantSizeOnCds < 0)
-    errAbort("gpFx: endInCds (%d) < startInCds (%d)", txc->endInCds, txc->startInCds);
-char *newCodingSeq = mergeAllele(oldCodingSeq, txc->startInCds, variantSizeOnCds,
+    errAbort("gpFx: endInCds (%d) < startInCds (%d)", endInCds, startInCds);
+char *newCodingSeq = mergeAllele(oldCodingSeq, startInCds, variantSizeOnCds,
 				 newAlleleSeq, allele->length, lm);
 // If newCodingSequence has an early stop, truncate there:
 truncateAtStopCodon(newCodingSeq);
 int variantSizeOnRef = allele->variant->chromEnd - allele->variant->chromStart;
 if (retCdsBasesAdded)
     *retCdsBasesAdded = allele->length - variantSizeOnRef;
 return newCodingSeq;
 }
 
 static boolean isSafeFromNMD(int exonNum, struct variant *variant, struct genePred *pred)
 /* Return TRUE if variant in strand-corrected exonNum is in the region
  * of pred that would make it safe from Nonsense-Mediated Decay (NMD).
  * NMD is triggered by the presence of a stop codon that appears
  * before ~50bp before the end of the last exon.  In other words, if
  * there's a stop codon with a detectable downstream splice junction,
@@ -531,54 +627,63 @@
 		    effect->soNumber = incomplete_terminal_codon_variant;
 		}
 	    else
 		effect->soNumber = missense_variant;
 	    }
 	}
     }
 }
 
 static struct gpFx *gpFxChangedCds(struct allele *allele, struct genePred *pred,
 				   struct txCoords *txc, int exonIx,
 				   struct dnaSeq *transcriptSequence, struct lm *lm)
 /* calculate effect of allele change on coding transcript */
 {
 // calculate original and variant coding DNA and AA's
-char *oldCodingSequence = getCodingSequence(pred, transcriptSequence->dna, lm);
+boolean addedBasesForFrame = FALSE;
+char *oldCodingSequence = getCodingSequence(pred, transcriptSequence->dna, &addedBasesForFrame, lm);
+int startInCds = txc->startInCds, endInCds = txc->endInCds;
+if (addedBasesForFrame)
+    {
+    // The annotated CDS exons were not all in frame, so getCodingSequence added 'N's
+    // and now we can't simply use txc->startInCds.
+    startInCds = getCorrectedCdsOffset(pred, txc->startInCds);
+    endInCds = getCorrectedCdsOffset(pred, txc->endInCds);
+    }
 int oldCdsLen = strlen(oldCodingSequence);
 char *oldaa = lmSimpleTranslate(lm, oldCodingSequence, oldCdsLen);
 int cdsBasesAdded = 0;
-char *newCodingSequence = gpFxModifyCodingSequence(oldCodingSequence, pred, txc, allele,
-						   &cdsBasesAdded, lm);
+char *newCodingSequence = gpFxModifyCodingSequence(oldCodingSequence, pred, startInCds, endInCds,
+						   allele, &cdsBasesAdded, lm);
 int newCdsLen = strlen(newCodingSequence);
 char *newaa = lmSimpleTranslate(lm, newCodingSequence, newCdsLen);
 
 
 // allocate the effect structure - fill in soNumber and details below
 struct gpFx *effect = gpFxNew(allele->sequence, pred->name, coding_sequence_variant, codingChange,
 			      lm);
 struct codingChange *cc = &effect->details.codingChange;
 cc->cDnaPosition = txc->startInCdna;
-cc->cdsPosition = txc->startInCds;
+cc->cdsPosition = startInCds;
 cc->exonNumber = exonIx;
-int pepPos = txc->startInCds / 3;
+int pepPos = startInCds / 3;
 // At this point we don't use genePredExt's exonFrames field -- we just assume that
 // the CDS starts in frame.  That's not always the case (e.g. ensGene has some CDSs
 // that begin out of frame), so watch out for early truncation of oldCodingSequence
 // due to stop codon in the wrong frame:
 if (pepPos >= strlen(oldaa))
-    return NULL;
+    return effect;
 cc->pepPosition = pepPos;
 if (cdsBasesAdded % 3 == 0)
     {
     // Common case: substitution, same number of old/new codons/peps:
     int numOldCodons = (1 + allele->length / 3), numNewCodons = (1 + allele->length / 3);
     if (cdsBasesAdded > 0)
 	{
 	// insertion: more new codons than old
 	numOldCodons = (cc->cdsPosition % 3) == 0 ? 0 : 1;
 	numNewCodons = numOldCodons + (cdsBasesAdded / 3);
 	}
     else if (cdsBasesAdded < 0)
 	{
 	// deletion: more old codons than new
 	numNewCodons = (cc->cdsPosition % 3) == 0 ? 0 : 1;