dc76b7573570d1fde28a064129e8acfa4f00ab67
angie
  Fri Jun 14 09:16:24 2013 -0700
Changing how gpFx determines indel changes.  Instead of padding the non-reference allele with hyphens, and using string comparison of CDS containing
hyphens to determine effects on genes, use the genePred information directly
to determine the variant's offset in CDS, and how many CDS bases are
added/deleted.  This solves two problems: 1) the reference was never padded,
so the method didn't work for insertions because string comparison could not
pick up the trail of identical bases again; 2) the hyphens were messing up
protein translation.

diff --git src/hg/lib/gpFx.c src/hg/lib/gpFx.c
index 5025ce1..a75cd3c 100644
--- src/hg/lib/gpFx.c
+++ src/hg/lib/gpFx.c
@@ -18,213 +18,206 @@
 }
 
 struct gpFx *gpFxNew(char *allele, char *transcript, enum soTerm soNumber,
 		     enum detailType detailType, struct lm *lm)
 /* Fill in the common members of gpFx; leave soTerm-specific members for caller to fill in. */
 {
 struct gpFx *effect;
 lmAllocVar(lm, effect);
 effect->allele = collapseDashes(strUpper(lmCloneString(lm, allele)));
 effect->transcript = lmCloneString(lm, transcript);
 effect->soNumber = soNumber;
 effect->detailType = detailType;
 return effect;
 }
 
-static unsigned countDashes(char *string)
-/* count the number of dashes in a string */
-{
-int count = 0;
-
-while(*string)
-    {
-    if (*string == '-')
-	count++;
-    string++;
-    }
-
-return count;
-}
-
 static char * mergeAllele(char *transcript, int offset, int variantWidth,
 			  char *newAlleleSeq, int alleleLength, struct lm *lm)
 /* merge a variant into an allele */
 {
 char *newTranscript = NULL;
 
 if (variantWidth == alleleLength)
     {
     newTranscript = lmCloneString(lm, transcript);
     memcpy(&newTranscript[offset], newAlleleSeq, alleleLength);
     }
-else  if (variantWidth > alleleLength)
-    errAbort("gpFx: allele not padded");
 else 
     {
     int insertionSize = alleleLength - variantWidth;
     int newLength = strlen(transcript) + insertionSize;
     newTranscript = lmAlloc(lm, newLength + 1);
     char *restOfTranscript = &transcript[offset + variantWidth];
 
     // copy over the part before the variant
     memcpy(newTranscript, transcript, offset);
 
     // copy in the new allele
     memcpy(&newTranscript[offset], newAlleleSeq, alleleLength);
 
-    // copy in the new allele
+    // copy in the part after the variant
     memcpy(&newTranscript[offset + alleleLength], restOfTranscript, 
 	strlen(restOfTranscript) + 1);
     }
 
 return newTranscript;
 }
 
+static int getCodingOffsetInTx(struct genePred *pred, char strand)
+/* Skip past UTR (portions of) exons to get offset of CDS relative to transcript start.
+ * The strand arg is used instead of pred->strand. */
+{
+int offset = 0;
+int iStart = 0, iIncr = 1;
+boolean isRc = (strand == '-');
+if (isRc)
+    {
+    // Work our way left from the last exon.
+    iStart = pred->exonCount - 1;
+    iIncr = -1;
+    }
+int ii;
+// trim off the 5' UTR (or 3' UTR if strand is '-')
+for (ii = iStart; ii >= 0 && ii < pred->exonCount; ii += iIncr)
+    {
+    if ((!isRc &&
+	 (pred->cdsStart >= pred->exonStarts[ii]) && (pred->cdsStart < pred->exonEnds[ii])) ||
+	(isRc && (pred->cdsEnd > pred->exonStarts[ii]) && (pred->cdsEnd <= pred->exonEnds[ii])))
+	break;
+    int exonSize = pred->exonEnds[ii] - pred->exonStarts[ii];
+    offset += exonSize;
+    }
+assert( !(strand == '+' && ii >= pred->exonCount) );
+assert( !(strand == '-' && ii < 0) );
+// clip off part of UTR in exon that has CDS in it too
+int exonOffset = pred->cdsStart - pred->exonStarts[ii];
+if (strand == '-')
+    exonOffset = pred->exonEnds[ii] - pred->cdsEnd;
+offset += exonOffset;
+return offset;
+}
+
 char *gpFxModifySequence(struct allele *allele, struct genePred *pred, 
 			 int exonNum, int exonQStart, struct dnaSeq *transcriptSequence,
-			 struct genePred **retNewPred, struct lm *lm)
+			 struct genePred **retNewPred,
+			 int *retTranscriptOffset, int *retBasesAdded,
+			 int *retCdsOffset, int *retCdsBasesAdded, struct lm *lm)
 /* modify a transcript to what it'd be if the alternate allele were present;
  * if transcript length changes, make retNewPred a shallow copy of pred w/tweaked coords. */
 {
 // clip allele to exon
 struct allele *clipAllele = alleleClip(allele, pred->exonStarts[exonNum], pred->exonEnds[exonNum],
 				       lm);
-
+struct variant *clipVariant = clipAllele->variant;
+boolean predIsRc = (*pred->strand == '-');
 // change transcript at variant point
-int exonOffset = clipAllele->variant->chromStart - pred->exonStarts[exonNum];
+int exonOffset = clipVariant->chromStart - pred->exonStarts[exonNum];
+if (predIsRc)
+    exonOffset = clipVariant->chromEnd - pred->exonStarts[exonNum];
 int transcriptOffset = exonQStart + exonOffset;
+int variantWidth = clipVariant->chromEnd - clipVariant->chromStart;
+int basesAdded = clipAllele->length - variantWidth;
+char *newAlleleSeq = lmCloneString(lm, clipAllele->sequence);
+if (predIsRc)
+    {
+    transcriptOffset = transcriptSequence->size - transcriptOffset;
+    reverseComplement(newAlleleSeq, strlen(newAlleleSeq));
+    }
 
-int variantWidth = clipAllele->variant->chromEnd - clipAllele->variant->chromStart;
+// If clipAllele overlaps CDS, figure out cdsOffset and cdsBasesAdded:
+int cdsOffset = -1, cdsBasesAdded = 0;
+if (clipVariant->chromStart < pred->cdsEnd && clipVariant->chromEnd > pred->cdsStart)
+    {
+    struct allele *clipAlleleCds = alleleClip(clipAllele, pred->cdsStart, pred->cdsEnd, lm);
+    struct variant *clipVariantCds = clipAlleleCds->variant;
+    int exonOffsetCds = clipVariantCds->chromStart - pred->exonStarts[exonNum];
+    if (predIsRc)
+	exonOffsetCds = clipVariantCds->chromEnd - pred->exonStarts[exonNum];
+    int cdsOffsetInTx = exonQStart + exonOffsetCds;
+    if (predIsRc)
+	cdsOffsetInTx = transcriptSequence->size - cdsOffsetInTx;
+    int cdsQStart = getCodingOffsetInTx(pred, pred->strand[0]);
+    cdsOffset = cdsOffsetInTx - cdsQStart;
+    int variantWidthCds = clipVariantCds->chromEnd - clipVariantCds->chromStart;
+    cdsBasesAdded = clipAlleleCds->length - variantWidthCds;
+    }
 
-if (variantWidth != clipAllele->length && retNewPred != NULL)
+if (basesAdded != 0 && retNewPred != NULL)
     {
     // OK, now we have to make a new genePred that matches the changed sequence
-    int basesAdded = clipAllele->length - variantWidth;
     struct genePred *newPred = lmCloneVar(lm, pred);
     size_t size = pred->exonCount * sizeof(pred->exonStarts[0]);
     newPred->exonStarts = lmCloneMem(lm, pred->exonStarts, size);
     newPred->exonEnds = lmCloneMem(lm, pred->exonEnds, size);
-    int alEnd = clipAllele->variant->chromEnd;
-    if (newPred->cdsStart > alEnd)
+    if (newPred->cdsStart > clipVariant->chromEnd)
 	newPred->cdsStart += basesAdded;
-    if (newPred->cdsEnd > alEnd)
+    else if (newPred->cdsStart > clipVariant->chromStart && cdsBasesAdded < 0)
+	newPred->cdsStart += cdsBasesAdded;
+    if (newPred->cdsEnd > clipVariant->chromEnd)
 	newPred->cdsEnd += basesAdded;
+    else if (newPred->cdsEnd > clipVariant->chromStart && cdsBasesAdded < 0)
+	newPred->cdsEnd += cdsBasesAdded;
     newPred->exonEnds[exonNum] += basesAdded;
     int ii;
     for (ii = exonNum+1;  ii < pred->exonCount;  ii++)
 	{
 	newPred->exonStarts[ii] += basesAdded;
 	newPred->exonEnds[ii] += basesAdded;
 	}
     newPred->txEnd = newPred->exonEnds[newPred->exonCount - 1];
     *retNewPred = newPred;
     }
 
-char *retSequence = lmCloneString(lm, transcriptSequence->dna);
-char *newAlleleSeq = lmCloneString(lm, clipAllele->sequence);
-if (*pred->strand == '-')
-    {
-    transcriptOffset = transcriptSequence->size - (transcriptOffset + strlen(newAlleleSeq));
-    reverseComplement(newAlleleSeq, strlen(newAlleleSeq));
-    }
 
 // make the change in the sequence
+char *retSequence = lmCloneString(lm, transcriptSequence->dna);
 retSequence = mergeAllele( retSequence, transcriptOffset, variantWidth, newAlleleSeq,
 			   clipAllele->length, lm);
-
+if (retTranscriptOffset != NULL)
+    *retTranscriptOffset = transcriptOffset;
+if (retBasesAdded != NULL)
+    *retBasesAdded = basesAdded;
+if (retCdsOffset != NULL)
+    *retCdsOffset = cdsOffset;
+if (retCdsBasesAdded != NULL)
+    *retCdsBasesAdded = cdsBasesAdded;
 return retSequence;
 }
 
-static int getCodingOffsetInTx(struct genePred *pred, char strand)
-/* Skip past UTR (portions of) exons to get offset of CDS relative to transcript start.
- * The strand arg is used instead of pred->strand. */
-{
-int offset = 0;
-int iStart = 0, iIncr = 1;
-if (strand == '-')
-    {
-    // Work our way left from the last exon.
-    iStart = pred->exonCount - 1;
-    iIncr = -1;
-    }
-int ii;
-// trim off the 5' UTR (or 3' UTR if strand is '-')
-for (ii = iStart; ii >= 0 && ii < pred->exonCount; ii += iIncr)
-    {
-    if ((pred->cdsStart >= pred->exonStarts[ii]) &&
-	(pred->cdsStart < pred->exonEnds[ii]))
-	break;
-    int exonSize = pred->exonEnds[ii] - pred->exonStarts[ii];
-    offset += exonSize;
-    }
-assert( !(strand == '+' && ii >= pred->exonCount) );
-assert( !(strand == '-' && ii < 0) );
-// clip off part of UTR in exon that has CDS in it too
-int exonOffset = pred->cdsStart - pred->exonStarts[ii];
-offset += exonOffset;
-return offset;
-}
-
 static char *getCodingSequence(struct genePred *pred, char *transcriptSequence, struct lm *lm)
 /* extract the CDS from a transcript */
 {
 if (*pred->strand == '-')
     reverseComplement(transcriptSequence, strlen(transcriptSequence));
 
 // Get leftmost CDS boundary to trim off 5' (or 3' if on minus strand):
 int cdsOffset = getCodingOffsetInTx(pred, '+');
 char *newString = lmCloneString(lm, transcriptSequence + cdsOffset);
 
 // trim off 3' (or 5' if on minus strand)
 newString[genePredCdsSize(pred)] = 0;
 
 // correct for strand
 if (*pred->strand == '-')
     {
     reverseComplement(transcriptSequence, strlen(transcriptSequence));
     reverseComplement(newString, strlen(newString));
     }
 
 return newString;
 }
 
-static int firstChange(char *string1, char *string2, int *numDifferent)
-/* return the position of the first difference between the two sequences */
-/* if numDifferent is non-NULL, return number of characters between first 
- * difference, and the last difference */
-{
-int count = 0;
-
-for(; *string1 == *string2 && (*string1); count++, string1++, string2++)
-    ;
-
-int firstChange = count;
-int lastChange = firstChange;
-
-if (numDifferent != NULL)
-    {
-    for (; (*string1) && (*string2); string1++, string2++, count++)
-	{
-	if (*string1 != *string2)
-	    lastChange = count;
-	}
-    *numDifferent = lastChange - firstChange + 1;
-    }
-
-return firstChange;
-}
-
 static struct dnaSeq *lmNewSimpleSeq(struct lm *lm, char *seq)
 /* Like newDnaSeq but no name and using localmem. */
 {
 struct dnaSeq *simpleSeq;
 lmAllocVar(lm, simpleSeq);
 simpleSeq->dna = seq;
 simpleSeq->size = strlen(seq);
 return simpleSeq;
 }
 
 static char *lmSimpleTranslate(struct lm *lm, struct dnaSeq *dnaSeq)
 /* Just translate dnaSeq into pep, use 'Z' and truncate for stop codon, and use localmem. */
 {
 int aaLen = (dnaSeq->size / 3) + 1;
 char *pep = lmAlloc(lm, aaLen + 1);
@@ -309,189 +302,203 @@
  * of pred that would make it safe from Nonsense-Mediated Decay (NMD).
  * NMD is triggered by the presence of a stop codon that appears
  * before ~50bp before the end of the last exon.  In other words, if
  * there's a stop codon with a detectable downstream splice junction,
  * translation is prevented -- see
  * http://en.wikipedia.org/wiki/MRNA_surveillance#Mechanism_in_mammals */
 {
 boolean lastExonNum = pred->exonCount - 1;
 if (exonNum == lastExonNum)
     return TRUE;
 int nextToLastExonNum = pred->exonCount - 2;
 if (exonNum == nextToLastExonNum)
     {
     // Is it within 50bp of 3' end of exon?
     if (pred->strand[0] == '-')
-	return (variant->chromEnd < pred->exonStarts[1]);
+	return (variant->chromEnd < pred->exonStarts[1] + 50);
     else
 	return (variant->chromStart > pred->exonEnds[nextToLastExonNum] - 50);
     }
 return FALSE;
 }
 
-void setSpecificCodingSoTerm(struct gpFx *effect, char *oldAa, char *newAa, int numDashes,
-			     int cdsChangeLength, boolean safeFromNMD)
+void setSpecificCodingSoTerm(struct gpFx *effect, char *oldAa, char *newAa, int cdsBasesAdded,
+			     boolean safeFromNMD)
 /* Assuming that deletions are marked with dashes in newCodingSequence, 
  * and that effect fields aside from soNumber are already populated, use the
  * pep seqs and number of dashes to determine the appropriate SO term.
  * Probably the majority of cases will be synonymous_variant or missense_variant,
  * but we have to check for several other special cases esp. indels. */
 {
 struct codingChange *cc = &effect->details.codingChange;
 int oldAaSize = strlen(oldAa), newAaSize = strlen(newAa);
 if (sameString(newAa, oldAa))
     {
     if (cc->pepPosition == oldAaSize-1 && cc->aaOld[0] == 'Z')
 	effect->soNumber = stop_retained_variant;
     else
 	effect->soNumber = synonymous_variant;
     }
 else
     {
-    if (numDashes != 0)
+    if (cdsBasesAdded < 0)
 	{
 	// Got a deletion variant -- check frame:
-	//#*** if numDashes == cdsChangeLength, then we don't need numDashes...
-	if ((numDashes % 3) == 0)
-	    {
-	    if ((cdsChangeLength % 3) != 0)
-		errAbort("gpFx: assumption about cdsChangeLength vs. numDashes not holding; "
-			 "cdsChangeLength=%d, numDashes=%d", cdsChangeLength, numDashes);
+	if ((cdsBasesAdded % 3) == 0)
 	    effect->soNumber = inframe_deletion;
-	    }
 	else
 	    effect->soNumber = frameshift_variant;
 	}
     else
 	{
 	// Not a deletion; could be single-base (including early stop) or insertion
 	if (newAaSize < oldAaSize)
 	    {
 	    // Not a deletion but protein got smaller; must have been an early stop codon,
 	    // possibly following a frameshift caused by an insertion.
 	    if (cc->aaNew[0] != 'Z')
 		{
 		if (newAa[newAaSize-1] != 'Z')
 		    errAbort("gpFx: new protein is smaller but last base in new sequence "
 			     "is '%c' not 'Z'", newAa[newAaSize-1]);
 		effect->soNumber = frameshift_variant;
 		}
 	    else if (safeFromNMD)
 		effect->soNumber = stop_gained;
 	    else
 		effect->soNumber = NMD_transcript_variant;
 	    }
 	else if (newAaSize > oldAaSize)
 	    {
 	    // protein got bigger; insertion or lost stop codon
 	    if (cc->aaOld[0] == 'Z')
 		effect->soNumber = stop_lost;
-	    else if ((cdsChangeLength % 3) == 0)
+	    else if ((cdsBasesAdded % 3) == 0)
 		effect->soNumber = inframe_insertion;
 	    else
 		effect->soNumber = frameshift_variant;
 	    }
 	else
 	    {
 	    // Single aa change
 	    if (cc->pepPosition == 0 && cc->aaOld[0] == 'M')
 		effect->soNumber = initiator_codon_variant;
 	    else if (cc->pepPosition == oldAaSize-1)
 		{
 		if (oldAa[oldAaSize-1] == 'Z')
 		    effect->soNumber = stop_lost;
 		else
 		    effect->soNumber = incomplete_terminal_codon_variant;
 		}
 	    else
 		effect->soNumber = missense_variant;
 	    }
 	}
     }
 }
 
 struct gpFx *gpFxChangedCodingExon(struct allele *allele, struct genePred *pred, int exonNum,
-				   char *transcriptSequence, char *newSequence,
-				   struct genePred *newPred, struct lm *lm)
+				   struct dnaSeq *transcriptSequence, char *newSequence,
+				   struct genePred *newPred, int cDnaPosition, int basesAdded,
+				   int cdsPosition, int cdsBasesAdded, struct lm *lm)
 /* calculate effect of allele change on coding transcript */
 {
 struct gpFx *effectsList = NULL;
 if (*pred->strand == '-')
     exonNum = pred->exonCount - exonNum - 1;
 
 // first find effects of allele in UTR, if any
-int dnaChangeLength;
-int cDnaPosition = firstChange(newSequence, transcriptSequence, &dnaChangeLength);
 effectsList = gpFxCheckUtr(allele, pred, exonNum, cDnaPosition, lm);
 
 // calculate original and variant coding DNA and AA's
 char *oldCodingSequence, *newCodingSequence, *oldaa, *newaa;
-getSequences(pred, transcriptSequence, &oldCodingSequence, &oldaa, lm);
+getSequences(pred, transcriptSequence->dna, &oldCodingSequence, &oldaa, lm);
 getSequences(newPred, newSequence, &newCodingSequence, &newaa, lm);
 
 // if coding sequence hasn't changed, we're done (allele == reference allele)
 if (sameString(oldCodingSequence, newCodingSequence))
     return effectsList;
 
 // allocate the effect structure - fill in soNumber and details below
 struct gpFx *effect = gpFxNew(allele->sequence, pred->name, coding_sequence_variant, codingChange,
 			      lm);
 struct codingChange *cc = &effect->details.codingChange;
 cc->cDnaPosition = cDnaPosition;
-int cdsChangeLength;
-cc->cdsPosition = firstChange( newCodingSequence, oldCodingSequence, &cdsChangeLength);
+cc->cdsPosition = cdsPosition;
 cc->exonNumber = exonNum;
+int pepPos = cdsPosition / 3;
+cc->pepPosition = pepPos;
+if (cdsBasesAdded % 3 == 0)
+    {
+    // Common case: substitution, same number of old/new codons/peps:
+    int numOldCodons = (1 + allele->length / 3), numNewCodons = (1 + allele->length / 3);
+    if (cdsBasesAdded > 0)
+	{
+	// insertion: more new codons than old
+	numOldCodons = (cdsPosition % 3) == 0 ? 0 : 1;
+	numNewCodons = numOldCodons + (cdsBasesAdded / 3);
+	}
+    else if (cdsBasesAdded < 0)
+	{
+	// deletion: more old codons than new
+	numNewCodons = (cdsPosition % 3) == 0 ? 0 : 1;
+	numOldCodons = numNewCodons + (-cdsBasesAdded / 3);
+	}
+    cc->codonOld = lmCloneStringZ(lm, oldCodingSequence + pepPos*3, numOldCodons*3);
+    cc->codonNew = lmCloneStringZ(lm, newCodingSequence + pepPos*3, numNewCodons*3);
+    cc->aaOld = lmCloneStringZ(lm, oldaa + pepPos, numOldCodons);
+    cc->aaNew = lmCloneStringZ(lm, newaa + pepPos, numNewCodons);
+    }
+else
+    {
+    // frameshift -- who knows how many codons we can reliably predict...
+    cc->codonOld = lmCloneString(lm, oldCodingSequence + pepPos*3);
+    cc->codonNew = lmCloneString(lm, newCodingSequence + pepPos*3);
+    cc->aaOld = lmCloneString(lm, oldaa + pepPos);
+    cc->aaNew = lmCloneString(lm, newaa + pepPos);
+    }
 
 // show specific coding changes by making before-and-after subsequences:
-int codonPosStart = (cc->cdsPosition / 3) ;
-int codonPosEnd = ((cdsChangeLength - 1 + cc->cdsPosition) / 3) ;
-int numCodons = (codonPosEnd - codonPosStart + 1);
-cc->codonOld = lmCloneStringZ(lm, oldCodingSequence + codonPosStart*3, numCodons*3);
-cc->codonNew = lmCloneStringZ(lm, newCodingSequence + codonPosStart*3, numCodons*3);
-
-cc->pepPosition = codonPosStart;
-cc->aaOld = lmCloneStringZ(lm, oldaa + cc->pepPosition, numCodons);
-cc->aaNew = lmCloneStringZ(lm, newaa + cc->pepPosition, numCodons);
 
 boolean safeFromNMD = isSafeFromNMD(exonNum, allele->variant, pred);
-setSpecificCodingSoTerm(effect, oldaa, newaa, countDashes(newCodingSequence), cdsChangeLength,
-			safeFromNMD);
+setSpecificCodingSoTerm(effect, oldaa, newaa, cdsBasesAdded, safeFromNMD);
 
 slAddHead(&effectsList, effect);
 return effectsList;
 }
 
 
 struct gpFx *gpFxInExon(struct allele *allele, struct genePred *pred, int exonNum,
-			int exonQStart, struct dnaSeq *transcriptSequence,
-			struct lm *lm)
+			int exonQStart, struct dnaSeq *transcriptSequence, struct lm *lm)
 /* generate an effect from a different allele in an exon */
 {
 struct genePred *newPred = pred;
-char *newSequence = gpFxModifySequence(allele, pred, exonNum, exonQStart, transcriptSequence,
-				       &newPred, lm);
+int cDnaPosition = 0, basesAdded = 0, cdsPosition = 0, cdsBasesAdded = 0;
+char *newSequence = gpFxModifySequence(allele, pred, exonNum, exonQStart,
+				       transcriptSequence, &newPred, &cDnaPosition, &basesAdded,
+				       &cdsPosition, &cdsBasesAdded, lm);
 
 if (sameString(newSequence, transcriptSequence->dna))
     return NULL;  // no change in transcript
 
 struct gpFx *effectsList;
 if (pred->cdsStart != pred->cdsEnd)
-    effectsList = gpFxChangedCodingExon(allele, pred, exonNum, transcriptSequence->dna,
-					newSequence, newPred, lm);
+    effectsList = gpFxChangedCodingExon(allele, pred, exonNum, transcriptSequence,
+					newSequence, newPred, cDnaPosition, basesAdded,
+					cdsPosition, cdsBasesAdded, lm);
 else
     {
-    int cDnaPosition = firstChange(newSequence, transcriptSequence->dna, NULL);
     effectsList = gpFxChangedNoncodingExon(allele, pred, exonNum, cDnaPosition, lm);
     }
 
 return effectsList;
 }
 
 static struct gpFx *gpFxCheckExons(struct variant *variantList, struct genePred *pred,
 				   struct dnaSeq *transcriptSequence, struct lm *lm)
 // check to see if the variant is in an exon
 {
 struct gpFx *effectsList = NULL;
 int exonQStart = 0;
 int ii;
 for (ii=0; ii < pred->exonCount; ii++)
     {