8bbed14087acb42d639fb045bc09c11394a0b82b
angie
  Wed Jul 2 16:49:07 2014 -0700
Fixed two bugs in the way cdsBasesAdded was computed -- that causedsome strange functional assignments for "<DEL>" (large deletion).
Also fixed a miscall of frameshift instead of stop_gained (from
insertion) brought to my attention by GoldenHelix's comparison of VEP,
Annovar and SnpEff: http://blog.goldenhelix.com/?p=2486

diff --git src/hg/lib/gpFx.c src/hg/lib/gpFx.c
index 1d92811..d4e365b 100644
--- src/hg/lib/gpFx.c
+++ src/hg/lib/gpFx.c
@@ -527,33 +527,32 @@
     newAlleleSeq = "";
     newAlLen = 0;
     }
 if (isRc && newAlLen > 0)
     {
     newAlleleSeq = lmCloneString(lm, newAlleleSeq);
     reverseComplement(newAlleleSeq, newAlLen);
     }
 int variantSizeOnCds = endInCds - startInCds;
 if (variantSizeOnCds < 0)
     errAbort("gpFx: endInCds (%d) < startInCds (%d)", endInCds, startInCds);
 char *newCodingSeq = mergeAllele(oldCodingSeq, startInCds, variantSizeOnCds,
 				 newAlleleSeq, newAlLen, lm);
 // If newCodingSequence has an early stop, truncate there:
 truncateAtStopCodon(newCodingSeq);
-int variantSizeOnRef = allele->variant->chromEnd - allele->variant->chromStart;
 if (retCdsBasesAdded)
-    *retCdsBasesAdded = allele->length - variantSizeOnRef;
+    *retCdsBasesAdded = newAlLen - variantSizeOnCds;
 return newCodingSeq;
 }
 
 static void setSpecificCodingSoTerm(struct gpFx *effect, char *oldAa, char *newAa,
 				    int cdsBasesAdded)
 /* Assuming that deletions are marked with dashes in newCodingSequence, 
  * and that effect fields aside from soNumber are already populated, use the
  * pep seqs and number of dashes to determine the appropriate SO term.
  * Probably the majority of cases will be synonymous_variant or missense_variant,
  * but we have to check for several other special cases esp. indels. */
 {
 struct codingChange *cc = &effect->details.codingChange;
 int oldAaSize = strlen(oldAa), newAaSize = strlen(newAa);
 if (sameString(newAa, oldAa))
     {
@@ -571,42 +570,54 @@
 	    {
 	    if (strchr(cc->aaOld, 'Z') && !strchr(cc->aaNew, 'Z'))
 		effect->soNumber = stop_lost;
 	    else
 		effect->soNumber = inframe_deletion;
 	    }
 	else
 	    effect->soNumber = frameshift_variant;
 	}
     else
 	{
 	// Not a deletion; could be single-base (including early stop) or insertion
 	if (newAaSize < oldAaSize)
 	    {
 	    // Not a deletion but protein got smaller; must have been an early stop codon,
-	    // possibly following a frameshift caused by an insertion.
-	    if (cc->aaNew[0] != 'Z')
+	    // possibly inserted or following a frameshift caused by an insertion.
+	    int frame = cc->cdsPosition % 3;
+	    int alleleLength = strlen(effect->allele);
+	    if (! isAllNt(effect->allele, alleleLength))
+		// symbolic -- may be deletion or insertion, but we can't tell. :(
+		alleleLength = 0;
+	    int i, affectedCodons = (frame + alleleLength + 2) / 3;
+	    boolean stopGain = FALSE;
+	    for (i = 0;  i < affectedCodons;  i++)
+		if (cc->aaNew[i] == 'Z')
+		    {
+		    effect->soNumber = stop_gained;
+		    stopGain = TRUE;
+		    break;
+		    }
+	    if (! stopGain)
 		{
 		if (newAa[newAaSize-1] != 'Z')
 		    errAbort("gpFx: new protein is smaller but last base in new sequence "
 			     "is '%c' not 'Z'.\n"
 			     "oldAa (%daa): %s\nnewAa (%daa): %s\n"
 			     , newAa[newAaSize-1], oldAaSize, oldAa, newAaSize, newAa);
 		effect->soNumber = frameshift_variant;
 		}
-	    else
-		effect->soNumber = stop_gained;
 	    }
 	else if (newAaSize > oldAaSize)
 	    {
 	    // protein got bigger; insertion or lost stop codon
 	    if (cc->aaOld[0] == 'Z')
 		effect->soNumber = stop_lost;
 	    else if ((cdsBasesAdded % 3) == 0)
 		effect->soNumber = inframe_insertion;
 	    else
 		effect->soNumber = frameshift_variant;
 	    }
 	else
 	    {
 	    // Single aa change
 	    if (cc->pepPosition == 0 && cc->aaOld[0] == 'M')