2f19b4081ef0f0d93daf8f7ff1c26cf8879a4d14
braney
  Thu Apr 26 17:26:10 2012 -0700
more work on #6152.   Now with in-frame deletions!
diff --git src/hg/lib/gpFx.c src/hg/lib/gpFx.c
index 5af7432..1689539 100644
--- src/hg/lib/gpFx.c
+++ src/hg/lib/gpFx.c
@@ -1,48 +1,94 @@
 
 /* gpFx --- routines to calculate the effect of variation on a genePred */
 
 #include "common.h"
 #include "genePred.h"
 #include "gpFx.h"
 
 
+unsigned countDashes(char *string)
+{
+int count = 0;
+
+while(*string)
+    {
+    if (*string == '-')
+	count++;
+    string++;
+    }
+
+return count;
+}
+
+static void mergeAllele(char *transcript, int variantWidth,
+    char *newAlleleSeq, int alleleLength)
+{
+if (variantWidth == alleleLength)
+    {
+    // for the moment, we're sticking the dashes into the transcripts
+    if (1) //noDashes(newAlleleSeq))
+	memcpy(transcript, newAlleleSeq, alleleLength);
+    else
+	{
+	char *transcriptSource = transcript;
+	int ii;
+	for(ii=0; ii < variantWidth; ii++)
+	    {
+	    if (*newAlleleSeq == '-')
+		{
+		transcriptSource++;
+		}
+	    else
+		{
+		*transcript++ = *newAlleleSeq++;
+		transcriptSource++;
+		}
+	    }
+	while(*transcriptSource)
+	    *transcript++ = *transcriptSource++;
+	*transcript = 0;
+	}
+    }
+}
+
 char *gpFxModifySequence(struct allele *allele, struct genePred *pred, 
     int exonNum, struct psl *transcriptPsl, struct dnaSeq *transcriptSequence)
 /* modify a transcript to what it'd be if the alternate allele were present */
 {
 // clip allele to exon
 struct allele *clipAllele = alleleClip(allele, transcriptPsl->tStarts[exonNum], 
 	transcriptPsl->tStarts[exonNum] + transcriptPsl->blockSizes[exonNum]);
 
 // change transcript at variant point
 int exonOffset = clipAllele->variant->chromStart - transcriptPsl->tStarts[exonNum];
 int transcriptOffset = transcriptPsl->qStarts[exonNum] + exonOffset;
 
-if (clipAllele->length != clipAllele->variant->chromEnd - clipAllele->variant->chromStart)
+int variantWidth = clipAllele->variant->chromEnd - clipAllele->variant->chromStart;
+if (clipAllele->length !=  variantWidth)
     errAbort("only support alleles the same length as the reference");
 
 char *retSequence = cloneString(transcriptSequence->dna);
 char *newAlleleSeq = cloneString(clipAllele->sequence);
 if (*pred->strand == '-')
     {
-    transcriptOffset = transcriptSequence->size - (transcriptOffset + 1);
+    transcriptOffset = transcriptSequence->size - (transcriptOffset + strlen(newAlleleSeq));
     reverseComplement(newAlleleSeq, strlen(newAlleleSeq));
     }
 
 // make the change in the sequence
-memcpy(&retSequence[transcriptOffset], newAlleleSeq, allele->length);
+mergeAllele( &retSequence[transcriptOffset], variantWidth, newAlleleSeq, allele->length);
 
 // clean up
 freeMem(newAlleleSeq);
 
 return retSequence;
 }
 
 static char *getCodingSequence(struct genePred *pred, char *transcriptSequence)
 /* extract the CDS from a transcript */
 {
 int ii;
 
 if (*pred->strand == '-')
     reverseComplement(transcriptSequence, strlen(transcriptSequence));
 
@@ -67,44 +113,52 @@
 // trim off 3' (or 5' if on minus strand)
 newString[genePredCdsSize(pred)] = 0;
 
 // correct for strand
 if (*pred->strand == '-')
     {
     reverseComplement(transcriptSequence, strlen(transcriptSequence));
     reverseComplement(newString, strlen(newString));
     }
 
 return newString;
 }
 
 static int firstChange(char *string1, char *string2, int *numDifferent)
 /* return the position of the first difference between the two sequences */
+/* if numDifferent is non-NULL, return number of characters between first 
+ * difference, and the last difference */
 {
 int count = 0;
 
-while (*string1++ == *string2++)
-    count++;
+for(; *string1 == *string2; count++, string1++, string2++)
+    ;
+
+int firstChange = count;
+int lastChange = firstChange;
 
 if (numDifferent != NULL)
     {
-    *numDifferent = 1;
-    while ((*string1) && (*string2) && (*string1++ != *string2++))
-	(*numDifferent)++;
+    for (; (*string1) && (*string2); string1++, string2++, count++)
+	{
+	if (*string1 != *string2)
+	    lastChange = count;
+	}
+    *numDifferent = lastChange - firstChange + 1;
     }
 
-return count;
+return firstChange;
 }
 
 static void getSequences(struct genePred *pred, char *transcriptSequence,
     char **codingSequence,  aaSeq **aaSeq)
 /* get coding sequences for a transcript and a variant transcript */
 {
 *codingSequence = getCodingSequence(pred, transcriptSequence);
 struct dnaSeq *codingDna = newDnaSeq(*codingSequence, strlen(*codingSequence), NULL);
 *aaSeq = translateSeq(codingDna, 0, FALSE);
 freez(codingDna);
 }
 
 static char *
 codonChangeString(int codonPos, int numCodons, char *oldCodingSequence, char *newCodingSequence)
 /* generate string that describes a codon change */
@@ -135,30 +189,39 @@
 {
 struct dyString *dy = newDyString(100);
 int ii;
 
 for(ii=0; ii < numaa; ii++)
     {
     dyStringPrintf(dy, "%c > %c", 
 	toupper(oldaa[pepPosition+ii]), toupper(newaa[pepPosition+ii]));
     if (ii < numaa - 1)
 	dyStringPrintf(dy, ",");
     }
 
 return dy->string;
 }
 
+struct gpFx *gpFxInCdsDeletion( struct allele *allele, struct genePred *pred, 
+    int exonNum, struct psl *transcriptPsl, struct dnaSeq *transcriptSequence,
+    char *newSequence)
+{
+struct gpFx *effects;
+AllocVar(effects);
+return effects;
+}
+
 struct gpFx *gpFxCheckUtr( struct allele *allele, struct genePred *pred, 
     int exonNum, struct psl *transcriptPsl, struct dnaSeq *transcriptSequence,
     char *newSequence)
 /* check for effects in UTR of coding gene */
 {
 if (positiveRangeIntersection(pred->txStart, pred->cdsStart,
 	allele->variant->chromStart, allele->variant->chromEnd))
     {
     // we're in 5' UTR ( or UTR intron )
     errAbort("don't support variants in 5' UTR");
     }
 
 if (positiveRangeIntersection(pred->txStart, pred->cdsStart,
 	allele->variant->chromStart, allele->variant->chromEnd))
     {
@@ -189,70 +252,81 @@
 effectsList = gpFxCheckUtr(allele, pred, exonNum, transcriptPsl, 
     transcriptSequence, newSequence);
 
 // check to see if coding sequence is changed
 // calculate original and variant coding AA's
 char *oldCodingSequence, *newCodingSequence;
 aaSeq *oldaa, *newaa;
 
 getSequences(pred, transcriptSequence->dna, &oldCodingSequence, &oldaa);
 getSequences(pred, newSequence, &newCodingSequence, &newaa);
 
 // if coding region hasn't changed, we're done
 if (sameString(oldCodingSequence, newCodingSequence))
     return effectsList;
 
+
 // start allocating the effect structure
 struct gpFx *effects;
 AllocVar(effects);
 slAddHead(&effectsList, effects);
 
 struct codingChange *cc = &effects->so.sub.codingChange;
 cc->transcript = cloneString(pred->name);
 int dnaChangeLength;
 cc->cDnaPosition = firstChange( newSequence, transcriptSequence->dna, &dnaChangeLength);
 int cdsChangeLength;
 cc->cdsPosition = firstChange( newCodingSequence, oldCodingSequence, &cdsChangeLength);
 if (*pred->strand == '-')
     cc->exonNumber = pred->exonCount - exonNum;
 else
     cc->exonNumber = exonNum;
 
 // calc codon change
-int codonPosStart = (cc->cdsPosition / 3) * 3;
-int codonPosEnd = ((cdsChangeLength + cc->cdsPosition) / 3) * 3;
-int numCodons = (codonPosEnd - codonPosStart) / 3 + 1;
-cc->codonChanges = codonChangeString( codonPosStart, numCodons, oldCodingSequence, newCodingSequence);
+int codonPosStart = (cc->cdsPosition / 3) ;
+int codonPosEnd = ((cdsChangeLength - 1 + cc->cdsPosition) / 3) ;
+int numCodons = (codonPosEnd - codonPosStart + 1);
+cc->codonChanges = codonChangeString( codonPosStart*3, numCodons, oldCodingSequence, newCodingSequence);
+// by convention we zero out these fields if they aren't used
+cc->pepPosition = 0;
+cc->aaChanges = "";
 
+int numDashes;
 if (sameString(newaa->dna, oldaa->dna))
     {
     // synonymous change
     effects->so.soNumber = synonymous_variant;
-    
-    // by convention we zero out these fields since they aren't used
-    cc->pepPosition = 0;
-    cc->aaChanges = "";
     }
 else
     {
-    // non-synonymous change
-    effects->so.soNumber = non_synonymous_variant;
-
     int numDifferent;
     cc->pepPosition = firstChange( newaa->dna, oldaa->dna, &numDifferent);
     cc->aaChanges = aaChangeString( cc->pepPosition, numDifferent, 
 	oldaa->dna, newaa->dna);
+
+    if ((numDashes = countDashes(newCodingSequence)) != 0)
+	{
+	if ((numDashes % 3) == 0)
+	    effects->so.soNumber = inframe_deletion;
+	else
+	    effects->so.soNumber = frameshift_variant;
+	}
+    else
+	{
+	// non-synonymous change
+	effects->so.soNumber = non_synonymous_variant;
+	}
     }
 
 return effectsList;
 }
 
 
 struct gpFx *gpFxInExon(struct allele *allele, struct genePred *pred, 
     int exonNum, struct psl *transcriptPsl, struct dnaSeq *transcriptSequence)
 /* generate an effect from a different allele in an exon */
 {
 char *newSequence = gpFxModifySequence(allele, pred, exonNum,
 	transcriptPsl, transcriptSequence);
 
 if (sameString(newSequence, transcriptSequence->dna))
     return NULL;  // no change in transcript