5c583c55ff0181c5f12ee21e6aa73bda6a5e7079
braney
  Mon May 21 14:47:12 2012 -0700
support for additional indel types in gpFx (#6152)
diff --git src/hg/lib/gpFx.c src/hg/lib/gpFx.c
index 1689539..d059d01 100644
--- src/hg/lib/gpFx.c
+++ src/hg/lib/gpFx.c
@@ -1,120 +1,149 @@
 
 /* gpFx --- routines to calculate the effect of variation on a genePred */
 
 #include "common.h"
 #include "genePred.h"
 #include "gpFx.h"
 
-
-unsigned countDashes(char *string)
+static unsigned countDashes(char *string)
+/* count the number of dashes in a string */
 {
 int count = 0;
 
 while(*string)
     {
     if (*string == '-')
 	count++;
     string++;
     }
 
 return count;
 }
 
-static void mergeAllele(char *transcript, int variantWidth,
+static char * mergeAllele(char *transcript, int offset, int variantWidth,
     char *newAlleleSeq, int alleleLength)
+/* merge a variant into an allele */
 {
+char *newTranscript = transcript;
+
 if (variantWidth == alleleLength)
     {
     // for the moment, we're sticking the dashes into the transcripts
     if (1) //noDashes(newAlleleSeq))
-	memcpy(transcript, newAlleleSeq, alleleLength);
+	{
+	memcpy(&transcript[offset], newAlleleSeq, alleleLength);
+	}
+#ifdef DOESNTKNOWABOUTTHEOFFSETARGUMENT
+    // if we decide not to put the dashes into the resulting transcripts
+    // this will have to be re-written to grok the offset argument
     else
 	{
 	char *transcriptSource = transcript;
 	int ii;
 	for(ii=0; ii < variantWidth; ii++)
 	    {
 	    if (*newAlleleSeq == '-')
 		{
 		transcriptSource++;
 		}
 	    else
 		{
 		*transcript++ = *newAlleleSeq++;
 		transcriptSource++;
 		}
 	    }
 	while(*transcriptSource)
 	    *transcript++ = *transcriptSource++;
 	*transcript = 0;
 	}
+#endif
     }
+else  if (variantWidth > alleleLength)
+    errAbort("allele not padded");
+else 
+    {
+    int insertionSize = alleleLength - variantWidth;
+    int newLength = strlen(transcript) + insertionSize;
+    newTranscript = needMem(newLength + 1);
+    char *restOfTranscript = &transcript[offset + variantWidth];
+
+    // copy over the part before the variant
+    memcpy(newTranscript, transcript, offset);
+
+    // copy in the new allele
+    memcpy(&newTranscript[offset], newAlleleSeq, alleleLength);
+
+    // copy in the new allele
+    memcpy(&newTranscript[offset + alleleLength], restOfTranscript, 
+	strlen(restOfTranscript) + 1);
+    }
+
+return newTranscript;
 }
 
 char *gpFxModifySequence(struct allele *allele, struct genePred *pred, 
     int exonNum, struct psl *transcriptPsl, struct dnaSeq *transcriptSequence)
 /* modify a transcript to what it'd be if the alternate allele were present */
 {
 // clip allele to exon
 struct allele *clipAllele = alleleClip(allele, transcriptPsl->tStarts[exonNum], 
 	transcriptPsl->tStarts[exonNum] + transcriptPsl->blockSizes[exonNum]);
 
 // change transcript at variant point
 int exonOffset = clipAllele->variant->chromStart - transcriptPsl->tStarts[exonNum];
 int transcriptOffset = transcriptPsl->qStarts[exonNum] + exonOffset;
 
 int variantWidth = clipAllele->variant->chromEnd - clipAllele->variant->chromStart;
-if (clipAllele->length !=  variantWidth)
-    errAbort("only support alleles the same length as the reference");
 
 char *retSequence = cloneString(transcriptSequence->dna);
 char *newAlleleSeq = cloneString(clipAllele->sequence);
 if (*pred->strand == '-')
     {
     transcriptOffset = transcriptSequence->size - (transcriptOffset + strlen(newAlleleSeq));
     reverseComplement(newAlleleSeq, strlen(newAlleleSeq));
     }
 
 // make the change in the sequence
-mergeAllele( &retSequence[transcriptOffset], variantWidth, newAlleleSeq, allele->length);
+retSequence = mergeAllele( retSequence, transcriptOffset, variantWidth, newAlleleSeq, allele->length);
 
 // clean up
 freeMem(newAlleleSeq);
 
 return retSequence;
 }
 
 static char *getCodingSequence(struct genePred *pred, char *transcriptSequence)
 /* extract the CDS from a transcript */
 {
 int ii;
 
 if (*pred->strand == '-')
     reverseComplement(transcriptSequence, strlen(transcriptSequence));
 
 // trim off the 5' UTR ( or 3' if on the minus strand)
 char *ptr = transcriptSequence;
 for(ii=0; ii < pred->exonCount; ii++)
     {
     int exonSize = pred->exonEnds[ii] - pred->exonStarts[ii];
-    if ((pred->cdsStart > pred->exonStarts[ii]) &&
+    if ((pred->cdsStart >= pred->exonStarts[ii]) &&
 	(pred->cdsStart < pred->exonEnds[ii]))
 	break;
 
     ptr += exonSize;
     }
+assert (ii != pred->exonCount);
 
 // clip off part of UTR in exon that has CDS in it too
 int exonOffset = pred->cdsStart - pred->exonStarts[ii];
 ptr += exonOffset;
 
 char *newString = cloneString(ptr);
 
 // trim off 3' (or 5' if on minus strand)
 newString[genePredCdsSize(pred)] = 0;
 
 // correct for strand
 if (*pred->strand == '-')
     {
     reverseComplement(transcriptSequence, strlen(transcriptSequence));
     reverseComplement(newString, strlen(newString));
@@ -189,39 +218,30 @@
 {
 struct dyString *dy = newDyString(100);
 int ii;
 
 for(ii=0; ii < numaa; ii++)
     {
     dyStringPrintf(dy, "%c > %c", 
 	toupper(oldaa[pepPosition+ii]), toupper(newaa[pepPosition+ii]));
     if (ii < numaa - 1)
 	dyStringPrintf(dy, ",");
     }
 
 return dy->string;
 }
 
-struct gpFx *gpFxInCdsDeletion( struct allele *allele, struct genePred *pred, 
-    int exonNum, struct psl *transcriptPsl, struct dnaSeq *transcriptSequence,
-    char *newSequence)
-{
-struct gpFx *effects;
-AllocVar(effects);
-return effects;
-}
-
 struct gpFx *gpFxCheckUtr( struct allele *allele, struct genePred *pred, 
     int exonNum, struct psl *transcriptPsl, struct dnaSeq *transcriptSequence,
     char *newSequence)
 /* check for effects in UTR of coding gene */
 {
 if (positiveRangeIntersection(pred->txStart, pred->cdsStart,
 	allele->variant->chromStart, allele->variant->chromEnd))
     {
     // we're in 5' UTR ( or UTR intron )
     errAbort("don't support variants in 5' UTR");
     }
 
 if (positiveRangeIntersection(pred->txStart, pred->cdsStart,
 	allele->variant->chromStart, allele->variant->chromEnd))
     {
@@ -252,31 +272,30 @@
 effectsList = gpFxCheckUtr(allele, pred, exonNum, transcriptPsl, 
     transcriptSequence, newSequence);
 
 // check to see if coding sequence is changed
 // calculate original and variant coding AA's
 char *oldCodingSequence, *newCodingSequence;
 aaSeq *oldaa, *newaa;
 
 getSequences(pred, transcriptSequence->dna, &oldCodingSequence, &oldaa);
 getSequences(pred, newSequence, &newCodingSequence, &newaa);
 
 // if coding region hasn't changed, we're done
 if (sameString(oldCodingSequence, newCodingSequence))
     return effectsList;
 
-
 // start allocating the effect structure
 struct gpFx *effects;
 AllocVar(effects);
 slAddHead(&effectsList, effects);
 
 struct codingChange *cc = &effects->so.sub.codingChange;
 cc->transcript = cloneString(pred->name);
 int dnaChangeLength;
 cc->cDnaPosition = firstChange( newSequence, transcriptSequence->dna, &dnaChangeLength);
 int cdsChangeLength;
 cc->cdsPosition = firstChange( newCodingSequence, oldCodingSequence, &cdsChangeLength);
 if (*pred->strand == '-')
     cc->exonNumber = pred->exonCount - exonNum;
 else
     cc->exonNumber = exonNum;
@@ -291,37 +310,43 @@
 cc->aaChanges = "";
 
 int numDashes;
 if (sameString(newaa->dna, oldaa->dna))
     {
     // synonymous change
     effects->so.soNumber = synonymous_variant;
     }
 else
     {
     int numDifferent;
     cc->pepPosition = firstChange( newaa->dna, oldaa->dna, &numDifferent);
     cc->aaChanges = aaChangeString( cc->pepPosition, numDifferent, 
 	oldaa->dna, newaa->dna);
 
+    // this is assuming that deletions are marked with dashes
+    // in newCodingSequence
     if ((numDashes = countDashes(newCodingSequence)) != 0)
 	{
 	if ((numDashes % 3) == 0)
 	    effects->so.soNumber = inframe_deletion;
 	else
 	    effects->so.soNumber = frameshift_variant;
 	}
+    else if (strlen(newaa->dna) != strlen(oldaa->dna)) 
+	{
+	effects->so.soNumber = inframe_insertion;
+	}
     else
 	{
 	// non-synonymous change
 	effects->so.soNumber = non_synonymous_variant;
 	}
     }
 
 return effectsList;
 }
 
 
 struct gpFx *gpFxInExon(struct allele *allele, struct genePred *pred, 
     int exonNum, struct psl *transcriptPsl, struct dnaSeq *transcriptSequence)
 /* generate an effect from a different allele in an exon */
 {
@@ -384,36 +409,39 @@
 {
 int ii;
 struct gpFx *effectsList = NULL;
 struct psl *transcriptPsl = genePredToPsl(pred);
 
 // should copy transcriptSequence if we have more than one variant!!
 for(; variant ; variant = variant->next)
     {
     struct allele *allele = variant->alleles;
     for(; allele ; allele = allele->next)
 	{
 	for(ii=0; ii < pred->exonCount; ii++)
 	    {
 	    // check if in an exon
 	    int size;
+	    int end = variant->chromEnd;
+	    if (variant->chromStart == end)
+		end++;
 	    if ((size = positiveRangeIntersection(pred->exonStarts[ii], 
 		pred->exonEnds[ii], 
-		variant->chromStart, variant->chromEnd)) > 0)
+		variant->chromStart, end)) > 0)
 		{
 
-		if (size != variant->chromEnd - variant->chromStart)
+		if (size != end - variant->chromStart)
 		    errAbort("variant crosses exon boundary");
 
 		effectsList = slCat(effectsList, gpFxInExon(allele, pred, ii,
 		    transcriptPsl, transcriptSequence));
 		}
 	    }
 	}
     }
 
 return effectsList;
 }
 
 static struct gpFx *gpFxCheckIntrons(struct variant *variant, 
     struct genePred *pred)
 // check to see if a single variant is in an exon or an intron