04c67a0e7bcb65c217598e5e44084bf4c5c0674d
angie
  Tue Mar 5 12:16:10 2013 -0800
Feature #6152 (Variant Annotation Integrator): first cut of annoFormatVep whichproduces output in the format of Ensembl's Variant Effect Predictor (VEP), along
with modifications to soTerm, gpFx and annoGratorGpVar to enable VEP-like output.

diff --git src/hg/lib/gpFx.c src/hg/lib/gpFx.c
index 6395bd3..325148e 100644
--- src/hg/lib/gpFx.c
+++ src/hg/lib/gpFx.c
@@ -1,549 +1,559 @@
-
 /* gpFx --- routines to calculate the effect of variation on a genePred */
 
 #include "common.h"
 #include "genePred.h"
 #include "gpFx.h"
 
+static char *collapseDashes(char *str)
+// Trim extra hyphen characters at end of str.
+{
+int len = strlen(str);
+if (len > 1)
+    {
+    char *p = str + len - 1;
+    while (*p == '-' && p > str)
+	*p-- = '\0';
+    }
+return str;
+}
+
+struct gpFx *gpFxNew(char *allele, char *transcript, enum soTerm soNumber)
+/* Fill in the common members of gpFx; leave soTerm-specific members for caller to fill in. */
+{
+struct gpFx *effect;
+AllocVar(effect);
+effect->allele = collapseDashes(strUpper(cloneString(allele)));
+effect->so.transcript = cloneString(transcript);
+effect->so.soNumber = soNumber;
+return effect;
+}
+
 static unsigned countDashes(char *string)
 /* count the number of dashes in a string */
 {
 int count = 0;
 
 while(*string)
     {
     if (*string == '-')
 	count++;
     string++;
     }
 
 return count;
 }
 
 static char * mergeAllele(char *transcript, int offset, int variantWidth,
     char *newAlleleSeq, int alleleLength)
 /* merge a variant into an allele */
 {
 char *newTranscript = transcript;
 
 if (variantWidth == alleleLength)
     {
     // for the moment, we're sticking the dashes into the transcripts
     if (1) //noDashes(newAlleleSeq))
 	{
 	memcpy(&transcript[offset], newAlleleSeq, alleleLength);
 	}
 #ifdef DOESNTKNOWABOUTTHEOFFSETARGUMENT
     // if we decide not to put the dashes into the resulting transcripts
     // this will have to be re-written to grok the offset argument
     else
 	{
 	char *transcriptSource = transcript;
 	int ii;
 	for(ii=0; ii < variantWidth; ii++)
 	    {
 	    if (*newAlleleSeq == '-')
 		{
 		transcriptSource++;
 		}
 	    else
 		{
 		*transcript++ = *newAlleleSeq++;
 		transcriptSource++;
 		}
 	    }
 	while(*transcriptSource)
 	    *transcript++ = *transcriptSource++;
 	*transcript = 0;
 	}
 #endif
     }
 else  if (variantWidth > alleleLength)
     errAbort("allele not padded");
 else 
     {
     int insertionSize = alleleLength - variantWidth;
     int newLength = strlen(transcript) + insertionSize;
     newTranscript = needMem(newLength + 1);
     char *restOfTranscript = &transcript[offset + variantWidth];
 
     // copy over the part before the variant
     memcpy(newTranscript, transcript, offset);
 
     // copy in the new allele
     memcpy(&newTranscript[offset], newAlleleSeq, alleleLength);
 
     // copy in the new allele
     memcpy(&newTranscript[offset + alleleLength], restOfTranscript, 
 	strlen(restOfTranscript) + 1);
     }
 
 return newTranscript;
 }
 
 char *gpFxModifySequence(struct allele *allele, struct genePred *pred, 
-    int exonNum, struct psl *transcriptPsl, struct dnaSeq *transcriptSequence)
-/* modify a transcript to what it'd be if the alternate allele were present */
+			 int exonNum, struct psl *transcriptPsl, struct dnaSeq *transcriptSequence,
+			 struct genePred **retNewPred)
+/* modify a transcript to what it'd be if the alternate allele were present;
+ * if transcript length changes, make retNewPred a shallow copy of pred w/tweaked coords. */
 {
 // clip allele to exon
 struct allele *clipAllele = alleleClip(allele, transcriptPsl->tStarts[exonNum], 
 	transcriptPsl->tStarts[exonNum] + transcriptPsl->blockSizes[exonNum]);
 
 // change transcript at variant point
 int exonOffset = clipAllele->variant->chromStart - transcriptPsl->tStarts[exonNum];
 int transcriptOffset = transcriptPsl->qStarts[exonNum] + exonOffset;
 
 int variantWidth = clipAllele->variant->chromEnd - clipAllele->variant->chromStart;
 
+if (variantWidth != clipAllele->length && retNewPred != NULL)
+    {
+    // OK, now we have to make a new genePred that matches the changed sequence
+    int basesAdded = clipAllele->length - variantWidth;
+    struct genePred *newPred = CloneVar(pred);
+    int alEnd = clipAllele->variant->chromEnd;
+    if (newPred->cdsStart > alEnd)
+	newPred->cdsStart += basesAdded;
+    if (newPred->cdsEnd > alEnd)
+	newPred->cdsEnd += basesAdded;
+    newPred->exonEnds[exonNum] += basesAdded;
+    int ii;
+    for (ii = exonNum+1;  ii < pred->exonCount;  ii++)
+	{
+	newPred->exonStarts[ii] += basesAdded;
+	newPred->exonEnds[ii] += basesAdded;
+	}
+    newPred->txEnd = newPred->exonEnds[newPred->exonCount - 1];
+    *retNewPred = newPred;
+    }
+
 char *retSequence = cloneString(transcriptSequence->dna);
 char *newAlleleSeq = cloneString(clipAllele->sequence);
 if (*pred->strand == '-')
     {
     transcriptOffset = transcriptSequence->size - (transcriptOffset + strlen(newAlleleSeq));
     reverseComplement(newAlleleSeq, strlen(newAlleleSeq));
     }
 
 // make the change in the sequence
-retSequence = mergeAllele( retSequence, transcriptOffset, variantWidth, newAlleleSeq, allele->length);
+retSequence = mergeAllele( retSequence, transcriptOffset, variantWidth, newAlleleSeq,
+			   clipAllele->length);
 
 // clean up
 freeMem(newAlleleSeq);
 
 return retSequence;
 }
 
 static char *getCodingSequence(struct genePred *pred, char *transcriptSequence)
 /* extract the CDS from a transcript */
 {
 int ii;
 
 if (*pred->strand == '-')
     reverseComplement(transcriptSequence, strlen(transcriptSequence));
 
 // trim off the 5' UTR ( or 3' if on the minus strand)
 char *ptr = transcriptSequence;
 for(ii=0; ii < pred->exonCount; ii++)
     {
     int exonSize = pred->exonEnds[ii] - pred->exonStarts[ii];
     if ((pred->cdsStart >= pred->exonStarts[ii]) &&
 	(pred->cdsStart < pred->exonEnds[ii]))
 	break;
 
     ptr += exonSize;
     }
 assert (ii != pred->exonCount);
 
 // clip off part of UTR in exon that has CDS in it too
 int exonOffset = pred->cdsStart - pred->exonStarts[ii];
 ptr += exonOffset;
 
 char *newString = cloneString(ptr);
 
 // trim off 3' (or 5' if on minus strand)
 newString[genePredCdsSize(pred)] = 0;
 
 // correct for strand
 if (*pred->strand == '-')
     {
     reverseComplement(transcriptSequence, strlen(transcriptSequence));
     reverseComplement(newString, strlen(newString));
     }
 
 return newString;
 }
 
 static int firstChange(char *string1, char *string2, int *numDifferent)
 /* return the position of the first difference between the two sequences */
 /* if numDifferent is non-NULL, return number of characters between first 
  * difference, and the last difference */
 {
 int count = 0;
 
 for(; *string1 == *string2; count++, string1++, string2++)
     ;
 
 int firstChange = count;
 int lastChange = firstChange;
 
 if (numDifferent != NULL)
     {
     for (; (*string1) && (*string2); string1++, string2++, count++)
 	{
 	if (*string1 != *string2)
 	    lastChange = count;
 	}
     *numDifferent = lastChange - firstChange + 1;
     }
 
 return firstChange;
 }
 
 static void getSequences(struct genePred *pred, char *transcriptSequence,
     char **codingSequence,  aaSeq **aaSeq)
 /* get coding sequences for a transcript and a variant transcript */
 {
 *codingSequence = getCodingSequence(pred, transcriptSequence);
 struct dnaSeq *codingDna = newDnaSeq(*codingSequence, strlen(*codingSequence), NULL);
-*aaSeq = translateSeq(codingDna, 0, FALSE);
-freez(codingDna);
-}
-
-static char *
-codonChangeString(int codonPos, int numCodons, char *oldCodingSequence, char *newCodingSequence)
-/* generate string that describes a codon change */
-{
-struct dyString *dy = newDyString(100);
-int ii;
-
-for(ii=0; ii < numCodons; ii++)
-    {
-    dyStringPrintf(dy, "%c%c%c > %c%c%c",
-	toupper(oldCodingSequence[codonPos + 0]),
-	toupper(oldCodingSequence[codonPos + 1]),
-	toupper(oldCodingSequence[codonPos + 2]),
-	toupper(newCodingSequence[codonPos + 0]),
-	toupper(newCodingSequence[codonPos + 1]),
-	toupper(newCodingSequence[codonPos + 2]));
-    if (ii < numCodons - 1)
-	dyStringPrintf(dy, ",");
-    codonPos += 3;
-    }
-
-return dy->string;
-}
-
-static char *
-aaChangeString(int pepPosition, int numaa, char *oldaa, char *newaa)
-/* generate string that describes an amino acid change */
+*aaSeq = translateSeq(codingDna, 0, FALSE); // TRUE truncates at stop, but doesn't include the Z
+char *stop = strchr((*aaSeq)->dna, 'Z');
+if (stop != NULL)
 {
-struct dyString *dy = newDyString(100);
-int ii;
-
-for(ii=0; ii < numaa; ii++)
-    {
-    dyStringPrintf(dy, "%c > %c", 
-	toupper(oldaa[pepPosition+ii]), toupper(newaa[pepPosition+ii]));
-    if (ii < numaa - 1)
-	dyStringPrintf(dy, ",");
+    *(stop+1) = '\0';
+    // If aaSeq was truncated by a stop codon, truncate codingSequence as well:
+    int codingLength = 3*strlen((*aaSeq)->dna);
+    (*codingSequence)[codingLength] = '\0';
     }
-
-return dy->string;
+freez(codingDna);
 }
 
 struct gpFx *gpFxCheckUtr( struct allele *allele, struct genePred *pred, 
     int exonNum, struct psl *transcriptPsl, struct dnaSeq *transcriptSequence,
     char *newSequence)
 /* check for effects in UTR of coding gene */
 {
+//#*** positiveRangeInt doesn't work for 0-length insertion variants
 if (positiveRangeIntersection(pred->txStart, pred->cdsStart,
 	allele->variant->chromStart, allele->variant->chromEnd))
     {
     // we're in 5' UTR ( or UTR intron )
     //errAbort("don't support variants in 5' UTR");
     }
 
 if (positiveRangeIntersection(pred->txStart, pred->cdsStart,
 	allele->variant->chromStart, allele->variant->chromEnd))
     {
     // we're in 3' UTR
     //errAbort("don't support variants in 3' UTR");
     }
 
 return NULL;
 }
 
 struct gpFx *gpFxChangedNoncodingTranscript( struct allele *allele, struct genePred *pred, 
     int exonNum, struct psl *transcriptPsl, struct dnaSeq *transcriptSequence,
-    char *newSequence)
+    char *newSequence, struct genePred *newPred)
 /* generate an effect for a variant in a non-coding transcript */
 {
 return NULL;
 //    errAbort("found a change in non-coding gene. we don't support non-coding genes at the moment");
 }
 
 struct gpFx *gpFxChangedCodingTranscript( struct allele *allele, struct genePred *pred, 
     int exonNum, struct psl *transcriptPsl, struct dnaSeq *transcriptSequence,
-    char *newSequence)
+    char *newSequence, struct genePred *newPred)
 /* calculate effect of allele change on coding transcript */
 {
 struct gpFx *effectsList = NULL;
 
 // first find effects of allele in UTR
 effectsList = gpFxCheckUtr(allele, pred, exonNum, transcriptPsl, 
     transcriptSequence, newSequence);
 
 // check to see if coding sequence is changed
 // calculate original and variant coding AA's
 char *oldCodingSequence, *newCodingSequence;
 aaSeq *oldaa, *newaa;
 
 getSequences(pred, transcriptSequence->dna, &oldCodingSequence, &oldaa);
-getSequences(pred, newSequence, &newCodingSequence, &newaa);
+getSequences(newPred, newSequence, &newCodingSequence, &newaa);
 
 // if coding region hasn't changed, we're done
 if (sameString(oldCodingSequence, newCodingSequence))
     return effectsList;
 
-// start allocating the effect structure
-struct gpFx *effects;
-AllocVar(effects);
+// start allocating the effect structure - fill in soNumber below
+struct gpFx *effects = gpFxNew(allele->sequence, pred->name, 0);
 slAddHead(&effectsList, effects);
 
 struct codingChange *cc = &effects->so.sub.codingChange;
-cc->transcript = cloneString(pred->name);
 int dnaChangeLength;
 cc->cDnaPosition = firstChange( newSequence, transcriptSequence->dna, &dnaChangeLength);
 int cdsChangeLength;
 cc->cdsPosition = firstChange( newCodingSequence, oldCodingSequence, &cdsChangeLength);
 if (*pred->strand == '-')
     cc->exonNumber = pred->exonCount - exonNum;
 else
     cc->exonNumber = exonNum;
 
-// calc codon change
+// show specific coding changes by making before-and-after subsequences:
 int codonPosStart = (cc->cdsPosition / 3) ;
 int codonPosEnd = ((cdsChangeLength - 1 + cc->cdsPosition) / 3) ;
 int numCodons = (codonPosEnd - codonPosStart + 1);
-cc->codonChanges = codonChangeString( codonPosStart*3, numCodons, oldCodingSequence, newCodingSequence);
-// by convention we zero out these fields if they aren't used
-cc->pepPosition = 0;
-cc->aaChanges = "";
+cc->codonOld = cloneStringZ(oldCodingSequence + codonPosStart*3, numCodons*3);
+cc->codonNew = cloneStringZ(newCodingSequence + codonPosStart*3, numCodons*3);
+
+cc->pepPosition = codonPosStart;
+cc->aaOld = cloneStringZ(oldaa->dna + cc->pepPosition, numCodons);
+cc->aaNew = cloneStringZ(newaa->dna + cc->pepPosition, numCodons);
 
 int numDashes;
 if (sameString(newaa->dna, oldaa->dna))
     {
     // synonymous change
     effects->so.soNumber = synonymous_variant;
     }
 else
     {
-    int numDifferent;
-    cc->pepPosition = firstChange( newaa->dna, oldaa->dna, &numDifferent);
-    cc->aaChanges = aaChangeString( cc->pepPosition, numDifferent, 
-	oldaa->dna, newaa->dna);
-
     // this is assuming that deletions are marked with dashes
     // in newCodingSequence
     if ((numDashes = countDashes(newCodingSequence)) != 0)
 	{
 	if ((numDashes % 3) == 0)
 	    effects->so.soNumber = inframe_deletion;
 	else
 	    effects->so.soNumber = frameshift_variant;
 	}
-    else if (strlen(newaa->dna) != strlen(oldaa->dna)) 
+    else if (newaa->size < oldaa->size)
+	{
+	effects->so.soNumber = stop_gained;
+	}
+    else if (newaa->size != oldaa->size)
 	{
 	effects->so.soNumber = inframe_insertion;
 	}
     else
 	{
-	// non-synonymous change
-	effects->so.soNumber = non_synonymous_variant;
+	char *stopPos = strchr(newaa->dna, 'Z');
+	// non-synonymous change: nonsense (early stop codon) or missense change?
+	if (stopPos != NULL && stopPos < newaa->dna + newaa->size - 1)
+	    effects->so.soNumber = stop_gained;
+	else
+	    effects->so.soNumber = missense_variant;
 	}
     }
 
 return effectsList;
 }
 
 
 struct gpFx *gpFxInExon(struct allele *allele, struct genePred *pred, 
     int exonNum, struct psl *transcriptPsl, struct dnaSeq *transcriptSequence)
 /* generate an effect from a different allele in an exon */
 {
+struct genePred *newPred = pred;
 char *newSequence = gpFxModifySequence(allele, pred, exonNum,
-	transcriptPsl, transcriptSequence);
+				       transcriptPsl, transcriptSequence, &newPred);
 
 if (sameString(newSequence, transcriptSequence->dna))
     return NULL;  // no change in transcript
 
 struct gpFx *effectsList;
 if (pred->cdsStart != pred->cdsEnd)
     effectsList = gpFxChangedCodingTranscript( allele, pred, exonNum, transcriptPsl, 
-	transcriptSequence, newSequence);
+					       transcriptSequence, newSequence, newPred);
 else
     effectsList = gpFxChangedNoncodingTranscript( allele, pred, exonNum, transcriptPsl, 
-	transcriptSequence, newSequence);
+						  transcriptSequence, newSequence, newPred);
 
+if (newPred != pred)
+    freeMem(newPred);
 return effectsList;
 }
 
 static struct psl *genePredToPsl(struct genePred *pred)
 /* generate a PSL alignment to a transcript from a genePred */
 {
 int qSize = genePredBases(pred);
 #define BOGUS_CHROM_SIZE  0
 struct psl *psl = pslNew(pred->name, qSize, 0, qSize,
                          pred->chrom, BOGUS_CHROM_SIZE, 
 			 pred->txStart, pred->txEnd,
 			 pred->strand, pred->exonCount, 0);
 psl->match = psl->qSize;
 
 int i, qNext = 0;
 for (i = 0; i < pred->exonCount; i++)
     {
     int exonSize =  pred->exonEnds[i] - pred->exonStarts[i];
 
     psl->qStarts[i] = qNext;
     //psl->tStarts[i] = pred->exonStarts[i] + pred->txStart;
     psl->tStarts[i] = pred->exonStarts[i];
     psl->blockSizes[i] = exonSize;
 
 /* notnow
     if (i > 0)
         {
         psl->tNumInsert += 1;
         psl->tBaseInsert += psl->tStarts[i] - pslTEnd(psl, i-1);
         }
 */
 
     psl->blockCount++;
     qNext += psl->blockSizes[i];
     }
 
 return psl;
 }
 
 static struct gpFx *gpFxCheckExons(struct variant *variant, 
     struct genePred *pred, struct dnaSeq *transcriptSequence)
 // check to see if the variant is in an exon
 {
 int ii;
 struct gpFx *effectsList = NULL;
 struct psl *transcriptPsl = genePredToPsl(pred);
 
 // should copy transcriptSequence if we have more than one variant!!
 for(; variant ; variant = variant->next)
     {
     struct allele *allele = variant->alleles;
     for(; allele ; allele = allele->next)
 	{
 	for(ii=0; ii < pred->exonCount; ii++)
 	    {
 	    // check if in an exon
 	    int size;
 	    int end = variant->chromEnd;
 	    if (variant->chromStart == end)
 		end++;
 	    if ((size = positiveRangeIntersection(pred->exonStarts[ii], 
 		pred->exonEnds[ii], 
 		variant->chromStart, end)) > 0)
 		{
-
 		if (size != end - variant->chromStart)
-		    errAbort("variant crosses exon boundary");
-
+		    {
+		    struct gpFx *effect = gpFxNew(allele->sequence, pred->name,
+						  complex_transcript_variant);
+		    effectsList = slCat(effectsList, effect);
+		    }
 		effectsList = slCat(effectsList, gpFxInExon(allele, pred, ii,
 		    transcriptPsl, transcriptSequence));
 		}
 	    }
 	}
     }
 
 return effectsList;
 }
 
 static struct gpFx *gpFxCheckIntrons(struct variant *variant, 
     struct genePred *pred)
 // check to see if a single variant is in an exon or an intron
 {
 int ii;
 struct gpFx *effectsList = NULL;
 
 for(ii=0; ii < pred->exonCount - 1; ii++)
     {
     // check if in intron
-    if (positiveRangeIntersection(pred->exonEnds[ii], 
-	    pred->exonStarts[ii+1],
-	    variant->chromStart, variant->chromEnd))
-	{
-	struct gpFx *effects;
-	AllocVar(effects);
-	effects->so.soNumber = intron_variant;
-	effects->so.sub.intron.transcript = cloneString(pred->name);
+    if (variant->chromEnd > pred->exonEnds[ii] &&
+	variant->chromStart < pred->exonStarts[ii+1])
+	{
+	struct gpFx *effects = gpFxNew("", pred->name, intron_variant);
 	effects->so.sub.intron.intronNumber = ii;
 	slAddHead(&effectsList, effects);
 	}
     }
 
 return effectsList;
 }
 
 
 static struct gpFx *gpFxCheckBackground(struct variant *variant, 
     struct genePred *pred)
 // check to see if the variant is up or downstream or in intron of the gene 
 {
-struct gpFx *effectsList = NULL, *effects;
+struct gpFx *effectsList = NULL;
 
 for(; variant ; variant = variant->next)
     {
     // is this variant in an intron
     effectsList = slCat(effectsList, gpFxCheckIntrons(variant, pred));
 
-    if (positiveRangeIntersection(pred->txStart - GPRANGE, pred->txStart,
-	    variant->chromStart, variant->chromEnd))
+    // Is this variant to the left or right of transcript?
+    enum soTerm soNumber = 0;
+    if (variant->chromStart < pred->txStart &&
+	variant->chromEnd > pred->txStart - GPRANGE)
 	{
-	AllocVar(effects);
 	if (*pred->strand == '+')
-	    ;//effects->gpFxType = gpFxUpstream;
+	    soNumber = upstream_gene_variant;
 	else
-	    ;//ffects->gpFxType = gpFxDownstream;
-	effectsList = slCat(effectsList, effects);
+	    soNumber = downstream_gene_variant;
 	}
-
-    if (positiveRangeIntersection(pred->txEnd, pred->txEnd + GPRANGE,
-	    variant->chromStart, variant->chromEnd))
+    else if (variant->chromEnd > pred->txEnd &&
+	     variant->chromStart < pred->txEnd + GPRANGE)
 	{
-	AllocVar(effects);
 	if (*pred->strand == '+')
-	    ;//ffects->gpFxType = gpFxDownstream;
+	    soNumber = downstream_gene_variant;
 	else
-	    ;//ffects->gpFxType = gpFxUpstream;
+	    soNumber = upstream_gene_variant;
+	}
+    if (soNumber != 0)
+	{
+	struct gpFx *effects = gpFxNew("", pred->name, soNumber);
 	effectsList = slCat(effectsList, effects);
 	}
     }
 
 return effectsList;
 }
 
 static void checkVariantList(struct variant *variant)
 // check to see that we either have one variant (possibly with multiple
 // alleles) or that if we have a list of variants, they only have
 // one allele a piece.
 {
 if (variant->next == NULL)	 // just one variant
     return;
 
 for(; variant; variant = variant->next)
     if (variant->numAlleles != 1)
 	errAbort("gpFxPredEffect needs either 1 variant, or only 1 allele in all variants");
 }
 
 struct gpFx *gpFxPredEffect(struct variant *variant, struct genePred *pred,
     struct dnaSeq *transcriptSequence)
 // return the predicted effect(s) of a variation list on a genePred
 {
 struct gpFx *effectsList = NULL;
 
 // make sure we can deal with the variants that are coming in
 checkVariantList(variant);
 
 // check to see if SNP is up or downstream in intron 
 effectsList = slCat(effectsList, gpFxCheckBackground(variant, pred));
 
 // check to see if SNP is in the transcript
 effectsList = slCat(effectsList, gpFxCheckExons(variant, pred, 
     transcriptSequence));
 
-if (effectsList != NULL)
     return effectsList;
-
-// default is no effect
-struct gpFx *noEffect;
-
-AllocVar(noEffect);
-noEffect->next = NULL;
-;//oEffect->gpFxType = gpFxNone;
-
-return noEffect;
 }