26d6cd9597d40b55635b1fbacc8dd64978887474
angie
  Mon Jun 17 10:51:53 2013 -0700
As Jonathan pointed out in note 46, Ensembl VEP always shows a/the alternate allele,even when the allele isn't used to make the functional prediction -- follow suit.
refs #6152

diff --git src/hg/lib/gpFx.c src/hg/lib/gpFx.c
index a75cd3c..ed25514 100644
--- src/hg/lib/gpFx.c
+++ src/hg/lib/gpFx.c
@@ -268,41 +268,42 @@
 			   int exonNum, int cDnaPosition, struct lm *lm)
 /* check for effects in UTR of coding gene -- caller ensures it's in exon, pred is coding
  * and exonNum has been strand-adjusted */
 {
 struct gpFx *gpFx = NULL;
 enum soTerm term = 0;
 struct variant *variant = allele->variant;
 if (variant->chromStart < pred->cdsStart && variant->chromEnd > pred->txStart)
     // we're in left UTR
     term = (*pred->strand == '-') ? _3_prime_UTR_variant : _5_prime_UTR_variant;
 else if (variant->chromStart < pred->txEnd && variant->chromEnd > pred->cdsEnd)
     // we're in right UTR
     term = (*pred->strand == '-') ? _5_prime_UTR_variant : _3_prime_UTR_variant;
 if (term != 0)
     {
-    gpFx = gpFxNew("", pred->name, term, nonCodingExon, lm);
+    gpFx = gpFxNew(allele->sequence, pred->name, term, nonCodingExon, lm);
     setNCExonVals(gpFx, exonNum, cDnaPosition);
     }
 return gpFx;
 }
 
 struct gpFx *gpFxChangedNoncodingExon(struct allele *allele, struct genePred *pred,
 				      int exonNum, int cDnaPosition, struct lm *lm)
 /* generate an effect for a variant in a non-coding transcript */
 {
-struct gpFx *gpFx = gpFxNew("", pred->name, non_coding_exon_variant, nonCodingExon, lm);
+struct gpFx *gpFx = gpFxNew(allele->sequence, pred->name, non_coding_exon_variant, nonCodingExon,
+			    lm);
 if (*pred->strand == '-')
     exonNum = pred->exonCount - exonNum - 1;
 setNCExonVals(gpFx, exonNum, cDnaPosition);
 return gpFx;
 }
 
 boolean isSafeFromNMD(int exonNum, struct variant *variant, struct genePred *pred)
 /* Return TRUE if variant in strand-corrected exonNum is in the region
  * of pred that would make it safe from Nonsense-Mediated Decay (NMD).
  * NMD is triggered by the presence of a stop codon that appears
  * before ~50bp before the end of the last exon.  In other words, if
  * there's a stop codon with a detectable downstream splice junction,
  * translation is prevented -- see
  * http://en.wikipedia.org/wiki/MRNA_surveillance#Mechanism_in_mammals */
 {
@@ -402,31 +403,31 @@
 				   int cdsPosition, int cdsBasesAdded, struct lm *lm)
 /* calculate effect of allele change on coding transcript */
 {
 struct gpFx *effectsList = NULL;
 if (*pred->strand == '-')
     exonNum = pred->exonCount - exonNum - 1;
 
 // first find effects of allele in UTR, if any
 effectsList = gpFxCheckUtr(allele, pred, exonNum, cDnaPosition, lm);
 
 // calculate original and variant coding DNA and AA's
 char *oldCodingSequence, *newCodingSequence, *oldaa, *newaa;
 getSequences(pred, transcriptSequence->dna, &oldCodingSequence, &oldaa, lm);
 getSequences(newPred, newSequence, &newCodingSequence, &newaa, lm);
 
-// if coding sequence hasn't changed, we're done (allele == reference allele)
+// if coding sequence hasn't changed (just UTR), we're done
 if (sameString(oldCodingSequence, newCodingSequence))
     return effectsList;
 
 // allocate the effect structure - fill in soNumber and details below
 struct gpFx *effect = gpFxNew(allele->sequence, pred->name, coding_sequence_variant, codingChange,
 			      lm);
 struct codingChange *cc = &effect->details.codingChange;
 cc->cDnaPosition = cDnaPosition;
 cc->cdsPosition = cdsPosition;
 cc->exonNumber = exonNum;
 int pepPos = cdsPosition / 3;
 cc->pepPosition = pepPos;
 if (cdsBasesAdded % 3 == 0)
     {
     // Common case: substitution, same number of old/new codons/peps:
@@ -455,195 +456,232 @@
     cc->codonNew = lmCloneString(lm, newCodingSequence + pepPos*3);
     cc->aaOld = lmCloneString(lm, oldaa + pepPos);
     cc->aaNew = lmCloneString(lm, newaa + pepPos);
     }
 
 // show specific coding changes by making before-and-after subsequences:
 
 boolean safeFromNMD = isSafeFromNMD(exonNum, allele->variant, pred);
 setSpecificCodingSoTerm(effect, oldaa, newaa, cdsBasesAdded, safeFromNMD);
 
 slAddHead(&effectsList, effect);
 return effectsList;
 }
 
 
-struct gpFx *gpFxInExon(struct allele *allele, struct genePred *pred, int exonNum,
+struct gpFx *gpFxInExon(struct allele *allele, struct genePred *pred, char *refAllele, int exonNum,
 			int exonQStart, struct dnaSeq *transcriptSequence, struct lm *lm)
 /* generate an effect from a different allele in an exon */
 {
 struct genePred *newPred = pred;
 int cDnaPosition = 0, basesAdded = 0, cdsPosition = 0, cdsBasesAdded = 0;
 char *newSequence = gpFxModifySequence(allele, pred, exonNum, exonQStart,
 				       transcriptSequence, &newPred, &cDnaPosition, &basesAdded,
 				       &cdsPosition, &cdsBasesAdded, lm);
 
 if (sameString(newSequence, transcriptSequence->dna))
-    return NULL;  // no change in transcript
+    {
+    struct variant *variant = allele->variant;
+    errAbort("gpFxInExon: %s:%d-%d, allele='%s', refAllele='%s', no change, shouldn't be here",
+	     variant->chrom, variant->chromStart+1, variant->chromEnd, allele->sequence,
+	     refAllele);
+    return NULL;  // no change in transcript -- but allele should always be non-reference
+    }
 
 struct gpFx *effectsList;
 if (pred->cdsStart != pred->cdsEnd)
     effectsList = gpFxChangedCodingExon(allele, pred, exonNum, transcriptSequence,
 					newSequence, newPred, cDnaPosition, basesAdded,
 					cdsPosition, cdsBasesAdded, lm);
 else
     {
     effectsList = gpFxChangedNoncodingExon(allele, pred, exonNum, cDnaPosition, lm);
     }
 
 return effectsList;
 }
 
+static boolean hasAltAllele(struct allele *alleles, char *refAllele)
+/* Make sure there's something to work on here... */
+{
+while (alleles != NULL && sameString(alleles->sequence, refAllele))
+    alleles = alleles->next;
+return (alleles != NULL);
+}
+
+static char *firstAltAllele(struct allele *alleles, char *refAllele)
+/* Ensembl always reports an alternate allele, even if that allele is not being used
+ * to calculate any consequence.  When allele doesn't really matter, just use the
+ * first alternate allele that is given. */
+{
+while (alleles != NULL && sameString(alleles->sequence, refAllele))
+    alleles = alleles->next;
+if (alleles == NULL)
+    errAbort("firstAltAllele: no alt allele in list");
+return alleles->sequence;
+}
+
 static struct gpFx *gpFxCheckExons(struct variant *variantList, struct genePred *pred,
-				   struct dnaSeq *transcriptSequence, struct lm *lm)
+				   char *refAllele, struct dnaSeq *transcriptSequence,
+				   struct lm *lm)
 // check to see if the variant is in an exon
 {
 struct gpFx *effectsList = NULL;
 int exonQStart = 0;
 int ii;
 for (ii=0; ii < pred->exonCount; ii++)
     {
     struct variant *variant = variantList;
     for(; variant ; variant = variant->next)
 	{
 	// check if in an exon
 	int end = variant->chromEnd;
 	if (variant->chromStart == end)
 	    end++;
 	int size;
 	if ((size = positiveRangeIntersection(pred->exonStarts[ii], pred->exonEnds[ii], 
 					      variant->chromStart, end)) > 0)
 	    {
 	    if (size != end - variant->chromStart)
 		{
 		// variant is not completely in exon
-		struct gpFx *effect = gpFxNew("", pred->name,
-					      complex_transcript_variant, none, lm);
+		char *altAllele = firstAltAllele(variant->alleles, refAllele);
+		struct gpFx *effect = gpFxNew(altAllele, pred->name, complex_transcript_variant,
+					      none, lm);
 		effectsList = slCat(effectsList, effect);
 		}
 	    struct allele *allele = variant->alleles;
 	    for(; allele ; allele = allele->next)
 		{
-		effectsList = slCat(effectsList, gpFxInExon(allele, pred, ii, exonQStart,
+		if (differentString(allele->sequence, refAllele))
+		    effectsList = slCat(effectsList,
+					gpFxInExon(allele, pred, refAllele, ii, exonQStart,
 							    transcriptSequence, lm));
 		}
 	    }
 	}
     exonQStart += (pred->exonEnds[ii] - pred->exonStarts[ii]);
     }
 
 return effectsList;
 }
 
-static struct gpFx *gpFxCheckIntrons(struct variant *variant, struct genePred *pred, struct lm *lm)
+static struct gpFx *gpFxCheckIntrons(struct variant *variant, struct genePred *pred,
+				     char *altAllele, struct lm *lm)
 // check to see if a single variant overlaps an intron (and possibly splice region)
 //#*** TODO: watch out for "introns" that are actually indels between tx seq and ref genome!
 {
 struct gpFx *effectsList = NULL;
 boolean minusStrand = (pred->strand[0] == '-');
 int ii;
 for (ii=0; ii < pred->exonCount - 1; ii++)
     {
     int intronStart = pred->exonEnds[ii];
     int intronEnd = pred->exonStarts[ii+1];
     if (variant->chromEnd > intronStart && variant->chromStart < intronEnd)
 	{
 	enum soTerm soNumber = intron_variant;
 	if (variant->chromEnd > intronStart && variant->chromStart < intronStart+2)
 	    // Within 2 bases of intron start(/end for '-'):
 	    soNumber = minusStrand ? splice_acceptor_variant : splice_donor_variant;
 	if (variant->chromEnd > intronEnd-2 && variant->chromStart < intronEnd)
 	    // Within 2 bases of intron end(/start for '-'):
 	    soNumber = minusStrand ? splice_donor_variant : splice_acceptor_variant;
 	else if ((variant->chromEnd > intronStart+3 && variant->chromStart < intronStart+8) ||
 		 (variant->chromEnd > intronEnd-8 && variant->chromStart < intronEnd+3))
 	    // Within 3 to 8 bases of intron start or end:
 	    soNumber = splice_region_variant;
-	struct gpFx *effects = gpFxNew("", pred->name, soNumber, intron, lm);
+	struct gpFx *effects = gpFxNew(altAllele, pred->name, soNumber, intron, lm);
 	effects->details.intron.intronNumber = minusStrand ? (pred->exonCount - ii - 1) : ii;
 	slAddHead(&effectsList, effects);
 	}
     else if ((variant->chromEnd > intronStart-3 && variant->chromStart < intronStart) ||
 	     (variant->chromEnd > intronEnd && variant->chromStart < intronEnd+3))
 	{
 	// if variant is in exon *but* within 3 bases of splice site,
 	// it also qualifies as splice_region_variant:
-	struct gpFx *effects = gpFxNew("", pred->name, splice_region_variant, intron, lm);
+	struct gpFx *effects = gpFxNew(altAllele, pred->name,
+				       splice_region_variant, intron, lm);
 	effects->details.intron.intronNumber = minusStrand ? (pred->exonCount - ii - 1) : ii;
 	slAddHead(&effectsList, effects);
 	}
     }
 return effectsList;
 }
 
 
 static struct gpFx *gpFxCheckBackground(struct variant *variant, struct genePred *pred,
-					struct lm *lm)
+					char *refAllele, struct lm *lm)
 // check to see if the variant is up or downstream or in intron of the gene 
 {
 struct gpFx *effectsList = NULL;
+char *altAllele = firstAltAllele(variant->alleles, refAllele);
 
 for(; variant ; variant = variant->next)
     {
     // is this variant in an intron
-    effectsList = slCat(effectsList, gpFxCheckIntrons(variant, pred, lm));
+    effectsList = slCat(effectsList, gpFxCheckIntrons(variant, pred, altAllele, lm));
 
     // Is this variant to the left or right of transcript?
     enum soTerm soNumber = 0;
     if (variant->chromStart < pred->txStart &&
 	variant->chromEnd > pred->txStart - GPRANGE)
 	{
 	if (*pred->strand == '+')
 	    soNumber = upstream_gene_variant;
 	else
 	    soNumber = downstream_gene_variant;
 	}
     else if (variant->chromEnd > pred->txEnd &&
 	     variant->chromStart < pred->txEnd + GPRANGE)
 	{
 	if (*pred->strand == '+')
 	    soNumber = downstream_gene_variant;
 	else
 	    soNumber = upstream_gene_variant;
 	}
     if (soNumber != 0)
 	{
-	struct gpFx *effects = gpFxNew("", pred->name, soNumber, none, lm);
+	struct gpFx *effects = gpFxNew(altAllele, pred->name, soNumber, none, lm);
 	effectsList = slCat(effectsList, effects);
 	}
     }
 
 return effectsList;
 }
 
 static void checkVariantList(struct variant *variant)
 // check to see that we either have one variant (possibly with multiple
 // alleles) or that if we have a list of variants, they only have
 // one allele a piece.
 {
 if (variant->next == NULL)	 // just one variant
     return;
 
 for(; variant; variant = variant->next)
     if (variant->numAlleles != 1)
 	errAbort("gpFxPredEffect needs either 1 variant, or only 1 allele in all variants");
 }
 
-struct gpFx *gpFxPredEffect(struct variant *variant, struct genePred *pred,
+struct gpFx *gpFxPredEffect(struct variant *variant, struct genePred *pred, char *refAllele,
 			    struct dnaSeq *transcriptSequence, struct lm *lm)
 // return the predicted effect(s) of a variation list on a genePred
 {
 struct gpFx *effectsList = NULL;
 
 // make sure we can deal with the variants that are coming in
 checkVariantList(variant);
 
-// check to see if SNP is up or downstream in intron 
-effectsList = slCat(effectsList, gpFxCheckBackground(variant, pred, lm));
+// If only the reference allele has been observed, skip it:
+if (! hasAltAllele(variant->alleles, refAllele))
+    return NULL;
+
+// check to see if SNP is up or downstream or in intron
+effectsList = slCat(effectsList, gpFxCheckBackground(variant, pred, refAllele, lm));
 
 // check to see if SNP is in the transcript
-effectsList = slCat(effectsList, gpFxCheckExons(variant, pred, transcriptSequence, lm));
+effectsList = slCat(effectsList, gpFxCheckExons(variant, pred, refAllele, transcriptSequence, lm));
 
-//#*** sort by position
+//#*** sort by transcript name?  transcript start?
 
 return effectsList;
 }