1ffd9a045c83c4b4a90493323e4dc3d3ac677f31
angie
  Fri Sep 22 16:07:10 2017 -0700
When start codon is lost, HGVS p. should be p.? because there may or may not be something that rescues the protein like a downstream in-frame start codon.

diff --git src/hg/lib/hgHgvs.c src/hg/lib/hgHgvs.c
index 43fc4e5..ed54650 100644
--- src/hg/lib/hgHgvs.c
+++ src/hg/lib/hgHgvs.c
@@ -2547,55 +2547,63 @@
 struct dyString *dy = dyStringCreate("%s:c.", vpTx->txName);
 // Make local copies of vpTx->{start,end} -- we may need to modify them for HGVS ins/dup.
 struct vpTxPosition startPos = vpTx->start, endPos = vpTx->end;
 int dupLen = tweakInsDup(&startPos, &endPos, vpTx->txAlt, gSeqWin, txAli, txSeq);
 appendHgvsNucPos(dy, &startPos, TRUE, cds);
 if (!vpTxPosRangeIsSingleBase(&startPos, &endPos))
     {
     dyStringAppendC(dy, '_');
     appendHgvsNucPos(dy, &endPos, FALSE, cds);
     }
 char *ref = refFromVpTx(vpTx);
 hgvsAppendChangesFromNucRefAlt(dy, ref, vpTx->txAlt, dupLen, breakDelIns);
 return dyStringCannibalize(&dy);
 }
 
+static boolean isStartLoss(struct vpPep *vpPep)
+/* Return TRUE if vpPep shows that the start codon has been lost. */
+{
+return (vpPep->start == 0 &&
+        isNotEmpty(vpPep->ref) && vpPep->ref[0] == 'M' &&
+        (isEmpty(vpPep->alt) || vpPep->alt[0] != 'M'));
+}
+
 char *hgvsPFromVpPep(struct vpPep *vpPep, struct dnaSeq *protSeq, boolean addParens)
 /* Return an HGVS p. (protein) term for a variant projected into protein space.
  * Strict HGVS compliance requires parentheses around predicted protein changes, but
  * nobody seems to do that in practice.
  * Return NULL if an input is NULL. */
 {
 if (vpPep == NULL || protSeq == NULL)
     return NULL;
 struct dyString *dy = dyStringCreate("%s:p.", vpPep->name);
 if (addParens)
     dyStringAppendC(dy, '(');
 int refLen = vpPep->end - vpPep->start;
 // When predicting frameshift/extension, the length of ref may be different from refLen
 int refExtLen = vpPep->ref ? strlen(vpPep->ref) : refLen;
 int altLen = vpPep->alt ? strlen(vpPep->alt) : 0;
 char refStartAbbr[4];
 if (vpPep->ref)
     aaToAbbr(vpPep->ref[0], refStartAbbr, sizeof(refStartAbbr));
 else
     // If ref is null then we should be writing just '=' or '?' but prevent garbage just in case:
     safecpy(refStartAbbr, sizeof(refStartAbbr), "?");
 // protSeq may or may not end with X, so treat protSeq->size accordingly
 boolean hitsStopCodon = (vpPep->end > protSeq->size ||
                          ((protSeq->dna[protSeq->size-1] == 'X') && vpPep->end == protSeq->size));
-if (vpPep->cantPredict || vpPep->spansUtrCds)
+if (vpPep->cantPredict || vpPep->spansUtrCds || isStartLoss(vpPep))
     dyStringAppend(dy, "?");
 else if (vpPep->frameshift)
     {
     dyStringPrintf(dy, "%s%d", refStartAbbr, vpPep->start+1);
     if (altLen == 1)
         dyStringAppend(dy, "Ter");
     else
         {
         char altStartAbbr[4];
         aaToAbbr(vpPep->alt[0], altStartAbbr, sizeof(altStartAbbr));
         // For stop-loss extension, make it "ext*"
         if (hitsStopCodon && altLen > refExtLen)
             dyStringPrintf(dy, "%sext*%d", altStartAbbr, altLen - refExtLen);
         else
             dyStringPrintf(dy, "%sfsTer%d", altStartAbbr, altLen);