904b4e4ad971a5be51a26dee4809efbe9c7b6430
angie
  Thu Aug 10 13:22:26 2017 -0700
Support XM_/XR_/XP_ accs in HGVS position search.  Generate correct HGVS p.*ext terms for stop codon loss.  refs #19968

diff --git src/hg/lib/hgHgvs.c src/hg/lib/hgHgvs.c
index 77fc0eb..e00c269 100644
--- src/hg/lib/hgHgvs.c
+++ src/hg/lib/hgHgvs.c
@@ -30,35 +30,35 @@
 #define lrgTranscriptExp "(LRG_[0-9]+t[0-9+])"
 #define lrgRegionExp "(LRG_[0-9+])"
 
 // NC = RefSeq reference assembly chromosome
 // NG = RefSeq incomplete genomic region (e.g. gene locus)
 // NM = RefSeq curated mRNA
 // NP = RefSeq curated protein
 // NR = RefSeq curated (non-coding) RNA
 #define geneSymbolExp "([A-Za-z0-9./_-]+)"
 #define optionalGeneSymbolExp "(\\(" geneSymbolExp "\\))?"
 #define versionedAccPrefixExp(p) "(" p "_[0-9]+(\\.[0-9]+)?)" optionalGeneSymbolExp
 //                                 ........................       accession and optional dot version
 //                                              ...........       optional dot version
 //                                                         ...... optional gene symbol in ()s
 //                                                          ....  optional gene symbol
-#define versionedRefSeqNCExp versionedAccPrefixExp("NC")
-#define versionedRefSeqNGExp versionedAccPrefixExp("NG")
-#define versionedRefSeqNMExp versionedAccPrefixExp("NM")
-#define versionedRefSeqNPExp versionedAccPrefixExp("NP")
-#define versionedRefSeqNMRExp versionedAccPrefixExp("N[MR]")
+#define versionedRefSeqNCExp versionedAccPrefixExp("[NX]C")
+#define versionedRefSeqNGExp versionedAccPrefixExp("[NX]G")
+#define versionedRefSeqNMExp versionedAccPrefixExp("[NX]M")
+#define versionedRefSeqNPExp versionedAccPrefixExp("[NX]P")
+#define versionedRefSeqNMRExp versionedAccPrefixExp("[NX][MR]")
 
 // Nucleotide position regexes
 // (c. = CDS, g. = genomic, m. = mitochondrial, n.= non-coding RNA, r. = RNA)
 #define posIntExp "([0-9]+)"
 #define hgvsGenoPosExp posIntExp "(_" posIntExp ")?"
 //                   ......                     1-based start position
 //                           .............      optional range separator and end position
 //                               ......         1-based end position
 // n. terms can have exonic anchor base and intron offset for both start and end:
 #define offsetExp "([-+])"
 // c. terms may also have a UTR indicator before the anchor base (- for UTR5, * for UTR3)
 #define anchorExp "([-*])?"
 #define complexNumExp anchorExp posIntExp "(" offsetExp posIntExp ")?"
 #define hgvsCdsPosExp complexNumExp "(_" complexNumExp ")?"
 //                    ...                               optional UTR anchor "-" or "*"
@@ -1324,31 +1324,32 @@
     pslFree(&variantPsl);
     pslFree(&txAli);
     }
 sqlFreeResult(&sr);
 hFreeConn(&conn);
 return mappedToGenome;
 }
 
 static char *pslTableForAcc(char *db, char *acc)
 /* Based on acc (and whether db has NCBI RefSeq alignments), pick a PSL table where
  * acc should be found (table may or may not exist).  Don't free the returned string. */
 {
 char *pslTable = NULL;
 if (startsWith("LRG_", acc))
     pslTable = "lrgTranscriptAli";
-else if (startsWith("NM_", acc) || startsWith("NR_", acc))
+else if (startsWith("NM_", acc) || startsWith("NR_", acc) ||
+         startsWith("XM_", acc) || startsWith("XR_", acc))
     {
     // Use NCBI's alignments if they are available
     if (hDbHasNcbiRefSeq(db))
         pslTable = "ncbiRefSeqPsl";
     else
         pslTable = "refSeqAli";
     }
 return pslTable;
 }
 
 #define limitToRange(val, min, max) { if (val < min) { val = min; }  \
                                       if (val > max) { val = max; } }
 
 static struct bed *pslAndFriendsToRegion(struct psl *psl, struct hgvsVariant *hgvs,
                                           int upstream, int downstream)
@@ -1429,31 +1430,31 @@
 if (region)
     {
     adjustInsStartEnd(hgvs, &region->chromStart, &region->chromEnd);
     if (retPslTable)
         *retPslTable = cloneString(pslTable);
     }
 return region;
 }
 
 static struct bed *hgvsMapPDotToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable)
 /* Return a bed6 with the variant's span on the genome and strand, or NULL if unable to map.
  * If successful and retPslTable is not NULL, set it to the name of the PSL table used. */
 {
 struct bed *region = NULL;
 char *acc = normalizeVersion(db, hgvs->seqAcc, NULL);
-if (acc && startsWith("NP_", acc))
+if (acc && (startsWith("NP_", acc) || startsWith("XP_", acc)))
     {
     // Translate the NP_*:p. to NM_*:c. and map NM_*:c. to the genome.
     struct sqlConnection *conn = hAllocConn(db);
     char query[2048];
     if (hDbHasNcbiRefSeq(db))
         sqlSafef(query, sizeof(query), "select mrnaAcc from ncbiRefSeqLink where protAcc = '%s'",
                  acc);
     else if (hTableExists(db, "refGene"))
         sqlSafef(query, sizeof(query), "select mrnaAcc from %s l, refGene r "
                  "where l.protAcc = '%s' and r.name = l.mrnaAcc", refLinkTable, acc);
     else
         return NULL;
     char *nmAcc = sqlQuickString(conn, query);
     hFreeConn(&conn);
     if (nmAcc)
@@ -2538,73 +2539,100 @@
 return dyStringCannibalize(&dy);
 }
 
 char *hgvsPFromVpPep(struct vpPep *vpPep, struct dnaSeq *protSeq, boolean addParens)
 /* Return an HGVS p. (protein) term for a variant projected into protein space.
  * Strict HGVS compliance requires parentheses around predicted protein changes, but
  * nobody seems to do that in practice.
  * Return NULL if an input is NULL. */
 {
 if (vpPep == NULL || protSeq == NULL)
     return NULL;
 struct dyString *dy = dyStringCreate("%s:p.", vpPep->name);
 if (addParens)
     dyStringAppendC(dy, '(');
 int refLen = vpPep->end - vpPep->start;
+// When predicting frameshift/extension, the length of ref may be different from refLen
+int refExtLen = vpPep->ref ? strlen(vpPep->ref) : refLen;
 int altLen = vpPep->alt ? strlen(vpPep->alt) : 0;
-char *pSeq = protSeq->dna;
 char refStartAbbr[4];
-aaToAbbr(pSeq[vpPep->start], refStartAbbr, sizeof(refStartAbbr));
+if (vpPep->ref)
+    aaToAbbr(vpPep->ref[0], refStartAbbr, sizeof(refStartAbbr));
+else
+    // If ref is null then we should be writing just '=' or '?' but prevent garbage just in case:
+    safecpy(refStartAbbr, sizeof(refStartAbbr), "?");
+// protSeq may or may not end with X, so treat protSeq->size accordingly
+boolean hitsStopCodon = (vpPep->end > protSeq->size ||
+                         ((protSeq->dna[protSeq->size-1] == 'X') && vpPep->end == protSeq->size));
 if (vpPep->likelyNoChange)
     dyStringAppend(dy, "=");
 else if (vpPep->cantPredict || vpPep->spansUtrCds)
     dyStringAppend(dy, "?");
 else if (vpPep->frameshift)
     {
+    dyStringPrintf(dy, "%s%d", refStartAbbr, vpPep->start+1);
     if (altLen == 1)
-        dyStringPrintf(dy, "%s%dTer", refStartAbbr, vpPep->start+1);
+        dyStringAppend(dy, "Ter");
     else
         {
         char altStartAbbr[4];
         aaToAbbr(vpPep->alt[0], altStartAbbr, sizeof(altStartAbbr));
-        dyStringPrintf(dy, "%s%d%sfsTer%d", refStartAbbr, vpPep->start+1, altStartAbbr, altLen);
+        // For stop-loss extension, make it "ext*"
+        if (hitsStopCodon && altLen > refExtLen)
+            dyStringPrintf(dy, "%sext*%d", altStartAbbr, altLen - refExtLen);
+        else
+            dyStringPrintf(dy, "%sfsTer%d", altStartAbbr, altLen);
         }
     }
+else if (hitsStopCodon && altLen > refExtLen)
+    {
+    // Stop loss extension from something that doesn't disrupt frame
+    char altStartAbbr[4];
+    aaToAbbr(vpPep->alt[0], altStartAbbr, sizeof(altStartAbbr));
+    dyStringPrintf(dy, "%s%d%sext*%d", refStartAbbr, vpPep->start+1,
+                   altStartAbbr, altLen - refExtLen);
+    }
 else
     {
     int dupLen = 0;
     if (refLen == 0 && altLen > 0)
         {
         // It's an insertion; is it a duplication?
         struct seqWindow *pSeqWin = memSeqWindowNew(protSeq->name, protSeq->dna);
         dupLen = findDup(vpPep->alt, pSeqWin, vpPep->start, FALSE);
         memSeqWindowFree(&pSeqWin);
         }
     if (refLen == 1)
         dyStringPrintf(dy, "%s%d", refStartAbbr, vpPep->start+1);
     else
         {
         int rangeStart = vpPep->start, rangeEnd = vpPep->end;
+        char *pSeq = protSeq->dna;
         if (dupLen > 0)
             {
             // Duplication; position range changes to preceding bases.
             rangeEnd = rangeStart;
             rangeStart -= dupLen;
             aaToAbbr(pSeq[rangeStart], refStartAbbr, sizeof(refStartAbbr));
             }
         else if (refLen == 0)
             {
             // Insertion; expand to two-AA range around insertion point
             rangeStart--;
             aaToAbbr(pSeq[rangeStart], refStartAbbr, sizeof(refStartAbbr));
             rangeEnd++;
             }
+        hitsStopCodon = (rangeEnd > protSeq->size ||
+                         ((protSeq->dna[protSeq->size-1] == 'X') && rangeEnd == protSeq->size));
         char refLastAbbr[4];
+        if (hitsStopCodon)
+            aaToAbbr('X', refLastAbbr, sizeof(refLastAbbr));
+        else
             aaToAbbr(pSeq[rangeEnd-1], refLastAbbr, sizeof(refLastAbbr));
         dyStringPrintf(dy, "%s%d_%s%d", refStartAbbr, rangeStart+1, refLastAbbr, rangeEnd);
         }
     hgvsAppendChangesFromPepRefAlt(dy, vpPep->ref, vpPep->alt, dupLen);
     }
 if (addParens)
     dyStringAppendC(dy, ')');
 return dyStringCannibalize(&dy);
 }