a3493ed2ff9a01552e8fe05540bd88b39541879d
angie
  Fri Sep 2 10:33:51 2016 -0700
Expand pseudo-HGVS regex to accept p. after gene name.

diff --git src/hg/lib/hgHgvs.c src/hg/lib/hgHgvs.c
index d1913fe..38ed891 100644
--- src/hg/lib/hgHgvs.c
+++ src/hg/lib/hgHgvs.c
@@ -1,531 +1,531 @@
 /* hgHgvs.c - Parse the Human Genome Variation Society (HGVS) nomenclature for variants. */
 /* See http://varnomen.hgvs.org/ and https://github.com/mutalyzer/mutalyzer/ */
 
 /* Copyright (C) 2016 The Regents of the University of California
  * See README in this or parent directory for licensing information. */
 #include "common.h"
 #include "hgHgvs.h"
 #include "genbank.h"
 #include "hdb.h"
 #include "pslTransMap.h"
 #include "regexHelper.h"
 
 // Regular expressions for HGVS-recognized sequence accessions: LRG or versioned RefSeq:
 #define lrgTranscriptExp "(LRG_[0-9]+t[0-9+])"
 #define lrgRegionExp "(LRG_[0-9+])"
 
 // NC = RefSeq reference assembly chromosome
 // NG = RefSeq incomplete genomic region (e.g. gene locus)
 // NM = RefSeq curated mRNA
 // NP = RefSeq curated protein
 // NR = RefSeq curated (non-coding) RNA
 #define geneSymbolExp "([A-Z0-9./_-]+)"
 #define optionalGeneSymbolExp "(\\(" geneSymbolExp "\\))?"
 #define versionedAccPrefixExp(p) "(" p "_[0-9]+(\\.[0-9]+)?)" optionalGeneSymbolExp
 //                                 ........................       accession and optional dot version
 //                                              ...........       optional dot version
 //                                                         ...... optional gene symbol in ()s
 //                                                          ....  optional gene symbol
 #define versionedRefSeqNCExp versionedAccPrefixExp("NC")
 #define versionedRefSeqNGExp versionedAccPrefixExp("NG")
 #define versionedRefSeqNMExp versionedAccPrefixExp("NM")
 #define versionedRefSeqNPExp versionedAccPrefixExp("NP")
 #define versionedRefSeqNRExp versionedAccPrefixExp("NR")
 
 // Nucleotide substitution regexes
 // (c. = CDS, g. = genomic, m. = mitochondrial, n.= non-coding RNA, r. = RNA)
 #define hgvsNtSubstExp "([0-9]+)([ACGT]+)>([ACGT]+)"
 //                       ......                     1-based position
 //                               .......            original sequence
 //                                         .......  replacement sequence
 #define hgvsCDotSubstExp "c\\." hgvsNtSubstExp
 #define hgvsGDotSubstExp "g\\." hgvsNtSubstExp
 #define hgvsMDotSubstExp "m\\." hgvsNtSubstExp
 #define hgvsNDotSubstExp "n\\." hgvsNtSubstExp
 #define hgvsRDotSubstExp "r\\." hgvsNtSubstExp
 
 // Protein substitution regex
 #define AA3 "Ala|Arg|Asn|Asp|Cys|Gln|Glu|Gly|His|Ile|Leu|Lys|Met|Phe|Pro|Ser|Thr|Trp|Tyr|Val|Ter"
 #define hgvsAminoAcidExp "([ARNDCQEGHILKMFPSTWYVX*]|" AA3 ")"
 #define hgvsAminoAcidSubstExp hgvsAminoAcidExp "([0-9]+)" hgvsAminoAcidExp
 #define hgvsPDotSubstExp "p\\." hgvsAminoAcidSubstExp
 //                                 ...                                  // original sequence
 //                                              ......                  // 1-based position
 //                                                           ...        // replacement sequence
 
 
 // Complete HGVS term regexes combining sequence identifiers and change operations
 #define hgvsFullRegex(seq, op) "^" seq ":" op "$"
 
 #define hgvsLrgCDotSubstExp hgvsFullRegex(lrgTranscriptExp, hgvsCDotSubstExp)
 // substring numbering:
 //      0.....................................................  whole matching string
 //      1...................                                    LRG transcript
 //                                   2.....                     1-based position
 //                                           3......            original sequence
 //                                                     4......  replacement sequence
 
 #define hgvsRefSeqNMCDotSubstExp hgvsFullRegex(versionedRefSeqNMExp, hgvsCDotSubstExp)
 // substring numbering:
 //      0.....................................................  whole matching string
 //      1...............                                        accession and optional dot version
 //             2........                                        optional dot version
 //                         3.....                               optional gene symbol in ()s
 //                          4...                                optional gene symbol
 //                                   5.....                     1-based position
 //                                           6......            original sequence
 //                                                     7......  replacement sequence
 
 #define hgvsRefSeqNPPDotSubstExp hgvsFullRegex(versionedRefSeqNPExp, hgvsPDotSubstExp)
 // substring numbering:
 //      0.....................................................  whole matching string
 //      1................                                       accession and optional dot version
 //                 2.....                                       optional dot version
 //                         3.....                               optional gene symbol in ()s
 //                          4...                                optional gene symbol
 //                                   5.....                     original sequence
 //                                           6......            1-based position
 //                                                     7......  replacement sequence
 
 // Pseudo-HGVS in common usage
-#define pseudoHgvsGeneSymbolSubstExp "^" geneSymbolExp "[: ]" hgvsAminoAcidSubstExp
+#define pseudoHgvsGeneSymbolSubstExp "^" geneSymbolExp "[: p.]+" hgvsAminoAcidSubstExp
 //      0.....................................................  whole matching string
 //      1...................                                    gene symbol
 //                                   2.....                     original sequence
 //                                           3......            1-based position
 //                                                     4......  replacement sequence
 
 static struct hgvsVariant *hgvsParseCDotSubst(char *term)
 /* If term is parseable as an HGVS CDS substitution, return the parsed representation,
  * otherwise NULL. */
 {
 struct hgvsVariant *hgvs = NULL;
 boolean matches = FALSE;
 int accIx = 1;
 int posIx = 0;
 int refIx = 0;
 int altIx = 0;
 int geneSymbolIx = -1;
 regmatch_t substrs[8];
 if (regexMatchSubstrNoCase(term, hgvsLrgCDotSubstExp, substrs, ArraySize(substrs)))
     {
     matches = TRUE;
     posIx = 2;
     refIx = 3;
     altIx = 4;
     }
 else if (regexMatchSubstrNoCase(term, hgvsRefSeqNMCDotSubstExp, substrs, ArraySize(substrs)))
     {
     matches = TRUE;
     geneSymbolIx = 4;
     posIx = 5;
     refIx = 6;
     altIx = 7;
     }
 if (matches)
     {
     AllocVar(hgvs);
     hgvs->type = hgvstCoding;
     hgvs->changeSymbol = cloneString(">");
     hgvs->seqAcc = regexSubstringClone(term, substrs[accIx]);
     hgvs->start1 = regexSubstringInt(term, substrs[posIx]);
     hgvs->refAllele = regexSubstringClone(term, substrs[refIx]);
     hgvs->altAllele = regexSubstringClone(term, substrs[altIx]);
     if (geneSymbolIx >= 0 && regexSubstrMatched(substrs[geneSymbolIx]))
         hgvs->seqGeneSymbol = regexSubstringClone(term, substrs[geneSymbolIx]);
     hgvs->end = hgvs->start1 - 1 + strlen(hgvs->refAllele);
     }
 return hgvs;
 }
 
 static char *aaAbbrsToLetters(char *aaStr)
 /* If the input string is composed of AA abbreviations like "Ala", "Asp" etc., convert them to
  * single letter AAs like "A", "D" etc.  Otherwise return a copy of aaStr. */
 {
 char *letters = NULL;
 if (regexMatch(aaStr, "^(" AA3 ")+$"))
     {
     int len = strlen(aaStr) / 3;
     letters = needMem(len + 1);
     int i;
     for (i = 0;  i < len;  i++)
         letters[i] = aaAbbrToLetter(&aaStr[i*3]);
     }
 else
     letters = cloneString(aaStr);
 return letters;
 }
 
 static struct hgvsVariant *hgvsParsePDotSubst(char *term)
 /* If term is parseable as an HGVS protein substitution, return the parsed representation,
  * otherwise NULL. */
 {
 struct hgvsVariant *hgvs = NULL;
 regmatch_t substrs[8];
 if (regexMatchSubstrNoCase(term, hgvsRefSeqNPPDotSubstExp, substrs, ArraySize(substrs)))
     {
     int accIx = 1;
     int geneSymbolIx = 4;
     int refIx = 5;
     int posIx = 6;
     int altIx = 7;
     AllocVar(hgvs);
     hgvs->type = hgvstProtein;
     hgvs->seqAcc = regexSubstringClone(term, substrs[accIx]);
     if (regexSubstrMatched(substrs[geneSymbolIx]))
         hgvs->seqGeneSymbol = regexSubstringClone(term, substrs[geneSymbolIx]);
     hgvs->start1 = regexSubstringInt(term, substrs[posIx]);
     // Convert 3-letter abbreviations to single-letter codes like "Ala" -> "A",
     // so strlen(allele) is the number of amino acids.
     char buf[2048];
     regexSubstringCopy(term, substrs[refIx], buf, sizeof(buf));
     hgvs->refAllele = aaAbbrsToLetters(buf);
     regexSubstringCopy(term, substrs[altIx], buf, sizeof(buf));
     hgvs->altAllele = aaAbbrsToLetters(buf);
     hgvs->end = hgvs->start1 - 1 + strlen(hgvs->refAllele);
     }
 return hgvs;
 }
 
 struct hgvsVariant *hgvsParseTerm(char *term)
 /* If term is a parseable form of HGVS, return the parsed representation, otherwise NULL.
  * This does not check validity of accessions or alleles. */
 {
 struct hgvsVariant *hgvs = hgvsParseCDotSubst(term);
 if (hgvs == NULL)
     hgvs = hgvsParsePDotSubst(term);
 return hgvs;
 }
 
 struct hgvsVariant *hgvsParsePseudoHgvs(char *db, char *term)
 /* Attempt to parse things that are not strict HGVS, but that people might intend as HGVS: */
 {
 struct hgvsVariant *hgvs = NULL;
 regmatch_t substrs[5];
 if (regexMatchSubstrNoCase(term, pseudoHgvsGeneSymbolSubstExp, substrs, ArraySize(substrs)))
     {
     int geneSymbolIx = 1;
     int refIx = 2;
     int posIx = 3;
     int altIx = 4;
     char *geneSymbol = regexSubstringClone(term, substrs[geneSymbolIx]);
     // Try to find an NP_ for geneSymbol, using refGene to make sure it's an NP for this species.
     struct sqlConnection *conn = hAllocConn(db);
     char query[2048];
     sqlSafef(query, sizeof(query), "select l.protAcc from %s l, refGene r "
              "where l.name = '%s' and l.mrnaAcc = r.name "
              "and l.protAcc != '' order by l.protAcc"
              , refLinkTable, geneSymbol);
     char *npAcc = sqlQuickString(conn, query);
     hFreeConn(&conn);
     if (isNotEmpty(npAcc))
         {
         AllocVar(hgvs);
         hgvs->type = hgvstProtein;
         hgvs->seqAcc = npAcc;
         hgvs->seqGeneSymbol = geneSymbol;
         hgvs->start1 = regexSubstringInt(term, substrs[posIx]);
         // Convert 3-letter abbreviations to single-letter codes like "Ala" -> "A",
         // so strlen(allele) is the number of amino acids.
         char buf[2048];
         regexSubstringCopy(term, substrs[refIx], buf, sizeof(buf));
         hgvs->refAllele = aaAbbrsToLetters(buf);
         regexSubstringCopy(term, substrs[altIx], buf, sizeof(buf));
         hgvs->altAllele = aaAbbrsToLetters(buf);
         int refLen = strlen(hgvs->refAllele);
         hgvs->end = hgvs->start1 - 1 + refLen;
         }
     // Note: this doesn't support non-coding gene symbol substs (which should have nt alleles)
     }
 return hgvs;
 }
 
 static char *getCdnaSeq(char *db, char *acc,  char **retFoundAcc)
 /* Return cdna sequence for acc, or NULL if not found.  If retFoundAcc is not NULL,
  * set it to the accession for which we grabbed seq (might have a .version chopped off). */
 {
 char *seq = NULL;
 char *foundAcc = NULL;
 if (startsWith("LRG_", acc))
     {
     if (hTableExists(db, "lrgCdna"))
         {
         char query[2048];
         sqlSafef(query, sizeof(query), "select seq from lrgCdna where name = '%s'", acc);
         struct sqlConnection *conn = hAllocConn(db);
         seq = sqlQuickString(conn, query);
         hFreeConn(&conn);
         if (seq)
             foundAcc = cloneString(acc);
         }
     }
 else
     {
     char *trimmedAcc = cloneFirstWordByDelimiter(acc, '.');
     struct dnaSeq *gbSeq = hGenBankGetMrna(db, trimmedAcc, gbSeqTable);
     if (gbSeq)
         {
         seq = gbSeq->dna;
         foundAcc = trimmedAcc;
         }
     }
 if (retFoundAcc)
     *retFoundAcc = foundAcc;
 return seq;
 }
 
 static char *getProteinSeq(char *db, char *acc,  char **retFoundAcc)
 /* Return amino acid sequence for acc, or NULL if not found.  If retFoundAcc is not NULL,
  * set it to the accession for which we grabbed seq (might have a .version chopped off). */
 {
 char *seq = NULL;
 char *foundAcc = NULL;
 if (startsWith("LRG_", acc))
     {
     if (hTableExists(db, "lrgPep"))
         {
         char query[2048];
         sqlSafef(query, sizeof(query), "select seq from lrgPep where name = '%s'", acc);
         struct sqlConnection *conn = hAllocConn(db);
         seq = sqlQuickString(conn, query);
         hFreeConn(&conn);
         if (seq)
             foundAcc = cloneString(acc);
         }
     }
 else
     {
     char *trimmedAcc = cloneFirstWordByDelimiter(acc, '.');
     aaSeq *aaSeq = hGenBankGetPep(db, trimmedAcc, gbSeqTable);
     if (aaSeq)
         {
         seq = aaSeq->dna;
         foundAcc = trimmedAcc;
         }
     }
 if (retFoundAcc)
     *retFoundAcc = foundAcc;
 return seq;
 }
 
 static int getCdsStart(char *db, char *acc)
 /* Return the start coordinate for CDS in genbank or LRG acc, or -1 if not found. */
 {
 int cdsStart = -1;
 char query[1024];
 if (startsWith("LRG_", acc))
     sqlSafef(query, sizeof(query),
              "select cds from lrgCds where id = '%s'", acc);
 else
     sqlSafef(query, sizeof(query),
              "SELECT c.name FROM %s as c, %s as g WHERE (g.acc = '%s') AND "
              "(g.cds != 0) AND (g.cds = c.id)", cdsTable, gbCdnaInfoTable, acc);
 struct sqlConnection *conn = hAllocConn(db);
 char cdsBuf[2048];
 char *cdsStr = sqlQuickQuery(conn, query, cdsBuf, sizeof(cdsBuf));
 hFreeConn(&conn);
 if (isNotEmpty(cdsStr))
     {
     struct genbankCds cds;
     if (genbankCdsParse(cdsStr, &cds) && cds.startComplete && cds.start != cds.end)
         cdsStart = cds.start;
     }
 return cdsStart;
 }
 
 static int getGbVersion(char *db, char *acc)
 /* Return the local version that we have for acc. */
 {
 char query[2048];
 sqlSafef(query, sizeof(query), "select version from %s where acc = '%s'", gbSeqTable, acc);
 struct sqlConnection *conn = hAllocConn(db);
 int version = sqlQuickNum(conn, query);
 hFreeConn(&conn);
 return version;
 }
 
 boolean hgvsValidate(char *db, struct hgvsVariant *hgvs, char **retFoundAcc, int *retFoundVersion,
                      char **retDiffRefAllele)
 /* Return TRUE if hgvs coords are within the bounds of the sequence for hgvs->seqAcc.
  * If retFoundAcc is not NULL, set it to our local accession (which may be missing the .version
  * of hgvs->seqAcc) or NULL if we can't find any match.
  * If retFoundVersion is not NULL and hgvs->seqAcc has a version number (e.g. NM_005957.4),
  * set retFoundVersion to our version from latest GenBank, otherwise 0 (no version for LRG).
  * If coords are OK and retDiffRefAllele is not NULL: if our sequence at the coords
  * matches hgvs->refAllele then set it to NULL; if mismatch then set it to our sequence. */
 {
 boolean coordsOK = FALSE;
 char *acc = hgvs->seqAcc;
 char *foundAcc = NULL;
 char *accSeq = NULL;
 int cdsOffset = 0;
 int refLen = strlen(hgvs->refAllele);
 if (hgvs->type == hgvstProtein)
     accSeq = getProteinSeq(db, acc, &foundAcc);
 else
     accSeq = getCdnaSeq(db, acc, &foundAcc);
 if (retFoundVersion)
     {
     *retFoundVersion = 0;
     if (foundAcc && ! startsWith("LRG_", foundAcc))
         *retFoundVersion = getGbVersion(db, foundAcc);
     }
 if (hgvs->type == hgvstCoding && foundAcc)
     cdsOffset = getCdsStart(db, foundAcc);
 if (accSeq && cdsOffset >= 0 &&
     (cdsOffset + hgvs->start1 - 1 + refLen) <= strlen(accSeq))
     {
     coordsOK = TRUE;
     }
 if (coordsOK && retDiffRefAllele)
     {
     char *ourSeq = cloneStringZ(accSeq + cdsOffset + hgvs->start1 - 1, refLen);
     toUpperN(ourSeq, refLen);
     if (sameString(hgvs->refAllele, ourSeq))
         *retDiffRefAllele = NULL;
     else
         *retDiffRefAllele = ourSeq;
     }
 if (retFoundAcc)
     *retFoundAcc = foundAcc;
 return coordsOK;
 }
 
 static struct psl *pslFromHgvs(struct hgvsVariant *hgvs, int accSize, int cdsStart)
 /* Allocate and return a PSL modeling the variant in transcript named acc. */
 {
 if (hgvs == NULL)
     return NULL;
 struct psl *psl;
 AllocVar(psl);
 int refLen = strlen(hgvs->refAllele);
 int altLen = strlen(hgvs->altAllele);
 struct dyString *dy = dyStringCreate("%s%s%s",
                                      hgvs->refAllele, hgvs->changeSymbol, hgvs->altAllele);
 psl->qName = dyStringCannibalize(&dy);
 psl->qSize = altLen;
 psl->qStart = 0;
 psl->qEnd = altLen;
 psl->tName = cloneFirstWordByDelimiter(hgvs->seqAcc, '.');
 psl->tSize = accSize;
 safecpy(psl->strand, sizeof(psl->strand), "+");
 psl->tStart = cdsStart + hgvs->start1 - 1;
 psl->tEnd = psl->tStart + refLen;
 psl->blockCount = 1;
 AllocArray(psl->blockSizes, psl->blockCount);
 AllocArray(psl->qStarts, psl->blockCount);
 AllocArray(psl->tStarts, psl->blockCount);
 psl->blockSizes[0] = refLen <= altLen ? refLen : altLen;
 psl->qStarts[0] = psl->qStart;
 psl->tStarts[0] = psl->tStart;
 return psl;
 }
 
 static struct psl *hgvsMapCDotToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable)
 /* Return a psl with target coords from db, mapping the variant to the genome.
  * Return NULL if unable to map.
  * If successful and retPslTable is not NULL, set it to the name of the PSL table used. */
 {
 struct psl *mappedToGenome = NULL;
 char *trackTable = NULL;
 char *pslTable = NULL;
 char *acc = cloneFirstWordByDelimiter(hgvs->seqAcc, '.');
 int cdsStart = -1;
 if (startsWith("NM_", acc))
     {
     // TODO: replace these with NCBI's alignments
     trackTable = "refGene";
     pslTable = "refSeqAli";
     }
 else if (startsWith("LRG_", acc))
     {
     trackTable = "lrgTranscriptAli";
     pslTable = "lrgTranscriptAli";
     }
 cdsStart = getCdsStart(db, acc);
 if (trackTable && pslTable && cdsStart >= 0 &&
     hTableExists(db, trackTable) && hTableExists(db, pslTable))
     {
     struct dyString *dyQuery = sqlDyStringCreate("select * from %s where qName = '%s'",
                                                  pslTable, acc);
     struct sqlConnection *conn = hAllocConn(db);
     struct sqlResult *sr = sqlGetResult(conn, dyQuery->string);
     int bin = hIsBinned(db, pslTable);
     char **row;
     if (sr && (row = sqlNextRow(sr)))
         {
         struct psl *txAli = pslLoad(row+bin);
         struct psl *variantPsl = pslFromHgvs(hgvs, txAli->qSize, cdsStart);
         mappedToGenome = pslTransMap(pslTransMapNoOpts, variantPsl, txAli);
         }
     sqlFreeResult(&sr);
     hFreeConn(&conn);
     }
 if (mappedToGenome && retPslTable)
     *retPslTable = cloneString(pslTable);
 return mappedToGenome;
 }
 
 static char *aaToArbitraryCodons(char *aaStr)
 /* Translate amino acid string into codon string where we just take the first codon we
  * find for each amino. */
 {
 int aaLen = strlen(aaStr);
 char *codonStr = needMem(aaLen*3 + 1);
 int i;
 for (i = 0;  i < aaLen;  i++)
     aaToArbitraryCodon(aaStr[i], &codonStr[i*3]);
 return codonStr;
 }
 
 static struct psl *hgvsMapPDotToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable)
 /* Return a psl with target coords from db, mapping the variant to the genome.
  * Return NULL if unable to map.
  * If successful and retPslTable is not NULL, set it to the name of the PSL table used. */
 {
 struct psl *mappedToGenome = NULL;
 char *acc = cloneFirstWordByDelimiter(hgvs->seqAcc, '.');
 if (startsWith("NP_", acc))
     {
     // Translate the NP_*:p. to NM_*:c. and map NM_*:c. to the genome.
     struct sqlConnection *conn = hAllocConn(db);
     char query[2048];
     sqlSafef(query, sizeof(query), "select mrnaAcc from %s l, refGene r "
           "where l.protAcc = '%s' and r.name = l.mrnaAcc", refLinkTable, acc);
     char *nmAcc = sqlQuickString(conn, query);
     hFreeConn(&conn);
     if (nmAcc)
         {
         struct hgvsVariant cDot;
         zeroBytes(&cDot, sizeof(cDot));
         cDot.type = hgvstCoding;
         cDot.seqAcc = nmAcc;
         cDot.changeSymbol = ">";
         // These aren't necessarily right since AA doesn't have as much info as codon.
         // We could get the refAllele right using the reference sequence -- but altAllele
         // could still be ambiguous.  In the meantime we're not really using the alleles
         // for anything besides length so far...
         cDot.refAllele = aaToArbitraryCodons(hgvs->refAllele);
         cDot.altAllele = aaToArbitraryCodons(hgvs->altAllele);
         cDot.start1 = ((hgvs->start1 - 1) * 3) + 1;
         cDot.end = ((hgvs->end - 1) * 3) + 3;
         mappedToGenome = hgvsMapCDotToGenome(db, &cDot, retPslTable);
         }
     }
 return mappedToGenome;
 }
 
 struct psl *hgvsMapToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable)
 /* Return a psl with target coords from db, mapping the variant to the genome.
  * Return NULL if unable to map.
  * If successful and retPslTable is not NULL, set it to the name of the PSL table used. */
 {
 if (hgvs == NULL || !hgvsValidate(db, hgvs, NULL, NULL, NULL))
     return NULL;
 struct psl *mappedToGenome = NULL;
 if (hgvs->type == hgvstCoding)
     mappedToGenome = hgvsMapCDotToGenome(db, hgvs, retPslTable);
 else if (hgvs->type == hgvstProtein)
     mappedToGenome = hgvsMapPDotToGenome(db, hgvs, retPslTable);
 return mappedToGenome;
 }