b534e5e167df93880881df0478ecec0225fdf136
angie
  Wed Aug 31 17:01:24 2016 -0700
This commit adds the capability to pick apart complex HGVS sequence change descriptions, and apply those changes to reference sequence, in order to translate HGVS nucleotide terms into a variant representation suitable for functional prediction in hgVai.  VCF was chosen since it is easy to integrate into hgVai.  refs #11460

Changes to existing code:
* hgvsMapToGenome maps to BED6 instead of BED3 because we need to know strand in order to convert transcript changes into VCF forward-strand genomic changes.
* hgvsMapToGenome maps insertions to zero-length points instead of 2-base ranges as in HGVS.

New file hgHgvsParse.c contains a tokenizer and parser for HGVS sequence change descriptions; top-level interface is hgvsParseNucleotideChange.

hgHgvs.c has new code to translate parsed HGVS nucleotide change(s) into VCF, optionally left-shifting ambiguous alignments (VCF convention, at odds with HGVS right-shifting convention); top-level interface is hgvsToVcfRow.

New hgvsToVcf utility enables testing of corner cases and may come in handy as a command-line util.

HGVS terms for testing have been taken from ClinVar and do not reflect the diversity of terms in the wild, nor do they cover the full HGVS spec.

For example, the HGVS repeat notation can be parsed but not mapped to the genome because all of the ClinVar repeat terms that I looked at looked wonky to me and I believe the HGVS repeat notation is inherently error-prone.  The repeat notation is supposed to use the position of the first repeat unit and to specify the number of repeated copies starting at that point (right-shifted if ambiguous).  However, in ClinVar, sometimes the given repeat unit sequence did not match the reference sequence at the given position; sometimes the number of sepeats made sense only if they were not perfect repeats (some differing bases); sometimes ranges of repeat numbers were given.  Also, the reference assembly's number of repeats can change from one assembly to the next.  So it is hard given an HGVS repeat term to determine 1) whether it makes sense in relation to the reference assembly with/without fuzzy matching and 2) what the exact change is relative to the reference assembly.

Insertions of inverted sequence from elsewhere in the same reference have not yet been tested.  http://varnomen.hgvs.org/recommendations/DNA/variant/inversion/ gives some complicated examples like "g.122_123ins213_234invinsAins123_211inv" but I have not yet seen terms like that in the wild.

diff --git src/hg/lib/hgFind.c src/hg/lib/hgFind.c
index da9fe46..4911eb9 100644
--- src/hg/lib/hgFind.c
+++ src/hg/lib/hgFind.c
@@ -3026,90 +3026,60 @@
 	hfs = hfsFind(longList, search);
     if (hfs != NULL)
 	foundIt = hgFindUsingSpec(db, hfs, term, limitResults, hgp, FALSE, 0,0, FALSE);
     else
 	warn("Unrecognized singleSearch=%s in URL", search);
     }
 if (foundIt)
     {
     fixSinglePos(hgp);
     if (cart != NULL)
         cartSetString(cart, "hgFind.matches", hgp->tableList->posList->browserName);
     }
 return foundIt;
 }
 
-static int getDotVersion(char *acc)
-/* If acc ends with a .version, return the version, else 0. */
-{
-char *p = strchr(acc, '.');
-if (p)
-    return atoi(p+1);
-return 0;
-}
-
 static boolean matchesHgvs(char *db, char *term, struct hgPositions *hgp)
 /* Return TRUE if the search term looks like a variant encoded using the HGVS nomenclature */
 /* See http://varnomen.hgvs.org/ */
 {
 boolean foundIt = FALSE;
 struct hgvsVariant *hgvs = hgvsParseTerm(term);
 if (hgvs == NULL)
     hgvs = hgvsParsePseudoHgvs(db, term);
 if (hgvs)
     {
-    char *foundAcc = NULL, *diffRefAllele = NULL;
-    int foundVersion = 0;
-    boolean coordsOk = hgvsValidate(db, hgvs, &foundAcc, &foundVersion, &diffRefAllele);
-    if (foundAcc == NULL)
-        warn("Can't find accession for HGVS term '%s'", term);
-    else
-        {
-        int hgvsVersion = getDotVersion(hgvs->seqAcc);
-        char foundAccWithV[strlen(foundAcc)+20];
-        if (foundVersion)
-            safef(foundAccWithV, sizeof(foundAccWithV), "%s.%d", foundAcc, foundVersion);
-        else
-            safecpy(foundAccWithV, sizeof(foundAccWithV), foundAcc);
-        if (hgvsVersion && hgvsVersion != foundVersion)
-            warn("HGVS term '%s' is based on %s but UCSC has version %s",
-                 term, hgvs->seqAcc, foundAccWithV);
-        if (! coordsOk)
-            warn("HGVS term '%s' has coordinates outside the bounds of %s", term, foundAccWithV);
-        else if (diffRefAllele != NULL)
-            warn ("HGVS term '%s' reference value does not match %s value '%s'",
-                  term, foundAccWithV, diffRefAllele);
-        if (coordsOk)
-            {
+    struct dyString *dyWarn = dyStringNew(0);
     char *pslTable = NULL;
-            struct bed3 *mapping = hgvsMapToGenome(db, hgvs, &pslTable);
+    struct bed *mapping = hgvsValidateAndMap(hgvs, db, term, dyWarn, &pslTable);
+    if (dyStringLen(dyWarn) > 0)
+        warn("%s", dyStringContents(dyWarn));
     if (mapping)
         {
         int padding = 5;
         char *trackTable;
         if (isEmpty(pslTable))
             trackTable = "chromInfo";
         else if (startsWith("lrg", pslTable))
             trackTable = "lrgTranscriptAli";
         else
             trackTable = "refGene";
         singlePos(hgp, "HGVS", NULL, trackTable, term, "",
                   mapping->chrom, mapping->chromStart-padding, mapping->chromEnd+padding);
         foundIt = TRUE;
         }
-            }
-        }
+    dyStringFree(&dyWarn);
     }
 return foundIt;
 }
 
 struct hgPositions *hgPositionsFind(char *db, char *term, char *extraCgi,
 	char *hgAppNameIn, struct cart *cart, boolean multiTerm)
 /* Return table of positions that match term or NULL if none such. */
 {
 struct hgPositions *hgp = NULL, *hgpItem = NULL;
 regmatch_t substrs[4];
 boolean canonicalSpec = FALSE;
 boolean gbrowserSpec = FALSE;
 boolean lengthSpec = FALSE;
 boolean singleBaseSpec = FALSE;
 boolean relativeFlag = FALSE;