src/hg/lib/hgHgvs.c b9af5d88785ba21a316027a5e22e5fbd5fb9a1af

b9af5d88785ba21a316027a5e22e5fbd5fb9a1af
angie
  Mon Jul 31 10:34:03 2017 -0700
Added support for "chr" sequence names in HGVS genomic (g.) terms.  refs #19832

diff --git src/hg/lib/hgHgvs.c src/hg/lib/hgHgvs.c
index 7bc0ae6..9e490a7 100644
--- src/hg/lib/hgHgvs.c
+++ src/hg/lib/hgHgvs.c
@@ -206,30 +206,41 @@
 #define hgvsRefSeqNPPDotRangeExp hgvsFullRegex(versionedRefSeqNPExp, hgvsPDotRangeExp)
 // substring numbering:
 //      0.....................................................  whole matching string
 //      1................                                       accession and optional dot version
 //                 2.....                                       optional dot version
 //                         3.....                               optional gene symbol in ()s
 //                          4...                                optional gene symbol
 //                                 5...                         original start AA
 //                                       6...                   1-based start position
 //                                          7.................  optional range sep and AA+pos
 //                                            8...              original end AA
 //                                                 9...         1-based end position
 //                                                     10.....  change description
 
 // Pseudo-HGVS in common usage
+// g. with "chr" ID:
+#define pseudoHgvsChrGDotExp hgvsFullRegex("(chr[0-9A-Za-z_]+)", hgvsGMDotPosExp) "(.*)"
+// substring numbering:
+//      0.....................................  whole matching string
+//      1...........                            chr...
+//                   2.                         g or m
+//                     3......                  1-based start position
+//                            4.......          optional range separator and end position
+//                              5.....          1-based end position
+//                                    6....     change description
+
 // Sometimes users give an NM_ accession, but a protein change.
 #define pseudoHgvsNMPDotSubstExp "^" versionedRefSeqNMExp "[ :]+p?\\.?" hgvsAminoAcidSubstExp
 // substring numbering:
 //      0.....................................................  whole matching string
 //      1...............                                        acc & optional dot version
 //             2........                                        optional dot version
 //                       3.....                                 optional gene sym in ()s
 //                        4...                                  optional gene symbol
 //                                   5.....                     original sequence
 //                                           6......            1-based position
 //                                                     7......  replacement sequence
 
 #define pseudoHgvsNMPDotRangeExp "^" versionedRefSeqNMExp "[ :]+p?\\.?" hgvsAaRangeExp
 // substring numbering:
 //      0.....................................................  whole matching string
@@ -699,30 +710,46 @@
     char geneSymbol[len+1];
     safencpy(geneSymbol, sizeof(geneSymbol), term, len);
     char *nmAcc = nmForGeneSymbol(db, geneSymbol);
     if (isNotEmpty(nmAcc))
         {
         // Make it a real HGVS term with the NM and pass that on to the usual parser.
         int descStartIx = regexSubstrMatched(substrs[2]) ? 2 : 3;
         char *description = term + substrs[descStartIx].rm_so;
         struct dyString *nmTerm = dyStringCreate("%s(%s):c.%s",
                                                  nmAcc, geneSymbol, description);
         hgvs = hgvsParseTerm(nmTerm->string);
         dyStringFree(&nmTerm);
         freeMem(nmAcc);
         }
     }
+else if (regexMatchSubstr(term, pseudoHgvsChrGDotExp, substrs, ArraySize(substrs)))
+    {
+    int chrIx = 1;
+    int startPosIx = 3;
+    int endPosIx = 5;
+    int changeIx = 6;
+    AllocVar(hgvs);
+    hgvs->type = hgvstGenomic;
+    hgvs->seqAcc = regexSubstringClone(term, substrs[chrIx]);
+    hgvs->start1 = regexSubstringInt(term, substrs[startPosIx]);
+    if (regexSubstrMatched(substrs[endPosIx]))
+        hgvs->end = regexSubstringInt(term, substrs[endPosIx]);
+    else
+        hgvs->end = hgvs->start1;
+    hgvs->changes = regexSubstringClone(term, substrs[changeIx]);
+    }
 return hgvs;
 }
 
 static char refBaseFromNucSubst(char *change)
 /* If sequence change is a nucleotide substitution, return the reference base, else null char. */
 {
 if (regexMatchNoCase(change, "^([ACGTU])>"))
     return toupper(change[0]);
 return '\0';
 }
 
 static void hgvsStartEndToZeroBasedHalfOpen(struct hgvsVariant *hgvs, int *retStart, int *retEnd)
 /* Convert HGVS's fully closed, 1-based-unless-negative start and end to UCSC start and end. */
 {
 // If hgvs->start1 is negative, it is effectively 0-based, so subtract 1 only if positive.