cc6ca259385f385c252b6a675801025e96783818
angie
  Thu May 11 12:40:15 2017 -0700
Converting HGVS coords to 0-based needs to be done differently when the HGVS coords are negative because when negative they are already 0-based (but end is closed and needs to be converted to open).  fixes #19396

diff --git src/hg/lib/hgHgvs.c src/hg/lib/hgHgvs.c
index ab6b363..be8e96d 100644
--- src/hg/lib/hgHgvs.c
+++ src/hg/lib/hgHgvs.c
@@ -656,62 +656,79 @@
         dyStringFree(&nmTerm);
         freeMem(nmAcc);
         }
     }
 return hgvs;
 }
 
 static char refBaseFromNucSubst(char *change)
 /* If sequence change is a nucleotide substitution, return the reference base, else null char. */
 {
 if (regexMatchNoCase(change, "^([ACGTU])>"))
     return toupper(change[0]);
 return '\0';
 }
 
+static void hgvsStartEndToZeroBasedHalfOpen(struct hgvsVariant *hgvs, int *retStart, int *retEnd)
+/* Convert HGVS's fully closed, 1-based-unless-negative start and end to UCSC start and end. */
+{
+// If hgvs->start1 is negative, it is effectively 0-based, so subtract 1 only if positive.
+int start = hgvs->start1;
+if (start > 0)
+    start--;
+// If hgvs->end is negative, it is effectively 0-based, so add 1 to get half-open end.
+int end = hgvs->end;
+if (end < 0)
+    end++;
+if (retStart)
+    *retStart = start;
+if (retEnd)
+    *retEnd = end;
+}
+
 static boolean hgvsValidateGenomic(char *db, struct hgvsVariant *hgvs,
                                    char **retFoundAcc, int *retFoundVersion,
                                    char **retDiffRefAllele)
 /* Return TRUE if hgvs->seqAcc can be mapped to a chrom in db and coords are legal for the chrom.
  * If retFoundAcc is non-NULL, set it to the versionless seqAcc if chrom is found, else NULL.
  * If retFoundVersion is non-NULL, set it to seqAcc's version if chrom is found, else NULL.
  * If retDiffRefAllele is non-NULL and hgvs specifies a reference allele then compare it to
  * the genomic sequence at that location; set *retDiffRefAllele to NULL if they match, or to
  * genomic sequence if they differ. */
 {
 boolean coordsOK = FALSE;
 if (retDiffRefAllele)
     *retDiffRefAllele = NULL;
 if (retFoundAcc)
     *retFoundAcc = NULL;
 if (retFoundVersion)
     *retFoundVersion = 0;
 char *chrom = hgOfficialChromName(db, hgvs->seqAcc);
 if (isNotEmpty(chrom))
     {
     struct chromInfo *ci = hGetChromInfo(db, chrom);
-    if ((hgvs->start1 >= 1 && hgvs->start1 <= ci->size) &&
-        (hgvs->end >=1 && hgvs->end <= ci->size))
+    int start, end;
+    hgvsStartEndToZeroBasedHalfOpen(hgvs, &start, &end);
+    if (start >= 0 && start < ci->size && end > 0 && end < ci->size)
         {
         coordsOK = TRUE;
         if (retDiffRefAllele)
             {
             char hgvsBase = refBaseFromNucSubst(hgvs->changes);
             if (hgvsBase != '\0')
                 {
-                struct dnaSeq *refBase = hFetchSeq(ci->fileName, chrom,
-                                                   hgvs->start1-1, hgvs->start1);
+                struct dnaSeq *refBase = hFetchSeq(ci->fileName, chrom, start, end);
                 touppers(refBase->dna);
                 if (refBase->dna[0] != hgvsBase)
                     *retDiffRefAllele = cloneString(refBase->dna);
                 dnaSeqFree(&refBase);
                 }
             }
         }
     // Since we found hgvs->seqAcc, split it into versionless and version parts.
     if (retFoundAcc)
         *retFoundAcc = cloneFirstWordByDelimiter(hgvs->seqAcc, '.');
     if (retFoundVersion)
         {
         char *p = strchr(hgvs->seqAcc, '.');
         if (p)
             *retFoundVersion = atoi(p+1);
@@ -808,40 +825,40 @@
     else
         return aaAbbrToLetter(match);
     }
 return '\0';
 }
 
 static char *cloneStringFromChar(char c)
 /* Return a cloned string from a single character. */
 {
 char str[2];
 str[0] = c;
 str[1] = '\0';
 return cloneString(str);
 }
 
-static void checkRefAllele(struct hgvsVariant *hgvs, int start1, char *accSeq,
+static void checkRefAllele(struct hgvsVariant *hgvs, int start, char *accSeq,
                            char **retDiffRefAllele)
-/* If hgvs change includes a reference allele, and if accSeq at start1 does not match,
+/* If hgvs change includes a reference allele, and if accSeq at start does not match,
  *  then set retDiffRefAllele to the accSeq version.  retDiffRefAllele must be non-NULL. */
 {
 char hgvsRefBase = (hgvs->type == hgvstProtein) ? refBaseFromProt(hgvs->changes) :
                                                   refBaseFromNucSubst(hgvs->changes);
 if (hgvsRefBase != '\0')
     {
-    char seqRefBase = toupper(accSeq[start1-1]);
+    char seqRefBase = toupper(accSeq[start]);
     if (seqRefBase != hgvsRefBase)
         *retDiffRefAllele = cloneStringFromChar(seqRefBase);
     }
 if (hgvs->type == hgvstProtein && *retDiffRefAllele == NULL)
     {
     // Protein ranges have a second ref allele base to check
     char *p = strchr(hgvs->changes, '_');
     if (p != NULL)
         {
         hgvsRefBase = refBaseFromProt(p+1);
         if (hgvsRefBase != '\0')
             {
             int end1 = atoi(skipToNumeric(p+1));
             char seqRefBase = toupper(accSeq[end1-1]);
             if (seqRefBase != hgvsRefBase)
@@ -922,52 +939,53 @@
  * set retFoundVersion to our version from latest GenBank, otherwise 0 (no version for LRG).
  * If coords are OK and retDiffRefAllele is not NULL: if our sequence at the coords
  * matches hgvs->refAllele then set it to NULL; if mismatch then set it to our sequence. */
 {
 char *acc = normalizeVersion(db, hgvs->seqAcc, retFoundVersion);
 if (isEmpty(acc))
     return FALSE;
 boolean coordsOK = FALSE;
 char *accSeq = (hgvs->type == hgvstProtein ? getProteinSeq(db, acc) : getCdnaSeq(db, acc));
 if (accSeq)
     {
     // By convention, foundAcc is always versionless because it's accompanied by foundVersion.
     if (retFoundAcc)
         *retFoundAcc = cloneFirstWordByDelimiter(acc, '.');
     int seqLen = strlen(accSeq);
+    int start, end;
+    hgvsStartEndToZeroBasedHalfOpen(hgvs, &start, &end);
     if (hgvs->type == hgvstCoding)
         {
         // Coding term coords can extend beyond the bounds of the transcript so
         // we can't check them without mapping to the genome.  However, if the coords
         // are in bounds and a reference allele is provided, we can check that.
         struct genbankCds cds;
         coordsOK = getCds(db, acc, &cds);
         if (coordsOK && retDiffRefAllele)
             {
-            int start = hgvs->start1 + (hgvs->startIsUtr3 ? cds.end : cds.start);
-            if (hgvs->startOffset == 0 && start >= 1 && start <= seqLen)
+            start += (hgvs->startIsUtr3 ? cds.end : cds.start);
+            if (hgvs->startOffset == 0 && start >= 0 && start < seqLen)
                 checkRefAllele(hgvs, start, accSeq, retDiffRefAllele);
             }
         }
     else
         {
-        if (hgvs->start1 >= 1 && hgvs->start1 <= seqLen &&
-            hgvs->end >= 1 && hgvs->end <= seqLen)
+        if (start >= 0 && start < seqLen && end > 0 && end <= seqLen)
             {
             coordsOK = TRUE;
             if (retDiffRefAllele)
-                checkRefAllele(hgvs, hgvs->start1, accSeq, retDiffRefAllele);
+                checkRefAllele(hgvs, start, accSeq, retDiffRefAllele);
             }
         }
     }
 freeMem(accSeq);
 freeMem(acc);
 return coordsOK;
 }
 
 boolean hgvsValidate(char *db, struct hgvsVariant *hgvs,
                      char **retFoundAcc, int *retFoundVersion, char **retDiffRefAllele)
 /* Return TRUE if hgvs coords are within the bounds of the sequence for hgvs->seqAcc.
  * Note: Coding terms may contain coords outside the bounds (upstream, intron, downstream) so
  * those can't be checked without mapping the term to the genome; this returns TRUE if seq is found.
  * If retFoundAcc is not NULL, set it to our local accession (which may be missing the .version
  * of hgvs->seqAcc) or NULL if we can't find any match.
@@ -1007,56 +1025,50 @@
 /* HGVS insertion coordinates include the base before and the base after the insertion point,
  * while we treat it as a zero-length point.  So if this is insertion, adjust the start and end. */
 {
 if (hgvsIsInsertion(hgvs))
     {
     *retStart0 = *retStart0 + 1;
     *retEnd = *retEnd - 1;
     }
 }
 
 static struct bed *hgvsMapGDotToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable)
 /* Given an NC_*g. term, if we can map NC_ to chrom then return the region, else NULL. */
 {
 struct bed *region = NULL;
 char *chrom = hgOfficialChromName(db, hgvs->seqAcc);
-if (isNotEmpty(chrom))
+if (isNotEmpty(chrom) && hgvs->start1 > 0)
     {
     region = bed6New(chrom, hgvs->start1 - 1, hgvs->end, "", 0, '+');
     adjustInsStartEnd(hgvs, &region->chromStart, &region->chromEnd);
     }
 if (retPslTable)
     *retPslTable = NULL;
 return region;
 }
 
 static void hgvsCodingToZeroBasedHalfOpen(struct hgvsVariant *hgvs,
                                           int maxCoord, struct genbankCds *cds,
                                           int *retStart, int *retEnd,
                                           int *retUpstreamBases, int *retDownstreamBases)
 /* Convert a coding HGVS's start1 and end into UCSC coords plus upstream and downstream lengths
  * for when the coding HGVS has coordinates that extend beyond its sequence boundaries.
  * ret* args must be non-NULL. */
 {
-// If hgvs->start1 is negative, it is effectively 0-based, so subtract 1 only if positive.
-int start = hgvs->start1;
-if (start > 0)
-    start -= 1;
-// If hgvs->end is negative, it is effectively 0-based, so add 1 to get half-open end.
-int end = hgvs->end;
-if (end < 0)
-    end += 1;
+int start, end;
+hgvsStartEndToZeroBasedHalfOpen(hgvs, &start, &end);
 // If the position follows '*' that means it's relative to cdsEnd; otherwise rel to cdsStart
 if (hgvs->startIsUtr3)
     *retStart = cds->end + start;
 else
     *retStart = cds->start + start;
 if (hgvs->endIsUtr3)
     *retEnd = cds->end + end;
 else
     *retEnd = cds->start + end;
 // Now check for coords that extend beyond the transcript('s alignment to the genome)
 if (*retStart < 0)
     {
     // hgvs->start1 is upstream of coding transcript.
     *retUpstreamBases = -*retStart;
     *retStart = 0;
@@ -1099,32 +1111,31 @@
  * NOTE: coding intron offsets are ignored; the PSL contains the exon anchor point
  * and the caller will have to map that to the genome and then apply the intron offset. */
 {
 if (hgvs == NULL)
     return NULL;
 if (hgvs->type == hgvstProtein)
     errAbort("pslFromHgvsNuc must be called only on nucleotide HGVSs, not protein.");
 struct psl *psl;
 AllocVar(psl);
 psl->tName = cloneString(acc);
 safecpy(psl->strand, sizeof(psl->strand), "+");
 psl->tSize = accSize;
 if (hgvs->type != hgvstCoding)
     {
     // Sane 1-based fully closed coords.
-    psl->tStart = hgvs->start1 - 1;
-    psl->tEnd = hgvs->end;
+    hgvsStartEndToZeroBasedHalfOpen(hgvs, &psl->tStart, &psl->tEnd);
     }
 else
     {
     // Simple or insanely complex CDS-relative coords.
     hgvsCodingToZeroBasedHalfOpen(hgvs, accEnd, cds, &psl->tStart, &psl->tEnd,
                                   retUpstreamBases, retDownstreamBases);
     }
 int refLen = psl->tEnd - psl->tStart;
 // Just use refLen for alt until we parse the sequence change portion of the term:
 int altLen = refLen;
 psl->qName = cloneString(hgvs->changes);
 psl->qSize = altLen;
 psl->qStart = 0;
 psl->qEnd = altLen;
 psl->blockCount = 1;
@@ -1478,32 +1489,32 @@
         seq = newDnaSeq(dna, size, name);
         }
     freez(&wholeSeq);
     }
 return seq;
 }
 
 static boolean hgvsToTxCoords(struct hgvsVariant *hgvs, char *db, uint *retStart, uint *retEnd)
 /* If hgvs is not coding then set *retStart to hgvs->start1-1 and *retEnd to hgvs->end & ret TRUE.
  * If coding and we have complete CDS info then apply cdsStart and/or cdsEnd offset to start & end.
  * If start or end is intronic, or start is negative (genomic upstream of tx) then return FALSE.
  * Note: at this point we don't know the length of the sequence so can't tell if start and/or end
  * fall past the end of the transcript (genomic downstream). */
 {
 boolean coordsOk = TRUE;
-int start = hgvs->start1 - 1;
-int end = hgvs->end;
+int start, end;
+hgvsStartEndToZeroBasedHalfOpen(hgvs, &start, &end);
 if (hgvs->type == hgvstCoding)
     {
     struct genbankCds cds;
     if (getCds(db, hgvs->seqAcc, &cds))
         {
         // Determine whether to use cdsStart or cdsEnd for hgvs start and hgvs end.
         // Make sure the cds start and end are marked as complete.
         if (hgvs->startIsUtr3 && cds.endComplete)
             start += cds.end;
         else if (!hgvs->startIsUtr3 && cds.startComplete)
             start += cds.start;
         else
             coordsOk = FALSE;
         if (hgvs->endIsUtr3 && cds.endComplete)
             end += cds.end;