e6e8263c1a4a85000be7cc4df0797d75dceba589
angie
  Fri Jun 30 09:51:18 2017 -0700
Noncoding introns were not being mapped to the genome correctly, hgvsTranscriptToZeroBasedHalfOpen had a fencepost error for upstream/downstream and mapToDeletion was using q coords instead of t coords for upstream/downstream, doh!  helpful testcase: NR_111987:n.-1 .  fixes #19702 note-1.

diff --git src/hg/lib/hgHgvs.c src/hg/lib/hgHgvs.c
index e10ef5a..47f37dd 100644
--- src/hg/lib/hgHgvs.c
+++ src/hg/lib/hgHgvs.c
@@ -355,42 +355,48 @@
     {
     matches = TRUE;
     geneSymbolIx = 4;
     startAnchorIx += refSeqExtra;
     endAnchorIx += refSeqExtra;
     endPosIx += refSeqExtra;
     changeIx += refSeqExtra;
     }
 if (matches)
     {
     AllocVar(hgvs);
     hgvs->type = isNoncoding ? hgvstNoncoding : hgvstCoding;
     hgvs->seqAcc = regexSubstringClone(term, substrs[accIx]);
     extractComplexNum(term, substrs, startAnchorIx,
                       &hgvs->startIsUtr3, &hgvs->start1, &hgvs->startOffset);
-    if (isNoncoding && (hgvs->startIsUtr3 || hgvs->start1 < 0))
-        warn("hgvsParseCNDotPos: noncoding term '%s' appears to start in UTR, "
+    if (isNoncoding && hgvs->startIsUtr3)
+        {
+        warn("hgvsParseCNDotPos: noncoding term '%s' appears to start in UTR3 (*), "
              "not applicable for noncoding", term);
+        hgvs->startIsUtr3 = FALSE;
+        }
     if (geneSymbolIx >= 0 && regexSubstrMatched(substrs[geneSymbolIx]))
         hgvs->seqGeneSymbol = regexSubstringClone(term, substrs[geneSymbolIx]);
     if (regexSubstrMatched(substrs[endPosIx]))
         {
         extractComplexNum(term, substrs, endAnchorIx,
                           &hgvs->endIsUtr3, &hgvs->end, &hgvs->endOffset);
-        if (isNoncoding && (hgvs->endIsUtr3 || hgvs->end < 0))
-            warn("hgvsParseCNDotPos: noncoding term '%s' appears to end in UTR, "
+        if (isNoncoding && hgvs->endIsUtr3)
+            {
+            warn("hgvsParseCNDotPos: noncoding term '%s' appears to end in UTR3 (*), "
                  "not applicable for noncoding", term);
+            hgvs->endIsUtr3 = FALSE;
+            }
         }
     else
         {
         hgvs->end = hgvs->start1;
         hgvs->endIsUtr3 = hgvs->startIsUtr3;
         hgvs->endOffset = hgvs->startOffset;
         }
     hgvs->changes = regexSubstringClone(term, substrs[changeIx]);
     }
 return hgvs;
 }
 
 static struct hgvsVariant *hgvsParsePDotSubst(char *term)
 /* If term is parseable as an HGVS protein substitution, return the parsed representation,
  * otherwise NULL. */
@@ -967,85 +973,79 @@
     }
 else
     {
     // genbank tables -- no version
     normalizedAcc = cloneFirstWordByDelimiter(acc, '.');
     foundVersion = getGbVersion(db, normalizedAcc);
     }
 if (retFoundVersion)
     *retFoundVersion = foundVersion;
 return normalizedAcc;
 }
 
 static boolean hgvsValidateGene(char *db, struct hgvsVariant *hgvs,
                                 char **retFoundAcc, int *retFoundVersion, char **retDiffRefAllele)
 /* Return TRUE if hgvs coords are within the bounds of the sequence for hgvs->seqAcc.
- * Note: Coding terms may contain coords outside the bounds (upstream, intron, downstream) so
+ * Note: Transcript terms may contain coords outside the bounds (upstream, intron, downstream) so
  * those can't be checked without mapping the term to the genome.
  * If retFoundAcc is not NULL, set it to our local accession (which may be missing the .version
  * of hgvs->seqAcc) or NULL if we can't find any match.
  * If retFoundVersion is not NULL and hgvs->seqAcc has a version number (e.g. NM_005957.4),
  * set retFoundVersion to our version from latest GenBank, otherwise 0 (no version for LRG).
  * If coords are OK and retDiffRefAllele is not NULL: if our sequence at the coords
  * matches hgvs->refAllele then set it to NULL; if mismatch then set it to our sequence. */
 {
 char *acc = normalizeVersion(db, hgvs->seqAcc, retFoundVersion);
 if (isEmpty(acc))
     return FALSE;
 boolean coordsOK = FALSE;
 char *accSeq = (hgvs->type == hgvstProtein ? getProteinSeq(db, acc) : getCdnaSeq(db, acc));
 if (accSeq)
     {
     // By convention, foundAcc is always versionless because it's accompanied by foundVersion.
     if (retFoundAcc)
         *retFoundAcc = cloneFirstWordByDelimiter(acc, '.');
     int seqLen = strlen(accSeq);
     int start, end;
     hgvsStartEndToZeroBasedHalfOpen(hgvs, &start, &end);
     if (hgvs->type == hgvstCoding)
         {
-        // Coding term coords can extend beyond the bounds of the transcript so
-        // we can't check them without mapping to the genome.  However, if the coords
-        // are in bounds and a reference allele is provided, we can check that.
         struct genbankCds cds;
         coordsOK = getCds(db, acc, &cds);
         if (coordsOK && retDiffRefAllele)
             {
             start += (hgvs->startIsUtr3 ? cds.end : cds.start);
             if (hgvs->startOffset == 0 && start >= 0 && start < seqLen)
                 checkRefAllele(hgvs, start, accSeq, retDiffRefAllele);
             }
         }
     else
         {
-        if (start >= 0 && start < seqLen && end > 0 && end <= seqLen)
-            {
         coordsOK = TRUE;
-            if (retDiffRefAllele)
+        if (retDiffRefAllele && hgvs->startOffset == 0 && start >= 0 && start < seqLen)
             checkRefAllele(hgvs, start, accSeq, retDiffRefAllele);
         }
     }
-    }
 freeMem(accSeq);
 freeMem(acc);
 return coordsOK;
 }
 
 boolean hgvsValidate(char *db, struct hgvsVariant *hgvs,
                      char **retFoundAcc, int *retFoundVersion, char **retDiffRefAllele)
 /* Return TRUE if hgvs coords are within the bounds of the sequence for hgvs->seqAcc.
- * Note: Coding terms may contain coords outside the bounds (upstream, intron, downstream) so
+ * Note: Transcript terms may contain coords outside the bounds (upstream, intron, downstream) so
  * those can't be checked without mapping the term to the genome; this returns TRUE if seq is found.
  * If retFoundAcc is not NULL, set it to our local accession (which may be missing the .version
  * of hgvs->seqAcc) or NULL if we can't find any match.
  * If retFoundVersion is not NULL and hgvs->seqAcc has a version number (e.g. NM_005957.4),
  * set retFoundVersion to our version from latest GenBank, otherwise 0 (no version for LRG).
  * If coords are OK and retDiffRefAllele is not NULL: if our sequence at the coords
  * matches hgvs->refAllele then set it to NULL; if mismatch then set it to our sequence. */
 {
 if (hgvs->type == hgvstGenomic || hgvs->type == hgvstMito)
     return hgvsValidateGenomic(db, hgvs, retFoundAcc, retFoundVersion, retDiffRefAllele);
 else
     return hgvsValidateGene(db, hgvs, retFoundAcc, retFoundVersion, retDiffRefAllele);
 }
 
 static struct bed *bed6New(char *chrom, unsigned chromStart, unsigned chromEnd, char *name,
@@ -1083,112 +1083,108 @@
 static struct bed *hgvsMapGDotToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable)
 /* Given an NC_*g. term, if we can map NC_ to chrom then return the region, else NULL. */
 {
 struct bed *region = NULL;
 char *chrom = hgOfficialChromName(db, hgvs->seqAcc);
 if (isNotEmpty(chrom) && hgvs->start1 > 0)
     {
     region = bed6New(chrom, hgvs->start1 - 1, hgvs->end, "", 0, '+');
     adjustInsStartEnd(hgvs, &region->chromStart, &region->chromEnd);
     }
 if (retPslTable)
     *retPslTable = NULL;
 return region;
 }
 
-static void hgvsCodingToZeroBasedHalfOpen(struct hgvsVariant *hgvs,
+static void hgvsTranscriptToZeroBasedHalfOpen(struct hgvsVariant *hgvs,
                                               int maxCoord, struct genbankCds *cds,
                                               int *retStart, int *retEnd,
                                               int *retUpstreamBases, int *retDownstreamBases)
-/* Convert a coding HGVS's start1 and end into UCSC coords plus upstream and downstream lengths
- * for when the coding HGVS has coordinates that extend beyond its sequence boundaries.
+/* Convert a transcript HGVS's start1 and end into UCSC coords plus upstream and downstream lengths
+ * for when the transcript HGVS has coordinates that extend beyond its sequence boundaries.
  * ret* args must be non-NULL. */
 {
-int start, end;
-hgvsStartEndToZeroBasedHalfOpen(hgvs, &start, &end);
+hgvsStartEndToZeroBasedHalfOpen(hgvs, retStart, retEnd);
+if (hgvs->type == hgvstCoding)
+    {
     // If the position follows '*' that means it's relative to cdsEnd; otherwise rel to cdsStart
-if (hgvs->startIsUtr3)
-    *retStart = cds->end + start;
-else
-    *retStart = cds->start + start;
-if (hgvs->endIsUtr3)
-    *retEnd = cds->end + end;
-else
-    *retEnd = cds->start + end;
+    *retStart += (hgvs->startIsUtr3 ? cds->end : cds->start);
+    *retEnd += (hgvs->endIsUtr3 ? cds->end : cds->start);
+    }
 // Now check for coords that extend beyond the transcript('s alignment to the genome)
 if (*retStart < 0)
     {
-    // hgvs->start1 is upstream of coding transcript.
+    // hgvs->start1 is upstream of transcript.
     *retUpstreamBases = -*retStart;
     *retStart = 0;
     }
-else if (*retStart > maxCoord)
+else if (*retStart >= maxCoord)
     {
-    // Even the start coord is downstream of coding transcript -- make a negative "upstream"
+    // Even the start coord is downstream of transcript -- make a negative "upstream"
     // for adjusting start.
     *retUpstreamBases = -(*retStart - maxCoord + 1);
     *retStart = maxCoord - 1;
     }
 else
     *retUpstreamBases = 0;
 if (*retEnd > maxCoord)
     {
-    // hgvs->end is downstream of coding transcript.
+    // hgvs->end is downstream of transcript.
     *retDownstreamBases = *retEnd - maxCoord;
     *retEnd = maxCoord;
     }
-else if (*retEnd < 0)
+else if (*retEnd <= 0)
     {
-    // Even the end coord is upstream of coding transcript -- make a negative "downstream"
+    // Even the end coord is upstream of transcript -- make a negative "downstream"
     // for adjusting end.
     *retEnd += *retUpstreamBases;
     *retDownstreamBases = -*retUpstreamBases;
     }
 else
     *retDownstreamBases = 0;
 }
 
 static struct psl *pslFromHgvsNuc(struct hgvsVariant *hgvs, char *acc, int accSize, int accEnd,
                                   struct genbankCds *cds,
                                   int *retUpstreamBases, int *retDownstreamBases)
 /* Allocate and return a PSL modeling the variant in nucleotide sequence acc.
  * The PSL target is the sequence and the query is the changed part of the sequence.
  * accEnd is given in case accSize includes an unaligned poly-A tail.
  * If hgvs is coding ("c.") then the caller must pass in a valid cds.
  * In case the start or end position is outside the bounds of the sequence, set retUpstreamBases
  * or retDownstreamBases to the number of bases beyond the beginning or end of sequence.
- * NOTE: coding intron offsets are ignored; the PSL contains the exon anchor point
+ * NOTE: intron offsets are ignored; the PSL contains the exon anchor point
  * and the caller will have to map that to the genome and then apply the intron offset. */
 {
 if (hgvs == NULL)
     return NULL;
 if (hgvs->type == hgvstProtein)
     errAbort("pslFromHgvsNuc must be called only on nucleotide HGVSs, not protein.");
 struct psl *psl;
 AllocVar(psl);
 psl->tName = cloneString(acc);
 safecpy(psl->strand, sizeof(psl->strand), "+");
 psl->tSize = accSize;
-if (hgvs->type != hgvstCoding)
+if (hgvs->type == hgvstGenomic || hgvs->type == hgvstMito)
     {
     // Sane 1-based fully closed coords.
     hgvsStartEndToZeroBasedHalfOpen(hgvs, &psl->tStart, &psl->tEnd);
     }
 else
     {
     // Simple or insanely complex CDS-relative coords.
-    hgvsCodingToZeroBasedHalfOpen(hgvs, accEnd, cds, &psl->tStart, &psl->tEnd,
+    hgvsTranscriptToZeroBasedHalfOpen(hgvs, accEnd, cds, &psl->tStart, &psl->tEnd,
                                       retUpstreamBases, retDownstreamBases);
     }
 int refLen = psl->tEnd - psl->tStart;
 // Just use refLen for alt until we parse the sequence change portion of the term:
 int altLen = refLen;
 psl->qName = cloneString(hgvs->changes);
 psl->qSize = altLen;
 psl->qStart = 0;
 psl->qEnd = altLen;
 psl->blockCount = 1;
 AllocArray(psl->blockSizes, psl->blockCount);
 AllocArray(psl->qStarts, psl->blockCount);
 AllocArray(psl->tStarts, psl->blockCount);
 psl->blockSizes[0] = refLen <= altLen ? refLen : altLen;
 psl->qStarts[0] = psl->qStart;
@@ -1212,45 +1208,46 @@
 del->blockCount = 2;
 AllocArray(del->blockSizes, del->blockCount);
 AllocArray(del->qStarts, del->blockCount);
 AllocArray(del->tStarts, del->blockCount);
 // I wonder if zero-length blockSizes would trigger crashes somewhere...
 del->blockSizes[0] = del->blockSizes[1] = 0;
 del->tStarts[0] = del->tStarts[1] = del->tStart;
 del->qStarts[0] = del->qStart;
 del->qStarts[1] = del->qEnd;
 return del;
 }
 
 
 static struct psl *mapToDeletion(struct psl *variantPsl, struct psl *txAli)
 /* If the variant falls on a transcript base that is deleted in the reference genome,
+ * (or upstream/downstream mapped to a zero-length point),
  * return the deletion coords (pslTransMap returns NULL), otherwise return NULL. */
 {
 // variant start and end coords, in transcript coords:
 int vStart = variantPsl->tStart;
 int vEnd = variantPsl->tEnd;
-// If txAli->strand is '-', reverse coords
-if (txAli->strand[0] == '-')
+boolean isRc = (pslQStrand(txAli) == '-');
+if (isRc)
     {
     vStart = variantPsl->tSize - variantPsl->tEnd;
     vEnd = variantPsl->tSize - variantPsl->tStart;
     }
 if (vEnd < txAli->qStart)
-    return pslDelFromCoord(txAli, txAli->qStart, variantPsl);
+    return pslDelFromCoord(txAli, isRc ? txAli->tEnd : txAli->tStart, variantPsl);
 else if (vStart > txAli->qEnd)
-    return pslDelFromCoord(txAli, txAli->qEnd, variantPsl);
+    return pslDelFromCoord(txAli, isRc ? txAli->tStart : txAli->tEnd, variantPsl);
 int i;
 for (i = 0;  i < txAli->blockCount - 1;  i++)
     {
     int qBlockEnd = txAli->qStarts[i] + txAli->blockSizes[i];
     int qNextBlockStart = txAli->qStarts[i+1];
     int tBlockEnd = txAli->tStarts[i] + txAli->blockSizes[i];
     int tNextBlockStart = txAli->tStarts[i+1];
     if (vStart >= qBlockEnd && vEnd <= qNextBlockStart &&
         tBlockEnd == tNextBlockStart)
         return pslDelFromCoord(txAli, tBlockEnd, variantPsl);
     }
 // Not contained in a deletion from reference genome (txAli target) -- return NULL.
 return NULL;
 }