61647b42c0de4b0d11abf375cd713cb28758a954
angie
  Wed Aug 9 12:53:14 2017 -0700
New lib modules in support of adding HGVS output to hgVai:
- seqWindow: generic interface to fetch portions of a sequence
- indelShift: slide to the left or right of an ambiguous alignment range (e.g. AG to AGAGAG)
- variantProjector: use PSL+CDS to project a genomic variant to a transcript variant; project transcript variant to protein variant
refs #19968

diff --git src/hg/lib/indelShift.c src/hg/lib/indelShift.c
new file mode 100644
index 0000000..873b4ae
--- /dev/null
+++ src/hg/lib/indelShift.c
@@ -0,0 +1,217 @@
+/* indelShift.c - find the leftmost or rightmost possible location of an insertion/deletion
+ * variant with ambiguous placement, e.g. 'AAA' -> 'AAAA' -- where is the extra A inserted? */
+
+/* Copyright (C) 2017 The Regents of the University of California
+ * See README in this or parent directory for licensing information. */
+#include "common.h"
+#include "dnaseq.h"
+#include "indelShift.h"
+
+boolean indelShiftIsApplicable(int refLen, int altLen)
+/* Return true if it makes sense to try to shift this variant, i.e. if it's pure ins or del. */
+{
+return ((refLen == 0 && altLen > 0) ||  // insertion into reference
+        (refLen > 0 && altLen == 0));   // deletion from reference
+}
+
+void indelShiftRangeForVariant(uint varStart, int refLen, int altLen,
+                                 uint *retStart, uint *retEnd)
+/* Given a variant start position and the lengths of reference and alternate alleles,
+ * set retStart and retEnd to a range around varStart suitable for shifting around.
+ * Caller must truncate retEnd to sequence size if necessary. */
+{
+int padding = max(128, 2*refLen+altLen);
+*retStart = (varStart > padding) ? varStart - padding : 0;
+*retEnd = varStart + refLen + altLen + padding;
+}
+
+static void expandWindowIfNecessary(struct seqWindow *seqWin, uint varStart, int refLen, int altLen)
+/* Make sure there's enough padding to start a sliding-window search around variant. */
+{
+uint maxStart, minEnd;
+indelShiftRangeForVariant(varStart, refLen, altLen, &maxStart, &minEnd);
+seqWin->fetch(seqWin, seqWin->seqName, maxStart, minEnd);
+}
+
+static int fetchLeft(struct seqWindow *seqWin, int basesShifted)
+/* We need to fetch more sequence to the left; update seqWin and p* params
+ * with the distance that we have shifted so far, and fetch a larger block of sequence.
+ * Return the new offset of the variant start within seqWin. */
+{
+// Double the size of our padding on the left (but don't underflow seq)
+int padding = basesShifted * 2;
+if (padding > seqWin->start)
+    padding = seqWin->start;
+if (padding > 0)
+    {
+    uint newStart = seqWin->start - padding;
+    seqWin->fetch(seqWin, seqWin->seqName, newStart, seqWin->end);
+    }
+return padding;
+}
+
+static uint slideLeftOne(struct seqWindow *seqWin, uint start, int *pBasesShifted)
+/* Advance one base to the left; if we hit the beginning of the sequence window, fetch more. */
+{
+uint newStart = start - 1;
+*pBasesShifted += 1;
+if (newStart == 0)
+    newStart = fetchLeft(seqWin, *pBasesShifted);
+return newStart;
+}
+
+INLINE boolean notMax(int basesShifted, int maxShift)
+/* Return TRUE if we have not yet hit the limit or if there is no limit. */
+{
+return (maxShift <= INDEL_SHIFT_NO_MAX) || (basesShifted < maxShift);
+}
+
+static int slideLeft(struct seqWindow *seqWin, uint varStart, int refLen, char *alt, int maxShift)
+/* Slide left within seqWin as far as we can; if we still haven't found a mismatch then
+ * fetch more sequence and keep going.  Return the number of bases shifted.
+ * This can modify all params (except maxShift). */
+{
+int basesShifted = 0;
+int altLen = strlen(alt);
+// Relative coords within seqWin:
+uint start = varStart - seqWin->start;
+boolean done = FALSE;
+// If insertion, first compare inserted seq to genomic seq to the left of insertion point.
+int ix;
+for (ix = altLen-1;  !done && ix >= 0 && start > 0 && notMax(basesShifted, maxShift);  ix--)
+    {
+    if (seqWin->seq[start-1] == alt[ix])
+        start = slideLeftOne(seqWin, start, &basesShifted);
+    else
+        done = TRUE;
+    }
+// If we're not done, keep trying to shift left.
+while (!done && start > 0 && notMax(basesShifted, maxShift))
+    {
+    if (seqWin->seq[start-1] == seqWin->seq[start-1+refLen+altLen])
+        start = slideLeftOne(seqWin, start, &basesShifted);
+    else
+        done = TRUE;
+    }
+return basesShifted;
+}
+
+static void fetchRight(struct seqWindow *seqWin, int extraPadding, int basesShifted)
+/* We need to fetch more sequence to the right; update seqWin and p* params
+ * with the distance that we have shifted so far, and fetch a larger block of sequence.
+ * Return the number of bases by which we expanded to the right. */
+{
+// Double the size of our padding on the right
+int padding = basesShifted * 2 + extraPadding;
+uint newEnd = seqWin->end + padding;
+seqWin->fetch(seqWin, seqWin->seqName, seqWin->start, newEnd);
+}
+
+static uint slideRightOne(struct seqWindow *seqWin, uint start, int refLen, int altLen,
+                          int *pBasesShifted)
+/* Advance one base to the right; if we hit the end of the sequence window, fetch more. */
+{
+uint newStart = start + 1;
+*pBasesShifted += 1;
+if (newStart >= seqWin->end - seqWin->start - refLen - altLen)
+    fetchRight(seqWin, refLen+altLen, *pBasesShifted);
+return newStart;
+}
+
+static int slideRight(struct seqWindow *seqWin, uint varStart, int refLen, char *alt, int maxShift)
+/* Slide right within seqWin as far as we can; if we still haven't found a mismatch then
+ * fetch more sequence and keep going.  Return the number of bases shifted.
+ * This can modify all params (except maxShift). */
+{
+int basesShifted = 0;
+int altLen = strlen(alt);
+// Relative coords within seqWin:
+uint start = varStart - seqWin->start;
+boolean done = FALSE;
+// If insertion, first compare inserted seq to genomic seq to the right of insertion point.
+int ix;
+for (ix = 0; !done && ix < altLen && start < seqWin->end - seqWin->start - refLen &&
+             notMax(basesShifted, maxShift); ix++)
+    {
+    if (seqWin->seq[start] == alt[ix])
+        start = slideRightOne(seqWin, start, refLen, altLen, &basesShifted);
+    else
+        done = TRUE;
+    }
+// If we're not done, keep trying to shift right.
+while (!done && start < seqWin->end - seqWin->start - refLen && notMax(basesShifted, maxShift))
+    {
+    if (seqWin->seq[start - altLen] == seqWin->seq[start + refLen])
+        start = slideRightOne(seqWin, start, refLen, altLen, &basesShifted);
+    else
+        done = TRUE;
+    }
+return basesShifted;
+}
+
+int indelShift(struct seqWindow *seqWin, uint *pVarStart, uint *pVarEnd, char *alt,
+               int maxShift, enum indelShiftDirection direction)
+/* Shift variant as far left or right as possible in seqName, expanding seqWindow as necessary,
+ * updating pVarStart and pVarEnd, and possibly rotating the bases in alt (if non-empty).
+ * If maxShift > INDEL_SHIFT_NO_MAX, shift at most maxShift bases.
+ * Return the number of bases by which we shifted, usually 0.
+ * NOTES:
+ * - This can modify all params (except direction).  Recompute anything derived from them
+ *   if result is not 0.
+ * - seqWin must have the correct seqName for variant and ideally will already contain a
+ *   sufficiently large region around variant.
+ * - This works only on pure insertions or deletions; if original ref and alt have identical
+ *   bases at the beginning and/or end, trim those before passing into this function. */
+{
+int basesShifted = 0;
+int refLen = *pVarEnd - *pVarStart;
+if (refLen < 0)
+    errAbort("indelShift: variant end (%u) is less than variant start (%u)",
+             *pVarEnd, *pVarStart);
+int altLen = strlen(alt);
+if (indelShiftIsApplicable(refLen, altLen))
+    {
+    // It's a pure insertion or deletion; attempt to slide the variant left
+    expandWindowIfNecessary(seqWin, *pVarStart, refLen, altLen);
+    if (direction == isdLeft)
+        basesShifted = slideLeft(seqWin, *pVarStart, refLen, alt, maxShift);
+    else if (direction == isdRight)
+        basesShifted = slideRight(seqWin, *pVarStart, refLen, alt, maxShift);
+    else
+        errAbort("indelShift: invalid direction (%d), should be isdLeft or isdRight", direction);
+    if (basesShifted)
+        {
+        // Update coords and alt sequence (if non-empty)
+        int coordOffset = (direction == isdRight) ? basesShifted : -basesShifted;
+        *pVarStart += coordOffset;
+        *pVarEnd += coordOffset;
+        if (altLen > 0)
+            {
+            if (basesShifted >= altLen)
+                // All of alt is a shifted portion of the reference.
+                seqWindowCopy(seqWin, *pVarStart - (direction == isdLeft ? 0 : altLen), altLen,
+                              alt, altLen+1);
+            else if (direction == isdRight)
+                {
+                // The beginning of alt is a left-shifted portion of the original.  The remainder
+                // is from reference seq.  Do the shifting first, then the copying from ref.
+                int shiftedLen = altLen - basesShifted;
+                memmove(alt, alt+basesShifted, shiftedLen);
+                seqWindowCopy(seqWin, *pVarStart - basesShifted, basesShifted,
+                              alt+shiftedLen, basesShifted+1);
+                }
+            else
+                {
+                // The beginning of alt is from reference seq. The remainder is a right-shifted
+                // portion of the original.  Do the shifting first, then the copying from ref.
+                memmove(alt+basesShifted, alt, (altLen - basesShifted));
+                // Note: can't use seqWindowCopy because it zero-terminates.
+                char *refAlStart = seqWin->seq + (*pVarStart - seqWin->start);
+                memcpy(alt, refAlStart, basesShifted);
+                }
+            }
+        }
+    }
+return basesShifted;
+}
+