61647b42c0de4b0d11abf375cd713cb28758a954 angie Wed Aug 9 12:53:14 2017 -0700 New lib modules in support of adding HGVS output to hgVai: - seqWindow: generic interface to fetch portions of a sequence - indelShift: slide to the left or right of an ambiguous alignment range (e.g. AG to AGAGAG) - variantProjector: use PSL+CDS to project a genomic variant to a transcript variant; project transcript variant to protein variant refs #19968 diff --git src/hg/lib/indelShift.c src/hg/lib/indelShift.c new file mode 100644 index 0000000..873b4ae --- /dev/null +++ src/hg/lib/indelShift.c @@ -0,0 +1,217 @@ +/* indelShift.c - find the leftmost or rightmost possible location of an insertion/deletion + * variant with ambiguous placement, e.g. 'AAA' -> 'AAAA' -- where is the extra A inserted? */ + +/* Copyright (C) 2017 The Regents of the University of California + * See README in this or parent directory for licensing information. */ +#include "common.h" +#include "dnaseq.h" +#include "indelShift.h" + +boolean indelShiftIsApplicable(int refLen, int altLen) +/* Return true if it makes sense to try to shift this variant, i.e. if it's pure ins or del. */ +{ +return ((refLen == 0 && altLen > 0) || // insertion into reference + (refLen > 0 && altLen == 0)); // deletion from reference +} + +void indelShiftRangeForVariant(uint varStart, int refLen, int altLen, + uint *retStart, uint *retEnd) +/* Given a variant start position and the lengths of reference and alternate alleles, + * set retStart and retEnd to a range around varStart suitable for shifting around. + * Caller must truncate retEnd to sequence size if necessary. */ +{ +int padding = max(128, 2*refLen+altLen); +*retStart = (varStart > padding) ? varStart - padding : 0; +*retEnd = varStart + refLen + altLen + padding; +} + +static void expandWindowIfNecessary(struct seqWindow *seqWin, uint varStart, int refLen, int altLen) +/* Make sure there's enough padding to start a sliding-window search around variant. */ +{ +uint maxStart, minEnd; +indelShiftRangeForVariant(varStart, refLen, altLen, &maxStart, &minEnd); +seqWin->fetch(seqWin, seqWin->seqName, maxStart, minEnd); +} + +static int fetchLeft(struct seqWindow *seqWin, int basesShifted) +/* We need to fetch more sequence to the left; update seqWin and p* params + * with the distance that we have shifted so far, and fetch a larger block of sequence. + * Return the new offset of the variant start within seqWin. */ +{ +// Double the size of our padding on the left (but don't underflow seq) +int padding = basesShifted * 2; +if (padding > seqWin->start) + padding = seqWin->start; +if (padding > 0) + { + uint newStart = seqWin->start - padding; + seqWin->fetch(seqWin, seqWin->seqName, newStart, seqWin->end); + } +return padding; +} + +static uint slideLeftOne(struct seqWindow *seqWin, uint start, int *pBasesShifted) +/* Advance one base to the left; if we hit the beginning of the sequence window, fetch more. */ +{ +uint newStart = start - 1; +*pBasesShifted += 1; +if (newStart == 0) + newStart = fetchLeft(seqWin, *pBasesShifted); +return newStart; +} + +INLINE boolean notMax(int basesShifted, int maxShift) +/* Return TRUE if we have not yet hit the limit or if there is no limit. */ +{ +return (maxShift <= INDEL_SHIFT_NO_MAX) || (basesShifted < maxShift); +} + +static int slideLeft(struct seqWindow *seqWin, uint varStart, int refLen, char *alt, int maxShift) +/* Slide left within seqWin as far as we can; if we still haven't found a mismatch then + * fetch more sequence and keep going. Return the number of bases shifted. + * This can modify all params (except maxShift). */ +{ +int basesShifted = 0; +int altLen = strlen(alt); +// Relative coords within seqWin: +uint start = varStart - seqWin->start; +boolean done = FALSE; +// If insertion, first compare inserted seq to genomic seq to the left of insertion point. +int ix; +for (ix = altLen-1; !done && ix >= 0 && start > 0 && notMax(basesShifted, maxShift); ix--) + { + if (seqWin->seq[start-1] == alt[ix]) + start = slideLeftOne(seqWin, start, &basesShifted); + else + done = TRUE; + } +// If we're not done, keep trying to shift left. +while (!done && start > 0 && notMax(basesShifted, maxShift)) + { + if (seqWin->seq[start-1] == seqWin->seq[start-1+refLen+altLen]) + start = slideLeftOne(seqWin, start, &basesShifted); + else + done = TRUE; + } +return basesShifted; +} + +static void fetchRight(struct seqWindow *seqWin, int extraPadding, int basesShifted) +/* We need to fetch more sequence to the right; update seqWin and p* params + * with the distance that we have shifted so far, and fetch a larger block of sequence. + * Return the number of bases by which we expanded to the right. */ +{ +// Double the size of our padding on the right +int padding = basesShifted * 2 + extraPadding; +uint newEnd = seqWin->end + padding; +seqWin->fetch(seqWin, seqWin->seqName, seqWin->start, newEnd); +} + +static uint slideRightOne(struct seqWindow *seqWin, uint start, int refLen, int altLen, + int *pBasesShifted) +/* Advance one base to the right; if we hit the end of the sequence window, fetch more. */ +{ +uint newStart = start + 1; +*pBasesShifted += 1; +if (newStart >= seqWin->end - seqWin->start - refLen - altLen) + fetchRight(seqWin, refLen+altLen, *pBasesShifted); +return newStart; +} + +static int slideRight(struct seqWindow *seqWin, uint varStart, int refLen, char *alt, int maxShift) +/* Slide right within seqWin as far as we can; if we still haven't found a mismatch then + * fetch more sequence and keep going. Return the number of bases shifted. + * This can modify all params (except maxShift). */ +{ +int basesShifted = 0; +int altLen = strlen(alt); +// Relative coords within seqWin: +uint start = varStart - seqWin->start; +boolean done = FALSE; +// If insertion, first compare inserted seq to genomic seq to the right of insertion point. +int ix; +for (ix = 0; !done && ix < altLen && start < seqWin->end - seqWin->start - refLen && + notMax(basesShifted, maxShift); ix++) + { + if (seqWin->seq[start] == alt[ix]) + start = slideRightOne(seqWin, start, refLen, altLen, &basesShifted); + else + done = TRUE; + } +// If we're not done, keep trying to shift right. +while (!done && start < seqWin->end - seqWin->start - refLen && notMax(basesShifted, maxShift)) + { + if (seqWin->seq[start - altLen] == seqWin->seq[start + refLen]) + start = slideRightOne(seqWin, start, refLen, altLen, &basesShifted); + else + done = TRUE; + } +return basesShifted; +} + +int indelShift(struct seqWindow *seqWin, uint *pVarStart, uint *pVarEnd, char *alt, + int maxShift, enum indelShiftDirection direction) +/* Shift variant as far left or right as possible in seqName, expanding seqWindow as necessary, + * updating pVarStart and pVarEnd, and possibly rotating the bases in alt (if non-empty). + * If maxShift > INDEL_SHIFT_NO_MAX, shift at most maxShift bases. + * Return the number of bases by which we shifted, usually 0. + * NOTES: + * - This can modify all params (except direction). Recompute anything derived from them + * if result is not 0. + * - seqWin must have the correct seqName for variant and ideally will already contain a + * sufficiently large region around variant. + * - This works only on pure insertions or deletions; if original ref and alt have identical + * bases at the beginning and/or end, trim those before passing into this function. */ +{ +int basesShifted = 0; +int refLen = *pVarEnd - *pVarStart; +if (refLen < 0) + errAbort("indelShift: variant end (%u) is less than variant start (%u)", + *pVarEnd, *pVarStart); +int altLen = strlen(alt); +if (indelShiftIsApplicable(refLen, altLen)) + { + // It's a pure insertion or deletion; attempt to slide the variant left + expandWindowIfNecessary(seqWin, *pVarStart, refLen, altLen); + if (direction == isdLeft) + basesShifted = slideLeft(seqWin, *pVarStart, refLen, alt, maxShift); + else if (direction == isdRight) + basesShifted = slideRight(seqWin, *pVarStart, refLen, alt, maxShift); + else + errAbort("indelShift: invalid direction (%d), should be isdLeft or isdRight", direction); + if (basesShifted) + { + // Update coords and alt sequence (if non-empty) + int coordOffset = (direction == isdRight) ? basesShifted : -basesShifted; + *pVarStart += coordOffset; + *pVarEnd += coordOffset; + if (altLen > 0) + { + if (basesShifted >= altLen) + // All of alt is a shifted portion of the reference. + seqWindowCopy(seqWin, *pVarStart - (direction == isdLeft ? 0 : altLen), altLen, + alt, altLen+1); + else if (direction == isdRight) + { + // The beginning of alt is a left-shifted portion of the original. The remainder + // is from reference seq. Do the shifting first, then the copying from ref. + int shiftedLen = altLen - basesShifted; + memmove(alt, alt+basesShifted, shiftedLen); + seqWindowCopy(seqWin, *pVarStart - basesShifted, basesShifted, + alt+shiftedLen, basesShifted+1); + } + else + { + // The beginning of alt is from reference seq. The remainder is a right-shifted + // portion of the original. Do the shifting first, then the copying from ref. + memmove(alt+basesShifted, alt, (altLen - basesShifted)); + // Note: can't use seqWindowCopy because it zero-terminates. + char *refAlStart = seqWin->seq + (*pVarStart - seqWin->start); + memcpy(alt, refAlStart, basesShifted); + } + } + } + } +return basesShifted; +} +