src/jkOwnLib/fuzzyFind.c 1.18

1.18 2009/08/20 22:31:46 kent
Initializing a field to zero which otherwise would be garbage.
Index: src/jkOwnLib/fuzzyFind.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/jkOwnLib/fuzzyFind.c,v
retrieving revision 1.17
retrieving revision 1.18
diff -b -B -U 1000000 -r1.17 -r1.18
--- src/jkOwnLib/fuzzyFind.c	9 May 2006 01:05:55 -0000	1.17
+++ src/jkOwnLib/fuzzyFind.c	20 Aug 2009 22:31:46 -0000	1.18
@@ -1,1522 +1,1523 @@
 /* Copyright 1999-2003 Jim Kent.  All rights reserved. */
 /* fuzzyFind - searches a large DNA sequence (the haystack) for 
  * a smaller DNA sequence (the needle).  What makes this tricky
  * is that the match need not be exact.
  *
  * The algorithm looks for exact matches to (typically ten) evenly 
  * spaced subsequences of the needle long enough for the match to
  * occur only once or less by chance in the haystack.  If
  * in fact they occur more than 3 times, the subsequence is
  * extended by a base.
  *
  * Next these ten-mers (called "tiles" in the code)
  * are extended as far as possible exactly.  Each tile at
  * this point may match the haystack in multiple places.
  *
  * Next the "weave" routines try to find a good 
  * way to link the tiles together.  Good is scored
  * by a routine which awards a point for a matching
  * base, and subtracts the log-base-two of gaps.  It
  * penalizes an additionallly if a tile is
  * placed before instead of after a previous tile.
  *
  * Now that a preliminary alignment exists, the algorithm
  * searches for exact matches between tiles. These matches
  * are constrained to be long enough that they only occur
  * once in the space between tiles.  These matches are 
  * extended as far as they can be exactly, and added as 
  * further tiles to the alignment list.
  *
  * Finally the tiles are extended inexactly until they
  * abutt each other, or until even inexact extension is
  * fruitless.  
  * 
  * The inexact extension is perhaps the trickiest part of the
  * algorithm.  It's implemented in expandLeft and expandRight.
  * In these the extension is first taken as far as it can go
  * exactly.  Then if four of the next five bases match it's
  * taken five bases further. Then a break is generated, and the
  * next significant match between the needle and haystack is
  * scanned for.
  */
 
 static char const rcsid[] = "$Id$";
 
 #include "common.h"
 #include "dnautil.h"
 #include "localmem.h"
 #include "errabort.h"
 #include "fuzzyFind.h"
 #include "obscure.h"
 
 /* ffMemory routines - 
  *    FuzzyFinder allocates memory internally from the
  *    following routines, which allocate blocks from the
  *    main allocator and then dole them out.
  *
  *    At the end of fuzzy-finding, the actual alignment
  *    is copied to more permanent memory, and all the
  *    blocks freed.  This saves from having to remember
  *    to free every little thing, and also is faster.
  *
  *    If memory can't be allocated this will throw
  *    back to the root of the fuzzy-finder system.
  *
  *    (In typical usage this only allocates about
  *    as much memory as the size of the DNA
  *    you're scanning though.)
  *    
  */
 
 /* debugging stuff. */
 void dumpFf(struct ffAli *left, DNA *needle, DNA *hay); 
 
 /* settable parameter, defaults to constant value */
 static jmp_buf ffRecover;
 
 static void ffAbort()
 /* Abort fuzzy finding. */
 {
 longjmp(ffRecover, -1);
 }
 
 static struct lm *ffMemPool = NULL;
 
 static void ffMemInit()
 /* Initialize fuzzyFinder local memory system. */
 {
 ffMemPool = lmInit(2048);
 }
 
 static void ffMemCleanup()
 /* Free up fuzzyFinder local memory system. */
 {
 lmCleanup(&ffMemPool);
 }
 
 static void *ffNeedMem(size_t size)
 /* Allocate from fuzzyFinder local memory system. */
 {
 return lmAlloc(ffMemPool, size);
 }
 
 static void makeFreqTable(DNA *dna, int dnaSize, double freq[4])
 {
 int histo[4];
 double total;
 int i;
 
 dnaBaseHistogram(dna, dnaSize, histo);
 total = histo[0] + histo[1] + histo[2] + histo[3];
 if (total == 0) total = 1;
 for (i=0; i<4; ++i)
     freq[i] = (double)histo[i] / total;
 }
 
 static double oligoProb(DNA *oligo, int size, double freq[4])
 {
 double prob = 1.0;
 int i;
 int baseVal;
 
 for (i=0; i<size; ++i)
     {
     if ((baseVal = ntVal[(int)oligo[i]]) >= 0)
         prob *= freq[baseVal];
     }
 return prob;
 }
 
 static boolean findImprobableOligo(DNA *needle, int needleLength, double maxProb, double freq[4],
     DNA **rOligo, int *rOligoLength, double *rOligoProb)
 /* Find an oligo that's got less than maxProb probability of matching to
  * random DNA with the given base frequency distribution. */
 {
 int i;
 double totalProb = 1.0;
 int base;
 int startIx = 0;
 
 for (i=0;i<needleLength;++i)
     {
     if ((base = ntVal[(int)needle[i]]) < 0)
         {
         totalProb = 1.0;
         startIx = i+1;
         }
     else
         {
         if ((totalProb *= freq[base]) <= maxProb)
             {
             *rOligo = needle+startIx;
             *rOligoLength = i-startIx+1;
             *rOligoProb = totalProb;
             return TRUE;
             }
         }
     }
 return FALSE;
 }
 
 
 static boolean hasRepeat(DNA *oligo, int oligoLen)
 /* Returns TRUE if oligo has an internal repeat mod 1, 2, or 3. */
 {
 int mod;
 int i;
 DNA b;
 boolean gotRepeat;
 int repSize;
 int maxRep = (oligoLen+1)/2;
 
 b = oligo[0];
 gotRepeat = TRUE;
 for (i=1; i<oligoLen; ++i)
     {
     if (oligo[i] != b)
         {
         gotRepeat = FALSE;
         break;
         }
     }
 if (gotRepeat)
     return TRUE;
 
 gotRepeat = TRUE;
 for (i=2; i<oligoLen; ++i)
     {
     if (oligo[i&1] != oligo[i])
         {
         gotRepeat = FALSE;
         break;
         }
     }
 if (gotRepeat)
     return TRUE;
 
 for (repSize = 3; repSize <= maxRep; ++repSize)
     {
     mod = 0;
     gotRepeat = TRUE;
     for (i=repSize; i<oligoLen; ++i)
         {
         if (oligo[mod] != oligo[i])
             {
             gotRepeat = FALSE;
             break;
             }
         if (++mod == repSize)
             mod = 0;
         }
     if (gotRepeat)
         return TRUE;
     }
 return gotRepeat;
 }
 
 
 
 static boolean ffFindGoodOligo(DNA *needle, int needleLength, double maxProb, double freq[4],
     DNA **rOligo, int *rOligoLength, double *rOligoProb)
 /* Find an oligo that's suitably improbable and doesn't contain 
  * short internal repeats. */
 {
 int oligoLen;
 DNA *oligo;
 
 /* Loop around until you get one that doesn't repeat. */
 for (;;)
     {
     if (!findImprobableOligo(needle, needleLength, maxProb, freq, rOligo, rOligoLength, rOligoProb))
         return FALSE;
     oligoLen = *rOligoLength;
     oligo = *rOligo;
     if (hasRepeat(oligo, oligoLen))
         {
         DNA *newNeedle = oligo+oligoLen;
         int newSize = needleLength - (newNeedle-needle);
         if (newSize <= 0)
             return FALSE;
         needle = newNeedle;
         needleLength = newSize;
         }
     else
         return TRUE;
     }
 }
 
 static boolean leftNextMatch(struct ffAli *ali, DNA *ns, DNA *ne, DNA *hs, DNA *he, 
     int gapPenalty, int maxSkip)
 /* Scan to the left for something that matches the next bit of the needle
  * in the haystack. */
 {
 int haySize = he - hs;
 int needleSize = ne - ns;
 int diagSize = haySize + needleSize;
 int matchSize;
 int i;
 
 /* We take care of bigger skips on the "tile" level. */
 if (diagSize > maxSkip)
     diagSize = maxSkip;
 /* Scan diagonally... */
 /*
 0 1 2 3
 1 2 3 4
 2 3 4 5
 3 4 5 6
 */
 for (i=1; i<=diagSize; ++i)
     {
     int hOff = i;
     int nOff = 0;
     int hDiff = hOff - haySize;
     matchSize = gapPenalty + digitsBaseTwo(i);
     if (hDiff > 0)
         {
         nOff += hDiff;
         hOff -= hDiff;
         }
     for (;hOff >= 0; --hOff, ++nOff)
         {
         int needleLeft = needleSize - nOff;
         int hayLeft = haySize - hOff;
         if (matchSize > needleLeft) break;
         if (matchSize > hayLeft) continue;
         if (ne[-nOff-1] == he[-hOff-1] && memcmp(ne-nOff-matchSize, he-hOff-matchSize, matchSize) == 0)
             {
             ali->nStart = ne - nOff - matchSize;
             ali->nEnd = ne - nOff;
             ali->hStart = he - hOff - matchSize;
             ali->hEnd = he- hOff;
             return TRUE;
             }
         }
     }
 return FALSE;
 }
 
 
 static boolean rightNextMatch(struct ffAli *ali, DNA *ns, DNA *ne, DNA *hs, DNA *he, 
     int gapPenalty, int maxSkip)
 /* Scan to the right for something that matches the next bit of the needle
  * in the haystack. */
 {
 int haySize = he - hs;
 int needleSize = ne - ns;
 int diagSize = haySize + needleSize;
 int matchSize;
 int i;
 
 /* We take care of bigger skips on the "tile" level. */
 if (diagSize > maxSkip)
     diagSize = maxSkip;
 /* Scan diagonally... */
 /*
 0 1 2 3
 1 2 3 4
 2 3 4 5
 3 4 5 6
 */
 for (i=1; i<=diagSize; ++i)
     {
     int hOff = i;
     int nOff = 0;
     int hDiff = hOff - haySize;
     matchSize = gapPenalty + digitsBaseTwo(i);
     if (hDiff > 0)
         {
         nOff += hDiff;
         hOff -= hDiff;
         }
     for (;hOff >= 0; --hOff, ++nOff)
         {
         int needleLeft = needleSize - nOff;
         int hayLeft = haySize - hOff;
         if (matchSize > needleLeft) break;
         if (matchSize > hayLeft) continue;
         if (ns[nOff] == hs[hOff] && memcmp(ns+nOff, hs+hOff, matchSize) == 0)
             {
             ali->nStart = ns + nOff;
             ali->nEnd = ns + nOff + matchSize;
             ali->hStart = hs + hOff;
             ali->hEnd = hs + hOff + matchSize;
             return TRUE;
             }
         }
     }
 return FALSE;
 }
 
 static boolean expandRight(struct ffAli *ali, DNA *needleStart, DNA *needleEnd,
     DNA *hayStart, DNA *hayEnd, int numSkips, int gapPenalty, int maxSkip);
 
 
 static boolean expandLeft(struct ffAli *ali, DNA *needleStart, DNA *needleEnd,
     DNA *hayStart, DNA *hayEnd, int numSkips, int gapPenalty, int maxSkip)
 /* Given a matching segment, try to expand the aligned parts to the
  * right. */
 {
 DNA *ns = ali->nStart;
 DNA *hs = ali->hStart;
 int score;
 DNA *oldNs = ns;
 
 for (;;)
     {
     while (ns > needleStart && hs > hayStart && ns[-1] == hs[-1])
         {
         --hs;
         --ns;
         }
     if (ns <= needleStart || hs <= hayStart)
         {
         ali->nStart = ns;
         ali->hStart = hs;
         return ns != oldNs;
         }
     /* Find fuzzy score to the left. */
         {
         int windowSize = 5;
         int nSize = ns-needleStart;
         int hSize = hs-hayStart;
         if (windowSize > nSize) windowSize = nSize;
         if (windowSize > hSize) windowSize = hSize;
         if (windowSize > 0)
             score = dnaScoreMatch(ns-windowSize, hs-windowSize, windowSize); 
         else
             score = -1;
 
         if (score >= windowSize-2)
             {
             hs -= windowSize;
             ns -= windowSize;
             }
         else if (--numSkips >= 0)
             {
             struct ffAli *newAli = ffNeedMem(sizeof(*newAli));
             ali->nStart = ns;
             ali->hStart = hs;
             if (ns - needleStart < 3 || 
                 !leftNextMatch(newAli, needleStart, ns, hayStart, hs, gapPenalty, maxSkip))
                 {
                 return ns != oldNs; /* We did our best... */
                 }
             newAli->right = ali;
             newAli->left = ali->left;
             if (ali->left)
                 ali->left->right = newAli;
             ali->left = newAli;
             ali = newAli;
             expandRight(ali, needleStart, ns, hayStart, hs, 0, gapPenalty, maxSkip);
             ns = ali->nStart;
             hs = ali->hStart;
             }
         else
             {
             ali->nStart = ns;
             ali->hStart = hs;
             return ns != oldNs;
             }
         }
     }
 }
 
 static boolean expandRight(struct ffAli *ali, DNA *needleStart, DNA *needleEnd,
     DNA *hayStart, DNA *hayEnd, int numSkips, int gapPenalty, int maxSkip)
 /* Given a matched segment, try to expand it to the right. */
 {
 int score;
 DNA *ne = ali->nEnd;
 DNA *he = ali->hEnd;
 DNA *oldNe = ne;
 
 for (;;)
     {
     /* Expand as far to the right as you can keeping perfect match */
     while (ne < needleEnd && he < hayEnd && *ne == *he)
         {
         ++he;
         ++ne;
         }
 
     /* Check to see if have come to the end. */
     if (ne >= needleEnd || he >= hayEnd)
         {
         ali->nEnd = ne;
         ali->hEnd = he;
         return oldNe != ne;
         }
 
     /* Find fuzzy score to the right. */
         {
         int windowSize = 5;
         int nSize = needleEnd-ne;
         int hSize = hayEnd - he;
         if (windowSize > nSize) windowSize = nSize;
         if (windowSize > hSize) windowSize = hSize;
         if (windowSize > 0)
             score = dnaScoreMatch(ne, he, windowSize); 
         else
             score = -1;
 
         if (score >= windowSize-2)
             {
             he += windowSize;
             ne += windowSize;
             }
         else if (--numSkips >= 0)
             {
             struct ffAli *newAli = ffNeedMem(sizeof(*newAli));
             ali->nEnd = ne;
             ali->hEnd = he;
             if (needleEnd - ne < 3 || 
                 !rightNextMatch(newAli, ne, needleEnd, he, hayEnd, gapPenalty, maxSkip))
                 {
                 return oldNe != ne; /* We did our best... */
                 }
             newAli->left = ali;
             newAli->right = ali->right;
             if (ali->right)
                 ali->right->left = newAli;
             ali->right = newAli;
             ali = newAli;
             expandLeft(ali, ne, needleEnd, he, hayEnd, 0, gapPenalty, maxSkip);
             ne = ali->nEnd;
             he = ali->hEnd;
             }
         else 
             {
             ali->nEnd = ne;
             ali->hEnd = he;
             return oldNe != ne;
             }
         }
     }
 }
 
 static boolean exactFind(DNA *needle, int nSize, DNA *hay, int hSize, int *hOffset)
 /* Look for exact match of needle in haystack. */
 {
 DNA firstC = needle[0];
 DNA *restOfNeedle = needle+1;
 int i;
 int endIx = hSize - nSize;
 int restSize = nSize-1;
 
 for (i = 0; i<=endIx; ++i)
     {
     if (firstC == hay[i])
         {
 	if (memcmp(restOfNeedle, hay+i+1, restSize) == 0)
 	    {
 	    *hOffset = i;
 	    return TRUE;
 	    }
 	}
     }
 *hOffset = -1;
 return FALSE;
 }
 
 #ifdef UNUSED
 static int countAlis(struct ffAli *ali)
 /* Count number of blocks in alignment. */
 {
 int count = 0;
 if (ali == NULL)
     return 0;
 while (ali->left)
     ali = ali->left;
 while (ali)
     {
     count += 1;
     ali = ali->right;
     }
 return count;
 }
 #endif /* UNUSED */
 
 
 static struct ffAli *reconsiderAlignedGaps(struct ffAli *ali)
 /* If the gap between two blocks is the same in both needle and
  * haystack, see if we're actually better off with the two
  * blocks merged together. */
 {
 struct ffAli *a = NULL;
 struct ffAli *left = NULL;
 
 if (ali == NULL)
     return NULL;
 a = ali;
 for (;;)
     {
     int gapSize;
 
     /* Advance to next ali */
     left = a;
     a = a->right;
     if (a == NULL)
         break;
     gapSize = a->nStart - left->nEnd;
     if (gapSize == a->hStart - left->hEnd)
         {
         int gapScore;
         int matchScore;
         gapScore = -ffCdnaGapPenalty(left, a);
         matchScore = dnaScoreMatch(left->nEnd, left->hEnd, gapSize);
         if (matchScore > gapScore)
             {
             /* Make current cover left. RemoveEmpty will take
              * care of empty shell of left later. */
             a->hStart = left->hEnd = left->hStart;
             a->nStart = left->nEnd = left->nStart;
             }
         }
     }
 return ali;
 }
 
 static struct ffAli *trimAlis(struct ffAli *aliList)
 /* If ends of an ali don't match trim them off. */
 {
 struct ffAli *ali;
 int trimCount = 0;
 for (ali = aliList; ali != NULL; ali = ali->right)
     {
     while (ali->nStart[0] != ali->hStart[0] && ali->nStart < ali->nEnd)
         {
         ali->nStart += 1;
         ali->hStart += 1;
         ++trimCount;
         }
     while (ali->nEnd[-1] != ali->hEnd[-1] && ali->nEnd > ali->nStart)
         {
         ali->nEnd -= 1;
         ali->hEnd -= 1;
         ++trimCount;
         }
     }
 return aliList;
 }
 
 static int extendThroughN;  /* Can extend through blocks of N's? */
 
 void setFfExtendThroughN(boolean val)
 /* Set whether or not can extend through N's. */
 {
 extendThroughN = val;
 }
 
 boolean expandThroughNRight(struct ffAli *ali, DNA *needleStart, DNA *needleEnd,
     DNA *hayStart, DNA *hayEnd)
 /* Expand through up to three N's to the left. */
 {
 DNA *nEnd = ali->nEnd;
 DNA *hEnd = ali->hEnd;
 DNA n,h;
 boolean expanded = FALSE;
 while (nEnd < needleEnd && hEnd < hayEnd)
     {
     n = *nEnd;
     h = *hEnd;
     if ((n == h) || 
     	(n == 'n' && 
     	(extendThroughN || nEnd + 3 >= needleEnd || nEnd[1] != 'n' || nEnd[2] != 'n' || nEnd[3] != 'n')) ||
 	(h == 'n' && 
 	(extendThroughN || hEnd + 3 >= hayEnd || hEnd[1] != 'n' || hEnd[2] != 'n' || hEnd[3] != 'n')))
         {
         nEnd += 1;
         hEnd += 1;
         expanded = TRUE;
         }
     else
         break;
     }
 ali->nEnd = nEnd;
 ali->hEnd = hEnd;
 return expanded;
 }
 
 boolean expandThroughNLeft(struct ffAli *ali, DNA *needleStart, DNA *needleEnd,
     DNA *hayStart, DNA *hayEnd)
 /* Expand through up to three N's to the left. */
 {
 DNA *nStart = ali->nStart-1;
 DNA *hStart = ali->hStart-1;
 DNA n,h;
 boolean expanded = FALSE;
 while (nStart >= needleStart && hStart >= hayStart)
     {
     n = *nStart;
     h = *hStart;
     if ((n == h) ||
 	(n == 'n' && (extendThroughN || nStart - 3 < needleStart || nStart[-1] != 'n' || nStart[-2] != 'n' || nStart[-3] != 'n')) ||
 	(h == 'n' && (extendThroughN || hStart - 3 < hayStart || hStart[-1] != 'n' || hStart[-2] != 'n' || hStart[-3] != 'n')))
         {
         nStart -= 1;
         hStart -= 1;
         expanded = TRUE;
         }
     else
         break;
     }
 ali->nStart = nStart + 1;
 ali->hStart = hStart + 1;
 return expanded;
 }
 
 static struct ffAli *expandAlis(struct ffAli *ali, DNA *nStart, DNA *nEnd, DNA *hStart, DNA *hEnd, 
     int gapPenalty, int maxSkip)
 /* Expand alignment to cover in-between tiles as well. */
 {
 struct ffAli *a, *left, *right;
 DNA *ns, *ne, *hs, *he;
 boolean expanded = TRUE;
 
 while (expanded)
     {
     expanded = FALSE;
     /* Loop through expanding three times. */
     /* First just expand through N's that don't require an insertion/deletion. */
     for (a = ali; a != NULL; a = a->right)
         {
         if ((left = a->left) != NULL)
             {
             ns = left->nEnd;
             hs = left->hEnd;
             }
         else
             {
             ns = nStart;
             hs = hStart;
             }
         if ((right = a->right) != NULL)
             {
             ne = right->nStart;
             he = right->hStart;
             }
         else
             {
             ne = nEnd;
             he = hEnd;
             }
         expanded |= expandThroughNLeft(a, ns, ne, hs, he);
         expanded |= expandThroughNRight(a, ns, ne, hs, he);
         }
     /* Second do other expansion that doesn't require an insertion/deletion. */
     for (a = ali; a != NULL; a = a->right)
         {
         if ((left = a->left) != NULL)
             {
             ns = left->nEnd;
             hs = left->hEnd;
             }
         else
             {
             ns = nStart;
             hs = hStart;
             }
         if ((right = a->right) != NULL)
             {
             ne = right->nStart;
             he = right->hStart;
             }
         else
             {
             ne = nEnd;
             he = hEnd;
             }
         expanded |= expandLeft(a, ns, ne, hs, he, 0, gapPenalty, maxSkip);
         expanded |= expandRight(a, ns, ne, hs, he, 0, gapPenalty, maxSkip);
         }
     /* Finally do insertion/deletion. */
     for (a = ali; a != NULL; a = a->right)
         {
         if ((left = a->left) != NULL)
             {
             ns = left->nEnd;
             hs = left->hEnd;
             }
         else
             {
             ns = nStart;
             hs = hStart;
             }
         if ((right = a->right) != NULL)
             {
             ne = right->nStart;
             he = right->hStart;
             }
         else
             {
             ne = nEnd;
             he = hEnd;
             }
         expanded |= expandLeft(a, ns, ne, hs, he, 1, gapPenalty, maxSkip);
         expanded |= expandRight(a, ns, ne, hs, he, 1, gapPenalty, maxSkip);
         }
     while (ali->left)
         ali = ali->left;
     }
 return ali;
 }
 
 
 
 DNA *matchInMem(DNA *ns, DNA *ne, DNA *hs, DNA *he)
 {
 int nLen = ne - ns;
 DNA c = *ns++;
 he -= nLen;
 nLen -= 1;
 while (hs < he)
     {   
     if (*hs++ == c && memcmp(ns, hs, nLen) == 0)
         {
         return hs-1;
         }
     }
 return NULL;
 }
 
 struct ffAli *findAliBetween(DNA *tile, int tileSize, DNA *ns, DNA *ne, DNA *hs, DNA *he)
 {
 DNA *match;
 DNA *tileEnd = tile + tileSize;
 int tries = 0;
 for (;;)
     {
     if ((match = matchInMem(tile, tileEnd, hs, he)) == NULL)
         return NULL;
     if (matchInMem(tile, tileEnd, match+1, he) == NULL)
         {
         /* Got exactly one match, whoopie! */
         struct ffAli *ali = ffNeedMem(sizeof(*ali));
         ali->nStart = tile;
         ali->nEnd = tileEnd;
         ali->hStart = match;
         ali->hEnd = match + (tileEnd-tile);
         ffExpandExactLeft(ali, ns, hs);
         ffExpandExactRight(ali, ne, he);
         return ali;
         }
 
     /* Got more than one match.  Expand tile to make it more specific.
      * In general first add base to start, then base to beginning.  If
      * at beginning or end already are constrained.  If already spanning
      * full needle, can't expand, so you're done. */
     if (tile <= ns)
         {
         if (tileEnd >= ne)
             return NULL;
         ++tileEnd;
         }
     else if (tile >= tileEnd)
         {
         --tile;
         }
     else if (tries & 1)
         {
         ++tileEnd;
         }
     else
         {
         --tile;
         }    
     ++tries;
     }    
 }
 
 double evalExactAli(struct ffAli *ali, DNA *ns, DNA *ne, DNA *hs, DNA *he, int numTiles,
     double freq[4])
 {
 int haySize = he-hs;
 double prob = 1.0;
 double allPossibles = haySize*numTiles;
 
 for (;ali != NULL;ali=ali->right)
     {
     double p = oligoProb(ali->nStart, ali->nEnd - ali->nStart, freq) * allPossibles;
     if (p < 1.0)
         prob *= p;
     }
 return prob;
 }
 
 void ffUnlink(struct ffAli **pList, struct ffAli *el)
 /* Unlink element from doubly linked list. If leftmost
  * element update list pointer. */
 {
 struct ffAli *list = *pList;
 struct ffAli *right = el->right;
 struct ffAli *left = el->left;
 
 if (el == list)    /* If first element need to update list. */
     *pList = right;
 if (right != NULL)
     right->left = left;
 if (left != NULL)
     left->right = right;
 el->left = el->right = NULL;
 }
 
 void unlinkAli(struct ffAli **pList, struct ffAli *ali)
 /* Unlink ali from list. */
 {
 ffUnlink(pList, ali);
 }
 
 struct protoGene
     {
     struct protoGene *left;
     struct protoGene *right;
     struct ffAli *hits;
     DNA *hStart, *hEnd, *nStart, *nEnd;
     int score;
     };
 
 int cmpProtoSize(const void *va, const void *vb)
 /* Sort biggest size first. */
 {
 const struct protoGene *a = *((struct protoGene **)va);
 const struct protoGene *b = *((struct protoGene **)vb);
 int aSize, bSize;
 aSize = a->nEnd - a->nStart;
 bSize = b->nEnd - b->nStart;
 return (bSize - aSize);
 }
 
 int cmpProtoNeedle(const void *va, const void *vb)
 /* Sort smallest offset into needle first. */
 {
 const struct protoGene *a = *((struct protoGene **)va);
 const struct protoGene *b = *((struct protoGene **)vb);
 return (a->nStart - b->nStart);
 }
 
 int cmpProtoScore(const void *va, const void *vb)
 /* Sort biggest score first. */
 {
 const struct protoGene *a = *((struct protoGene **)va);
 const struct protoGene *b = *((struct protoGene **)vb);
 return (b->score - a->score);
 }
 
 boolean canAdd(struct protoGene *a, struct protoGene *b)
 /* Can b fit into a?  It can if it doesn't overlap
  * by more than 1/4 with what's already there. */
 {
 struct ffAli *pa;
 DNA *bnEnd = b->nEnd;
 DNA *bnStart = b->nStart;
 DNA *bhEnd = b->hEnd;
 DNA *bhStart = b->hStart;
 int bSize = bnEnd - bnStart;
 int maxOverlap;
 for (pa = a->hits; pa != NULL; pa = pa->right)
     {
     int aSize = pa->nEnd - pa->nStart;
     DNA *start = (bnStart > pa->nStart ? bnStart : pa->nStart);
     DNA *end = (bnEnd < pa->nEnd ? bnEnd : pa->nEnd);
     maxOverlap = (aSize < bSize ? aSize : bSize) / 4;
     if (maxOverlap < 2) maxOverlap = 2;
     if (end - start >= maxOverlap)
        return FALSE;
     start = (bhStart > pa->hStart ? bhStart : pa->hStart);
     end = (bhEnd < pa->hEnd ? bhEnd : pa->hEnd);
     if (end - start >= maxOverlap)
         return FALSE;
     }
 return TRUE;
 }
 
 boolean bestMerger(struct protoGene *list, enum ffStringency stringency,
     DNA *ns, DNA *hs, struct protoGene **retA, struct protoGene **retB)
 /* Figure out best scoring merger in list. */
 {
 int bestScore = -0x7fffffff;
 int score;
 struct protoGene *bestA = NULL, *bestB = NULL;
 struct protoGene *a, *b;
 boolean isCdna = (stringency == ffCdna);
 
 if (list == NULL)
     return FALSE;
 
 for (a = list; a != NULL; a = a->right)
     {
     for (b = a->right; b != NULL; b = b->right)
         {
         if (canAdd(a,b))
             {
             int hGap = b->hStart - a->hEnd;
             int nGap = b->nStart - a->nEnd;
             if (hGap < 0)
                 {
                 hGap = -8*hGap;
                 if (!isCdna || hGap < 32)
                     hGap = (hGap*hGap);
                 }
             if (nGap < 0)
                 nGap = -8*nGap;
             score = -hGap - nGap*nGap;
             if (score > bestScore)
                 {
                 bestA = a;
                 bestB = b;
                 bestScore = score;
                 }
             }
         }
     }
 *retA = bestA;
 *retB = bestB;
 return bestA != NULL;
 }
 
 void mergeProtoGenes(struct protoGene **pList, struct protoGene *a, struct protoGene *b)
 /* Absorb protoGene b into protoGene a. */
 {
 ffUnlink((struct ffAli**)pList, (struct ffAli *)b);
 ffCat(&a->hits, &b->hits);
 if (a->hStart > b->hStart)
     a->hStart = b->hStart;
 if (a->nStart > b->nStart)
     a->nStart = b->nStart;
 if (a->hEnd < b->hEnd)
     a->hEnd = b->hEnd;
 if (a->nEnd < b->nEnd)
     a->nEnd = b->nEnd;
 }
 
 void lumpProtoGenes(struct protoGene **pList,
     DNA *ns, DNA *hs, enum ffStringency stringency)
 /* Lump together as many protogenes as we can. */
 {
 struct protoGene *a, *b;
 
 while (bestMerger(*pList, stringency, ns, hs, &a, &b))
     {
     mergeProtoGenes(pList, a, b);
     }
 }
 
 struct protoGene *lumpHits(struct ffAli **pHitList, DNA *ns, DNA *hs)
 /* Lump together as many hits as can. Criteria - they must be close
  * to same "diagonal." That is the distance between them must be
  * nearly the same in the needle and the haystack. */
 {
 struct protoGene proto;
 struct protoGene *retProto;
 struct ffAli *ali, *nextAli;
 int dif, lastDif;
 int lumpCount = 1;
 
 if ((ali = *pHitList) == NULL)
     return NULL;
 
 /* Move first hit onto our crude exon. */
 nextAli = ali->right;
 proto.left = proto.right = NULL;
 unlinkAli(pHitList, ali);
 proto.hits = ali;
 proto.hStart = ali->hStart;
 proto.hEnd = ali->hEnd;
 proto.nStart = ali->nStart;
 proto.nEnd = ali->nEnd;
+proto.score = 0;
 lastDif = ali->hStart - ali->nStart;
 
 /* Go through rest of list and put others that are close to
  * on the same diagonal onto this exon. */
 while ((ali = nextAli) != NULL)
     {
     nextAli = ali->right;
     dif = ali->hStart - ali->nStart;
     if (lastDif - 2 <= dif && dif <= lastDif + 2)
         {
         lastDif = dif;
         unlinkAli(pHitList, ali);
         ali->left = proto.hits;
         proto.hits = ali;
         proto.hEnd = ali->hEnd;
         proto.nEnd = ali->nEnd;
         ++lumpCount;
         }
     }
 proto.hits = ffMakeRightLinks(proto.hits);
 retProto = ffNeedMem(sizeof(*retProto));
 memcpy(retProto, &proto, sizeof(*retProto));
 return retProto;
 }
 
 #ifdef OLD
 void removeThrowbackHits(struct protoGene *proto)
 /* Remove hits that cause backtracking in hay coordinates. */
 {
 struct ffAli *left, *right;
 boolean gotThrowback = FALSE;
 
 right = proto->hits;
 for (;;)
     {
     left = right;
     right = right->right;
     if (right == NULL)
         break;
     if (left->hStart > right->hStart)
         {
 	right->hStart = right->hEnd = left->hEnd;
 	right->nStart = right->nEnd = left->nEnd;
 	gotThrowback = TRUE;
         }
     }
 if (gotThrowback)
     proto->hits = ffRemoveEmptyAlis(proto->hits, FALSE);
 }
 #endif /* OLD */
 
 
 void thinProtoList(struct protoGene **pList, int maxToTake)
 /* Reduce list to only top scoring members.  At this
  * point protoList is singly linked on left. */
 {
 struct protoGene *newList = NULL, *pg, *next;
 int i;
 
 slSort(pList, cmpProtoScore);
 for (pg=*pList, i=0; pg != NULL && i < maxToTake; pg = next, ++i)
     {
     next = pg->left;
     pg->left = newList;
     newList = pg;
     }
 *pList = newList;
 }
 
 static struct ffAli *weaveAli(struct ffAli *hitList, 
     DNA *ns, DNA *ne, DNA *hs, DNA *he, double freq[4], int *rBestVal, 
     enum ffStringency stringency)
 /* Weave together best looking alignment out of the hitList list. */
 {
 struct ffAli *ali, *lastAli;
 struct protoGene *proto, *protoList = NULL;
 int protoCount = 0;
 
 /* Sort initial hits and remove duplicates. */
 ffAliSort(&hitList, ffCmpHitsHayFirst);
 ali = hitList;
 for (;;)
     {
     lastAli = ali;
     if ((ali = ali->right) == NULL)
         break;
     if (ali->hStart == lastAli->hStart && ali->hEnd == lastAli->hEnd
        && ali->nStart == lastAli->nStart && ali->nEnd == lastAli->nEnd)
        {
        unlinkAli(&hitList, lastAli);
        }
     }
 
 /* Lump together things which look to be separated only by small
  * amounts of noise. */
 while ((proto = lumpHits(&hitList, ns, hs)) != NULL)
     {
     proto->left = protoList;
     protoList = proto;
     ++protoCount;
     }
 if (protoCount > 200)
     {
     thinProtoList(&protoList, 200);
     }
 protoList = (struct protoGene *)ffMakeRightLinks((struct ffAli*)protoList);
 
 
 /* Lump together any other ones that can, starting with ones
  * that go together best. */
 ffAliSort((struct ffAli**)(void*)&protoList, cmpProtoNeedle);
 lumpProtoGenes(&protoList, ns, hs, stringency );
 for (proto = protoList; proto != NULL; proto = proto->right)
     {
     ffAliSort((struct ffAli**)&proto->hits, ffCmpHitsNeedleFirst);
 /*    removeThrowbackHits(proto); */
     proto->score = ffScore(proto->hits,stringency);
     }
 
 /* Sort by score and return the best. */
 ffAliSort((struct ffAli **)(void*)&protoList, cmpProtoScore);
 *rBestVal = protoList->score;
 return protoList->hits;
 }
 
 /* This set of variables is set before calling the recursive tile finders - the
  * below two routines. */
 static double rwFreq[4];
 static boolean rwIsCdna;
 static boolean rwCheckGoodEnough;
 
 static struct ffAli *rwFindTilesBetween(DNA *ns, DNA *ne, DNA *hs, DNA *he, 
     enum ffStringency stringency, double probMax)
 /* Search for more or less regularly spaced exact matches that are
  * in the right order. */
 {
 int searchOffset;
 int endTileOffset;
 int haySize = he - hs;
 int needleSize = ne - ns;
 int numTiles = 0;
 int bestWeaveVal;
 int tileIx;
 struct ffAli *hitList = NULL;
 struct ffAli *bestAli;
 struct ffAli *ali;
 int possibleTiles;
 double tileProbOne;
 
 possibleTiles = (haySize - nextPowerOfFour(needleSize));
 if (possibleTiles < 1) possibleTiles = 1;
 tileProbOne = probMax/possibleTiles;
 
 searchOffset = 0;
 endTileOffset = 0;
 tileIx = 0;
 for (;;)
     {
     DNA *tile;
     int tileSize;
     double tileProb;
     DNA *h = hs;
 
     searchOffset = endTileOffset;
     if (!ffFindGoodOligo(ns+searchOffset, needleSize-searchOffset, tileProbOne,
         rwFreq, &tile, &tileSize, &tileProb))
         {
         break;
         }
     /* Find all parts that match the tile. */
     for (;;)
         {
         if ((h = matchInMem(tile, tile+tileSize, h, he)) == NULL)
             break;
         ali = ffNeedMem(sizeof(*ali));
         ali->hStart = h;
         ali->hEnd = ali->hStart + tileSize;
         ali->nStart = tile;
         ali->nEnd = tile + tileSize;
         ali->left = hitList;
         hitList = ali;
         h += tileSize;
         } 
     endTileOffset = tile + tileSize - ns;
     ++numTiles;
     }
 if (hitList == NULL)
     return NULL;
 
 hitList = ffMakeRightLinks(hitList);
 for (ali = hitList; ali != NULL; ali = ali->right)
     {
     ffExpandExactLeft(ali, ns, hs);
     ffExpandExactRight(ali, ne, he);
     }
 bestAli = weaveAli(hitList, ns, ne, hs, he, rwFreq, &bestWeaveVal, stringency);
 if (rwCheckGoodEnough)
     {
     double prob;
     prob = evalExactAli(bestAli, ns, ne, hs, he, numTiles, rwFreq);
     if (prob > 0.1)
         return NULL;
     rwCheckGoodEnough = FALSE;
     }
 
 return bestAli;
 }
 
 static struct ffAli *exactAli(DNA *ns, DNA *ne, DNA *hs, DNA *he)
 /* Return alignment based on exact match. */
 {
 int exactOffset;
 if (exactFind(ns, ne-ns, hs, he-hs, &exactOffset))
     {
     struct ffAli *ali = ffNeedMem(sizeof(*ali));
     ali->nStart = ns;
     ali->nEnd = ne;
     ali->hStart = hs + exactOffset;
     ali->hEnd = ali->hStart + (ne - ns);
     if (ali->hEnd > he)
         ali->hEnd = he;
     return ali;
     }
 else
     return NULL;
 }
 
 static struct ffAli *recursiveWeave(DNA *ns, DNA *ne, DNA *hs, DNA *he, 
     enum ffStringency stringency, double probMax, int level, int orientation)
 /* Find a set of tiles, then recurse to find set of tiles between the tiles
  * at somewhat lower stringency. */
 {
 struct ffAli *left = NULL, *right = NULL, *aliList;
 
 if ((left = exactAli(ns, ne, hs, he)) != NULL)
     return left;
 if (stringency == ffExact)
     return NULL;
 
 aliList = rwFindTilesBetween(ns, ne, hs, he, stringency, probMax);
 if (aliList != NULL)
     {
     DNA *lne, *rns, *lhe, *rhs;
     int ndif, hdif;
     right = aliList;
     if (orientation == 0)
         orientation = ffIntronOrientation(aliList);
     for (;;)
         {
         /* Figure out the end points to recurse to. */
         if (left != NULL)
             {
             lne = left->nEnd;
             lhe = left->hEnd;
             }
         else
             {
             lne = ns;
             lhe = hs;
             }
         if (right != NULL)
             {
             rns = right->nStart;
             rhs = right->hStart;
             }
         else
             {
             rns = ne;
             rhs = he;
             }
         ndif = rns-lne;
         hdif = rhs-lhe;
         /* If a big enough gap left recurse. */
         if (ndif >= 5 && hdif >= 5)
             {
             struct ffAli *newLeft = NULL, *newRight;
 
 	    newLeft = recursiveWeave(lne, rns, lhe, rhs, stringency, probMax*2, level+1, 
 		orientation);
             if (newLeft != NULL)
                 {
                 /* Insert new tiles between left and right. */
                 
                 /* Find right end of tile set. */
                 newRight = newLeft;
                 while (newRight->right != NULL)
                     newRight = newRight->right;
                 
                 
                 if (left != NULL)
                     {
                     left->right = newLeft;
                     newLeft->left = left;
                     }
                 else
                     {
                     aliList = newLeft;
                     }
                 if (right != NULL)
                     {
                     right->left = newRight;
                     newRight->right = right;
                     }
                 }
             }
         if ((left = right) == NULL)
             break;
         right = right->right;
         }
     }
 return aliList;
 }
 
 
 static struct ffAli *findWovenTiles(DNA *ns, DNA *ne, DNA *hs, DNA *he, enum ffStringency stringency)
 {
 struct ffAli *bestAli;
 int haySize = he - hs;
 int needleSize = ne - ns;
 static double tileStrinProbMult[] = { 0.0001, 0.0005, 0.0005, 0.5, };
                                     /* exact  cDNA        tight    loose */
 if (needleSize < 2 || haySize < 2)  /* Be serious man! */
     return NULL;
 
 /* Set up rwVariables - essentially locals except that they don't change over the course
  * of the recursion. */
 makeFreqTable(hs, haySize, rwFreq);
 rwIsCdna = (stringency == ffCdna);
 rwCheckGoodEnough = (stringency == ffTight || stringency == ffCdna);
 
 bestAli = recursiveWeave(ns, ne, hs, he, stringency, tileStrinProbMult[stringency], 1, 0);
 
 return bestAli;
 }
 
 
 static struct ffAli *findBestAli(DNA *ns, DNA *ne, DNA *hs, DNA *he, enum ffStringency stringency)
 {
 static int iniExpGapPen[] = {0 /* (exact) */, 4 /* cDna */, 4 /* tight */, 4 /* loose */};
 static int addExpGapPen[] = {0 /* (exact) */, 3 /* cDna */, 3 /* tight */, 3 /* loose */};
 static int midTileMinSize[] ={0 /*(exact) */,12 /* cDNA */, 12 /* tight */, 4 /* loose */};
 
 struct ffAli *bestAli;
 int matchSize;
 
 matchSize = nextPowerOfFour(he-hs)+1;
 if (matchSize < midTileMinSize[stringency])
     matchSize = midTileMinSize[stringency];
 
 bestAli = findWovenTiles(ns, ne, hs, he, stringency);
 if (bestAli == NULL)
     return NULL;
 
 bestAli = ffMergeNeedleAlis(bestAli, FALSE);
 bestAli = expandAlis(bestAli,ns,ne,hs,he,iniExpGapPen[stringency], 1);
 bestAli = ffMergeNeedleAlis(bestAli, FALSE);
 bestAli = expandAlis(bestAli,ns,ne,hs,he,addExpGapPen[stringency], 2*matchSize);
 bestAli = trimAlis(bestAli);
 bestAli = ffMergeNeedleAlis(bestAli, FALSE);
 bestAli = ffMergeHayOverlaps(bestAli);
 bestAli = reconsiderAlignedGaps(bestAli);
 bestAli = ffRemoveEmptyAlis(bestAli, FALSE);
 bestAli = ffMergeNeedleAlis(bestAli, FALSE);
 if (stringency == ffCdna)
     ffSlideIntrons(bestAli);
 bestAli = ffRemoveEmptyAlis(bestAli, FALSE);
 return bestAli;
 }
 
 
 static struct ffAli *saveAliToPermanentMem(struct ffAli *volatileAli)
 /* Save alignment to memory that doesn't get thrown away. */
 {
 struct ffAli *leftList = NULL;
 struct ffAli *newAli, *ali;
 struct ffAli *rightList = NULL;
 
 /* Allocate memory, copy contents and set left pointer. */
 for (ali = volatileAli; ali != NULL; ali = ali->right)
     {
     newAli = needMem(sizeof(*newAli));
     if (newAli == NULL)
         {
         slFreeList(leftList);
         ffAbort();
         }
     memcpy(newAli, ali, sizeof(*newAli));
     newAli->left = leftList;
     leftList = newAli;
     }
 
 /* Set right pointer. */
 for (ali = leftList; ali != NULL; ali = ali->left)
     {
     ali->right = rightList;
     rightList = ali;
     }
 return rightList;
 }
 
 struct ffAli *ffFind(DNA *needleStart, DNA *needleEnd, DNA *hayStart, DNA *hayEnd,
      enum ffStringency stringency)
 /* Return an alignment of needle in haystack. */
 {
 struct ffAli *bestAli;
 int status;
 
 assert(needleStart <= needleEnd);
 assert(hayStart <= hayEnd);
 
 ffMemInit();
 dnaUtilOpen();
 
 
 /* Set up error recovery. */
 status = setjmp(ffRecover);
 if (status == 0)    /* Always true except after long jump. */
     {
     pushAbortHandler(ffAbort);
     bestAli = findBestAli(needleStart, needleEnd, hayStart, hayEnd, stringency);
     ffCountGoodEnds(bestAli);
     bestAli = saveAliToPermanentMem(bestAli);
     }
 else    /* They long jumped here because of an error. */
     {
     bestAli = NULL;
     }
 popAbortHandler();
 ffMemCleanup();
 return bestAli;
 }
 
 boolean ffFindAndScore(DNA *needle, int needleSize, DNA *haystack, int haySize,
     enum ffStringency stringency, struct ffAli **pAli, boolean *pRcNeedle, int *pScore)
 /* Return TRUE if find an allignment using needle, or reverse complement of 
  * needle to search haystack. DNA must be lower case. If pScore is non-NULL returns
  * score of alignment. */
 {
 int fScore, rScore;
 struct ffAli *fAli, *rAli;
 
 /* Get forward alignment and score it. */
 fAli = ffFind(needle, needle+needleSize, haystack, haystack+haySize, stringency);
 fScore = ffScore(fAli,stringency);
 
 /* Get reverse alignment and score it. */
 reverseComplement(needle, needleSize);
 rAli = ffFind(needle, needle+needleSize, haystack, haystack+haySize, stringency);
 rScore = ffScore(rAli,stringency);
 reverseComplement(needle, needleSize);
 
 /* If no good alignment on either strand return FALSE. */
 if (fAli == NULL && rAli == NULL)
     {
     *pAli = NULL;
     return FALSE;
     }
 
 /* Return TRUE with best alignment.  Free the other one. */
 if (fScore > rScore)
     {
     *pAli = fAli;
     *pRcNeedle = FALSE;
     if (pScore != NULL)
         *pScore = fScore;
     ffFreeAli(&rAli);
     }   
 else
     {
     *pAli = rAli;
     *pRcNeedle = TRUE;
     if (pScore != NULL)
         *pScore = rScore;
     ffFreeAli(&fAli);
     }
 return TRUE;
 }
 
 boolean ffFindEitherStrandN(DNA *needle, int needleSize, DNA *haystack, int haySize,
     enum ffStringency stringency, struct ffAli **pAli, boolean *pRcNeedle)
 /* Return TRUE if find an allignment using needle, or reverse complement of 
  * needle to search haystack. DNA must be lower case. */
 {
 return ffFindAndScore(needle, needleSize, haystack, haySize, stringency, pAli, pRcNeedle, NULL);
 }
 
 boolean ffFindEitherStrand(DNA *needle, DNA *haystack, enum ffStringency stringency,
     struct ffAli **pAli, boolean *pRcNeedle)
 /* Return TRUE if find an alignment using needle, or reverse complement of 
  * needle to search haystack. DNA must be lower case. Needle and haystack
  * are zero terminated. */
 {
 return ffFindEitherStrandN(needle, strlen(needle), haystack, strlen(haystack),
 stringency, pAli, pRcNeedle);
 }