b31907d700c1fe956e4e4c20e64d91de027d7c84
markd
  Tue May 14 02:03:33 2024 -0700
merge blatHuge implementation

diff --git src/jkOwnLib/patSpace.c src/jkOwnLib/patSpace.c
index 80f1de7..2e73ecc 100644
--- src/jkOwnLib/patSpace.c
+++ src/jkOwnLib/patSpace.c
@@ -1,463 +1,463 @@
 /* patSpace - a homology finding algorithm that occurs mostly in 
  * pattern space (as opposed to offset space). */
 /* Copyright 1999-2003 Jim Kent.  All rights reserved. */
 #include "common.h"
 #include "portable.h"
 #include "dnaseq.h"
 #include "ooc.h"
 #include "patSpace.h"
 
 
 #define blockSize (256)
 
 #define BIGONE	/* define this for 32 bit version. */
 
 #ifdef BIGONE
 
 #define maxBlockCount (2*230*1024 - 1)
 #define psBits bits32
 /* psBits is the size of an index word.  If 16 bits
  * patSpace will use less memory, but be limited to
  * 16 meg or less genome size. */
 
 #else /* BIGONE */
 
 #define maxBlockCount (64*1024 - 1)
 #define psBits bits16
 /* psBits is the size of an index word.  If 16 bits
  * patSpace will use less memory, but be limited to
  * 16 meg or less genome size. */
 
 #endif /* BIGONE */
 
 
 #define maxPatCount (1024*16)    /* Maximum allowed count for one pattern. */
 
 struct blockPos
 /* Structure that stores where in a BAC we are - 
  * essentially an unpacked block index. */
     {
     bits16 bacIx;          /* Which bac. */
     bits16 seqIx;          /* Which subsequence in bac. */
     struct dnaSeq *seq; /* Pointer to subsequence. */
     int offset;         /* Start position of block within subsequence. */
     int size;           /* Size of block within subsequence. */
     };
 
 struct patSpace
 /* A pattern space - something that holds an index of all 10-mers in
  * genome. */
     {
     psBits **lists;                      /* A list for each 10-mer */
     psBits *listSizes;                   /* Size of list for each 10-mer */
     psBits *allocated;                   /* Storage space for all lists. */
     int    blocksUsed;			 /* Number of blocks used, <= maxBlockCount */
     int posBuf[maxBlockCount];           /* Histogram of hits to each block. */
     int hitBlocks[maxBlockCount];        /* Indexes of blocks with hits. */
     struct blockPos blockPos[maxBlockCount]; /* Relates block to sequence. */
     psBits maxPat;                       /* Max # of times pattern can occur
                                           * before it is ignored. */
     int minMatch;                        /* Minimum number of tile hits needed
                                           * to trigger a clump hit. */
     int maxGap;                          /* Max gap between tiles in a clump. */
     int seedSize;			 /* Size of alignment seed. */
     int seedSpaceSize;                    /* Number of possible seed values. */
     };
 
 void freePatSpace(struct patSpace **pPatSpace)
 /* Free up a pattern space. */
 {
 struct patSpace *ps = *pPatSpace;
 if (ps != NULL)
     {
     freeMem(ps->allocated);
     freez(pPatSpace);
     }
 }
 
 void tooManyBlocks()
 /* Complain about too many blocks and abort. */
 {
 errAbort("Too many blocks, can only handle %d\n", maxBlockCount);
 }
 
 static void fixBlockSize(struct blockPos *bp, int blockIx)
 /* Update size field from rest of blockPos. */
 {
 struct dnaSeq *seq = bp->seq;
 int size = seq->size - bp->offset;
 if (size > blockSize)
 	size = blockSize;
 bp->size = size;
 }
 
 static struct patSpace *newPatSpace(int minMatch, int maxGap, int seedSize)
 /* Return an empty pattern space. */
 {
 struct patSpace *ps;
 int seedBitSize = seedSize*2;
 int seedSpaceSize;
 
 AllocVar(ps);
 ps->seedSize = seedSize;
 seedSpaceSize = ps->seedSpaceSize = (1<<seedBitSize);
 ps->lists = needLargeZeroedMem(seedSpaceSize * sizeof(ps->lists[0]));
 ps->listSizes = needLargeZeroedMem(seedSpaceSize * sizeof(ps->listSizes[0]));
 ps->minMatch = minMatch;
 ps->maxGap = maxGap;
 return ps;
 }
 
 static void countPatSpace(struct patSpace *ps, struct dnaSeq *seq)
 /* Add up number of times each pattern occurs in sequence into ps->listSizes. */
 {
 int size = seq->size;
 int mask = ps->seedSpaceSize-1;
 DNA *dna = seq->dna;
 int i;
 int bits = 0;
 int bVal;
 int ls;
 
 for (i=0; i<ps->seedSize-1; ++i)
     {
     bVal = ntValNoN[(int)dna[i]];
     bits <<= 2;
     bits += bVal;
     }
 for (i=ps->seedSize-1; i<size; ++i)
     {
     bVal = ntValNoN[(int)dna[i]];
     bits <<= 2;
     bits += bVal;
     bits &= mask;
     ls = ps->listSizes[bits];
     if (ls < maxPatCount)
         ps->listSizes[bits] = ls+1;
     }
 }
 
 static int allocPatSpaceLists(struct patSpace *ps)
 /* Allocate pat space lists and set up list pointers. 
  * Returns size of all lists. */
 {
 int oneCount;
 int count = 0;
 int i;
 psBits *listSizes = ps->listSizes;
 psBits **lists = ps->lists;
 psBits *allocated;
 psBits maxPat = ps->maxPat;
 int size;
 int usedCount = 0, overusedCount = 0;
 int seedSpaceSize = ps->seedSpaceSize;
 
 for (i=0; i<seedSpaceSize; ++i)
     {
     /* If pattern is too much used it's no good to us, ignore. */
     if ((oneCount = listSizes[i]) < maxPat)
         {
         count += oneCount;
         usedCount += 1;
         }
     else
         {
         overusedCount += 1;
         }
     }
 ps->allocated = allocated = needLargeMem(count*sizeof(allocated[0]));
 for (i=0; i<seedSpaceSize; ++i)
     {
     if ((size = listSizes[i]) < maxPat)
         {
         lists[i] = allocated;
         allocated += size;
         }
     }
 return count;
 }
 
 static int addToPatSpace(struct patSpace *ps, int bacIx, int seqIx, struct dnaSeq *seq,int startBlock)
 /* Add contents of one sequence to pattern space. Returns end block. */
 {
 int size = seq->size;
 int mask = ps->seedSpaceSize-1;
 DNA *dna = seq->dna;
 int i;
 int bits = 0;
 int bVal;
 int blockMod = blockSize;
 int curCount;
 psBits maxPat = ps->maxPat;
 psBits *listSizes = ps->listSizes;
 psBits **lists = ps->lists;
 struct blockPos *bp = &ps->blockPos[startBlock];
 
 bp->bacIx = bacIx;
 bp->seqIx = seqIx;
 bp->seq = seq;
 bp->offset = 0;
 fixBlockSize(bp, startBlock);
 ++bp;
 for (i=0; i<ps->seedSize-1; ++i)
     {
     bVal = ntValNoN[(int)dna[i]];
     bits <<= 2;
     bits += bVal;
     }
 for (i=ps->seedSize-1; i<size; ++i)
     {
     bVal = ntValNoN[(int)dna[i]];
     bits <<= 2;
     bits += bVal;
     bits &= mask;
     if ((curCount = listSizes[bits]) < maxPat)
         {
         listSizes[bits] = curCount+1;
         lists[bits][curCount] = startBlock;
         }
     if (--blockMod == 0)
         {
         if (++startBlock >= maxBlockCount)
 	    tooManyBlocks();
         blockMod = blockSize;
 	bp->bacIx = bacIx;
 	bp->seqIx = seqIx;
 	bp->seq = seq;
 	bp->offset = i - (ps->seedSize-1) + 1;
 	fixBlockSize(bp, startBlock);
 	++bp;
         }
     }
 for (i=0; i<ps->seedSize-1; ++i)
     {
     if (--blockMod == 0)
         {
         if (++startBlock >= maxBlockCount)
 	    tooManyBlocks();
         blockMod = blockSize;
         }
     }
 if (blockMod != blockSize)
     ++startBlock;
 return startBlock;
 }
 
 struct patSpace *makePatSpace(
     struct dnaSeq **seqArray,       /* Array of sequence lists. */
     int seqArrayCount,              /* Size of above array. */
     int seedSize,	            /* Alignment seed size - 10 or 11. Must match oocFileName */
     char *oocFileName,              /* File with tiles to filter out, may be NULL. */
     int minMatch,                   /* Minimum # of matching tiles.  4 is good. */
     int maxGap                      /* Maximum gap size - 32k is good for 
                                        cDNA (introns), 500 is good for DNA assembly. */
     )
 /* Allocate a pattern space and fill from sequences.  (Each element of
    seqArray is a list of sequences. */
 {
 struct patSpace *ps = newPatSpace(minMatch, maxGap,seedSize);
 int i;
 int startIx = 0;
 int total = 0;
 struct dnaSeq *seq;
 psBits maxPat;
-psBits *listSizes;
+bits32 *listSizes;
 int seedSpaceSize = ps->seedSpaceSize;
 
 maxPat = ps->maxPat = maxPatCount;
 for (i=0; i<seqArrayCount; ++i)
     {
     for (seq = seqArray[i]; seq != NULL; seq = seq->next)
         {
         total += seq->size;
         countPatSpace(ps, seq);
         }
     }
 
 listSizes = ps->listSizes;
 
 /* Scan through over-popular patterns and set their count to value 
  * where they won't be added to pat space. */
 oocMaskCounts(oocFileName, listSizes, ps->seedSize, maxPat);
 
 /* Get rid of simple repeats as well. */
 oocMaskSimpleRepeats(listSizes, ps->seedSize, maxPat);
 
 
 allocPatSpaceLists(ps);
 
 /* Zero out pattern counts that aren't oversubscribed. */
 for (i=0; i<ps->seedSpaceSize; ++i)
     {
     if (listSizes[i] < maxPat)
         listSizes[i] = 0;
     }
 for (i=0; i<seqArrayCount; ++i)
     {
 	int j;
     for (seq = seqArray[i], j=0; seq != NULL; seq = seq->next, ++j)
         {
         startIx = addToPatSpace(ps, i, j, seq, startIx);
         if (startIx >= maxBlockCount)
 	    tooManyBlocks();
         }
     }
 ps->blocksUsed = startIx;
 
 /* Zero local over-popular patterns. */
 for (i=0; i<seedSpaceSize; ++i)
     {
     if (listSizes[i] >= maxPat)
         listSizes[i] = 0;
     }
 
 return ps;
 }
 
 
 void bfFind(struct patSpace *ps, int block, struct blockPos *ret)
 /* Find dna sequence and position within sequence that corresponds to block. */
 {
 assert(block >= 0 && block < maxBlockCount);
 *ret = ps->blockPos[block];
 }
 
 
 static struct patClump *newClump(struct patSpace *ps, struct blockPos *first, struct blockPos *last)
 /* Make new clump . */
 {
 struct dnaSeq *seq = first->seq;
 int seqIx = first->seqIx;
 int bacIx = first->bacIx;
 int start = first->offset;
 int end = last->offset+last->size;
 int extraAtEnds = blockSize/2;
 struct patClump *cl;
 
 start -= extraAtEnds;
 if (start < 0)
     start = 0;
 end += extraAtEnds;
 if (end >seq->size)
     end = seq->size;
 AllocVar(cl);
 cl->bacIx = bacIx;
 cl->seqIx = seqIx;
 cl->seq = seq;
 cl->start = start;
 cl->size = end-start;
 return cl;
 }
 
 
 static struct patClump *clumpHits(struct patSpace *ps,
     int hitBlocks[], int hitCount, int posBuf[], 
     DNA *dna, int dnaSize)
 /* Clumps together hits and returns a list. */
 {
 /* Clump together hits. */
 int block;
 int i;
 int maxGap = ps->maxGap;
 struct blockPos first, cur, pre;
 struct patClump *patClump = NULL, *cl;
 
 bfFind(ps, hitBlocks[0], &first);
 pre = first;
 for (i=1; i<hitCount; ++i)
     {
     block = hitBlocks[i];
     bfFind(ps, block, &cur);
     if (cur.seq != pre.seq || cur.offset - (pre.offset + pre.size) > maxGap)
         {
         /* Write old clump and start new one. */
         cl = newClump(ps, &first, &pre);
         slAddHead(&patClump, cl);
         first = cur;
         }
     else
         {
         /* Extend old clump. */
         }
     pre = cur;
     }
 /* Write hitOut last clump. */
 cl = newClump(ps, &first, &pre);
 slAddHead(&patClump, cl);
 slReverse(&patClump);
 return patClump;
 }
 
 struct patClump *patSpaceFindOne(struct patSpace *ps, DNA *dna, int dnaSize)
 /* Find occurrences of DNA in patSpace. */
 {
 int lastStart = dnaSize - ps->seedSize;
 int i,j;
 int pat;
 int hitBlockCount = 0;
 int totalSigHits = 0;
 DNA *tile = dna;
 int blocksUsed = ps->blocksUsed;
 int *posBuf = ps->posBuf;
 int *hitBlocks = ps->hitBlocks;
 int minMatch = ps->minMatch;
 
 memset(ps->posBuf, 0, sizeof(ps->posBuf[0]) * blocksUsed);
 for (i=0; i<=lastStart; i += ps->seedSize)
     {
     psBits *list;
     psBits count;
 
     pat = 0;
     for (j=0; j<ps->seedSize; ++j)
         {
         int bVal = ntValNoN[(int)tile[j]];
         pat <<= 2;
         pat += bVal;
         }
     list = ps->lists[pat];
     if ((count = ps->listSizes[pat]) > 0)
         {
         for (j=0; j<count; ++j)
             posBuf[list[j]] += 1;            
         }
     tile += ps->seedSize;
     }
 
 /* Scan through array that records counts of hits at positions. */
 for (i=0; i<blocksUsed-1; ++i)
     {
     /* Save significant hits in a more compact array */
     int a = posBuf[i], b = posBuf[i+1];
     int sum = a + b;
     if (sum >= minMatch)
         {
         if (a > 0)
             {
             if (hitBlockCount == 0 || hitBlocks[hitBlockCount-1] != i)
                 {
                 hitBlocks[hitBlockCount++] = i;
                 totalSigHits += a;
                 }
             }
         if (b > 0)
             {
             hitBlocks[hitBlockCount++] = i+1;
             totalSigHits += b;
             }
         }
     }
 
 /* Output data with significant hits. */
 if (hitBlockCount > 0 && totalSigHits*ps->seedSize*8 > dnaSize)
     {
     return clumpHits(ps, hitBlocks, hitBlockCount, posBuf, dna, 
         dnaSize);
     }        
 else
     return NULL;
 }