c3c65fde6dd5aa6f20860c7113eb9ee22cf35b96 markd Wed Jan 15 08:37:28 2020 -0800 Initial pass at 64bit blat index diff --git src/jkOwnLib/patSpace.c src/jkOwnLib/patSpace.c index 80f1de7..88b29d5 100644 --- src/jkOwnLib/patSpace.c +++ src/jkOwnLib/patSpace.c @@ -1,463 +1,463 @@ /* patSpace - a homology finding algorithm that occurs mostly in * pattern space (as opposed to offset space). */ /* Copyright 1999-2003 Jim Kent. All rights reserved. */ #include "common.h" #include "portable.h" #include "dnaseq.h" #include "ooc.h" #include "patSpace.h" #define blockSize (256) #define BIGONE /* define this for 32 bit version. */ #ifdef BIGONE #define maxBlockCount (2*230*1024 - 1) -#define psBits bits32 +#define psBits bits64 /* psBits is the size of an index word. If 16 bits * patSpace will use less memory, but be limited to * 16 meg or less genome size. */ #else /* BIGONE */ #define maxBlockCount (64*1024 - 1) #define psBits bits16 /* psBits is the size of an index word. If 16 bits * patSpace will use less memory, but be limited to * 16 meg or less genome size. */ #endif /* BIGONE */ #define maxPatCount (1024*16) /* Maximum allowed count for one pattern. */ struct blockPos /* Structure that stores where in a BAC we are - * essentially an unpacked block index. */ { bits16 bacIx; /* Which bac. */ bits16 seqIx; /* Which subsequence in bac. */ struct dnaSeq *seq; /* Pointer to subsequence. */ int offset; /* Start position of block within subsequence. */ int size; /* Size of block within subsequence. */ }; struct patSpace /* A pattern space - something that holds an index of all 10-mers in * genome. */ { psBits **lists; /* A list for each 10-mer */ psBits *listSizes; /* Size of list for each 10-mer */ psBits *allocated; /* Storage space for all lists. */ int blocksUsed; /* Number of blocks used, <= maxBlockCount */ int posBuf[maxBlockCount]; /* Histogram of hits to each block. */ int hitBlocks[maxBlockCount]; /* Indexes of blocks with hits. */ struct blockPos blockPos[maxBlockCount]; /* Relates block to sequence. */ psBits maxPat; /* Max # of times pattern can occur * before it is ignored. */ int minMatch; /* Minimum number of tile hits needed * to trigger a clump hit. */ int maxGap; /* Max gap between tiles in a clump. */ int seedSize; /* Size of alignment seed. */ int seedSpaceSize; /* Number of possible seed values. */ }; void freePatSpace(struct patSpace **pPatSpace) /* Free up a pattern space. */ { struct patSpace *ps = *pPatSpace; if (ps != NULL) { freeMem(ps->allocated); freez(pPatSpace); } } void tooManyBlocks() /* Complain about too many blocks and abort. */ { errAbort("Too many blocks, can only handle %d\n", maxBlockCount); } static void fixBlockSize(struct blockPos *bp, int blockIx) /* Update size field from rest of blockPos. */ { struct dnaSeq *seq = bp->seq; int size = seq->size - bp->offset; if (size > blockSize) size = blockSize; bp->size = size; } static struct patSpace *newPatSpace(int minMatch, int maxGap, int seedSize) /* Return an empty pattern space. */ { struct patSpace *ps; int seedBitSize = seedSize*2; int seedSpaceSize; AllocVar(ps); ps->seedSize = seedSize; seedSpaceSize = ps->seedSpaceSize = (1<lists = needLargeZeroedMem(seedSpaceSize * sizeof(ps->lists[0])); ps->listSizes = needLargeZeroedMem(seedSpaceSize * sizeof(ps->listSizes[0])); ps->minMatch = minMatch; ps->maxGap = maxGap; return ps; } static void countPatSpace(struct patSpace *ps, struct dnaSeq *seq) /* Add up number of times each pattern occurs in sequence into ps->listSizes. */ { int size = seq->size; int mask = ps->seedSpaceSize-1; DNA *dna = seq->dna; int i; int bits = 0; int bVal; int ls; for (i=0; iseedSize-1; ++i) { bVal = ntValNoN[(int)dna[i]]; bits <<= 2; bits += bVal; } for (i=ps->seedSize-1; ilistSizes[bits]; if (ls < maxPatCount) ps->listSizes[bits] = ls+1; } } static int allocPatSpaceLists(struct patSpace *ps) /* Allocate pat space lists and set up list pointers. * Returns size of all lists. */ { int oneCount; int count = 0; int i; psBits *listSizes = ps->listSizes; psBits **lists = ps->lists; psBits *allocated; psBits maxPat = ps->maxPat; int size; int usedCount = 0, overusedCount = 0; int seedSpaceSize = ps->seedSpaceSize; for (i=0; iallocated = allocated = needLargeMem(count*sizeof(allocated[0])); for (i=0; isize; int mask = ps->seedSpaceSize-1; DNA *dna = seq->dna; int i; int bits = 0; int bVal; int blockMod = blockSize; int curCount; psBits maxPat = ps->maxPat; psBits *listSizes = ps->listSizes; psBits **lists = ps->lists; struct blockPos *bp = &ps->blockPos[startBlock]; bp->bacIx = bacIx; bp->seqIx = seqIx; bp->seq = seq; bp->offset = 0; fixBlockSize(bp, startBlock); ++bp; for (i=0; iseedSize-1; ++i) { bVal = ntValNoN[(int)dna[i]]; bits <<= 2; bits += bVal; } for (i=ps->seedSize-1; i= maxBlockCount) tooManyBlocks(); blockMod = blockSize; bp->bacIx = bacIx; bp->seqIx = seqIx; bp->seq = seq; bp->offset = i - (ps->seedSize-1) + 1; fixBlockSize(bp, startBlock); ++bp; } } for (i=0; iseedSize-1; ++i) { if (--blockMod == 0) { if (++startBlock >= maxBlockCount) tooManyBlocks(); blockMod = blockSize; } } if (blockMod != blockSize) ++startBlock; return startBlock; } struct patSpace *makePatSpace( struct dnaSeq **seqArray, /* Array of sequence lists. */ int seqArrayCount, /* Size of above array. */ int seedSize, /* Alignment seed size - 10 or 11. Must match oocFileName */ char *oocFileName, /* File with tiles to filter out, may be NULL. */ int minMatch, /* Minimum # of matching tiles. 4 is good. */ int maxGap /* Maximum gap size - 32k is good for cDNA (introns), 500 is good for DNA assembly. */ ) /* Allocate a pattern space and fill from sequences. (Each element of seqArray is a list of sequences. */ { struct patSpace *ps = newPatSpace(minMatch, maxGap,seedSize); int i; int startIx = 0; int total = 0; struct dnaSeq *seq; psBits maxPat; psBits *listSizes; int seedSpaceSize = ps->seedSpaceSize; maxPat = ps->maxPat = maxPatCount; for (i=0; inext) { total += seq->size; countPatSpace(ps, seq); } } listSizes = ps->listSizes; /* Scan through over-popular patterns and set their count to value * where they won't be added to pat space. */ oocMaskCounts(oocFileName, listSizes, ps->seedSize, maxPat); /* Get rid of simple repeats as well. */ oocMaskSimpleRepeats(listSizes, ps->seedSize, maxPat); allocPatSpaceLists(ps); /* Zero out pattern counts that aren't oversubscribed. */ for (i=0; iseedSpaceSize; ++i) { if (listSizes[i] < maxPat) listSizes[i] = 0; } for (i=0; inext, ++j) { startIx = addToPatSpace(ps, i, j, seq, startIx); if (startIx >= maxBlockCount) tooManyBlocks(); } } ps->blocksUsed = startIx; /* Zero local over-popular patterns. */ for (i=0; i= maxPat) listSizes[i] = 0; } return ps; } void bfFind(struct patSpace *ps, int block, struct blockPos *ret) /* Find dna sequence and position within sequence that corresponds to block. */ { assert(block >= 0 && block < maxBlockCount); *ret = ps->blockPos[block]; } static struct patClump *newClump(struct patSpace *ps, struct blockPos *first, struct blockPos *last) /* Make new clump . */ { struct dnaSeq *seq = first->seq; int seqIx = first->seqIx; int bacIx = first->bacIx; int start = first->offset; int end = last->offset+last->size; int extraAtEnds = blockSize/2; struct patClump *cl; start -= extraAtEnds; if (start < 0) start = 0; end += extraAtEnds; if (end >seq->size) end = seq->size; AllocVar(cl); cl->bacIx = bacIx; cl->seqIx = seqIx; cl->seq = seq; cl->start = start; cl->size = end-start; return cl; } static struct patClump *clumpHits(struct patSpace *ps, int hitBlocks[], int hitCount, int posBuf[], DNA *dna, int dnaSize) /* Clumps together hits and returns a list. */ { /* Clump together hits. */ int block; int i; int maxGap = ps->maxGap; struct blockPos first, cur, pre; struct patClump *patClump = NULL, *cl; bfFind(ps, hitBlocks[0], &first); pre = first; for (i=1; i maxGap) { /* Write old clump and start new one. */ cl = newClump(ps, &first, &pre); slAddHead(&patClump, cl); first = cur; } else { /* Extend old clump. */ } pre = cur; } /* Write hitOut last clump. */ cl = newClump(ps, &first, &pre); slAddHead(&patClump, cl); slReverse(&patClump); return patClump; } struct patClump *patSpaceFindOne(struct patSpace *ps, DNA *dna, int dnaSize) /* Find occurrences of DNA in patSpace. */ { int lastStart = dnaSize - ps->seedSize; int i,j; int pat; int hitBlockCount = 0; int totalSigHits = 0; DNA *tile = dna; int blocksUsed = ps->blocksUsed; int *posBuf = ps->posBuf; int *hitBlocks = ps->hitBlocks; int minMatch = ps->minMatch; memset(ps->posBuf, 0, sizeof(ps->posBuf[0]) * blocksUsed); for (i=0; i<=lastStart; i += ps->seedSize) { psBits *list; psBits count; pat = 0; for (j=0; jseedSize; ++j) { int bVal = ntValNoN[(int)tile[j]]; pat <<= 2; pat += bVal; } list = ps->lists[pat]; if ((count = ps->listSizes[pat]) > 0) { for (j=0; jseedSize; } /* Scan through array that records counts of hits at positions. */ for (i=0; i= minMatch) { if (a > 0) { if (hitBlockCount == 0 || hitBlocks[hitBlockCount-1] != i) { hitBlocks[hitBlockCount++] = i; totalSigHits += a; } } if (b > 0) { hitBlocks[hitBlockCount++] = i+1; totalSigHits += b; } } } /* Output data with significant hits. */ if (hitBlockCount > 0 && totalSigHits*ps->seedSize*8 > dnaSize) { return clumpHits(ps, hitBlocks, hitBlockCount, posBuf, dna, dnaSize); } else return NULL; }