c3c65fde6dd5aa6f20860c7113eb9ee22cf35b96 markd Wed Jan 15 08:37:28 2020 -0800 Initial pass at 64bit blat index diff --git src/jkOwnLib/genoFind.c src/jkOwnLib/genoFind.c index ae05696..c6d2a82 100644 --- src/jkOwnLib/genoFind.c +++ src/jkOwnLib/genoFind.c @@ -217,31 +217,31 @@ return -1; tile += aa; } return tile; } static void gfCountSeq(struct genoFind *gf, bioSeq *seq) /* Add all N-mers in seq. */ { char *poly = seq->dna; int tileSize = gf->tileSize; int stepSize = gf->stepSize; int tileHeadSize = gf->tileSize - gf->segSize; int maxPat = gf->maxPat; int tile; -bits32 *listSizes = gf->listSizes; +gfOffset *listSizes = gf->listSizes; int i, lastTile = seq->size - tileSize; int (*makeTile)(char *poly, int n) = (gf->isPep ? gfPepTile : gfDnaTile); initNtLookup(); for (i=0; i<=lastTile; i += stepSize) { if ((tile = makeTile(poly, tileHeadSize)) >= 0) { if (listSizes[tile] < maxPat) { listSizes[tile] += 1; } } poly += stepSize; } @@ -310,34 +310,34 @@ ++seqCount; freeDnaSeq(&seq); } twoBitClose(&tbf); *retSeqCount = seqCount; *retBaseCount = baseCount; } static int gfAllocLists(struct genoFind *gf) /* Allocate index lists and set up list pointers. * Returns size of all lists. */ { int oneCount; int count = 0; int i; -bits32 *listSizes = gf->listSizes; -bits32 **lists = gf->lists; -bits32 *allocated = NULL; -bits32 maxPat = gf->maxPat; +gfOffset *listSizes = gf->listSizes; +gfOffset **lists = gf->lists; +void *allocated = NULL; +gfOffset maxPat = gf->maxPat; int size; int usedCount = 0, overusedCount = 0; int tileSpaceSize = gf->tileSpaceSize; for (i=0; i<tileSpaceSize; ++i) { /* If pattern is too much used it's no good to us, ignore. */ if ((oneCount = listSizes[i]) < maxPat) { count += oneCount; usedCount += 1; } else { overusedCount += 1; @@ -350,197 +350,196 @@ if ((size = listSizes[i]) < maxPat) { lists[i] = allocated; allocated += size; } } return count; } static int gfAllocLargeLists(struct genoFind *gf) /* Allocate large index lists and set up list pointers. * Returns size of all lists. */ { int count = 0; int i; -bits32 *listSizes = gf->listSizes; -bits16 **endLists = gf->endLists; -bits16 *allocated = NULL; +gfOffset *listSizes = gf->listSizes; +struct endList **endLists = gf->endLists; +void *allocated = NULL; int size; int tileSpaceSize = gf->tileSpaceSize; for (i=0; i<tileSpaceSize; ++i) count += listSizes[i]; if (count > 0) - gf->allocated = allocated = needHugeMem(3*count*sizeof(allocated[0])); + gf->allocated = allocated = needHugeMem(count * sizeof(struct endList)); for (i=0; i<tileSpaceSize; ++i) { size = listSizes[i]; endLists[i] = allocated; - allocated += 3*size; + allocated += size; } return count; } -static void gfAddSeq(struct genoFind *gf, bioSeq *seq, bits32 offset) +static void gfAddSeq(struct genoFind *gf, bioSeq *seq, gfOffset offset) /* Add all N-mers in seq. Done after gfCountSeq. */ { char *poly = seq->dna; int tileSize = gf->tileSize; int stepSize = gf->stepSize; int i, lastTile = seq->size - tileSize; int (*makeTile)(char *poly, int n) = (gf->isPep ? gfPepTile : gfDnaTile); int maxPat = gf->maxPat; int tile; -bits32 *listSizes = gf->listSizes; -bits32 **lists = gf->lists; +gfOffset *listSizes = gf->listSizes; +gfOffset **lists = gf->lists; initNtLookup(); for (i=0; i<=lastTile; i += stepSize) { tile = makeTile(poly, tileSize); if (tile >= 0) { if (listSizes[tile] < maxPat) { lists[tile][listSizes[tile]++] = offset; } } offset += stepSize; poly += stepSize; } } -static void gfAddLargeSeq(struct genoFind *gf, bioSeq *seq, bits32 offset) +static void gfAddLargeSeq(struct genoFind *gf, bioSeq *seq, gfOffset offset) /* Add all N-mers to segmented index. Done after gfCountSeq. */ { char *poly = seq->dna; int tileSize = gf->tileSize; int stepSize = gf->stepSize; int tileTailSize = gf->segSize; int tileHeadSize = tileSize - tileTailSize; int i, lastTile = seq->size - tileSize; int (*makeTile)(char *poly, int n) = (gf->isPep ? gfPepTile : gfDnaTile); int tileHead; int tileTail; -bits32 *listSizes = gf->listSizes; -bits16 **endLists = gf->endLists; -bits16 *endList; +gfOffset *listSizes = gf->listSizes; +struct endList **endLists = gf->endLists; +struct endList *endList; int headCount; initNtLookup(); for (i=0; i<=lastTile; i += stepSize) { tileHead = makeTile(poly, tileHeadSize); tileTail = makeTile(poly + tileHeadSize, tileTailSize); if (tileHead >= 0 && tileTail >= 0) { endList = endLists[tileHead]; headCount = listSizes[tileHead]++; endList += headCount * 3; /* Because have three slots per. */ - endList[0] = tileTail; - endList[1] = (offset >> 16); - endList[2] = (offset&0xffff); + endList->tileTail = tileTail; + endList->offset = offset; } offset += stepSize; poly += stepSize; } } -static int gfAddTilesInNib(struct genoFind *gf, char *fileName, bits32 offset, +static int gfAddTilesInNib(struct genoFind *gf, char *fileName, gfOffset offset, int stepSize) /* Add all tiles in nib file. Returns size of nib file. */ { FILE *f; int tileSize = gf->tileSize; int bufSize = tileSize * 16*1024; int nibSize, i; int endBuf, basesInBuf; struct dnaSeq *seq; printf("Adding tiles in %s\n", fileName); nibOpenVerify(fileName, &f, &nibSize); for (i=0; i < nibSize; i = endBuf) { endBuf = i + bufSize; if (endBuf >= nibSize) endBuf = nibSize; basesInBuf = endBuf - i; seq = nibLdPart(fileName, f, nibSize, i, basesInBuf); gfAddSeq(gf, seq, i + offset); freeDnaSeq(&seq); } fclose(f); return nibSize; } static void gfZeroOverused(struct genoFind *gf) /* Zero out counts of overused tiles. */ { -bits32 *sizes = gf->listSizes; +gfOffset *sizes = gf->listSizes; int tileSpaceSize = gf->tileSpaceSize, i; int maxPat = gf->maxPat; int overCount = 0; for (i=0; i<tileSpaceSize; ++i) { if (sizes[i] >= maxPat) { sizes[i] = 0; ++overCount; } } } static void gfZeroNonOverused(struct genoFind *gf) /* Zero out counts of non-overused tiles. */ { -bits32 *sizes = gf->listSizes; +gfOffset *sizes = gf->listSizes; int tileSpaceSize = gf->tileSpaceSize, i; int maxPat = gf->maxPat; int overCount = 0; for (i=0; i<tileSpaceSize; ++i) { if (sizes[i] < maxPat) { sizes[i] = 0; ++overCount; } } } struct genoFind *gfIndexNibsAndTwoBits(int fileCount, char *fileNames[], int minMatch, int maxGap, int tileSize, int maxPat, char *oocFile, boolean allowOneMismatch, int stepSize, boolean noSimpRepMask) /* Make index for all nibs and .2bits in list. * minMatch - minimum number of matching tiles to trigger alignments * maxGap - maximum deviation from diagonal of tiles * tileSize - size of tile in nucleotides * maxPat - maximum use of tile to not be considered a repeat * oocFile - .ooc format file that lists repeat tiles. May be NULL. * allowOneMismatch - allow one mismatch in a tile. * stepSize - space between tiles. Zero means default (which is tileSize). * noSimpRepMask - skip simple repeat masking. */ { struct genoFind *gf = gfNewEmpty(minMatch, maxGap, tileSize, stepSize, maxPat, oocFile, FALSE, allowOneMismatch, noSimpRepMask); int i; -bits32 offset = 0, nibSize; +gfOffset offset = 0, nibSize; char *fileName; struct gfSeqSource *ss; long long totalBases = 0, warnAt = maxTotalBases(); int totalSeq = 0; if (allowOneMismatch) errAbort("Don't currently support allowOneMismatch in gfIndexNibsAndTwoBits"); if (stepSize == 0) stepSize = gf->tileSize; for (i=0; i<fileCount; ++i) { fileName = fileNames[i]; if (twoBitIsFile(fileName)) { int seqCount; @@ -604,31 +603,31 @@ } static void maskSimplePepRepeat(struct genoFind *gf) /* Remove tiles from index that represent repeats * of period one and two. */ { if (gf->noSimpRepMask) { return; } int i; int tileSize = gf->tileSize; int maxPat = gf->maxPat; -bits32 *listSizes = gf->listSizes; +gfOffset *listSizes = gf->listSizes; for (i=0; i<20; ++i) { int j, k; for (j=0; j<20; ++j) { int tile = 0; for (k=0; k<tileSize; ++k) { tile *= 20; if (k&1) tile += j; else tile += i; } @@ -681,31 +680,31 @@ for (isRc=0; isRc <= 1; ++isRc) { if (isRc) reverseComplement(seq->dna, seq->size); t3 = trans3New(seq); for (frame = 0; frame < 3; ++frame) { struct genoFind *gf = transGf[isRc][frame]; gfCountSeq(gf, t3->trans[frame]); } trans3Free(&t3); } } static void transIndexBothStrands(struct dnaSeq *seq, - struct genoFind *transGf[2][3], bits32 offset[2][3], + struct genoFind *transGf[2][3], gfOffset offset[2][3], int sourceIx, char *fileName) /* Add unmasked tiles on both strands of sequence to * index. As a side effect this will reverse-complement seq. */ { int isRc, frame; struct trans3 *t3; struct gfSeqSource *ss; for (isRc=0; isRc <= 1; ++isRc) { if (isRc) { reverseComplement(seq->dna, seq->size); } t3 = trans3New(seq); for (frame = 0; frame < 3; ++frame) @@ -718,31 +717,31 @@ offset[isRc][frame] += t3->trans[frame]->size; ss->end = offset[isRc][frame]; } trans3Free(&t3); } } void gfIndexTransNibsAndTwoBits(struct genoFind *transGf[2][3], int fileCount, char *fileNames[], int minMatch, int maxGap, int tileSize, int maxPat, char *oocFile, boolean allowOneMismatch, boolean doMask, int stepSize, boolean noSimpRepMask) /* Make translated (6 frame) index for all .nib and .2bit files. */ { struct genoFind *gf; int i,isRc, frame; -bits32 offset[2][3]; +gfOffset offset[2][3]; char *fileName; struct dnaSeq *seq; int sourceCount = 0; long long totalBases = 0, warnAt = maxTotalBases(); if (allowOneMismatch) errAbort("Don't currently support allowOneMismatch in gfIndexTransNibsAndTwoBits"); /* Allocate indices for all reading frames. */ for (isRc=0; isRc <= 1; ++isRc) { for (frame = 0; frame < 3; ++frame) { transGf[isRc][frame] = gf = gfNewEmpty(minMatch, maxGap, tileSize, stepSize, maxPat, oocFile, TRUE, allowOneMismatch, noSimpRepMask); } @@ -838,31 +837,31 @@ gf = transGf[isRc][frame]; gf->totalSeqSize = offset[isRc][frame]; gfZeroOverused(gf); } } } static struct genoFind *gfSmallIndexSeq(struct genoFind *gf, bioSeq *seqList, int minMatch, int maxGap, int tileSize, int maxPat, char *oocFile, boolean isPep, boolean maskUpper) /* Make index for all seqs in list. */ { int seqCount = slCount(seqList); bioSeq *seq; int i; -bits32 offset = 0; +gfOffset offset = 0; struct gfSeqSource *ss; if (isPep) maskSimplePepRepeat(gf); for (seq = seqList; seq != NULL; seq = seq->next) gfCountSeq(gf, seq); gfAllocLists(gf); gfZeroNonOverused(gf); if (seqCount > 0) AllocArray(gf->sources, seqCount); gf->sourceCount = seqCount; for (i=0, seq = seqList; i<seqCount; ++i, seq = seq->next) { gfAddSeq(gf, seq, offset); ss = gf->sources+i; @@ -874,31 +873,31 @@ ss->maskedBits = maskFromUpperCaseSeq(seq); } gf->totalSeqSize = offset; gfZeroOverused(gf); return gf; } static struct genoFind *gfLargeIndexSeq(struct genoFind *gf, bioSeq *seqList, int minMatch, int maxGap, int tileSize, int maxPat, char *oocFile, boolean isPep, boolean maskUpper) /* Make index for all seqs in list. */ { int seqCount = slCount(seqList); bioSeq *seq; int i; -bits32 offset = 0; +gfOffset offset = 0; struct gfSeqSource *ss; for (seq = seqList; seq != NULL; seq = seq->next) gfCountSeq(gf, seq); gfAllocLargeLists(gf); gfZeroNonOverused(gf); AllocArray(gf->sources, seqCount); gf->sourceCount = seqCount; for (i=0, seq = seqList; i<seqCount; ++i, seq = seq->next) { gfAddLargeSeq(gf, seq, offset); ss = gf->sources+i; ss->seq = seq; ss->start = offset; offset += seq->size; @@ -940,46 +939,46 @@ stepSize = tileSize; if (gf->segSize > 0) { gfLargeIndexSeq(gf, seqList, minMatch, maxGap, tileSize, maxPat, oocFile, isPep, maskUpper); } else { gfSmallIndexSeq(gf, seqList, minMatch, maxGap, tileSize, maxPat, oocFile, isPep, maskUpper); } return gf; } static int bCmpSeqSource(const void *vTarget, const void *vRange) /* Compare function for binary search of gfSeqSource. */ { -const bits32 *pTarget = vTarget; -bits32 target = *pTarget; +const gfOffset *pTarget = vTarget; +gfOffset target = *pTarget; const struct gfSeqSource *ss = vRange; if (target < ss->start) return -1; if (target >= ss->end) return 1; return 0; } -static struct gfSeqSource *findSource(struct genoFind *gf, bits32 targetPos) +static struct gfSeqSource *findSource(struct genoFind *gf, gfOffset targetPos) /* Find source given target position. */ { struct gfSeqSource *ss = bsearch(&targetPos, gf->sources, gf->sourceCount, sizeof(gf->sources[0]), bCmpSeqSource); if (ss == NULL) - errAbort("Couldn't find source for %d", targetPos); + errAbort("Couldn't find source for %lld", targetPos); return ss; } void gfClumpFree(struct gfClump **pClump) /* Free a single clump. */ { struct gfClump *clump; if ((clump = *pClump) == NULL) return; freez(pClump); } void gfClumpFreeList(struct gfClump **pList) /* Free a list of dynamically allocated gfClump's */ { @@ -988,31 +987,31 @@ for (el = *pList; el != NULL; el = next) { next = el->next; gfClumpFree(&el); } *pList = NULL; } void gfClumpDump(struct genoFind *gf, struct gfClump *clump, FILE *f) /* Print out info on clump */ { struct gfSeqSource *ss = clump->target; char *name = ss->fileName; if (name == NULL) name = ss->seq->name; -fprintf(f, "%u-%u %s %u-%u, hits %d\n", +fprintf(f, "%llu-%llu %s %llu-%llu, hits %d\n", clump->qStart, clump->qEnd, name, clump->tStart - ss->start, clump->tEnd - ss->start, clump->hitCount); #ifdef SOMETIMES for (hit = clump->hitList; hit != NULL; hit = hit->next) fprintf(f, " q %d, t %d, diag %d\n", hit->qStart, hit->tStart, hit->diagonal); #endif } /* Fast sorting routines for sorting gfHits on diagonal. * More or less equivalent to system qsort, but with * comparison function inline. Worth a little tweaking * since this is the bottleneck for the whole procedure. */ static void gfHitSort2(struct gfHit **ptArray, int n); @@ -1293,31 +1292,31 @@ } clump->hitList = NULL; /* We ate up the hit list. */ gfClumpFree(&clump); } } static int gfNearEnough = 300; static struct gfClump *clumpNear(struct genoFind *gf, struct gfClump *oldClumps, int minMatch) /* Go through clump list and make sure hits are also near each other. * If necessary divide clumps. */ { struct gfClump *newClumps = NULL, *clump, *nextClump; struct gfHit *hit, *nextHit; int tileSize = gf->tileSize; -bits32 lastT; +gfOffset lastT; int nearEnough = (gf->isPep ? gfNearEnough/3 : gfNearEnough); for (clump = oldClumps; clump != NULL; clump = nextClump) { struct gfHit *newHits = NULL, *oldHits = clump->hitList; int clumpSize = 0; clump->hitCount = 0; clump->hitList = NULL; /* Clump no longer owns list. */ nextClump = clump->next; slSort(&oldHits, gfHitCmpTarget); lastT = oldHits->tStart; for (hit = oldHits; hit != NULL; hit = nextHit) { nextHit = hit->next; if (hit->tStart > nearEnough + lastT) @@ -1376,31 +1375,31 @@ struct gfHit **buckets = NULL, **pb; /* Sort hit list into buckets. */ AllocArray(buckets, bucketCount); for (hit = hitList; hit != NULL; hit = nextHit) { nextHit = hit->next; pb = buckets + (hit->tStart >> bucketShift); slAddHead(pb, hit); } /* Sort each bucket on diagonal and clump. */ for (i=0; i<bucketCount; ++i) { int clumpSize; - bits32 maxT; + gfOffset maxT; struct gfHit *clumpHits; pb = buckets + i; gfHitSortDiagonal(pb); for (hit = *pb; hit != NULL; ) { /* Each time through this loop will get info on a clump. Will only * actually create clump if it is big enough though. */ clumpSize = 0; clumpHits = lastHit = NULL; maxT = 0; for (; hit != NULL; hit = nextHit) { nextHit = hit->next; if (lastHit != NULL && hit->diagonal - lastHit->diagonal > maxGap) break; @@ -1426,54 +1425,54 @@ usedHits += clumpSize; ++clumpCount; } } *pb = NULL; boundary += bucketSize; } clumpList = clumpNear(gf, clumpList, minMatch); gfClumpComputeQueryCoverage(clumpList, tileSize); /* Thanks AG */ slSort(&clumpList, gfClumpCmpQueryCoverage); #ifdef DEBUG uglyf("Dumping clumps B\n"); for (clump = clumpList; clump != NULL; clump = clump->next) /* uglyf */ { - uglyf(" %d %d %s %d %d (%d hits)\n", clump->qStart, clump->qEnd, clump->target->seq->name, clump->tStart, clump->tEnd, clump->hitCount); + uglyf(" %lld %lld %s %lld %lld (%d hits)\n", clump->qStart, clump->qEnd, clump->target->seq->name, clump->tStart, clump->tEnd, clump->hitCount); } #endif /* DEBUG */ freez(&buckets); return clumpList; } static struct gfHit *gfFastFindDnaHits(struct genoFind *gf, struct dnaSeq *seq, - Bits *qMaskBits, int qMaskOffset, struct lm *lm, int *retHitCount, - struct gfSeqSource *target, int tMin, int tMax) + Bits *qMaskBits, gfOffset qMaskOffset, struct lm *lm, int *retHitCount, + struct gfSeqSource *target, gfOffset tMin, gfOffset tMax) /* Find hits associated with one sequence. This is is special fast * case for DNA that is in an unsegmented index. */ { struct gfHit *hitList = NULL, *hit; -int size = seq->size; +gfOffset size = seq->size; int tileSizeMinusOne = gf->tileSize - 1; int mask = gf->tileMask; DNA *dna = seq->dna; int i, j; -bits32 bits = 0; -bits32 bVal; +gfOffset bits = 0; +gfOffset bVal; int listSize; -bits32 qStart, *tList; +gfOffset qStart, *tList; int hitCount = 0; for (i=0; i<tileSizeMinusOne; ++i) { bVal = ntValNoN[(int)dna[i]]; bits <<= 2; bits += bVal; } for (i=tileSizeMinusOne; i<size; ++i) { bVal = ntValNoN[(int)dna[i]]; bits <<= 2; bits += bVal; bits &= mask; listSize = gf->listSizes[bits]; @@ -1493,63 +1492,63 @@ hit->qStart = qStart; hit->tStart = tStart; hit->diagonal = tStart + size - qStart; slAddHead(&hitList, hit); ++hitCount; } } } } } *retHitCount = hitCount; return hitList; } static struct gfHit *gfStraightFindHits(struct genoFind *gf, aaSeq *seq, - Bits *qMaskBits, int qMaskOffset, struct lm *lm, int *retHitCount, - struct gfSeqSource *target, int tMin, int tMax) + Bits *qMaskBits, gfOffset qMaskOffset, struct lm *lm, int *retHitCount, + struct gfSeqSource *target, gfOffset tMin, gfOffset tMax) /* Find hits associated with one sequence in a non-segmented * index where hits match exactly. */ { struct gfHit *hitList = NULL, *hit; int size = seq->size; int tileSize = gf->tileSize; int lastStart = size - tileSize; char *poly = seq->dna; int i, j; int tile; int listSize; -bits32 qStart, *tList; +gfOffset qStart, *tList; int hitCount = 0; int (*makeTile)(char *poly, int n) = (gf->isPep ? gfPepTile : gfDnaTile); initNtLookup(); for (i=0; i<=lastStart; ++i) { tile = makeTile(poly+i, tileSize); if (tile < 0) continue; listSize = gf->listSizes[tile]; if (listSize > 0) { qStart = i; if (qMaskBits == NULL || bitCountRange(qMaskBits, qStart+qMaskOffset, tileSize) == 0) { tList = gf->lists[tile]; for (j=0; j<listSize; ++j) { - int tStart = tList[j]; + gfOffset tStart = tList[j]; if (target == NULL || (target == findSource(gf, tStart) && tStart >= tMin && tStart < tMax) ) { lmAllocVar(lm,hit); hit->qStart = qStart; hit->tStart = tStart; hit->diagonal = tStart + size - qStart; slAddHead(&hitList, hit); ++hitCount; } } } } } *retHitCount = hitCount; @@ -1558,31 +1557,31 @@ static struct gfHit *gfStraightFindNearHits(struct genoFind *gf, aaSeq *seq, Bits *qMaskBits, int qMaskOffset, struct lm *lm, int *retHitCount, struct gfSeqSource *target, int tMin, int tMax) /* Find hits associated with one sequence in a non-segmented * index where hits can mismatch in one letter. */ { struct gfHit *hitList = NULL, *hit; int size = seq->size; int tileSize = gf->tileSize; int lastStart = size - tileSize; char *poly = seq->dna; int i, j; int tile; int listSize; -bits32 qStart, *tList; +gfOffset qStart, *tList; int hitCount = 0; int varPos, varVal; /* Variable position. */ int (*makeTile)(char *poly, int n); int alphabetSize; char oldChar, zeroChar; int *seqValLookup; int posMul, avoid; initNtLookup(); if (gf->isPep) { makeTile = gfPepTile; alphabetSize = 20; zeroChar = 'A'; seqValLookup = aaVal; @@ -1658,55 +1657,55 @@ Bits *qMaskBits, int qMaskOffset, struct lm *lm, int *retHitCount, struct gfSeqSource *target, int tMin, int tMax) /* Find hits associated with one sequence in general case in a segmented * index. */ { struct gfHit *hitList = NULL, *hit; int size = seq->size; int tileSize = gf->tileSize; int tileTailSize = gf->segSize; int tileHeadSize = gf->tileSize - tileTailSize; int lastStart = size - tileSize; char *poly = seq->dna; int i, j; int tileHead, tileTail; int listSize; -bits32 qStart; -bits16 *endList; +gfOffset qStart; +struct endList *endList; int hitCount = 0; int (*makeTile)(char *poly, int n) = (gf->isPep ? gfPepTile : gfDnaTile); initNtLookup(); for (i=0; i<=lastStart; ++i) { if (qMaskBits == NULL || bitCountRange(qMaskBits, i+qMaskOffset, tileSize) == 0) { tileHead = makeTile(poly+i, tileHeadSize); if (tileHead < 0) continue; tileTail = makeTile(poly+i+tileHeadSize, tileTailSize); if (tileTail < 0) continue; listSize = gf->listSizes[tileHead]; qStart = i; endList = gf->endLists[tileHead]; for (j=0; j<listSize; ++j) { - if (endList[0] == tileTail) + if (endList->tileTail == tileTail) { - int tStart = (endList[1]<<16) + endList[2]; + int tStart = endList->offset; if (target == NULL || (target == findSource(gf, tStart) && tStart >= tMin && tStart < tMax) ) { lmAllocVar(lm,hit); hit->qStart = qStart; hit->tStart = tStart; hit->diagonal = tStart + size - qStart; slAddHead(&hitList, hit); ++hitCount; } } endList += 3; } } @@ -1719,32 +1718,32 @@ aaSeq *seq, Bits *qMaskBits, int qMaskOffset, struct lm *lm, int *retHitCount, struct gfSeqSource *target, int tMin, int tMax) /* Find hits associated with one sequence in a segmented * index where one mismatch is allowed. */ { struct gfHit *hitList = NULL, *hit; int size = seq->size; int tileSize = gf->tileSize; int tileTailSize = gf->segSize; int tileHeadSize = gf->tileSize - tileTailSize; int lastStart = size - tileSize; char *poly = seq->dna; int i, j; int tileHead, tileTail; int listSize; -bits32 qStart; -bits16 *endList; +gfOffset qStart; +struct endList *endList; int hitCount = 0; int varPos, varVal; /* Variable position. */ int (*makeTile)(char *poly, int n); int alphabetSize; char oldChar, zeroChar; int headPosMul, tailPosMul, avoid; boolean modTail; int *seqValLookup; initNtLookup(); if (gf->isPep) { makeTile = gfPepTile; alphabetSize = 20; @@ -1779,33 +1778,33 @@ avoid = -1; else avoid = seqValLookup[(int)oldChar]; if (tileHead >= 0 && tileTail >= 0) { for (varVal=0; varVal<alphabetSize; ++varVal) { if (varVal != avoid) { listSize = gf->listSizes[tileHead]; qStart = i; endList = gf->endLists[tileHead]; for (j=0; j<listSize; ++j) { - if (endList[0] == tileTail) + if (endList->tileTail == tileTail) { - int tStart = (endList[1]<<16) + endList[2]; + int tStart = endList->offset; if (target == NULL || (target == findSource(gf, tStart) && tStart >= tMin && tStart < tMax) ) { lmAllocVar(lm,hit); hit->qStart = qStart; hit->tStart = tStart; hit->diagonal = tStart + size - qStart; slAddHead(&hitList, hit); ++hitCount; } } endList += 3; } } @@ -1964,31 +1963,31 @@ for (qFrame = 0; qFrame<3; ++qFrame) { gfTransFindClumps(gfs, seqs[qFrame], clumps[qFrame], lm, &oneHit); hitCount += oneHit; } *retHitCount = hitCount; } void gfMakeOoc(char *outName, char *files[], int fileCount, int tileSize, bits32 maxPat, enum gfType tType, boolean noSimpRepMask) /* Count occurences of tiles in seqList and make a .ooc file. */ { boolean dbIsPep = (tType == gftProt || tType == gftDnaX || tType == gftRnaX); struct genoFind *gf = gfNewEmpty(gfMinMatch, gfMaxGap, tileSize, tileSize, maxPat, NULL, dbIsPep, FALSE, noSimpRepMask); -bits32 *sizes = gf->listSizes; +gfOffset *sizes = gf->listSizes; int tileSpaceSize = gf->tileSpaceSize; bioSeq *seq, *seqList; bits32 sig = oocSig, psz = tileSize; bits32 i; int oocCount = 0; char *inName; FILE *f = mustOpen(outName, "w"); if (gf->segSize > 0) errAbort("Don't yet know how to make ooc files for large tile sizes."); for (i=0; i<fileCount; ++i) { inName = files[i]; printf("Loading %s\n", inName); if (nibIsFile(inName)) @@ -2095,32 +2094,32 @@ slFreeList(&iList); binKeeperAdd(bk, start, end, src); } static struct gfClump *pcrClumps(struct genoFind *gf, char *fPrimer, int fPrimerSize, char *rPrimer, int rPrimerSize, int minDistance, int maxDistance) /* Find possible PCR hits. The fPrimer and rPrimer are on the same strand. */ { struct gfClump *clumpList = NULL; int tileSize = gf->tileSize; int fTile; int fTileCount = fPrimerSize - tileSize; int *rTiles, rTile; int rTileCount = rPrimerSize - tileSize; int fTileIx,rTileIx,fPosIx,rPosIx; -bits32 *fPosList, fPos, *rPosList, rPos; -int fPosListSize, rPosListSize; +gfOffset *fPosList, fPos, *rPosList, rPos; +gfOffset fPosListSize, rPosListSize; struct hash *targetHash = newHash(0); /* Build up array of all tiles in reverse primer. */ AllocArray(rTiles, rTileCount); for (rTileIx = 0; rTileIx<rTileCount; ++rTileIx) { rTiles[rTileIx] = gfDnaTile(rPrimer + rTileIx, tileSize); if (rTiles[rTileIx] == -1) errAbort("Bad char in reverse primer sequence: %s", rPrimer); } /* Loop through all tiles in forward primer. */ for (fTileIx=0; fTileIx<fTileCount; ++fTileIx) { fTile = gfDnaTile(fPrimer + fTileIx, tileSize);