src/shortReads/itsaFind/itsaFind.c 1.5
1.5 2009/11/24 15:50:23 kent
Commenting out some debugging statements.
Index: src/shortReads/itsaFind/itsaFind.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/shortReads/itsaFind/itsaFind.c,v
retrieving revision 1.4
retrieving revision 1.5
diff -b -B -U 1000000 -r1.4 -r1.5
--- src/shortReads/itsaFind/itsaFind.c 6 Nov 2008 07:03:00 -0000 1.4
+++ src/shortReads/itsaFind/itsaFind.c 24 Nov 2009 15:50:23 -0000 1.5
@@ -1,284 +1,284 @@
/* itsaFind - Find sequence by searching indexed traversable suffix array.. */
/* Copyright 2008 Jim Kent all rights reserved. */
#include "common.h"
#include "linefile.h"
#include "hash.h"
#include "options.h"
#include "dnautil.h"
#include "dnaseq.h"
#include "dnaLoad.h"
#include "verbose.h"
#include "itsa.h"
static char const rcsid[] = "$Id$";
boolean mmap;
int maxMismatch = 0;
int maxRepeat = 10;
void usage()
/* Explain usage and exit. */
{
errAbort(
"itsaFind - Find sequence by searching indexed traversable suffix array.\n"
"usage:\n"
" itsaFind target.itsa query.fa output\n"
"options:\n"
" -maxRepeat=N - maximum number of alignments to output on one query sequence. Default %d\n"
" -mmap - Use memory mapping. Faster just a few reads, but much slower for many reads\n"
" -maxMismatch - maximum number of mismatches allowed. Default %d. NOT IMPLEMENTED\n"
, maxRepeat, maxMismatch
);
}
static struct optionSpec options[] = {
{"maxRepeat", OPTION_INT},
{"mmap", OPTION_BOOLEAN},
{"maxMismatch", OPTION_INT},
{NULL, 0},
};
void finalSearch(DNA *tDna, bits32 *suffixArray, bits32 searchStart, bits32 searchEnd,
DNA *qDna, int qSize, int alreadyMatched, struct slInt **pHitList)
/* Our search has been narrowed to be between searchStart and searchEnd.
* We know within the interval a prefix of size alreadyMatched is already
* the same. Here we check if anything in this interval to see if there is
* a full match to anything. If so we add it to hitList. Due to the peculiarities
* of the array seach with each successive item in the window we've checked one
* more letter already. */
{
bits32 searchIx;
for (searchIx = searchStart; searchIx < searchEnd; ++searchIx)
{
int diff = memcmp(qDna+alreadyMatched, tDna+alreadyMatched+suffixArray[searchIx],
qSize-alreadyMatched);
/* Todo - break without a hit when diff is the wrong sign. */
if (diff == 0)
{
struct slInt *hit = slIntNew(searchIx);
slAddHead(pHitList, hit);
break;
}
++alreadyMatched;
}
}
void itsaFindWithin(DNA *tDna, bits32 *suffixArray, bits32 *traverseArray,
DNA *qDna, int qSize, int cursor, bits32 searchStart, bits32 searchEnd,
struct slInt **pHitList)
/* Search the part of the suffix array between searchStart and searchEnd for a match.
* The searchStart/searchEnd and cursor position must agree. */
{
-uglyf("itsaFindWithin(qDna=%s cursor=%d searchStart=%d searchEnd=%d\n", qDna, cursor, searchStart, searchEnd);
+// uglyf("itsaFindWithin(qDna=%s cursor=%d searchStart=%d searchEnd=%d\n", qDna, cursor, searchStart, searchEnd);
bits32 arrayPos = searchStart;
/* We step through each base of the query */
for (; cursor<qSize; ++cursor)
{
bits32 nextOffset = traverseArray[arrayPos];
DNA qBase = qDna[cursor];
DNA tBase = tDna[suffixArray[arrayPos]+cursor];
/* Skip to next matching base. */
if (qBase != tBase)
{
bits32 nextPos = arrayPos;
for (;;)
{
if ((nextPos += nextOffset) >= searchEnd)
{
finalSearch(tDna, suffixArray, searchStart, arrayPos, qDna, qSize,
cursor-(arrayPos-searchStart), pHitList);
return; /* Ran through all variations of letters at this position. */
}
nextOffset = traverseArray[nextPos];
tBase = tDna[suffixArray[nextPos]+cursor];
if (qBase == tBase)
{
searchStart = arrayPos = nextPos;
break;
}
}
}
searchEnd = arrayPos + nextOffset;
/* We are going to advance to next position in query and in array.
* This will only possibly yield a match if the next item in the suffix
* array has a prefix that matches the current position up to our current
* offset. Happily this is encoded in the traverse array in a subtle way.
* The step to find another letter at this position has to be greater than
* one for the prefix to be shared. */
if (nextOffset <= 1)
{
finalSearch(tDna, suffixArray, searchStart, searchEnd, qDna, qSize,
cursor - (arrayPos-searchStart), pHitList);
return; /* No match since prefix of next position doesn't match us. */
}
++arrayPos;
}
/* If we got here we matched the whole query sequence, and actually there's multiple
* matches to it. It's actually rare to get to here on query sequences larger than
* 20 bases or so. */
finalSearch(tDna, suffixArray, searchStart, searchEnd, qDna, qSize,
qSize - (arrayPos-searchStart-1), pHitList); // TODO - check -1
}
void itsaFindGivenSlot(int slot, struct itsa *itsa, DNA *qDna, int qSize, int remainingMismatches,
struct slInt **pHitList)
/* Given a slot (a 13-mer converted to binary that is the first 13 bases of qDna possibly with
* an induced mutation or two) check out appropriate part of suffix array for hits. */
{
bits32 searchStart = itsa->index13[slot];
if (searchStart != 0)
{
searchStart -= 1; /* Pesky thing to keep 0 meaning no data in slot. */
- uglyf("Going to look within. Cursor slot %d\n", itsa->cursors13[slot]);
+ // uglyf("Going to look within. Cursor slot %d\n", itsa->cursors13[slot]);
itsaFindWithin(itsa->allDna, itsa->array, itsa->traverse, qDna, qSize,
itsa->cursors13[slot], searchStart, searchStart + itsa->traverse[searchStart], pHitList);
}
}
void itsaExactFind(struct itsa *itsa, DNA *qDna, int qSize, struct slInt **pHitList)
/* Search indexed traversable suffix tree for exact matches. */
{
int slot = itsaDnaToBinary(qDna, 13);
itsaFindGivenSlot(slot, itsa, qDna, qSize, 0, pHitList);
}
int itsaCountIdenticalPrefix(DNA *dna, int prefixSize, bits32 *array, bits32 arraySize)
/* Count up number of places in a row in array, where the referenced DNA is the
* same up to prefixSize bases. You do this a lot since generally the suffix tree
* search just gives you the first place in the array that matches. */
{
bits32 i;
DNA *first = dna + array[0];
for (i=1; i<arraySize; ++i)
{
if (memcmp(first, dna+array[i], prefixSize) != 0)
break;
}
return i;
}
void itsaFuzzyFind(struct itsa *itsa,
bits32 sliceStart, bits32 sliceEnd, int cursor,
char *qDna, int qSize, int subsLeft,
struct slInt **pHitList)
{
// uglyf("itsaFuzzyFind %s\n", qDna);
int slot = itsaDnaToBinary(qDna, 13);
char *qMutant = cloneStringZ(qDna, qSize);
itsaFindGivenSlot(slot, itsa, qDna, qSize, maxMismatch, pHitList);
if (subsLeft > 0)
{
subsLeft -= 1;
int mutantSlot = slot;
static int mutMasks[13] = {0x3FFFFFC, 0x3FFFFF3, 0x3FFFFCF, 0x3FFFF3F, 0x3FFFCFF, 0x3FFF3FF,
0x3FFCFFF, 0x3FF3FFF, 0x3FCFFFF, 0x3F3FFFF, 0x3CFFFFF, 0x33FFFFF,
0x0FFFFFF, };
int toggle1 = 1;
int baseIx;
for (baseIx = 0; baseIx<13; ++baseIx)
{
int baseOffset = 12-baseIx;
DNA qOrig = qDna[baseOffset];
mutantSlot = slot & mutMasks[baseIx]; /* A */
if (qOrig != 'A')
{
qMutant[baseOffset] = 'A';
itsaFindGivenSlot(mutantSlot, itsa, qMutant, qSize, subsLeft, pHitList);
}
mutantSlot += toggle1;
if (qOrig != 'C')
{
qMutant[baseOffset] = 'C';
itsaFindGivenSlot(mutantSlot, itsa, qMutant, qSize, subsLeft, pHitList);
}
mutantSlot += toggle1;
if (qOrig != 'G')
{
qMutant[baseOffset] = 'G';
itsaFindGivenSlot(mutantSlot, itsa, qMutant, qSize, subsLeft, pHitList);
}
mutantSlot += toggle1;
if (qOrig != 'T')
{
qMutant[baseOffset] = 'T';
itsaFindGivenSlot(mutantSlot, itsa, qMutant, qSize, subsLeft, pHitList);
}
qMutant[baseOffset] = qOrig;
toggle1 <<= 2; /* Move on to next base. */
}
}
freeMem(qMutant);
}
void itsaFind(char *itsaFile, char *queryFile, char *outputFile)
/* itsaFind - Find sequence by searching indexed traversable suffix array.. */
{
struct itsa *itsa = itsaRead(itsaFile, mmap);
verboseTime(1, "Loaded %s", itsaFile);
struct dnaLoad *qLoad = dnaLoadOpen(queryFile);
bits32 arraySize = itsa->header->arraySize;
FILE *f = mustOpen(outputFile, "w");
struct dnaSeq *qSeq;
int queryCount = 0, hitCount = 0, missCount=0;
while ((qSeq = dnaLoadNext(qLoad)) != NULL)
{
struct slInt *hit, *hitList = NULL;
verbose(2, "Processing %s\n", qSeq->name);
toUpperN(qSeq->dna, qSeq->size);
itsaFuzzyFind(itsa, 0, arraySize, 0, qSeq->dna, qSeq->size, maxMismatch, &hitList);
if (hitList != NULL)
++hitCount;
else
++missCount;
for (hit = hitList; hit != NULL; hit = hit->next)
{
bits32 hitIx = hit->val;
bits32 count = itsaCountIdenticalPrefix(itsa->allDna, qSeq->size,
itsa->array + hitIx, arraySize - hitIx);
bits32 i;
if (count > maxRepeat)
{
}
else
{
for (i=0; i<count; ++i)
{
fprintf(f, "%s hits offset %u\n", qSeq->name, itsa->array[hitIx+i]);
fprintf(f, "q %s\n", qSeq->dna);
fprintf(f, "t ");
mustWrite(f, itsa->allDna + itsa->array[hitIx+i], qSeq->size);
fprintf(f, "\n");
}
}
}
++queryCount;
dnaSeqFree(&qSeq);
}
verboseTime(1, "Alignment");
verbose(1, "%d queries. %d hits (%5.2f%%). %d misses (%5.2f%%).\n", queryCount,
hitCount, 100.0*hitCount/queryCount, missCount, 100.0*missCount/queryCount);
carefulClose(&f);
}
int main(int argc, char *argv[])
/* Process command line. */
{
optionInit(&argc, argv, options);
if (argc != 4)
usage();
maxRepeat = optionInt("maxRepeat", maxRepeat);
maxMismatch = optionInt("maxMismatch", maxMismatch);
mmap = optionExists("mmap");
verboseTime(1, NULL);
dnaUtilOpen();
itsaBaseToValInit();
itsaFind(argv[1], argv[2], argv[3]);
return 0;
}