src/jkOwnLib/gfBlatLib.c 1.28
1.28 2009/09/23 18:42:27 angie
Fixed compiler warnings from gcc 4.3.3, mostly about system calls whose return values weren't checked and non-literal format strings with no args.
Index: src/jkOwnLib/gfBlatLib.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/jkOwnLib/gfBlatLib.c,v
retrieving revision 1.27
retrieving revision 1.28
diff -b -B -U 1000000 -r1.27 -r1.28
--- src/jkOwnLib/gfBlatLib.c 9 Dec 2008 08:06:20 -0000 1.27
+++ src/jkOwnLib/gfBlatLib.c 23 Sep 2009 18:42:27 -0000 1.28
@@ -1,1613 +1,1614 @@
/* gfBlatLib - stuff that blat-related clients of genoFind library or
* gfServer on the web use. */
/* Copyright 2001-2005 Jim Kent. All rights reserved. */
#include "common.h"
#include "net.h"
#include "linefile.h"
#include "sqlNum.h"
#include "dnaseq.h"
#include "fa.h"
#include "fuzzyFind.h"
#include "supStitch.h"
#include "genoFind.h"
#include "gfInternal.h"
#include "errabort.h"
#include "nib.h"
#include "twoBit.h"
#include "trans3.h"
static char const rcsid[] = "$Id$";
static int ssAliCount = 16; /* Number of alignments returned by ssStitch. */
#ifdef DEBUG
void dumpRange(struct gfRange *r, FILE *f)
/* Dump range to file. */
{
fprintf(f, "%d-%d %s %d-%d, hits %d\n", r->qStart, r->qEnd, r->tName, r->tStart, r->tEnd, r->hitCount);
}
void dumpRangeList(struct gfRange *rangeList, FILE *f)
/* Dump range list to file for debugging. */
{
struct gfRange *range;
for (range = rangeList; range != NULL; range = range->next)
dumpRange(range, f);
}
#endif /* DEBUG */
void gfRangeFree(struct gfRange **pEl)
/* Free a single dynamically allocated gfRange such as created
* with gfRangeLoad(). */
{
struct gfRange *el;
if ((el = *pEl) == NULL) return;
freeMem(el->tName);
if (el->components != NULL)
gfRangeFreeList(&el->components);
freez(pEl);
}
void gfRangeFreeList(struct gfRange **pList)
/* Free a list of dynamically allocated gfRange's */
{
struct gfRange *el, *next;
for (el = *pList; el != NULL; el = next)
{
next = el->next;
gfRangeFree(&el);
}
*pList = NULL;
}
static struct gfRange *gfRangeLoad(char **row)
/* Load a gfRange from array of strings parsed from
* server. Dispose of this with gfRangeFree(). */
{
struct gfRange *ret;
AllocVar(ret);
ret->qStart = atoi(row[0]);
ret->qEnd = atoi(row[1]);
ret->tName = cloneString(row[2]);
ret->tStart = atoi(row[3]);
ret->tEnd = atoi(row[4]);
ret->hitCount = atoi(row[5]);
return ret;
}
int gfRangeCmpTarget(const void *va, const void *vb)
/* Compare to sort based on target position. */
{
const struct gfRange *a = *((struct gfRange **)va);
const struct gfRange *b = *((struct gfRange **)vb);
int diff;
diff = strcmp(a->tName, b->tName);
if (diff == 0)
{
long lDiff = a->t3 - b->t3;
if (lDiff < 0)
diff = -1;
else if (lDiff > 0)
diff = 1;
else
diff = 0;
#ifdef SOLARIS_WORKAROUND_COMPILER_BUG_BUT_FAILS_IN_64_BIT
diff = (unsigned long)(a->t3) - (unsigned long)(b->t3); /* Casts needed for Solaris. Thanks Darren Platt! */
#endif /* SOLARIS_WORKAROUND_COMPILER_BUG_BUT_FAILS_IN_64_BIT */
}
if (diff == 0)
diff = a->tStart - b->tStart;
return diff;
}
static void startSeqQuery(int conn, bioSeq *seq, char *type)
/* Send a query that involves some sequence. */
{
char buf[256];
sprintf(buf, "%s%s %d", gfSignature(), type, seq->size);
-write(conn, buf, strlen(buf));
-read(conn, buf, 1);
+mustWriteFd(conn, buf, strlen(buf));
+if (read(conn, buf, 1) < 0)
+ errAbort("startSeqQuery: read failed: %s", strerror(errno));
if (buf[0] != 'Y')
errAbort("Expecting 'Y' from server, got %c", buf[0]);
-write(conn, seq->dna, seq->size);
+mustWriteFd(conn, seq->dna, seq->size);
}
static void gfServerWarn(bioSeq *seq, char *warning)
/* Write out warning. */
{
warn("couldn't process %s: %s", seq->name, warning);
}
static struct gfRange *gfQuerySeq(int conn, struct dnaSeq *seq)
/* Ask server for places sequence hits. */
{
struct gfRange *rangeList = NULL, *range;
char buf[256], *row[6];
int rowSize;
startSeqQuery(conn, seq, "query");
/* Read results line by line and save in list, and return. */
for (;;)
{
netRecieveString(conn, buf);
if (sameString(buf, "end"))
{
break;
}
else if (startsWith("Error:", buf))
{
gfServerWarn(seq, buf);
break;
}
else
{
rowSize = chopLine(buf, row);
if (rowSize < 6)
errAbort("Expecting 6 words from server got %d", rowSize);
range = gfRangeLoad(row);
slAddHead(&rangeList, range);
}
}
slReverse(&rangeList);
return rangeList;
}
static int findTileSize(char *line)
/* Parse through line/val pairs looking for tileSize. */
{
char *var, *val;
int tileSize = 4;
for (;;)
{
var = nextWord(&line);
if (var == NULL)
break;
val = nextWord(&line);
if (val == NULL)
{
internalErr();
break;
}
if (sameString("tileSize", var))
{
tileSize = atoi(val);
if (tileSize <= 0)
internalErr();
}
}
return tileSize;
}
struct gfHit *getHitsFromServer(int conn, struct lm *lm)
/* Read a lone line of hits from server. */
{
char *s, *line, *q, *t;
struct gfHit *hitList = NULL, *hit;
s = line = netRecieveLongString(conn);
for (;;)
{
if ((q = nextWord(&line)) == NULL)
break;
if ((t = nextWord(&line)) == NULL)
internalErr();
lmAllocVar(lm, hit);
hit->qStart = sqlUnsigned(q);
hit->tStart = sqlUnsigned(t);
slAddHead(&hitList, hit);
}
freez(&s);
slReverse(&hitList);
return hitList;
}
static void gfQuerySeqTrans(int conn, aaSeq *seq, struct gfClump *clumps[2][3],
struct lm *lm, struct gfSeqSource **retSsList, int *retTileSize)
/* Query server for clumps where aa sequence hits translated index. */
{
int frame, isRc, rowSize;
struct gfClump *clump;
int tileSize = 0;
char *line;
char buf[256], *row[12];
struct gfSeqSource *ssList = NULL, *ss;
for (isRc = 0; isRc <= 1; ++isRc)
for (frame = 0; frame<3; ++frame)
clumps[isRc][frame] = NULL;
/* Send sequence to server. */
startSeqQuery(conn, seq, "protQuery");
line = netRecieveString(conn, buf);
if (!startsWith("Error:", line))
{
tileSize = findTileSize(line);
/* Read results line by line and save in memory. */
for (;;)
{
/* Read and parse first line that describes clump overall. */
netRecieveString(conn, buf);
if (sameString(buf, "end"))
{
break;
}
else if (startsWith("Error:", buf))
{
gfServerWarn(seq, buf);
break;
}
rowSize = chopLine(buf, row);
if (rowSize < 8)
errAbort("Expecting 8 words from server got %d", rowSize);
AllocVar(clump);
clump->qStart = sqlUnsigned(row[0]);
clump->qEnd = sqlUnsigned(row[1]);
AllocVar(ss);
ss->fileName = cloneString(row[2]);
slAddHead(&ssList, ss);
clump->target = ss;
clump->tStart = sqlUnsigned(row[3]);
clump->tEnd = sqlUnsigned(row[4]);
clump->hitCount = sqlUnsigned(row[5]);
isRc = ((row[6][0] == '-') ? 1 : 0);
frame = sqlUnsigned(row[7]);
slAddHead(&clumps[isRc][frame], clump);
/* Read and parse next (long) line that describes hits. */
clump->hitList = getHitsFromServer(conn, lm);
assert(slCount(clump->hitList) == clump->hitCount);
}
for (isRc = 0; isRc <= 1; ++isRc)
for (frame = 0; frame<3; ++frame)
slReverse(&clumps[isRc][frame]);
}
else
{
gfServerWarn(seq, line);
}
*retSsList = ssList;
*retTileSize = tileSize;
}
static void gfQuerySeqTransTrans(int conn, struct dnaSeq *seq,
struct gfClump *clumps[2][3][3],
struct lm *lm, struct gfSeqSource **retSsList, int *retTileSize)
/* Query server for clumps where translated DNA sequence hits translated
* index. */
{
int qFrame, tFrame, isRc, rowSize;
struct gfClump *clump;
int tileSize = 0;
char *line;
char buf[256], *row[12];
struct gfSeqSource *ssList = NULL, *ss;
for (isRc = 0; isRc <= 1; ++isRc)
for (qFrame = 0; qFrame<3; ++qFrame)
for (tFrame = 0; tFrame<3; ++tFrame)
clumps[isRc][qFrame][tFrame] = NULL;
/* Send sequence to server. */
startSeqQuery(conn, seq, "transQuery");
line = netRecieveString(conn, buf);
if (!startsWith("Error:", line))
{
tileSize = findTileSize(line);
/* Read results line by line and save in memory. */
for (;;)
{
/* Read and parse first line that describes clump overall. */
netRecieveString(conn, buf);
if (sameString(buf, "end"))
{
break;
}
else if (startsWith("Error:", buf))
{
gfServerWarn(seq, buf);
break;
}
rowSize = chopLine(buf, row);
if (rowSize < 9)
errAbort("Expecting 9 words from server got %d", rowSize);
AllocVar(clump);
clump->qStart = sqlUnsigned(row[0]);
clump->qEnd = sqlUnsigned(row[1]);
AllocVar(ss);
ss->fileName = cloneString(row[2]);
slAddHead(&ssList, ss);
clump->target = ss;
clump->tStart = sqlUnsigned(row[3]);
clump->tEnd = sqlUnsigned(row[4]);
clump->hitCount = sqlUnsigned(row[5]);
isRc = ((row[6][0] == '-') ? 1 : 0);
qFrame = sqlUnsigned(row[7]);
tFrame = sqlUnsigned(row[8]);
slAddHead(&clumps[isRc][qFrame][tFrame], clump);
/* Read and parse next (long) line that describes hits. */
clump->hitList = getHitsFromServer(conn, lm);
assert(slCount(clump->hitList) == clump->hitCount);
}
for (isRc = 0; isRc <= 1; ++isRc)
for (qFrame = 0; qFrame<3; ++qFrame)
for (tFrame = 0; tFrame<3; ++tFrame)
slReverse(&clumps[isRc][qFrame][tFrame]);
}
else
{
gfServerWarn(seq, buf);
}
*retSsList = ssList;
*retTileSize = tileSize;
}
struct gfRange *gfRangesBundle(struct gfRange *exonList, int maxIntron)
/* Bundle a bunch of 'exons' into plausable 'genes'. It's
* not necessary to be precise here. The main thing is to
* group together exons that are close to each other in the
* same target sequence. */
{
struct gfRange *geneList = NULL, *gene = NULL, *lastExon = NULL, *exon, *nextExon;
for (exon = exonList; exon != NULL; exon = nextExon)
{
nextExon = exon->next;
if (lastExon == NULL || !sameString(lastExon->tName, exon->tName)
|| exon->t3 != lastExon->t3
|| exon->tStart - lastExon->tEnd > maxIntron)
{
AllocVar(gene);
gene->tStart = exon->tStart;
gene->tEnd = exon->tEnd;
gene->tName = cloneString(exon->tName);
gene->tSeq = exon->tSeq;
gene->qStart = exon->qStart;
gene->qEnd = exon->qEnd;
gene->hitCount = exon->hitCount;
gene->t3 = exon->t3;
slAddHead(&gene->components, exon);
slAddHead(&geneList, gene);
}
else
{
if (exon->qStart < gene->qStart) gene->qStart = exon->qStart;
if (exon->qEnd > gene->qEnd) gene->qEnd = exon->qEnd;
if (exon->tStart < gene->tStart) gene->tStart = exon->tStart;
if (exon->tEnd > gene->tEnd) gene->tEnd = exon->tEnd;
gene->hitCount += exon->hitCount;
slAddTail(&gene->components, exon);
}
lastExon = exon;
}
slReverse(&geneList);
return geneList;
}
static int usualExpansion = 500;
static boolean alignComponents(struct gfRange *combined, struct ssBundle *bun,
enum ffStringency stringency)
/* Align each piece of combined->components and put result in
* bun->ffList. */
{
struct gfRange *range;
struct dnaSeq *qSeq = bun->qSeq, *tSeq = bun->genoSeq;
struct ssFfItem *ffi;
struct ffAli *ali;
int qStart, qEnd, tStart, tEnd;
int extra = 250;
boolean gotAny = FALSE;
for (range = combined->components; range != NULL; range = range->next)
{
/* Expand to include some extra sequence around range. */
qStart = range->qStart - extra;
tStart = range->tStart - extra;
qEnd = range->qEnd + extra;
tEnd = range->tEnd + extra;
if (range == combined->components)
{
qStart -= extra;
tStart -= extra;
}
if (range->next == NULL)
{
qEnd += extra;
tEnd += extra;
}
if (qStart < combined->qStart) qStart = combined->qStart;
if (tStart < combined->tStart) tStart = combined->tStart;
if (qEnd > combined->qEnd) qEnd = combined->qEnd;
if (tEnd > combined->tEnd) tEnd = combined->tEnd;
ali = ffFind(qSeq->dna + qStart,
qSeq->dna + qEnd,
tSeq->dna + tStart - combined->tStart,
tSeq->dna + tEnd - combined->tStart,
stringency);
if (ali != NULL)
{
AllocVar(ffi);
ffi->ff = ali;
slAddHead(&bun->ffList, ffi);
gotAny = TRUE;
}
}
return gotAny;
}
static int scoreAli(struct ffAli *ali, boolean isProt,
enum ffStringency stringency,
struct dnaSeq *tSeq, struct trans3 *t3List)
/* Score alignment. */
{
int (*scoreFunc)(char *a, char *b, int size);
struct ffAli *ff, *nextFf;
int score = 0;
if (isProt)
scoreFunc = aaScoreMatch;
else
scoreFunc = dnaScoreMatch;
for (ff = ali; ff != NULL; ff = nextFf)
{
nextFf = ff->right;
score += scoreFunc(ff->nStart, ff->hStart, ff->nEnd-ff->nStart);
if (nextFf != NULL)
{
int nhStart = trans3GenoPos(nextFf->hStart, tSeq, t3List, FALSE);
int ohEnd = trans3GenoPos(ff->hEnd, tSeq, t3List, TRUE);
int hGap = nhStart - ohEnd;
int nGap = nextFf->nStart - ff->nEnd;
score -= ffCalcGapPenalty(hGap, nGap, stringency);
}
}
return score;
}
static void saveAlignments(char *chromName, int chromSize, int chromOffset,
struct ssBundle *bun, struct hash *t3Hash,
boolean qIsRc, boolean tIsRc,
enum ffStringency stringency, int minMatch, struct gfOutput *out)
/* Save significant alignments to file in .psl format. */
{
struct dnaSeq *tSeq = bun->genoSeq, *qSeq = bun->qSeq;
struct ssFfItem *ffi;
for (ffi = bun->ffList; ffi != NULL; ffi = ffi->next)
{
struct ffAli *ff = ffi->ff;
struct trans3 *t3List = NULL;
int score;
if (t3Hash != NULL)
t3List = hashMustFindVal(t3Hash, tSeq->name);
score = scoreAli(ff, bun->isProt, stringency, tSeq, t3List);
if (score >= minMatch)
{
out->out(chromName, chromSize, chromOffset, ff, tSeq, t3Hash, qSeq,
qIsRc, tIsRc, stringency, minMatch, out);
}
}
}
struct hash *gfFileCacheNew()
/* Create hash for storing info on .nib and .2bit files. */
{
return hashNew(0);
}
static void gfFileCacheFreeEl(struct hashEl *el)
/* Free up one file cache info. */
{
char *name = el->name;
if (nibIsFile(name))
{
struct nibInfo *nib = el->val;
nibInfoFree(&nib);
}
else
{
struct twoBitFile *tbf = el->val;
twoBitClose(&tbf);
}
el->val = NULL;
}
void gfFileCacheFree(struct hash **pCache)
/* Free up resources in cache. */
{
struct hash *cache = *pCache;
if (cache != NULL)
{
hashTraverseEls(cache, gfFileCacheFreeEl);
hashFree(pCache);
}
}
static void getTargetName(char *tSpec, boolean includeFile, char *targetName)
/* Put sequence name, optionally prefixed by file: in targetName. */
{
if (includeFile)
{
char seqName[128];
char fileName[PATH_LEN];
gfiGetSeqName(tSpec, seqName, fileName);
safef(targetName, PATH_LEN, "%s:%s", fileName, seqName);
}
else
gfiGetSeqName(tSpec, targetName, NULL);
}
void gfAlignStrand(int *pConn, char *tSeqDir, struct dnaSeq *seq,
boolean isRc, int minMatch, struct hash *tFileCache, struct gfOutput *out)
/* Search genome on server with one strand of other sequence to find homology.
* Then load homologous bits of genome locally and do detailed alignment.
* Call 'outFunction' with each alignment that is found. */
{
struct ssBundle *bun;
struct gfRange *rangeList = NULL, *range;
struct dnaSeq *targetSeq;
char targetName[PATH_LEN];
rangeList = gfQuerySeq(*pConn, seq);
close(*pConn);
*pConn = -1;
slSort(&rangeList, gfRangeCmpTarget);
rangeList = gfRangesBundle(rangeList, ffIntronMax);
for (range = rangeList; range != NULL; range = range->next)
{
getTargetName(range->tName, out->includeTargetFile, targetName);
targetSeq = gfiExpandAndLoadCached(range, tFileCache, tSeqDir,
seq->size, &range->tTotalSize, FALSE, FALSE, usualExpansion);
AllocVar(bun);
bun->qSeq = seq;
bun->genoSeq = targetSeq;
alignComponents(range, bun, ffCdna);
ssStitch(bun, ffCdna, minMatch, ssAliCount);
saveAlignments(targetName, range->tTotalSize, range->tStart,
bun, NULL, isRc, FALSE, ffCdna, minMatch, out);
ssBundleFree(&bun);
freeDnaSeq(&targetSeq);
}
gfRangeFreeList(&rangeList);
}
char *clumpTargetName(struct gfClump *clump)
/* Return target name of clump - whether it is in memory or on disk. */
{
struct gfSeqSource *target = clump->target;
char *name;
if (target->seq != NULL)
name = target->seq->name;
else
name = target->fileName;
if (name == NULL)
internalErr();
return name;
}
static struct gfRange *seqClumpToRangeList(struct gfClump *clumpList, int frame)
/* Convert from clump list to range list. */
{
struct gfRange *rangeList = NULL, *range;
struct gfClump *clump;
char *name;
int tOff;
for (clump = clumpList; clump != NULL; clump = clump->next)
{
tOff = clump->target->start;
AllocVar(range);
range->qStart = clump->qStart;
range->qEnd = clump->qEnd;
name = clumpTargetName(clump);
range->tName = cloneString(name);
range->tStart = clump->tStart - tOff;
range->tEnd = clump->tEnd - tOff;
range->tSeq = clump->target->seq;
range->frame = frame;
slAddHead(&rangeList, range);
}
slReverse(&rangeList);
return rangeList;
}
static struct ssBundle *gfClumpsToBundles(struct gfClump *clumpList,
boolean isRc, struct dnaSeq *seq, int minScore,
struct gfRange **retRangeList)
/* Convert gfClumps to an actual alignments (ssBundles) */
{
struct ssBundle *bun, *bunList = NULL;
struct gfRange *rangeList = NULL, *range;
struct dnaSeq *targetSeq;
rangeList = seqClumpToRangeList(clumpList, 0);
slSort(&rangeList, gfRangeCmpTarget);
rangeList = gfRangesBundle(rangeList, 2000);
for (range = rangeList; range != NULL; range = range->next)
{
targetSeq = range->tSeq;
gfiExpandRange(range, seq->size, targetSeq->size, FALSE, isRc,
usualExpansion);
range->tStart = 0;
range->tEnd = targetSeq->size;
AllocVar(bun);
bun->qSeq = seq;
bun->genoSeq = targetSeq;
alignComponents(range, bun, ffCdna);
ssStitch(bun, ffCdna, minScore, ssAliCount);
slAddHead(&bunList, bun);
}
slReverse(&bunList);
*retRangeList = rangeList;
return bunList;
}
static void extendHitRight(int qMax, int tMax,
char **pEndQ, char **pEndT, int (*scoreMatch)(char a, char b),
int maxDown)
/* Extend endQ/endT as much to the right as possible. */
{
int maxScore = 0;
int score = 0;
int maxPos = -1;
int last = min(qMax, tMax);
int i;
char *q = *pEndQ, *t = *pEndT;
for (i=0; i<last; ++i)
{
score += scoreMatch(q[i], t[i]);
if (score > maxScore)
{
maxScore = score;
maxPos = i;
}
else if (i > maxPos + maxDown)
{
break;
}
}
*pEndQ = q+maxPos+1;
*pEndT = t+maxPos+1;
}
static void extendHitLeft(int qMax, int tMax,
char **pStartQ, char **pStartT, int (*scoreMatch)(char a, char b),
int maxDown)
/* Extend startQ/startT as much to the left as possible. */
{
int maxScore = 0;
int score = 0;
int maxPos = 0;
int last = -min(qMax, tMax);
int i;
char *q = *pStartQ, *t = *pStartT;
for (i=-1; i>=last; --i)
{
score += scoreMatch(q[i], t[i]);
if (score > maxScore)
{
maxScore = score;
maxPos = i;
}
else if (i < maxPos - maxDown)
{
break;
}
}
*pStartQ = q+maxPos;
*pStartT = t+maxPos;
}
static void clumpToHspRange(struct gfClump *clump, bioSeq *qSeq, int tileSize,
int frame, struct trans3 *t3, struct gfRange **pRangeList,
boolean isProt, boolean fastMap)
/* Covert clump->hitList to HSPs (high scoring local sequence pair,
* that is longest alignment without gaps) and add resulting HSPs to
* rangeList. */
{
struct gfSeqSource *target = clump->target;
aaSeq *tSeq = target->seq;
BIOPOL *qs, *ts, *qe, *te;
struct gfHit *hit;
int qStart = 0, tStart = 0, qEnd = 0, tEnd = 0, newQ = 0, newT = 0;
boolean outOfIt = TRUE; /* Logically outside of a clump. */
struct gfRange *range;
BIOPOL *lastQs = NULL, *lastQe = NULL, *lastTs = NULL, *lastTe = NULL;
int (*scoreMatch)(char a, char b) = (isProt ? aaScore2 : dnaScore2);
int maxDown, minSpan;
if (fastMap)
{
maxDown = 1;
minSpan = 50;
}
else
{
maxDown = 10;
minSpan = 0;
}
if (tSeq == NULL)
internalErr();
/* The termination condition of this loop is a little complicated.
* We want to output something either when the next hit can't be
* merged into the previous, or at the end of the list. To avoid
* duplicating the output code we're forced to complicate the loop
* termination logic. Hence the check for hit == NULL to break
* the loop is not until near the end of the loop. */
for (hit = clump->hitList; ; hit = hit->next)
{
if (hit != NULL)
{
newQ = hit->qStart;
newT = hit->tStart - target->start;
}
/* See if it's time to output merged (diagonally adjacent) hits. */
if (!outOfIt) /* Not first time through. */
{
/* As a micro-optimization handle strings of adjacent hits
* specially. Don't do the extensions until we've merged
* all adjacent hits. */
if (hit == NULL || newQ != qEnd || newT != tEnd)
{
qs = qSeq->dna + qStart;
ts = tSeq->dna + tStart;
qe = qSeq->dna + qEnd;
te = tSeq->dna + tEnd;
extendHitRight(qSeq->size - qEnd, tSeq->size - tEnd,
&qe, &te, scoreMatch, maxDown);
extendHitLeft(qStart, tStart, &qs, &ts, scoreMatch, maxDown);
if (qs != lastQs || ts != lastTs || qe != lastQe || qs != lastQs)
{
lastQs = qs;
lastTs = ts;
lastQe = qe;
lastTe = te;
if (qe - qs >= minSpan)
{
AllocVar(range);
range->qStart = qs - qSeq->dna;
range->qEnd = qe - qSeq->dna;
range->tName = cloneString(tSeq->name);
range->tSeq = tSeq;
range->tStart = ts - tSeq->dna;
range->tEnd = te - tSeq->dna;
range->hitCount = qe - qs;
range->frame = frame;
range->t3 = t3;
assert(range->tEnd <= tSeq->size);
slAddHead(pRangeList, range);
}
}
outOfIt = TRUE;
}
}
if (hit == NULL)
break;
if (outOfIt)
{
qStart = newQ;
qEnd = qStart + tileSize;
tStart = newT;
tEnd = tStart + tileSize;
outOfIt = FALSE;
}
else
{
qEnd = newQ + tileSize;
tEnd = newT + tileSize;
}
}
}
struct ssFfItem *gfRangesToFfItem(struct gfRange *rangeList, aaSeq *qSeq)
/* Convert ranges to ssFfItem's. */
{
AA *q = qSeq->dna;
struct ffAli *ffList = NULL, *ff;
struct gfRange *range;
struct ssFfItem *ffi;
for (range = rangeList; range != NULL; range = range->next)
{
aaSeq *tSeq = range->tSeq;
AA *t = tSeq->dna;
AllocVar(ff);
ff->nStart = q + range->qStart;
ff->nEnd = q + range->qEnd;
ff->hStart = t + range->tStart;
ff->hEnd = t + range->tEnd;
ff->left = ffList;
ffList = ff;
}
AllocVar(ffi);
ffi->ff = ffMakeRightLinks(ffList);
return ffi;
}
static struct ssBundle *fastMapClumpsToBundles(struct genoFind *gf, struct gfClump *clumpList, bioSeq *qSeq)
/* Convert gfClumps ffAlis. */
{
struct gfClump *clump;
struct gfRange *rangeList = NULL, *range;
bioSeq *targetSeq;
struct ssBundle *bunList = NULL, *bun;
for (clump = clumpList; clump != NULL; clump = clump->next)
clumpToHspRange(clump, qSeq, gf->tileSize, 0, NULL, &rangeList, FALSE, TRUE);
slReverse(&rangeList);
slSort(&rangeList, gfRangeCmpTarget);
rangeList = gfRangesBundle(rangeList, 256);
for (range = rangeList; range != NULL; range = range->next)
{
targetSeq = range->tSeq;
AllocVar(bun);
bun->qSeq = qSeq;
bun->genoSeq = targetSeq;
bun->ffList = gfRangesToFfItem(range->components, qSeq);
bun->isProt = FALSE;
slAddHead(&bunList, bun);
}
gfRangeFreeList(&rangeList);
return bunList;
}
static void gfAlignSomeClumps(struct genoFind *gf, struct gfClump *clumpList,
bioSeq *seq, boolean isRc, int minMatch,
struct gfOutput *out, boolean isProt, enum ffStringency stringency)
/* Convert gfClumps to an actual alignment that gets saved via
* outFunction/outData. */
{
struct gfClump *clump;
struct gfRange *rangeList = NULL, *range;
bioSeq *targetSeq;
struct ssBundle *bun;
int intronMax = ffIntronMax;
if (isProt)
intronMax /= 3;
for (clump = clumpList; clump != NULL; clump = clump->next)
{
clumpToHspRange(clump, seq, gf->tileSize, 0, NULL, &rangeList, isProt, FALSE);
}
slReverse(&rangeList);
slSort(&rangeList, gfRangeCmpTarget);
rangeList = gfRangesBundle(rangeList, intronMax);
for (range = rangeList; range != NULL; range = range->next)
{
targetSeq = range->tSeq;
AllocVar(bun);
bun->qSeq = seq;
bun->genoSeq = targetSeq;
bun->ffList = gfRangesToFfItem(range->components, seq);
bun->isProt = isProt;
ssStitch(bun, stringency, minMatch, ssAliCount);
saveAlignments(targetSeq->name, targetSeq->size, 0,
bun, NULL, isRc, FALSE, stringency, minMatch, out);
ssBundleFree(&bun);
}
gfRangeFreeList(&rangeList);
}
void gfAlignAaClumps(struct genoFind *gf, struct gfClump *clumpList,
aaSeq *seq, boolean isRc, int minMatch,
struct gfOutput *out)
{
gfAlignSomeClumps(gf, clumpList, seq, isRc, minMatch, out, TRUE, ffTight);
}
int rangeScore(struct gfRange *range, struct dnaSeq *qSeq)
/* Return score associated with range. */
{
struct gfRange *comp;
struct dnaSeq *tSeq;
int score = 0;
for (comp = range->components; comp != NULL; comp = comp->next)
{
tSeq = comp->tSeq;
score += dnaScoreMatch(tSeq->dna + range->tStart, qSeq->dna + range->qStart,
range->tEnd - range->tStart);
if (comp->next != NULL)
score -= 4;
}
return score;
}
void gfFindAlignAaTrans(struct genoFind *gfs[3], aaSeq *qSeq, struct hash *t3Hash,
boolean tIsRc, int minMatch, struct gfOutput *out)
/* Look for qSeq alignment in three translated reading frames. Save alignment
* via outFunction/outData. */
{
struct gfClump *clumps[3];
int frame;
struct gfClump *clump;
struct gfRange *rangeList = NULL, *range;
aaSeq *targetSeq;
struct ssBundle *bun;
int tileSize = gfs[0]->tileSize;
struct trans3 *t3;
int hitCount;
struct lm *lm = lmInit(0);
gfTransFindClumps(gfs, qSeq, clumps, lm, &hitCount);
for (frame=0; frame<3; ++frame)
{
for (clump = clumps[frame]; clump != NULL; clump = clump->next)
{
clumpToHspRange(clump, qSeq, tileSize, frame, NULL, &rangeList, TRUE, FALSE);
}
}
slReverse(&rangeList);
slSort(&rangeList, gfRangeCmpTarget);
rangeList = gfRangesBundle(rangeList, ffIntronMax/3);
for (range = rangeList; range != NULL; range = range->next)
{
targetSeq = range->tSeq;
t3 = hashMustFindVal(t3Hash, targetSeq->name);
AllocVar(bun);
bun->qSeq = qSeq;
bun->genoSeq = targetSeq;
bun->ffList = gfRangesToFfItem(range->components, qSeq);
bun->isProt = TRUE;
bun->t3List = t3;
ssStitch(bun, ffCdna, minMatch, ssAliCount);
saveAlignments(targetSeq->name, t3->seq->size, 0,
bun, t3Hash, FALSE, tIsRc, ffCdna, minMatch, out);
ssBundleFree(&bun);
}
gfRangeFreeList(&rangeList);
for (frame=0; frame<3; ++frame)
gfClumpFreeList(&clumps[frame]);
lmCleanup(&lm);
}
void rangeCoorTimes3(struct gfRange *rangeList)
/* Multiply coordinates on range list times three (to go from
* amino acid to nucleotide. */
{
struct gfRange *range;
for (range = rangeList; range != NULL; range = range->next)
{
range->qStart *= 3;
range->qEnd *= 3;
range->tStart = range->tStart*3;
range->tEnd = range->tEnd*3;
}
}
static void loadHashT3Ranges(struct gfRange *rangeList,
char *tSeqDir, struct hash *tFileCache, int qSeqSize, boolean isRc,
struct hash **retT3Hash, struct dnaSeq **retSeqList,
struct slRef **retT3RefList)
/* Load DNA in ranges into memory, and put translation in a hash
* that gets returned. */
{
struct hash *t3Hash = newHash(10);
struct dnaSeq *targetSeq, *tSeqList = NULL;
struct slRef *t3RefList = NULL;
struct gfRange *range;
for (range = rangeList; range != NULL; range = range->next)
{
struct trans3 *t3, *oldT3;
targetSeq = gfiExpandAndLoadCached(range, tFileCache,
tSeqDir, qSeqSize*3, &range->tTotalSize, TRUE, isRc, usualExpansion);
slAddHead(&tSeqList, targetSeq);
freez(&targetSeq->name);
targetSeq->name = cloneString(range->tName);
t3 = trans3New(targetSeq);
refAdd(&t3RefList, t3);
t3->start = range->tStart;
t3->end = range->tEnd;
t3->nibSize = range->tTotalSize;
t3->isRc = isRc;
if ((oldT3 = hashFindVal(t3Hash, range->tName)) != NULL)
{
slAddTail(&oldT3->next, t3);
}
else
{
hashAdd(t3Hash, range->tName, t3);
}
}
*retT3Hash = t3Hash;
*retSeqList = tSeqList;
*retT3RefList = t3RefList;
}
void gfAlignTrans(int *pConn, char *tSeqDir, aaSeq *seq, int minMatch,
struct hash *tFileCache, struct gfOutput *out)
/* Search indexed translated genome on server with an amino acid sequence.
* Then load homologous bits of genome locally and do detailed alignment.
* Call 'outFunction' with each alignment that is found. */
{
struct ssBundle *bun;
struct gfClump *clumps[2][3], *clump;
struct gfRange *rangeList = NULL, *range, *rl;
struct dnaSeq *targetSeq, *tSeqList = NULL;
char targetName[PATH_LEN];
int tileSize;
int frame, isRc = 0;
struct hash *t3Hash = NULL;
struct slRef *t3RefList = NULL, *ref;
struct gfSeqSource *ssList = NULL, *ss;
struct trans3 *t3;
struct lm *lm = lmInit(0);
/* Get clumps from server. */
gfQuerySeqTrans(*pConn, seq, clumps, lm, &ssList, &tileSize);
close(*pConn);
*pConn = -1;
for (isRc = 0; isRc <= 1; ++isRc)
{
/* Figure out which parts of sequence we need to load. */
for (frame = 0; frame < 3; ++frame)
{
rl = seqClumpToRangeList(clumps[isRc][frame], frame);
rangeList = slCat(rangeList, rl);
}
/* Convert from amino acid to nucleotide coordinates. */
rangeCoorTimes3(rangeList);
slSort(&rangeList, gfRangeCmpTarget);
rangeList = gfRangesBundle(rangeList, ffIntronMax);
loadHashT3Ranges(rangeList, tSeqDir, tFileCache, seq->size,
isRc, &t3Hash, &tSeqList, &t3RefList);
/* The old range list was not very precise - it was just to get
* the DNA loaded. */
gfRangeFreeList(&rangeList);
/* Patch up clump list and associated sequence source to refer
* to bits of genome loaded into memory. Create new range list
* by extending hits in clumps. */
for (frame = 0; frame < 3; ++frame)
{
for (clump = clumps[isRc][frame]; clump != NULL; clump = clump->next)
{
struct gfSeqSource *ss = clump->target;
t3 = trans3Find(t3Hash, clumpTargetName(clump), clump->tStart*3, clump->tEnd*3);
ss->seq = t3->trans[frame];
ss->start = t3->start/3;
ss->end = t3->end/3;
clumpToHspRange(clump, seq, tileSize, frame, t3, &rangeList, TRUE, FALSE);
}
}
slReverse(&rangeList);
slSort(&rangeList, gfRangeCmpTarget);
rangeList = gfRangesBundle(rangeList, ffIntronMax/3);
/* Do detailed alignment of each of the clustered ranges. */
for (range = rangeList; range != NULL; range = range->next)
{
targetSeq = range->tSeq;
AllocVar(bun);
bun->qSeq = seq;
bun->genoSeq = targetSeq;
bun->ffList = gfRangesToFfItem(range->components, seq);
bun->isProt = TRUE;
t3 = hashMustFindVal(t3Hash, range->tName);
bun->t3List = t3;
ssStitch(bun, ffCdna, minMatch, ssAliCount);
getTargetName(range->tName, out->includeTargetFile, targetName);
saveAlignments(targetName, t3->nibSize, 0,
bun, t3Hash, FALSE, isRc, ffCdna, minMatch, out);
ssBundleFree(&bun);
}
/* Cleanup for this strand of database. */
gfRangeFreeList(&rangeList);
freeHash(&t3Hash);
for (ref = t3RefList; ref != NULL; ref = ref->next)
{
struct trans3 *t3 = ref->val;
trans3Free(&t3);
}
slFreeList(&t3RefList);
freeDnaSeqList(&tSeqList);
}
/* Final cleanup. */
for (isRc=0; isRc<=1; ++isRc)
for (frame=0; frame<3; ++frame)
gfClumpFreeList(&clumps[isRc][frame]);
for (ss = ssList; ss != NULL; ss = ss->next)
freeMem(ss->fileName);
slFreeList(&ssList);
lmCleanup(&lm);
}
void untranslateRangeList(struct gfRange *rangeList, int qFrame, int tFrame,
struct hash *t3Hash, struct trans3 *t3, int tOffset)
/* Translate coordinates from protein to dna. */
{
struct gfRange *range;
for (range = rangeList; range != NULL; range = range->next)
{
range->qStart = 3*range->qStart + qFrame;
range->qEnd = 3*range->qEnd + qFrame;
range->tStart = 3*range->tStart + tFrame;
range->tEnd = 3*range->tEnd + tFrame;
if (t3Hash)
t3 = trans3Find(t3Hash, range->tSeq->name, range->tStart + tOffset, range->tEnd + tOffset);
range->tSeq = t3->seq;
range->t3 = t3;
}
}
void gfAlignTransTrans(int *pConn, char *tSeqDir, struct dnaSeq *qSeq,
boolean qIsRc, int minMatch, struct hash *tFileCache,
struct gfOutput *out, boolean isRna)
/* Search indexed translated genome on server with an dna sequence. Translate
* this sequence in three frames. Load homologous bits of genome locally
* and do detailed alignment. Call 'outFunction' with each alignment
* that is found. */
{
struct gfClump *clumps[2][3][3], *clump;
char targetName[PATH_LEN];
int qFrame, tFrame, tIsRc;
struct gfSeqSource *ssList = NULL, *ss;
struct lm *lm = lmInit(0);
int tileSize;
struct gfRange *rangeList = NULL, *rl, *range;
struct trans3 *qTrans = trans3New(qSeq), *t3;
struct slRef *t3RefList = NULL, *t3Ref;
struct hash *t3Hash = NULL;
struct dnaSeq *tSeqList = NULL;
enum ffStringency stringency = (isRna ? ffCdna : ffLoose);
/* Query server for clumps. */
gfQuerySeqTransTrans(*pConn, qSeq, clumps, lm, &ssList, &tileSize);
close(*pConn);
*pConn = -1;
for (tIsRc=0; tIsRc <= 1; ++tIsRc)
{
/* Figure out which ranges need to be loaded and load them. */
for (qFrame = 0; qFrame < 3; ++qFrame)
{
for (tFrame = 0; tFrame < 3; ++tFrame)
{
rl = seqClumpToRangeList(clumps[tIsRc][qFrame][tFrame], tFrame);
rangeList = slCat(rangeList, rl);
}
}
rangeCoorTimes3(rangeList);
slSort(&rangeList, gfRangeCmpTarget);
rangeList = gfRangesBundle(rangeList, ffIntronMax);
loadHashT3Ranges(rangeList, tSeqDir, tFileCache,
qSeq->size/3, tIsRc, &t3Hash, &tSeqList, &t3RefList);
/* The old range list was not very precise - it was just to get
* the DNA loaded. */
gfRangeFreeList(&rangeList);
/* Patch up clump list and associated sequence source to refer
* to bits of genome loaded into memory. Create new range list
* by extending hits in clumps. */
for (qFrame = 0; qFrame < 3; ++qFrame)
{
for (tFrame = 0; tFrame < 3; ++tFrame)
{
for (clump = clumps[tIsRc][qFrame][tFrame]; clump != NULL; clump = clump->next)
{
struct gfSeqSource *ss = clump->target;
struct gfRange *rangeSet = NULL;
t3 = trans3Find(t3Hash, clumpTargetName(clump), clump->tStart*3, clump->tEnd*3);
ss->seq = t3->trans[tFrame];
ss->start = t3->start/3;
ss->end = t3->end/3;
clumpToHspRange(clump, qTrans->trans[qFrame], tileSize, tFrame, t3, &rangeSet, TRUE, FALSE);
untranslateRangeList(rangeSet, qFrame, tFrame, NULL, t3, t3->start);
rangeList = slCat(rangeSet, rangeList);
}
}
}
slReverse(&rangeList);
slSort(&rangeList, gfRangeCmpTarget);
rangeList = gfRangesBundle(rangeList, ffIntronMax);
for (range = rangeList; range != NULL; range = range->next)
{
struct dnaSeq *targetSeq = range->tSeq;
struct ssBundle *bun;
AllocVar(bun);
bun->qSeq = qSeq;
bun->genoSeq = targetSeq;
bun->ffList = gfRangesToFfItem(range->components, qSeq);
ssStitch(bun, stringency, minMatch, ssAliCount);
getTargetName(range->tName, out->includeTargetFile, targetName);
t3 = range->t3;
saveAlignments(targetName, t3->nibSize, t3->start,
bun, NULL, qIsRc, tIsRc, stringency, minMatch, out);
ssBundleFree(&bun);
}
/* Cleanup for this strand of database. */
gfRangeFreeList(&rangeList);
freeHash(&t3Hash);
for (t3Ref = t3RefList; t3Ref != NULL; t3Ref = t3Ref->next)
{
struct trans3 *t3 = t3Ref->val;
trans3Free(&t3);
}
slFreeList(&t3RefList);
freeDnaSeqList(&tSeqList);
}
trans3Free(&qTrans);
for (ss = ssList; ss != NULL; ss = ss->next)
freeMem(ss->fileName);
slFreeList(&ssList);
lmCleanup(&lm);
}
static struct ssBundle *gfTransTransFindBundles(struct genoFind *gfs[3], struct dnaSeq *qSeq,
struct hash *t3Hash, boolean isRc, int minMatch, boolean isRna)
/* Look for alignment to three translations of qSeq in three translated reading frames.
* Save alignment via outFunction/outData. */
{
struct trans3 *qTrans = trans3New(qSeq);
int qFrame, tFrame;
struct gfClump *clumps[3][3], *clump;
struct gfRange *rangeList = NULL, *range;
int tileSize = gfs[0]->tileSize;
bioSeq *targetSeq;
struct ssBundle *bun, *bunList = NULL;
int hitCount;
struct lm *lm = lmInit(0);
enum ffStringency stringency = (isRna ? ffCdna : ffLoose);
gfTransTransFindClumps(gfs, qTrans->trans, clumps, lm, &hitCount);
for (qFrame = 0; qFrame<3; ++qFrame)
{
for (tFrame=0; tFrame<3; ++tFrame)
{
for (clump = clumps[qFrame][tFrame]; clump != NULL; clump = clump->next)
{
struct gfRange *rangeSet = NULL;
clumpToHspRange(clump, qTrans->trans[qFrame], tileSize, tFrame, NULL, &rangeSet, TRUE, FALSE);
untranslateRangeList(rangeSet, qFrame, tFrame, t3Hash, NULL, 0);
rangeList = slCat(rangeSet, rangeList);
}
}
}
slSort(&rangeList, gfRangeCmpTarget);
rangeList = gfRangesBundle(rangeList, 2000);
for (range = rangeList; range != NULL; range = range->next)
{
targetSeq = range->tSeq;
AllocVar(bun);
bun->qSeq = qSeq;
bun->genoSeq = targetSeq;
bun->ffList = gfRangesToFfItem(range->components, qSeq);
ssStitch(bun, stringency, minMatch, ssAliCount);
slAddHead(&bunList, bun);
}
for (qFrame = 0; qFrame<3; ++qFrame)
for (tFrame=0; tFrame<3; ++tFrame)
gfClumpFreeList(&clumps[qFrame][tFrame]);
gfRangeFreeList(&rangeList);
trans3Free(&qTrans);
lmCleanup(&lm);
slReverse(&bunList);
return bunList;
}
static void addToBigBundleList(struct ssBundle **pOneList, struct hash *bunHash,
struct ssBundle **pBigList, struct dnaSeq *query)
/* Add bundles in one list to bigList, consolidating bundles that refer
* to the same target sequence. This will destroy oneList in the process. */
{
struct ssBundle *oneBun, *bigBun;
for (oneBun = *pOneList; oneBun != NULL; oneBun = oneBun->next)
{
char *name = oneBun->genoSeq->name;
if ((bigBun = hashFindVal(bunHash, name)) == NULL)
{
AllocVar(bigBun);
slAddHead(pBigList, bigBun);
hashAdd(bunHash, name, bigBun);
bigBun->qSeq = query;
bigBun->genoSeq = oneBun->genoSeq;
bigBun->isProt = oneBun->isProt;
bigBun->avoidFuzzyFindKludge = oneBun->avoidFuzzyFindKludge;
}
bigBun->ffList = slCat(bigBun->ffList, oneBun->ffList);
oneBun->ffList = NULL;
}
ssBundleFreeList(pOneList);
}
static boolean jiggleSmallExons(struct ffAli *ali, struct dnaSeq *nSeq, struct dnaSeq *hSeq)
/* See if can jiggle small exons to match splice sites a little
* better. */
{
struct ffAli *left, *mid, *right;
int orient;
boolean creeped = FALSE;
if (ffAliCount(ali) < 3)
return FALSE;
orient = ffIntronOrientation(ali);
left = ali;
mid = left->right;
right = mid->right;
while (right != NULL)
{
int midSizeN = mid->nEnd - mid->nStart;
if (midSizeN < 10 && mid->hStart - left->hEnd > 1 && right->hStart - mid->hEnd > 1)
{
DNA *spLeft, *spRight; /* Splice sites on either side of exon. */
DNA exonX[10+2+2]; /* Storage for exon with splice sites. */
DNA *match;
static int creeps[4][2] = { {2, 2}, {2, 1}, {1, 2}, {1, 1},};
int creepIx, creepL, creepR;
DNA *hs = mid->hStart, *he = mid->hEnd;
DNA *hMin = left->hEnd, *hMax = right->hStart;
if (orient >= 0)
{
spLeft = "ag";
spRight = "gt";
}
else
{
spLeft = "ac";
spRight = "ct";
}
for (creepIx=0; creepIx<4; ++creepIx)
{
creepL = creeps[creepIx][0];
creepR = creeps[creepIx][1];
/* Check to see if we already match consensus, and if so just bail. */
if (hs[-1] == spLeft[1] && he[0] == spRight[0])
{
if ((creepL == 1 || hs[-2] == spLeft[0])
&& (creepR == 1 || he[1] == spRight[1]))
{
break;
}
}
memcpy(exonX, spLeft + 2 - creepL, creepL);
memcpy(exonX + creepL, mid->nStart, midSizeN);
memcpy(exonX + creepL + midSizeN, spRight, creepR);
match = memMatch(exonX, midSizeN + creepR + creepL, hMin, hMax - hMin);
if (match != NULL)
{
mid->hStart = match + creepL;
mid->hEnd = mid->hStart + (he - hs);
creeped = TRUE;
break;
}
}
}
left = mid;
mid = right;
right = right->right;
}
if (creeped)
ffSlideIntrons(ali);
return creeped;
}
static struct ffAli *refineSmallExons(struct ffAli *ff,
struct dnaSeq *nSeq, struct dnaSeq *hSeq)
/* Tweak small exons slightly - refining positions to match splice
* sites if possible and looking a little harder for small first
* and last exons. */
{
if (jiggleSmallExons(ff, nSeq, hSeq))
ff = ffRemoveEmptyAlis(ff, TRUE);
return ff;
}
static void refineSmallExonsInBundle(struct ssBundle *bun)
/* Tweak small exons slightly - refining positions to match splice
* sites if possible and looking a little harder for small first
* and last exons. */
{
struct ssFfItem *fi; /* Item list - memory owned by bundle. */
for (fi = bun->ffList; fi != NULL; fi = fi->next)
{
fi->ff = refineSmallExons(fi->ff, bun->qSeq, bun->genoSeq);
}
}
#ifdef DEBUG
void dumpBunList(struct ssBundle *bunList)
/* Write out summary info on bundle list. */
{
int totalItems = 0;
int totalBlocks = 0;
int totalBases = 0;
struct ssBundle *bun;
for (bun = bunList; bun != NULL; bun = bun->next)
{
struct ssFfItem *item;
for (item = bun->ffList; item != NULL; item = item->next)
{
printf("item: ");
struct ffAli *ff;
for (ff = item->ff; ff != NULL; ff = ff->right)
{
totalBases += ff->hEnd - ff->hStart;
totalBlocks += 1;
printf("%d,", ff->hEnd - ff->hStart);
}
printf("\n");
totalItems += 1;
}
}
printf("total bundles %d, alignments %d, blocks %d, bases %d\n",
slCount(bunList), totalItems, totalBlocks, totalBases);
}
#endif /* DEBUG */
static void gfAlignSomeClumps(struct genoFind *gf, struct gfClump *clumpList,
bioSeq *seq, boolean isRc, int minMatch,
struct gfOutput *out, boolean isProt, enum ffStringency stringency);
void gfLongDnaInMem(struct dnaSeq *query, struct genoFind *gf,
boolean isRc, int minScore, Bits *qMaskBits,
struct gfOutput *out, boolean fastMap, boolean band)
/* Chop up query into pieces, align each, and stitch back
* together again. */
{
int hitCount;
int maxSize = 5000;
int preferredSize = 4500;
int overlapSize = 250;
struct dnaSeq subQuery = *query;
struct lm *lm = lmInit(0);
int subOffset, subSize, nextOffset;
DNA saveEnd, *endPos;
struct ssBundle *oneBunList = NULL, *bigBunList = NULL, *bun;
struct hash *bunHash = newHash(8);
for (subOffset = 0; subOffset<query->size; subOffset = nextOffset)
{
struct gfClump *clumpList;
struct gfRange *rangeList = NULL;
/* Figure out size of this piece. If query is
* maxSize or less do it all. Otherwise just
* do prefered size, and set it up to overlap
* with surrounding pieces by overlapSize. */
if (subOffset == 0 && query->size <= maxSize)
nextOffset = subSize = query->size;
else
{
subSize = preferredSize;
if (subSize + subOffset >= query->size)
{
subSize = query->size - subOffset;
nextOffset = query->size;
}
else
{
nextOffset = subOffset + preferredSize - overlapSize;
}
}
subQuery.dna = query->dna + subOffset;
subQuery.size = subSize;
endPos = &subQuery.dna[subSize];
saveEnd = *endPos;
*endPos = 0;
if (band)
{
oneBunList = ffSeedExtInMem(gf, &subQuery, qMaskBits, subOffset, lm, minScore, isRc);
}
else
{
clumpList = gfFindClumpsWithQmask(gf, &subQuery, qMaskBits, subOffset, lm, &hitCount);
if (fastMap)
{
oneBunList = fastMapClumpsToBundles(gf, clumpList, &subQuery);
}
else
{
oneBunList = gfClumpsToBundles(clumpList, isRc, &subQuery, minScore, &rangeList);
gfRangeFreeList(&rangeList);
}
gfClumpFreeList(&clumpList);
}
addToBigBundleList(&oneBunList, bunHash, &bigBunList, query);
*endPos = saveEnd;
}
#ifdef DEBUG
dumpBunList(bigBunList);
#endif /* DEBUG */
for (bun = bigBunList; bun != NULL; bun = bun->next)
{
ssStitch(bun, ffCdna, minScore, ssAliCount);
if (!fastMap && !band)
refineSmallExonsInBundle(bun);
saveAlignments(bun->genoSeq->name, bun->genoSeq->size, 0,
bun, NULL, isRc, FALSE, ffCdna, minScore, out);
}
ssBundleFreeList(&bigBunList);
freeHash(&bunHash);
lmCleanup(&lm);
}
void gfLongTransTransInMem(struct dnaSeq *query, struct genoFind *gfs[3],
struct hash *t3Hash, boolean qIsRc, boolean tIsRc, boolean qIsRna,
int minScore, struct gfOutput *out)
/* Chop up query into pieces, align each in translated space, and stitch back
* together again as nucleotides. */
{
enum ffStringency stringency = (qIsRna ? ffCdna : ffLoose);
int maxSize = 1500;
int preferredSize = 1200; /* PreferredSize - overlapSize might need to be multiple of 3. */
int overlapSize = 270;
struct dnaSeq subQuery = *query;
int subOffset, subSize, nextOffset;
DNA saveEnd, *endPos;
struct ssBundle *oneBunList = NULL, *bigBunList = NULL, *bun;
struct hash *bunHash = newHash(8);
for (subOffset = 0; subOffset<query->size; subOffset = nextOffset)
{
/* Figure out size of this piece. If query is
* maxSize or less do it all. Otherwise just
* do prefered size, and set it up to overlap
* with surrounding pieces by overlapSize. */
if (subOffset == 0 && query->size <= maxSize)
nextOffset = subSize = query->size;
else
{
subSize = preferredSize;
if (subSize + subOffset >= query->size)
{
subSize = query->size - subOffset;
nextOffset = query->size;
}
else
{
nextOffset = subOffset + preferredSize - overlapSize;
}
}
subQuery.dna = query->dna + subOffset;
subQuery.size = subSize;
endPos = &subQuery.dna[subSize];
saveEnd = *endPos;
*endPos = 0;
oneBunList = gfTransTransFindBundles(gfs, &subQuery, t3Hash, qIsRc, minScore, qIsRna);
addToBigBundleList(&oneBunList, bunHash, &bigBunList, query);
*endPos = saveEnd;
}
for (bun = bigBunList; bun != NULL; bun = bun->next)
{
ssStitch(bun, ffCdna, minScore, ssAliCount);
saveAlignments(bun->genoSeq->name, bun->genoSeq->size, 0,
bun, NULL, qIsRc, tIsRc, stringency, minScore, out);
}
hashFree(&bunHash);
ssBundleFreeList(&bigBunList);
}