src/hg/blastToPsl/blastToPsl.c 1.23
1.23 2009/04/12 03:47:20 markd
added support for PSI BLAST madness
Index: src/hg/blastToPsl/blastToPsl.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/blastToPsl/blastToPsl.c,v
retrieving revision 1.22
retrieving revision 1.23
diff -b -B -U 1000000 -r1.22 -r1.23
--- src/hg/blastToPsl/blastToPsl.c 22 Aug 2007 02:49:47 -0000 1.22
+++ src/hg/blastToPsl/blastToPsl.c 12 Apr 2009 03:47:20 -0000 1.23
@@ -1,413 +1,419 @@
/* blastToPsl - convert blast textual output to PSLs */
#include "common.h"
#include "linefile.h"
#include "options.h"
#include "psl.h"
#include "blastParse.h"
#include "dnautil.h"
static char const rcsid[] = "$Id$";
double eVal = -1; /* default Expect value signifying no filtering */
boolean pslxFmt = FALSE; /* output in pslx format */
struct block
/* coordinates of a block */
{
int qStart; /* Query start/end positions. */
int qEnd;
int tStart; /* Target start/end position. */
int tEnd;
int qLetMult; /* each letter counts as this many units */
int tLetMult;
int qSizeMult; /* user to convert coordinates */
int tSizeMult;
int countMult; /* column count for match/mismatch */
int alnStart; /* start/end in alignment */
int alnEnd;
};
/* Bit set of flags */
enum {
// blast algorithm
BLASTN = 0x01,
BLASTP = 0x02,
BLASTX = 0x04,
TBLASTN = 0x10,
TBLASTX = 0x20
};
/* score file header */
static char *scoreHdr = "#strand\tqName\tqStart\tqEnd\ttName\ttStart\ttEnd\tbitScore\teVal\n";
/* command line option specifications */
static struct optionSpec optionSpecs[] = {
{"scores", OPTION_STRING},
{"eVal", OPTION_DOUBLE},
{"pslx", OPTION_BOOLEAN},
{NULL, 0}
};
static void usage()
/* Explain usage and exit. */
{
errAbort(
"blastToPsl - Convert blast alignments to PSLs.\n"
"\n"
"usage:\n"
" blastToPsl [options] blastOutput psl\n"
"\n"
"Options:\n"
" -scores=file - Write score information to this file. Format is:\n"
" strands qName qStart qEnd tName tStart tEnd bitscore eVal\n"
" -verbose=n - n >= 3 prints each line of file after parsing.\n"
" n >= 4 dumps the result of each query\n"
" -eVal=n n is e-value threshold to filter results. Format can be either\n"
" an integer, double or 1e-10. Default is no filter.\n"
" -pslx - create PSLX output (includes sequences for blocks)\n"
+ "\n"
+ "Output only results of last round from PSI BLAST\n"
);
}
static boolean isProteinSeqs(unsigned flags)
/* do the flags indicate a protein sequences in alignments */
{
return (flags & (BLASTP|BLASTX|TBLASTN|TBLASTX)) != 0;
}
static char *wackAfterWhite(char *s)
/* end string at first whitespare */
{
char *w = skipToSpaces(s);
if (w != NULL)
*w = '\0';
return s;
}
static struct psl* createPsl(struct block *blk, struct blastBlock *bb, int pslSpace)
/* create PSL for a blast block */
{
struct blastGappedAli *ba = bb->gappedAli;
char strand[3];
strand[0] = (bb->qStrand > 0) ? '+' : '-';
strand[1] = (bb->tStrand > 0) ? '+' : '-';
strand[2] = '\0';
/* white space in query or target name breaks some psl software (like pslOpen()),
* so only keep up to first whitespace.
*/
int qSize = blk->qSizeMult*ba->query->queryBaseCount;
return pslNew(wackAfterWhite(ba->query->query), qSize, 0, 0,
wackAfterWhite(ba->targetName), ba->targetSize, 0, 0,
strand, pslSpace, (pslxFmt ? PSL_XA_FORMAT : 0));
}
static void makeUntranslated(struct psl* psl)
/* convert a PSL so it is in the untranslated form produced by blat */
{
if (psl->strand[1] == '-')
{
/* swap around blocks so it's query that is reversed */
int i;
for (i = 0; i < psl->blockCount; i++)
{
psl->tStarts[i] = psl->tSize - (psl->tStarts[i] + psl->blockSizes[i]);
psl->qStarts[i] = psl->qSize - (psl->qStarts[i] + psl->blockSizes[i]);
}
reverseUnsigned(psl->tStarts, psl->blockCount);
reverseUnsigned(psl->qStarts, psl->blockCount);
reverseUnsigned(psl->blockSizes, psl->blockCount);
/* fix strand, +- now -, -- now + */
psl->strand[0] = (psl->strand[0] == '+') ? '-' : '+';
}
psl->strand[1] = '\0';
}
static void finishPsl(struct psl* psl, unsigned flags)
/* put finishing touches on a psl */
{
if ((flags & TBLASTN) == 0)
makeUntranslated(psl);
}
static void addPslBlock(struct psl* psl, struct blastBlock *bb, struct block* blk,
int* pslSpace)
/* add a block to a psl */
{
unsigned newIBlk = psl->blockCount;
unsigned blkSize = blk->qSizeMult * (blk->qEnd - blk->qStart);
if (newIBlk >= *pslSpace)
pslGrow(psl, pslSpace);
psl->qStarts[newIBlk] = blk->qStart;
psl->tStarts[newIBlk] = blk->tStart;
/* uses query size so protein psl is right */
psl->blockSizes[newIBlk] = blkSize;
/* keep bounds current */
psl->qStart = psl->qStarts[0];
psl->qEnd = psl->qStarts[newIBlk]
+ (blk->qSizeMult * psl->blockSizes[newIBlk]);
if (psl->strand[0] == '-')
reverseIntRange(&psl->qStart, &psl->qEnd, psl->qSize);
psl->tStart = psl->tStarts[0];
psl->tEnd = psl->tStarts[newIBlk]
+ (blk->tSizeMult * psl->blockSizes[newIBlk]);
if (psl->strand[1] == '-')
reverseIntRange(&psl->tStart, &psl->tEnd, psl->tSize);
if (pslxFmt)
{
psl->qSequence[newIBlk] = cloneStringZ(bb->qSym + blk->alnStart, blkSize);
psl->tSequence[newIBlk] = cloneStringZ(bb->tSym + blk->alnStart, blkSize);
}
psl->blockCount++;
}
static boolean nextUngappedBlk(struct blastBlock* bb, struct block* blk, unsigned flags)
/* Find the next ungapped block in a blast alignment, in [0..n) coords in mrna
* space. On first call, block should be zero, subsequence calls should be
* parsed the result of the previous call.
*/
{
char *qPtr, *tPtr;
if (blk->tEnd == 0)
{
/* first call */
blk->qStart = bb->qStart;
blk->qEnd = blk->qStart+1;
blk->tStart = bb->tStart;
blk->tEnd = blk->tStart+1;
}
else
{
/* subsequence calls */
blk->qStart = blk->qEnd;
blk->tStart = blk->tEnd;
blk->alnStart = blk->alnEnd;
}
/* find start of next aligned block */
qPtr = bb->qSym + blk->alnStart;
tPtr = bb->tSym + blk->alnStart;
while ((*qPtr != '\0') && (*tPtr != '\0')
&& ((*qPtr == '-') || (*tPtr == '-')))
{
if (*qPtr != '-')
blk->qStart += blk->qLetMult;
if (*tPtr != '-')
blk->tStart += blk->tLetMult;
qPtr++;
tPtr++;
blk->alnStart++;
}
blk->qEnd = blk->qStart;
blk->tEnd = blk->tStart;
blk->alnEnd = blk->alnStart;
if ((*qPtr == '\0') || (*tPtr == '\0'))
{
assert((*qPtr == '\0') && (*tPtr == '\0'));
assert(blk->qStart == bb->qEnd);
assert(blk->tStart == bb->tEnd);
return FALSE; /* no more */
}
/* find end of aligned block */
while ((*qPtr != '\0') && (*tPtr != '\0')
&& (*qPtr != '-') && (*tPtr != '-'))
{
blk->qEnd += blk->qLetMult;
blk->tEnd += blk->tLetMult;
qPtr++;
tPtr++;
blk->alnEnd++;
}
assert((blk->tSizeMult * (blk->qEnd - blk->qStart))
== (blk->qSizeMult * (blk->tEnd - blk->tStart)));
return TRUE;
}
static void countBlock(struct blastBlock* bb, struct block* blk, struct block* prevBlk, struct psl* psl,
unsigned flags)
/* update the PSL counts between for a block and previous insert. */
{
if (prevBlk->tEnd != 0)
{
/* count insert */
if (prevBlk->qEnd != blk->qStart)
{
/* insert in query */
psl->qNumInsert++;
psl->qBaseInsert += (blk->qStart - prevBlk->qEnd);
}
if (prevBlk->tEnd != blk->tStart)
{
/* insert in target */
psl->tNumInsert++;
psl->tBaseInsert += (blk->tStart - prevBlk->tEnd);
}
}
int i, alnLen = (blk->alnEnd - blk->alnStart);
char *qPtr = bb->qSym + blk->alnStart;
char *tPtr = bb->tSym + blk->alnStart;
boolean isProt = isProteinSeqs(flags);
for (i = 0; i < alnLen; i++, qPtr++, tPtr++)
{
if ((!isProt && ((*qPtr == 'N') || (*tPtr == 'N')))
|| (isProt && ((*qPtr == 'X') || (*tPtr == 'X'))))
psl->repMatch += blk->countMult;
else if (*qPtr == *tPtr)
psl->match += blk->countMult;
else
psl->misMatch += blk->countMult;
}
}
static void outputPsl(struct blastBlock *bb, unsigned flags, struct psl *psl,
FILE* pslFh, FILE* scoreFh)
/* output a psl and optional score */
{
pslTabOut(psl, pslFh);
if (scoreFh != NULL)
fprintf(scoreFh, "%s\t%s\t%d\t%d\t%s\t%d\t%d\t%g\t%g\n", psl->strand,
psl->qName, psl->qStart, psl->qEnd,
psl->tName, psl->tStart, psl->tEnd, bb->bitScore, bb->eVal);
}
static void initBlk(unsigned flags, struct block *blk)
/* initialize a block object */
{
ZeroVar(blk);
if (flags & TBLASTN)
{
blk->qLetMult = 1;
blk->qSizeMult = 1;
blk->tLetMult = 3;
blk->tSizeMult = 3;
blk->countMult = 1;
}
else if (flags & TBLASTX)
{
blk->qLetMult = 3;
blk->qSizeMult = 1;
blk->tLetMult = 3;
blk->tSizeMult = 1;
blk->countMult = 3;
}
else
{
blk->qLetMult = 1;
blk->qSizeMult = 1;
blk->tLetMult = 1;
blk->tSizeMult = 1;
blk->countMult = 1;
}
}
static void processBlock(struct blastBlock *bb, unsigned flags,
FILE* pslFh, FILE* scoreFh)
/* process one alignment block */
{
int pslSpace = 8;
struct block blk, prevBlk;
initBlk(flags, &blk);
initBlk(flags, &prevBlk);
struct psl *psl = createPsl(&blk, bb, pslSpace);
/* fill in ungapped blocks */
while (nextUngappedBlk(bb, &blk, flags))
{
countBlock(bb, &blk, &prevBlk, psl, flags);
addPslBlock(psl, bb, &blk, &pslSpace);
prevBlk = blk;
}
if (psl->blockCount > 0 && (bb->eVal <= eVal || eVal == -1))
{
finishPsl(psl, flags);
outputPsl(bb, flags, psl, pslFh, scoreFh);
}
pslFree(&psl);
}
void processQuery(struct blastQuery *bq, unsigned flags, FILE* pslFh, FILE* scoreFh)
-/* process one query */
+/* process one query. Each gaped block becomes an psl. Chaining is left
+ * to other programs. Only output last round from PSI BLAST */
{
struct blastGappedAli* ba;
struct blastBlock *bb;
for (ba = bq->gapped; ba != NULL; ba = ba->next)
{
+ if (ba->psiRound == bq->psiRounds)
+ {
for (bb = ba->blocks; bb != NULL; bb = bb->next)
processBlock(bb, flags, pslFh, scoreFh);
}
+ }
}
static unsigned getBlastAlgo(struct blastFile *bf)
/* determine blast algorithm */
{
if (sameString(bf->program, "BLASTN"))
return BLASTN;
if (sameString(bf->program, "BLASTP"))
return BLASTP;
if (sameString(bf->program, "BLASTX"))
return BLASTX;
if (sameString(bf->program, "TBLASTN"))
return TBLASTN;
if (sameString(bf->program, "TBLASTX"))
return TBLASTX;
errAbort("unknown BLAST program \"%s\", please update blastToPsl.c",
bf->program);
return 0;
}
static void blastToPsl(char *blastFile, char *pslFile, char* scoreFile)
/* process one query in */
{
struct blastFile *bf = blastFileOpenVerify(blastFile);
struct blastQuery *bq;
FILE *pslFh = mustOpen(pslFile, "w");
FILE *scoreFh = NULL;
if (scoreFile != NULL)
{
scoreFh = mustOpen(scoreFile, "w");
fputs(scoreHdr, scoreFh);
}
unsigned flags = getBlastAlgo(bf);
if ((flags & TBLASTX) & pslxFmt)
errAbort("-pslx not supported for TBLASTX alignments");
while ((bq = blastFileNextQuery(bf)) != NULL)
{
processQuery(bq, flags, pslFh, scoreFh);
blastQueryFree(&bq);
}
blastFileFree(&bf);
carefulClose(&scoreFh);
carefulClose(&pslFh);
}
int main(int argc, char *argv[])
/* Process command line. */
{
optionInit(&argc, argv, optionSpecs);
eVal = optionDouble("eVal", eVal);
pslxFmt = optionExists("pslx");
if (argc != 3)
usage();
blastToPsl(argv[1], argv[2], optionVal("scores", NULL));
return 0;
}
/*
* Local Variables:
* c-file-style: "jkent-c"
* End:
*/