src/hg/blastToPsl/blastToPsl.c 1.24

1.24 2010/02/08 03:04:22 markd
added blastXmlToPsl
Index: src/hg/blastToPsl/blastToPsl.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/blastToPsl/blastToPsl.c,v
retrieving revision 1.23
retrieving revision 1.24
diff -b -B -U 4 -r1.23 -r1.24
--- src/hg/blastToPsl/blastToPsl.c	12 Apr 2009 03:47:20 -0000	1.23
+++ src/hg/blastToPsl/blastToPsl.c	8 Feb 2010 03:04:22 -0000	1.24
@@ -3,44 +3,15 @@
 #include "linefile.h"
 #include "options.h"
 #include "psl.h"
 #include "blastParse.h"
-#include "dnautil.h"
+#include "pslBuild.h"
 
 static char const rcsid[] = "$Id$";
 
 double eVal = -1; /* default Expect value signifying no filtering */
 boolean pslxFmt = FALSE; /* output in pslx format */
 
-struct block
-/* coordinates of a block */
-{
-    int qStart;          /* Query start/end positions. */
-    int qEnd;
-    int tStart;          /* Target start/end position. */
-    int tEnd;
-    int qLetMult;        /* each letter counts as this many units */
-    int tLetMult;
-    int qSizeMult;       /* user to convert coordinates */
-    int tSizeMult;
-    int countMult;       /* column count for match/mismatch */
-    int alnStart;        /* start/end in alignment */
-    int alnEnd;
-};
-
-/* Bit set of flags */
-enum {
-    // blast algorithm
-    BLASTN   = 0x01,
-    BLASTP   = 0x02,
-    BLASTX   = 0x04,
-    TBLASTN  = 0x10,
-    TBLASTX  = 0x20
-};
-
-/* score file header */
-static char *scoreHdr = "#strand\tqName\tqStart\tqEnd\ttName\ttStart\ttEnd\tbitScore\teVal\n";
-
 /* command line option specifications */
 static struct optionSpec optionSpecs[] = {
     {"scores", OPTION_STRING},
     {"eVal", OPTION_DOUBLE},
@@ -69,273 +40,29 @@
   "Output only results of last round from PSI BLAST\n"
   );
 }
 
-static boolean isProteinSeqs(unsigned flags)
-/* do the flags indicate a protein sequences in alignments */
-{
-return (flags & (BLASTP|BLASTX|TBLASTN|TBLASTX)) != 0;
-}
-
-static char *wackAfterWhite(char *s)
-/* end string at first whitespare */
-{
-char *w = skipToSpaces(s);
-if (w != NULL)
-    *w = '\0';
-return s;
-}
-
-static struct psl* createPsl(struct block *blk, struct blastBlock *bb, int pslSpace)
-/* create PSL for a blast block */
-{
-struct blastGappedAli *ba = bb->gappedAli;
-char strand[3];
-
-strand[0] = (bb->qStrand > 0) ? '+' : '-';
-strand[1] = (bb->tStrand > 0) ? '+' : '-';
-strand[2] = '\0';
-
-/* white space in query or target name breaks some psl software (like pslOpen()),
- * so only keep up to first whitespace.
- */
-int qSize = blk->qSizeMult*ba->query->queryBaseCount;
-return pslNew(wackAfterWhite(ba->query->query), qSize, 0, 0,
-              wackAfterWhite(ba->targetName), ba->targetSize, 0, 0,
-              strand, pslSpace, (pslxFmt ? PSL_XA_FORMAT : 0));
-}
-
-static void makeUntranslated(struct psl* psl)
-/* convert a PSL so it is in the untranslated form produced by blat */
-{
-if (psl->strand[1] == '-')
-    {
-    /* swap around blocks so it's query that is reversed */
-    int i;
-    for (i = 0; i < psl->blockCount; i++)
-        {
-        psl->tStarts[i] = psl->tSize - (psl->tStarts[i] + psl->blockSizes[i]);
-        psl->qStarts[i] = psl->qSize - (psl->qStarts[i] + psl->blockSizes[i]);
-        }
-    reverseUnsigned(psl->tStarts, psl->blockCount);
-    reverseUnsigned(psl->qStarts, psl->blockCount);
-    reverseUnsigned(psl->blockSizes, psl->blockCount);
-
-    /* fix strand, +- now -, -- now + */
-    psl->strand[0] = (psl->strand[0] == '+') ? '-' : '+';
-    }
-psl->strand[1] = '\0';
-}
-
-static void finishPsl(struct psl* psl, unsigned flags)
-/* put finishing touches on a psl */
-{
-if ((flags & TBLASTN) == 0)
-    makeUntranslated(psl);
-}
-
-static void addPslBlock(struct psl* psl, struct blastBlock *bb, struct block* blk,
-                        int* pslSpace)
-/* add a block to a psl */
-{
-unsigned newIBlk = psl->blockCount;
-unsigned blkSize = blk->qSizeMult * (blk->qEnd - blk->qStart);
-if (newIBlk >= *pslSpace)
-    pslGrow(psl, pslSpace);
-psl->qStarts[newIBlk] = blk->qStart;
-psl->tStarts[newIBlk] = blk->tStart;
-/* uses query size so protein psl is right */
-psl->blockSizes[newIBlk] = blkSize;
-
-/* keep bounds current */
-psl->qStart = psl->qStarts[0];
-psl->qEnd = psl->qStarts[newIBlk]
-    + (blk->qSizeMult * psl->blockSizes[newIBlk]);
-if (psl->strand[0] == '-')
-    reverseIntRange(&psl->qStart, &psl->qEnd, psl->qSize);
-psl->tStart = psl->tStarts[0];
-psl->tEnd = psl->tStarts[newIBlk]
-    + (blk->tSizeMult * psl->blockSizes[newIBlk]);
-if (psl->strand[1] == '-')
-    reverseIntRange(&psl->tStart, &psl->tEnd, psl->tSize);
-
-if (pslxFmt)
-    {
-    psl->qSequence[newIBlk] = cloneStringZ(bb->qSym + blk->alnStart, blkSize);
-    psl->tSequence[newIBlk] = cloneStringZ(bb->tSym + blk->alnStart, blkSize);
-    }
-psl->blockCount++;
-}
-
-static boolean nextUngappedBlk(struct blastBlock* bb, struct block* blk, unsigned flags)
-/* Find the next ungapped block in a blast alignment, in [0..n) coords in mrna
- * space.  On first call, block should be zero, subsequence calls should be
- * parsed the result of the previous call.
- */
-{
-char *qPtr, *tPtr;
-
-if (blk->tEnd == 0)
-    {
-    /* first call */
-    blk->qStart = bb->qStart;
-    blk->qEnd = blk->qStart+1;
-    blk->tStart = bb->tStart;
-    blk->tEnd = blk->tStart+1;
-    }
-else
-    {
-    /* subsequence calls */
-    blk->qStart = blk->qEnd;
-    blk->tStart = blk->tEnd;
-    blk->alnStart = blk->alnEnd;
-    }
-
-/* find start of next aligned block */
-qPtr = bb->qSym + blk->alnStart;
-tPtr = bb->tSym + blk->alnStart;
-
-while ((*qPtr != '\0') && (*tPtr != '\0')
-       && ((*qPtr == '-') || (*tPtr == '-')))
-    {
-    if (*qPtr != '-')
-        blk->qStart += blk->qLetMult;
-    if (*tPtr != '-')
-        blk->tStart += blk->tLetMult;
-    qPtr++;
-    tPtr++;
-    blk->alnStart++;
-    }
-blk->qEnd = blk->qStart;
-blk->tEnd = blk->tStart;
-blk->alnEnd = blk->alnStart;
-
-if ((*qPtr == '\0') || (*tPtr == '\0'))
-    {
-    assert((*qPtr == '\0') && (*tPtr == '\0'));
-    assert(blk->qStart == bb->qEnd);
-    assert(blk->tStart == bb->tEnd);
-    return FALSE;  /* no more */
-    }
-
-
-/* find end of aligned block */
-while ((*qPtr != '\0') && (*tPtr != '\0')
-       && (*qPtr != '-') && (*tPtr != '-'))
-    {
-    blk->qEnd += blk->qLetMult;
-    blk->tEnd += blk->tLetMult;
-    qPtr++;
-    tPtr++;
-    blk->alnEnd++;
-    }
-
-assert((blk->tSizeMult * (blk->qEnd - blk->qStart))
-       == (blk->qSizeMult * (blk->tEnd - blk->tStart)));
-return TRUE;
-}
-
-static void countBlock(struct blastBlock* bb, struct block* blk, struct block* prevBlk, struct psl* psl,
-                       unsigned flags)
-/* update the PSL counts between for a block and previous insert. */
-{
-if (prevBlk->tEnd != 0)
-    {
-    /* count insert */
-    if (prevBlk->qEnd != blk->qStart)
-        {
-        /* insert in query */
-        psl->qNumInsert++;
-        psl->qBaseInsert += (blk->qStart - prevBlk->qEnd);
-    }
-    if (prevBlk->tEnd != blk->tStart)
-        {
-        /* insert in target */
-        psl->tNumInsert++;
-        psl->tBaseInsert += (blk->tStart - prevBlk->tEnd);
-        }
-    }
-
-int i, alnLen = (blk->alnEnd - blk->alnStart);
-char *qPtr = bb->qSym + blk->alnStart;
-char *tPtr = bb->tSym + blk->alnStart;
-boolean isProt = isProteinSeqs(flags);
-for (i = 0; i < alnLen; i++, qPtr++, tPtr++)
-    {
-    if ((!isProt && ((*qPtr == 'N') || (*tPtr == 'N')))
-        || (isProt && ((*qPtr == 'X') || (*tPtr == 'X'))))
-        psl->repMatch += blk->countMult;
-    else if (*qPtr == *tPtr)
-        psl->match += blk->countMult;
-    else
-        psl->misMatch += blk->countMult;
-    }
-}
-
 static void outputPsl(struct blastBlock *bb, unsigned flags, struct psl *psl,
                       FILE* pslFh, FILE* scoreFh)
 /* output a psl and optional score */
 {
 pslTabOut(psl, pslFh);
 if (scoreFh != NULL)
-    fprintf(scoreFh, "%s\t%s\t%d\t%d\t%s\t%d\t%d\t%g\t%g\n", psl->strand,
-            psl->qName, psl->qStart, psl->qEnd,
-            psl->tName, psl->tStart, psl->tEnd, bb->bitScore, bb->eVal);
-}
-
-static void initBlk(unsigned flags, struct block *blk)
-/* initialize a block object */
-{
-ZeroVar(blk);
-if (flags & TBLASTN)
-    {
-    blk->qLetMult = 1;
-    blk->qSizeMult = 1;
-    blk->tLetMult = 3;
-    blk->tSizeMult = 3;
-    blk->countMult = 1;
-    }
-else if (flags & TBLASTX)
-    {
-    blk->qLetMult = 3;
-    blk->qSizeMult = 1;
-    blk->tLetMult = 3;
-    blk->tSizeMult = 1;
-    blk->countMult = 3;
-    }
-else
-    {
-    blk->qLetMult = 1;
-    blk->qSizeMult = 1;
-    blk->tLetMult = 1;
-    blk->tSizeMult = 1;
-    blk->countMult = 1;
-    }
+    pslBuildWriteScores(scoreFh, psl, bb->bitScore, bb->eVal);
+pslCheck("blastToPsl", stderr, psl);
 }
 
 static void processBlock(struct blastBlock *bb, unsigned flags,
                          FILE* pslFh, FILE* scoreFh)
-/* process one alignment block  */
+/* process one gapped alignment block */
 {
-int pslSpace = 8;
-struct block blk, prevBlk;
-initBlk(flags, &blk);
-initBlk(flags, &prevBlk);
-
-struct psl *psl = createPsl(&blk, bb, pslSpace);
-
-/* fill in ungapped blocks */
-while (nextUngappedBlk(bb, &blk, flags))
-    {
-    countBlock(bb, &blk, &prevBlk, psl, flags);
-    addPslBlock(psl, bb, &blk, &pslSpace);
-    prevBlk = blk;
-    }
+struct blastGappedAli* ba = bb->gappedAli;
+struct blastQuery *bq = ba->query;
+struct psl *psl = pslBuildFromHsp(firstWordInLine(bq->query), bq->queryBaseCount, bb->qStart, bb->qEnd, ((bb->qStrand > 0) ? '+' : '-'), bb->qSym,
+                                  firstWordInLine(ba->targetName), ba->targetSize, bb->tStart, bb->tEnd, ((bb->tStrand > 0) ? '+' : '-'), bb->tSym,
+                                  flags);
 if (psl->blockCount > 0 && (bb->eVal <= eVal || eVal == -1))
-    {
-    finishPsl(psl, flags);
     outputPsl(bb, flags, psl, pslFh, scoreFh);
-    }
 pslFree(&psl);
 }
 
 void processQuery(struct blastQuery *bq, unsigned flags, FILE* pslFh, FILE* scoreFh)
@@ -353,26 +80,8 @@
         }
     }
 }
 
-static unsigned getBlastAlgo(struct blastFile *bf)
-/* determine blast algorithm */
-{
-if (sameString(bf->program, "BLASTN"))
-    return BLASTN;
-if (sameString(bf->program, "BLASTP"))
-    return BLASTP;
-if (sameString(bf->program, "BLASTX"))
-    return BLASTX;
-if (sameString(bf->program, "TBLASTN"))
-    return TBLASTN;
-if (sameString(bf->program, "TBLASTX"))
-    return TBLASTX;
-errAbort("unknown BLAST program \"%s\", please update blastToPsl.c",
-         bf->program);
-return 0;
-}
-
 static void blastToPsl(char *blastFile, char *pslFile, char* scoreFile)
 /* process one query in */
 {
 struct blastFile *bf = blastFileOpenVerify(blastFile);
@@ -381,13 +90,11 @@
 FILE *scoreFh = NULL;
 if (scoreFile != NULL)
     {
     scoreFh = mustOpen(scoreFile, "w");
-    fputs(scoreHdr, scoreFh);
+    fputs(pslBuildScoreHdr, scoreFh);
     }
-unsigned flags = getBlastAlgo(bf);
-if ((flags & TBLASTX) & pslxFmt)
-    errAbort("-pslx not supported for TBLASTX alignments");
+unsigned flags =  pslBuildGetBlastAlgo(bf->program) |  (pslxFmt ? bldPslx : 0);
 
 while ((bq = blastFileNextQuery(bf)) != NULL)
     {
     processQuery(bq, flags, pslFh, scoreFh);