src/hg/blastToPsl/blastXmlToPsl.c 1.1

1.1 2010/02/08 03:04:22 markd
added blastXmlToPsl
Index: src/hg/blastToPsl/blastXmlToPsl.c
===================================================================
RCS file: src/hg/blastToPsl/blastXmlToPsl.c
diff -N src/hg/blastToPsl/blastXmlToPsl.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/hg/blastToPsl/blastXmlToPsl.c	8 Feb 2010 03:04:22 -0000	1.1
@@ -0,0 +1,203 @@
+/* blastXmlToPsl - convert blast XML output to PSLs. */
+#include "common.h"
+#include "linefile.h"
+#include "options.h"
+#include "psl.h"
+#include "ncbiBlast.h"
+#include "pslBuild.h"
+
+static char const rcsid[] = "$Id$";
+
+void usage()
+/* Explain usage and exit. */
+{
+errAbort(
+  "blastXmlToPsl - convert blast XML output to PSLs\n"
+  "usage:\n"
+  "   blastXmlToPsl [options] blastXml psl\n"
+  "\n"
+  "options:\n"
+  "  -scores=file - Write score information to this file.  Format is:\n"
+  "       strands qName qStart qEnd tName tStart tEnd bitscore eVal\n"
+  "  -verbose=n - n >= 3 prints each line of file after parsing.\n"
+  "               n >= 4 dumps the result of each query\n"
+  "  -eVal=n n is e-value threshold to filter results. Format can be either\n"
+  "          an integer, double or 1e-10. Default is no filter.\n"
+  "  -pslx - create PSLX output (includes sequences for blocks)\n"
+  "\n"
+  "Output only results of last round from PSI BLAST\n");
+}
+
+static struct optionSpec options[] = {
+    {"scores", OPTION_STRING},
+    {"eVal", OPTION_DOUBLE},
+    {"pslx", OPTION_BOOLEAN},
+    {NULL, 0},
+};
+
+static double eVal = -1; /* default Expect value signifying no filtering */
+static boolean pslxFmt = FALSE; /* output in pslx format */
+static int errCount = 0; /* count of  PSLs failing checks */
+
+struct coords
+/* structure to return converted coordinates */
+{
+    int start;
+    int end;
+    int size;
+    char strand;
+};
+
+static struct coords blastToUcsc(int blastStart, int blastEnd, int size, int blastFrame)
+/* convert coordinates from blast to UCSC convention. */
+{
+// blastStart >= blastEnd for queries with blastFrame < 0
+// blastStart <= blastEnd for hits with blastFrame < 0
+struct coords ucsc;
+ucsc.start = (blastStart <= blastEnd) ? blastStart-1 : blastEnd-1;
+ucsc.end = (blastStart <= blastEnd) ? blastEnd : blastStart;
+ucsc.size = size;
+ucsc.strand = (blastFrame >= 0) ? '+' : '-';
+if (ucsc.strand == '-')
+    reverseIntRange(&ucsc.start, &ucsc.end, size);
+assert(ucsc.start < ucsc.end);
+return ucsc;
+}
+
+static unsigned getFlags(struct ncbiBlastBlastOutput *outputRec)
+/* determine blast algorithm and other flags */
+{
+return pslBuildGetBlastAlgo(outputRec->ncbiBlastBlastOutputProgram->text) | (pslxFmt ? bldPslx : 0);
+}
+
+static void outputPsl(struct psl *psl, struct ncbiBlastHsp *hspRec, FILE* pslFh, FILE* scoreFh)
+/* output a psl and optional score */
+{
+pslTabOut(psl, pslFh);
+if (scoreFh != NULL)
+    pslBuildWriteScores(scoreFh, psl, hspRec->ncbiBlastHspBitScore->text, hspRec->ncbiBlastHspEvalue->text);
+pslCheck("blastXmlToPsl", stderr, psl);
+}
+
+static void processHspRec(struct ncbiBlastIteration *iterRec, struct ncbiBlastHit *hitRec,
+                          struct ncbiBlastHsp *hspRec, unsigned flags, FILE *pslFh, FILE *scoreFh)
+/* process one HSP record, converting to a PSL */
+{
+struct coords qUcsc = blastToUcsc(hspRec->ncbiBlastHspQueryFrom->text, hspRec->ncbiBlastHspQueryTo->text, iterRec->ncbiBlastIterationQueryLen->text,
+                                  ((hspRec->ncbiBlastHspQueryFrame == NULL) ? 0 : hspRec->ncbiBlastHspQueryFrame->text));
+struct coords tUcsc = blastToUcsc(hspRec->ncbiBlastHspHitFrom->text, hspRec->ncbiBlastHspHitTo->text, hitRec->ncbiBlastHitLen->text,
+                                  ((hspRec->ncbiBlastHspHitFrame == NULL) ? 0 : hspRec->ncbiBlastHspHitFrame->text));
+struct psl *psl = pslBuildFromHsp(firstWordInLine(iterRec->ncbiBlastIterationQueryDef->text), qUcsc.size, qUcsc.start, qUcsc.end, qUcsc.strand, hspRec->ncbiBlastHspQseq->text,
+                                  firstWordInLine(hitRec->ncbiBlastHitDef->text), tUcsc.size, tUcsc.start, tUcsc.end, tUcsc.strand, hspRec->ncbiBlastHspHseq->text,
+                                  flags);
+if  ((psl->blockCount > 0) && ((hspRec->ncbiBlastHspEvalue->text <= eVal) || (eVal == -1)))
+    outputPsl(psl, hspRec, pslFh, scoreFh);
+pslFree(&psl);
+}
+
+static void processIterRec(struct ncbiBlastIteration *iterRec, unsigned flags, FILE *pslFh, FILE *scoreFh)
+/* process one iteration record, converting all HSPs to PSLs */
+{
+struct ncbiBlastIterationHits *hitsRec;
+for (hitsRec = iterRec->ncbiBlastIterationHits; hitsRec != NULL; hitsRec = hitsRec->next)
+    {
+    struct ncbiBlastHit *hitRec;
+    for (hitRec = hitsRec->ncbiBlastHit; hitRec != NULL; hitRec = hitRec->next)
+        {
+        struct ncbiBlastHitHsps *hspsRec;
+        for (hspsRec = hitRec->ncbiBlastHitHsps; hspsRec != NULL; hspsRec = hspsRec->next)
+            {
+            struct ncbiBlastHsp *hspRec;
+            for (hspRec = hspsRec->ncbiBlastHsp; hspRec != NULL; hspRec = hspRec->next)
+                {
+                processHspRec(iterRec, hitRec, hspRec, flags, pslFh, scoreFh);
+                }
+            }
+        }
+    }
+}
+
+static void convertOnePassBlast(struct ncbiBlastBlastOutput *outputRec, unsigned flags, FILE *pslFh, FILE *scoreFh)
+/* convert standard single-pass blast */
+{
+struct ncbiBlastBlastOutputIterations *itersRec;
+for (itersRec = outputRec->ncbiBlastBlastOutputIterations; itersRec != NULL; itersRec = itersRec->next)
+    {
+    struct ncbiBlastIteration *iterRec;
+    for (iterRec = itersRec->ncbiBlastIteration; iterRec != NULL; iterRec = iterRec->next)
+        processIterRec(iterRec, flags, pslFh, scoreFh);
+    }
+}
+
+static struct ncbiBlastIteration *findLastIterForQuery(struct ncbiBlastIteration *iterRec)
+/* find the last iteration record for the query in the specified record */
+{
+struct ncbiBlastIteration *nextRec;
+for (nextRec = iterRec->next; nextRec != NULL ; iterRec = nextRec, nextRec = nextRec->next)
+    {
+    if (!sameString(nextRec->ncbiBlastIterationQueryDef->text, iterRec->ncbiBlastIterationQueryDef->text))
+        break;
+    }
+return iterRec;
+}
+
+static void convertPsiBlast(struct ncbiBlastBlastOutput *outputRec, unsigned flags, FILE *pslFh, FILE *scoreFh)
+/* convert psi-blast */
+{
+struct ncbiBlastBlastOutputIterations *itersRec;
+for (itersRec = outputRec->ncbiBlastBlastOutputIterations; itersRec != NULL; itersRec = itersRec->next)
+    {
+    struct ncbiBlastIteration *iterRec;
+    for (iterRec = itersRec->ncbiBlastIteration; iterRec != NULL; iterRec = iterRec->next)
+        {
+        iterRec = findLastIterForQuery(iterRec);
+        processIterRec(iterRec, flags, pslFh, scoreFh);
+        }
+    }
+}
+
+static void blastXmlToPsl(char *blastXmlFile, char *pslFile, char *scoreFile)
+/* blastXmlToPsl - convert blast XML output to PSLs. */
+{
+struct xap *xap = xapNew(ncbiBlastStartHandler, ncbiBlastEndHandler, blastXmlFile);
+xapParseFile(xap, blastXmlFile);
+FILE *pslFh = mustOpen(pslFile, "w");
+FILE *scoreFh = NULL;
+if (scoreFile != NULL)
+    {
+    scoreFh = mustOpen(scoreFile, "w");
+    fputs(pslBuildScoreHdr, scoreFh);
+    }
+
+if (xap->topObject == NULL)
+    errAbort("empty BLAST XML file: %s", blastXmlFile);
+char *expectType = "BlastOutput";
+if (!sameString(xap->topType, expectType))
+    errAbort("expected top XML element of type \"%s\", got \"%s\"", expectType, xap->topType);
+struct ncbiBlastBlastOutput *outputRec = xap->topObject;
+unsigned flags = getFlags(outputRec);
+
+if (flags & psiblast)
+    convertPsiBlast(outputRec, flags, pslFh, scoreFh);
+else
+    convertOnePassBlast(outputRec, flags, pslFh, scoreFh);
+
+carefulClose(&scoreFh);
+carefulClose(&pslFh);
+ncbiBlastBlastOutputFree(&outputRec);
+xapFree(&xap);
+}
+
+int main(int argc, char *argv[])
+/* Process command line. */
+{
+optionInit(&argc, argv, options);
+if (argc != 3)
+    usage();
+eVal = optionDouble("eVal", eVal);
+pslxFmt = optionExists("pslx");
+blastXmlToPsl(argv[1], argv[2], optionVal("scores", NULL));
+if (errCount > 0)
+    errAbort("%d invalid PSLs created", errCount);
+return 0;
+}