src/lib/blastParse.c 1.24
1.24 2009/04/12 18:45:54 markd
skip bogus blocks in PSI BLAST output
Index: src/lib/blastParse.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/lib/blastParse.c,v
retrieving revision 1.23
retrieving revision 1.24
diff -b -B -U 4 -r1.23 -r1.24
--- src/lib/blastParse.c 12 Apr 2009 05:48:42 -0000 1.23
+++ src/lib/blastParse.c 12 Apr 2009 18:45:54 -0000 1.24
@@ -9,8 +9,9 @@
#include "verbose.h"
static char const rcsid[] = "$Id$";
+#define WARN_LEVEL 1 /* verbose level to enable warnings */
#define TRACE_LEVEL 3 /* verbose level to enable tracing of files */
#define DUMP_LEVEL 4 /* verbose level to enable dumping of parsed */
struct blastFile *blastFileReadAll(char *fileName)
@@ -31,10 +32,15 @@
static void bfError(struct blastFile *bf, char *message)
/* Print blast file error message. */
{
-errAbort("%s\nLine %d of %s", message,
- bf->lf->lineIx, bf->fileName);
+errAbort("%s:%d: %s", bf->fileName, bf->lf->lineIx, message);
+}
+
+static void bfWarn(struct blastFile *bf, char *message)
+/* Print blast file warning message. */
+{
+verbose(WARN_LEVEL, "Warning: %s:%d: %s\n", bf->fileName, bf->lf->lineIx, message);
}
static void bfUnexpectedEof(struct blastFile *bf)
/* generate error on unexpected EOF */
@@ -47,8 +53,30 @@
{
bfError(bf, "Can't cope with BLAST output syntax");
}
+static boolean isAllDigits(char *s)
+/* test if a string is all digits */
+{
+for (; *s != '\0'; s++)
+ {
+ if (!isdigit(*s))
+ return FALSE;
+ }
+return TRUE;
+}
+
+static boolean isAllDashes(char *s)
+/* test if a string is all dashes */
+{
+for (; *s != '\0'; s++)
+ {
+ if (*s != '-')
+ return FALSE;
+ }
+return TRUE;
+}
+
static char *bfNextLine(struct blastFile *bf)
/* Fetch next line of input trying, or NULL if not found */
{
char *line = NULL;
@@ -435,25 +463,40 @@
return atof(buf);
}
}
-static void parseBlockLine(struct blastFile *bf, int *startRet, int *endRet,
+static boolean parseBlockLine(struct blastFile *bf, int *startRet, int *endRet,
struct dyString *seq)
/* read and parse the next target or query line, like:
* Query: 26429 taccttgacattcctcagtgtgtcatcatcgttctctcctccaaacggcgagagtccgga 26488
*
* also handle broken NCBI tblastn output like:
* Sbjct: 1181YYGEQRSTNGQTIQLKTQVFRRFPDDDDESEDHDDPDNAHESPEQEGAEGHFDLHYYENQ 1360
+ *
+ * Ignores and returns FALSE on bogus records generated by PSI BLAST, such as
+ * Query: 0 --------------------------
+ * Sbjct: 38 PPGPPGVAGGNQTTVVVIYGPPGPPG 63
+ * Query: 0
+ * Sbjct: 63 63
+ * If FALSE is returned, the output parameters will be unchanged.
*/
{
char* line = bfNeedNextLine(bf);
int a, b, s, e;
char *words[16];
int wordCount = chopLine(line, words);
-if ((wordCount < 3) || (wordCount > 4)
- || !(sameString("Query:", words[0]) || sameString("Sbjct:", words[0])))
+if ((wordCount < 2) || (wordCount > 4) || !(sameString("Query:", words[0]) || sameString("Sbjct:", words[0])))
bfSyntax(bf);
+/* look for one of the bad formats to ignore, as described above */
+if (((wordCount == 2) && isAllDigits(words[1]))
+ || ((wordCount == 3) && isAllDigits(words[1]) && isAllDigits(words[2]))
+ || ((wordCount == 3) && isAllDigits(words[1]) && isAllDashes(words[2])))
+ {
+ bfWarn(bf, "Ignored invalid alignment format for aligned sequence pair");
+ return FALSE;
+ }
+
/* special handling for broken output with no space between start and
* sequence */
if (wordCount == 3)
{
@@ -478,10 +521,65 @@
s = min(a,b);
e = max(a,b);
*startRet = min(s, *startRet);
*endRet = max(e, *endRet);
+return TRUE;
}
+static boolean findBlockSeqPair(struct blastFile *bf, struct blastQuery *bq)
+/* scan forward for the next pair of Query:/Sbjct: sequences */
+{
+char *line;
+for (;;)
+ {
+ if (!nextBlockLine(bf, bq, &line))
+ return FALSE;
+ if (startsWith(" Score", line))
+ {
+ lineFileReuse(bf->lf);
+ return FALSE;
+ }
+ if (startsWith("Query:", line))
+ {
+ lineFileReuse(bf->lf);
+ return TRUE;
+ }
+ }
+}
+
+static void clearBlastBlock(struct blastBlock *bb, struct dyString *qString, struct dyString *tString)
+/* reset the contents of a blast block being accummulated */
+{
+bb->qStart = bb->tStart = 0x3fffffff;
+bb->qEnd = bb->tEnd = -bb->qStart;
+dyStringClear(qString);
+dyStringClear(tString);
+}
+
+static void parseBlockSeqPair(struct blastFile *bf, struct blastBlock *bb,
+ struct dyString *qString, struct dyString *tString)
+/* parse the current pair Query:/Sbjct: sequences */
+{
+boolean isOk = TRUE;
+
+/* Query line */
+if (!parseBlockLine(bf, &bb->qStart, &bb->qEnd, qString))
+ isOk = FALSE;
+
+/* Skip next line. */
+bfNeedNextLine(bf);
+
+/* Fetch target sequence line. */
+if (!parseBlockLine(bf, &bb->tStart, &bb->tEnd, tString))
+ isOk = FALSE;
+if (!isOk)
+ {
+ // reset accumulated data so we discard everything before bogus pair
+ clearBlastBlock(bb, qString, tString);
+ }
+}
+
+
static struct blastBlock *nextBlock(struct blastFile *bf, struct blastQuery *bq,
struct blastGappedAli *bga, boolean *skipRet)
/* Read in next blast block. Return NULL at EOF or end of gapped
* alignment. If an unparsable block is found, set skipRet to TRUE and return
@@ -616,45 +714,20 @@
* Query: 26429 taccttgacattcctcagtgtgtcatcatcgttctctcctccaaacggcgagagtccgga 26488
* |||||| |||||||||| ||| ||||||||||||||||||||||| || || ||||||||
* Sbjct: 62966 taccttaacattcctcaatgtttcatcatcgttctctcctccaaatggtgaaagtccgga 63025
*/
-bb->qStart = bb->tStart = 0x3fffffff;
-bb->qEnd = bb->tEnd = -bb->qStart;
if (qString == NULL)
{
qString = newDyString(50000);
tString = newDyString(50000);
}
-else
- {
- dyStringClear(qString);
- dyStringClear(tString);
- }
+clearBlastBlock(bb, qString, tString);
for (;;)
{
- /* Fetch next first line. */
- for (;;)
- {
- if (!nextBlockLine(bf, bq, &line))
- goto DONE;
- if (startsWith(" Score", line))
- {
- lineFileReuse(bf->lf);
- goto DONE;
- }
- if (startsWith("Query:", line))
+ if (!findBlockSeqPair(bf, bq))
break;
+ parseBlockSeqPair(bf, bb, qString, tString);
}
- lineFileReuse(bf->lf);
- parseBlockLine(bf, &bb->qStart, &bb->qEnd, qString);
-
- /* Skip next line. */
- line = bfNeedNextLine(bf);
-
- /* Fetch target sequence line. */
- parseBlockLine(bf, &bb->tStart, &bb->tEnd, tString);
- }
-DONE:
/* convert to [0..n) and move to strand coords if necessary */
bb->qStart--;
if (bb->qStrand < 0)