src/lib/blastParse.c 1.24

1.24 2009/04/12 18:45:54 markd
skip bogus blocks in PSI BLAST output
Index: src/lib/blastParse.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/lib/blastParse.c,v
retrieving revision 1.23
retrieving revision 1.24
diff -b -B -U 4 -r1.23 -r1.24
--- src/lib/blastParse.c	12 Apr 2009 05:48:42 -0000	1.23
+++ src/lib/blastParse.c	12 Apr 2009 18:45:54 -0000	1.24
@@ -9,8 +9,9 @@
 #include "verbose.h"
 
 static char const rcsid[] = "$Id$";
 
+#define WARN_LEVEL 1   /* verbose level to enable warnings */
 #define TRACE_LEVEL 3  /* verbose level to enable tracing of files */
 #define DUMP_LEVEL 4    /* verbose level to enable dumping of parsed */
 
 struct blastFile *blastFileReadAll(char *fileName)
@@ -31,10 +32,15 @@
 
 static void bfError(struct blastFile *bf, char *message)
 /* Print blast file error message. */
 {
-errAbort("%s\nLine %d of %s", message, 
-	bf->lf->lineIx, bf->fileName);
+errAbort("%s:%d: %s", bf->fileName, bf->lf->lineIx, message);
+}
+
+static void bfWarn(struct blastFile *bf, char *message)
+/* Print blast file warning message. */
+{
+verbose(WARN_LEVEL, "Warning: %s:%d: %s\n", bf->fileName, bf->lf->lineIx, message);
 }
 
 static void bfUnexpectedEof(struct blastFile *bf)
 /* generate error on unexpected EOF */
@@ -47,8 +53,30 @@
 {
 bfError(bf, "Can't cope with BLAST output syntax");
 }
 
+static boolean isAllDigits(char *s)
+/* test if a string is all digits */
+{
+for (; *s != '\0'; s++)
+    {
+    if (!isdigit(*s))
+        return FALSE;
+    }
+return TRUE;
+}
+
+static boolean isAllDashes(char *s)
+/* test if a string is all dashes */
+{
+for (; *s != '\0'; s++)
+    {
+    if (*s != '-')
+        return FALSE;
+    }
+return TRUE;
+}
+
 static char *bfNextLine(struct blastFile *bf)
 /* Fetch next line of input trying, or NULL if not found */
 {
 char *line = NULL;
@@ -435,25 +463,40 @@
     return atof(buf);
     }
 }
 
-static void parseBlockLine(struct blastFile *bf, int *startRet, int *endRet,
+static boolean parseBlockLine(struct blastFile *bf, int *startRet, int *endRet,
                            struct dyString *seq)
 /* read and parse the next target or query line, like:
  *   Query: 26429 taccttgacattcctcagtgtgtcatcatcgttctctcctccaaacggcgagagtccgga 26488
  *
  * also handle broken NCBI tblastn output like:
  *   Sbjct: 1181YYGEQRSTNGQTIQLKTQVFRRFPDDDDESEDHDDPDNAHESPEQEGAEGHFDLHYYENQ 1360
+ *
+ * Ignores and returns FALSE on bogus records generated by PSI BLAST, such as
+ *   Query: 0   --------------------------                                  
+ *   Sbjct: 38  PPGPPGVAGGNQTTVVVIYGPPGPPG                                   63
+ *   Query: 0                                                               
+ *   Sbjct: 63                                                               63
+ * If FALSE is returned, the output parameters will be unchanged.
  */
 {
 char* line = bfNeedNextLine(bf);
 int a, b, s, e;
 char *words[16];
 int wordCount = chopLine(line, words);
-if ((wordCount < 3) || (wordCount > 4)
-    || !(sameString("Query:", words[0]) || sameString("Sbjct:", words[0])))
+if ((wordCount < 2) || (wordCount > 4) || !(sameString("Query:", words[0]) || sameString("Sbjct:", words[0])))
     bfSyntax(bf);
 
+/* look for one of the bad formats to ignore, as described above */
+if (((wordCount == 2) && isAllDigits(words[1]))
+    || ((wordCount == 3) && isAllDigits(words[1]) && isAllDigits(words[2]))
+    || ((wordCount == 3) && isAllDigits(words[1]) && isAllDashes(words[2])))
+    {
+    bfWarn(bf, "Ignored invalid alignment format for aligned sequence pair");
+    return FALSE;
+    }
+
 /* special handling for broken output with no space between start and
  * sequence */
 if (wordCount == 3)
     {
@@ -478,10 +521,65 @@
 s = min(a,b);
 e = max(a,b);
 *startRet = min(s, *startRet);
 *endRet = max(e, *endRet);
+return TRUE;
 }
 
+static boolean findBlockSeqPair(struct blastFile *bf, struct blastQuery *bq)
+/* scan forward for the next pair of Query:/Sbjct: sequences */
+{
+char *line;
+for (;;)
+    {
+    if (!nextBlockLine(bf, bq, &line))
+        return FALSE;
+    if (startsWith(" Score", line))
+        {
+        lineFileReuse(bf->lf);
+        return FALSE;
+        }
+    if (startsWith("Query:", line))
+        {
+        lineFileReuse(bf->lf);
+        return TRUE;
+        }
+    }
+}
+
+static void clearBlastBlock(struct blastBlock *bb, struct dyString *qString, struct dyString *tString)
+/* reset the contents of a blast block being accummulated */
+{
+bb->qStart = bb->tStart = 0x3fffffff;
+bb->qEnd = bb->tEnd = -bb->qStart;
+dyStringClear(qString);
+dyStringClear(tString);
+}
+
+static void parseBlockSeqPair(struct blastFile *bf, struct blastBlock *bb,
+                              struct dyString *qString, struct dyString *tString)
+/* parse the current pair Query:/Sbjct: sequences */
+{
+boolean isOk = TRUE;
+
+/* Query line */
+if (!parseBlockLine(bf, &bb->qStart, &bb->qEnd, qString))
+    isOk = FALSE;
+
+/* Skip next line. */
+bfNeedNextLine(bf);
+
+/* Fetch target sequence line. */
+if (!parseBlockLine(bf, &bb->tStart, &bb->tEnd, tString))
+    isOk = FALSE;
+if (!isOk)
+    {
+    // reset accumulated data so we discard everything before bogus pair
+    clearBlastBlock(bb, qString, tString);
+    }
+}
+
+
 static struct blastBlock *nextBlock(struct blastFile *bf, struct blastQuery *bq,
                                     struct blastGappedAli *bga, boolean *skipRet)
 /* Read in next blast block.  Return NULL at EOF or end of gapped
  * alignment. If an unparsable block is found, set skipRet to TRUE and return
@@ -616,45 +714,20 @@
  * Query: 26429 taccttgacattcctcagtgtgtcatcatcgttctctcctccaaacggcgagagtccgga 26488
  *              |||||| |||||||||| ||| ||||||||||||||||||||||| || || ||||||||
  * Sbjct: 62966 taccttaacattcctcaatgtttcatcatcgttctctcctccaaatggtgaaagtccgga 63025
  */
-bb->qStart = bb->tStart = 0x3fffffff;
-bb->qEnd = bb->tEnd = -bb->qStart;
 if (qString == NULL)
     {
     qString = newDyString(50000);
     tString = newDyString(50000);
     }
-else
-    {
-    dyStringClear(qString);
-    dyStringClear(tString);
-    }
+clearBlastBlock(bb, qString, tString);
 for (;;)
     {
-    /* Fetch next first line. */
-    for (;;)
-	{
-	if (!nextBlockLine(bf, bq, &line))
-	    goto DONE;
-	if (startsWith(" Score", line))
-	    {
-	    lineFileReuse(bf->lf);
-	    goto DONE;
-	    }
-	if (startsWith("Query:", line))
+    if (!findBlockSeqPair(bf, bq))
 	    break;
+    parseBlockSeqPair(bf, bb, qString, tString);
 	}
-    lineFileReuse(bf->lf);
-    parseBlockLine(bf, &bb->qStart, &bb->qEnd, qString);
-
-    /* Skip next line. */
-    line = bfNeedNextLine(bf);
-
-    /* Fetch target sequence line. */
-    parseBlockLine(bf, &bb->tStart, &bb->tEnd, tString);
-    }
-DONE:
 
 /* convert to [0..n) and move to strand coords if necessary */
 bb->qStart--;
 if (bb->qStrand < 0)