89dc43c1ed51a5f783b2e84485864aa5d17d57c5
markd
  Thu Oct 10 09:33:07 2019 -0700
Capture PSL parse errors and report them instead of failing.  Add option to act as a filter and not error exit

diff --git src/hg/pslCheck/pslCheck.c src/hg/pslCheck/pslCheck.c
index ab83345..921582a 100644
--- src/hg/pslCheck/pslCheck.c
+++ src/hg/pslCheck/pslCheck.c
@@ -1,73 +1,78 @@
 /* pslCheck - validate PSL files or tables. */
 #include "common.h"
 #include "options.h"
 #include "portable.h"
 #include "psl.h"
 #include "hash.h"
 #include "jksql.h"
 #include "sqlNum.h"
 #include "chromInfo.h"
+#include "errCatch.h"
 #include "verbose.h"
 
 
 /* command line options and values */
 static struct optionSpec optionSpecs[] =
 {
     {"db", OPTION_STRING},
     {"prot", OPTION_BOOLEAN},
     {"quiet", OPTION_BOOLEAN},
     {"noCountCheck", OPTION_BOOLEAN},
     {"targetSizes", OPTION_STRING},
     {"querySizes", OPTION_STRING},
     {"pass", OPTION_STRING},
     {"fail", OPTION_STRING},
+    {"filter", OPTION_BOOLEAN},
     {"ignoreQUniq", OPTION_BOOLEAN},
     {"skipInsertCounts", OPTION_BOOLEAN},
     {NULL, 0}
 };
 static char *db = NULL;
 static boolean protCheck = FALSE;
 static boolean quiet = FALSE;
 static boolean noCountCheck = FALSE;
+static boolean filter = FALSE;
 static char *passFile = NULL;
 static char *failFile = NULL;
 static boolean ignoreQUniq = FALSE;
 static boolean skipInsertCounts = FALSE;
 static struct hash *targetSizes = NULL;
 static struct hash *querySizes = NULL;
 
 /* global count of alignments checked and errors */
 static int chkCount = 0;
 static int failCount = 0;
 static int errCount = 0;
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "pslCheck - validate PSL files\n"
   "usage:\n"
   "   pslCheck fileTbl(s)\n"
   "options:\n"
   "   -db=db - get targetSizes from this database, and if file doesn't exist,\n"
   "    look for a table in this database.\n"
   "   -prot - confirm psls are protein psls\n"
   "   -noCountCheck - don't validate that match/mismatch counts are match\n"
   "    the total size of the alignment blocks\n"
   "   -pass=pslFile - write PSLs without errors to this file\n"
   "   -fail=pslFile - write PSLs with errors to this file\n"
+  "   -filter - use program as a filter, with -pass and/or -fail, don't error exit\n"
+  "    on problems, but do report them.\n"
   "   -targetSizes=sizesFile - tab file with columns of target and size.\n"
   "    If specified, psl is check to have a valid target and target\n"
   "    coordinates.\n"
   "   -skipInsertCounts - Don't validate insert counts.  Useful for BLAT protein\n"
   "    PSLs where these are not computed consistently.\n"
   "   -querySizes=sizesFile - file with query sizes.\n"
   "   -ignoreQUniq - ignore everything after the last `-' in the qName field, that\n"
   "    is sometimes used to generate a unique identifier\n"
   "   -quiet - no write error message, just filter\n");
 }
 
 static struct hash *loadSizes(char *sizesFile)
 /* load a sizes file */
 {
 struct hash *sizes = hashNew(20);
@@ -113,30 +118,76 @@
 if (ignoreQUniq)
     {
     if (buf == NULL)
         buf = dyStringNew(2*strlen(qName));
     dyStringClear(buf);
     char *dash = strrchr(qName, '-');
     if (dash == NULL)
         return qName;
     dyStringAppendN(buf, qName, (dash-qName));
     return buf->string;
     }
 else
     return qName;
 }
 
+static void printErrRow(int numColumns, char** row)
+/* print a row, which might not be a valid PSL, for error reporting */
+{
+int i;
+for (i = 0; i < numColumns; i++)
+    fprintf(stderr, "\t%s", row[i]);
+fprintf(stderr, "\n");
+}
+
+static struct psl *parsePsl(char *fileTblName, int lineNum,
+                            int numColumns, char** row)
+/*
+ * Parse a PSL.  If an error occurs, report and count it and return NULL.
+ */
+{
+if (!((numColumns == PSL_NUM_COLS) || (numColumns == PSLX_NUM_COLS)))
+    {
+    fprintf(stderr, "Error: wrong number of columns in PSL: %d, expected %d or %d: %s:%d\n",
+            numColumns, PSL_NUM_COLS, PSLX_NUM_COLS, fileTblName, lineNum);
+    printErrRow(numColumns, row);
+    errCount += 1;
+    failCount += 1;
+    return NULL;
+    }
+struct errCatch *errCatch = errCatchNew();
+struct psl *psl = NULL;
+if (errCatchStart(errCatch))
+    {
+    if (numColumns == PSL_NUM_COLS)
+        psl = pslLoad(row);
+    else
+        psl = pslxLoad(row);
+    }
+errCatchEnd(errCatch);
+if (errCatch->gotError)
+    {
+    fprintf(stderr, "Error: parsing of PSL failed: %s: %s:%d\n",
+            trimSpaces(errCatch->message->string), fileTblName, lineNum);
+    printErrRow(numColumns, row);
+    errCount += 1;
+    failCount += 1;
+    }
+errCatchFree(&errCatch);
+return psl;
+}
+
 static int checkSize(struct psl *psl, char *pslDesc, char *sizeDesc,
                      int numErrs, struct hash *sizeTbl, char *name, int size,
                      FILE *errFh)
 /* check a size, error count (0 or 1) */
 {
 int expectSz = hashIntValDefault(sizeTbl, name, -1);
 if (expectSz < 0)
     {
     if (numErrs == 0)
         prPslDesc(psl, pslDesc, errFh);
     fprintf(errFh, "\t%s \"%s\" does not exist\n", sizeDesc, name);
     return 1;
     }
 if (size != expectSz)
     {
@@ -198,55 +249,66 @@
 if ((passFh != NULL) && (numErrs == 0))
     pslTabOut(psl, passFh);
 if ((failFh != NULL) && (numErrs > 0))
     pslTabOut(psl, failFh);
 errCount += numErrs;
 chkCount++;
 if (numErrs > 0)
     failCount++;
 }
 
 static void checkPslFile(char *fileName, unsigned opts, FILE *errFh,
                          FILE *passFh, FILE *failFh)
 /* Check one psl file */
 {
 struct lineFile *lf = pslFileOpen(fileName);
-struct psl *psl;
+char *row[2*PSLX_NUM_COLS];   // allow extra
+int numColumns;
 
-while ((psl = pslNext(lf)) != NULL)
+while ((numColumns = lineFileChopCharNext(lf, '\t', row, ArraySize(row))) > 0)
+    {
+    struct psl *psl = parsePsl(lf->fileName, lf->lineEnd, numColumns, row);
+    if (psl != NULL)
         {
         checkPsl(lf, NULL, opts, psl, errFh, passFh, failFh);
         pslFree(&psl);
         }
+    }
 lineFileClose(&lf);
 }
 
 static void checkPslTbl(struct sqlConnection *conn, char *tbl, unsigned opts, FILE *errFh,
                          FILE *passFh, FILE *failFh)
 /* Check one psl table */
 {
 char query[1024], **row;
 sqlSafef(query, sizeof(query), "select * from %s", tbl);
 struct sqlResult *sr = sqlGetResult(conn, query);
+int numColumns = sqlCountColumns(sr);
+int rowNum = 0;
 int rowOff = (sqlFieldColumn(sr, "bin") >= 0) ? 1 : 0;
 
 while ((row = sqlNextRow(sr)) != NULL)
     {
-    struct psl *psl = pslLoad(row+rowOff);
+    rowNum++;
+    struct psl *psl = parsePsl(tbl, rowNum, numColumns-rowOff, row+rowOff);
+    if (psl != NULL)
+        {
         checkPsl(NULL, tbl, opts, psl, errFh, passFh, failFh);
         pslFree(&psl);
         }
+    }
 sqlFreeResult(&sr);
 }
 
 void checkFileTbl(struct sqlConnection *conn, char *fileTblName,
                   FILE *errFh, FILE *passFh, FILE *failFh)
 /* check a PSL file or table. */
 {
 unsigned opts = 0;
 if (skipInsertCounts)
     opts |= PSL_CHECK_IGNORE_INSERT_CNTS;
 if (fileExists(fileTblName))
     checkPslFile(fileTblName, opts, errFh, passFh, failFh);
 else if (conn == NULL)
     errAbort("file %s does not exist and no database specified", fileTblName);
 else
@@ -268,32 +330,33 @@
 carefulClose(&failFh);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, optionSpecs);
 if (argc < 2)
     usage();
 db = optionVal("db", NULL);
 protCheck = optionExists("prot");
 noCountCheck = optionExists("noCountCheck");
 quiet = optionExists("quiet");
 passFile = optionVal("pass", NULL);
 failFile = optionVal("fail", NULL);
+filter = optionExists("filter");
 ignoreQUniq = optionExists("ignoreQUniq");
 skipInsertCounts = optionExists("skipInsertCounts");
 struct sqlConnection *conn = NULL;
 if (db != NULL)
     conn = sqlConnect(db);
 
 if (optionExists("targetSizes"))
     targetSizes = loadSizes(optionVal("targetSizes", NULL));
 else if (db != NULL)
     targetSizes = loadChromInfoSizes(conn);
 if (optionExists("querySizes"))
     querySizes = loadSizes(optionVal("querySizes", NULL));
 checkFilesTbls(conn, argc-1, argv+1);
 sqlDisconnect(&conn);
 verbose(1, "checked: %d failed: %d errors: %d\n", chkCount, failCount, errCount);
-return ((errCount == 0) ? 0 : 1);
+return (((errCount == 0) || filter) ? 0 : 1);
 }