89dc43c1ed51a5f783b2e84485864aa5d17d57c5 markd Thu Oct 10 09:33:07 2019 -0700 Capture PSL parse errors and report them instead of failing. Add option to act as a filter and not error exit diff --git src/hg/pslCheck/pslCheck.c src/hg/pslCheck/pslCheck.c index ab83345..921582a 100644 --- src/hg/pslCheck/pslCheck.c +++ src/hg/pslCheck/pslCheck.c @@ -1,299 +1,362 @@ /* pslCheck - validate PSL files or tables. */ #include "common.h" #include "options.h" #include "portable.h" #include "psl.h" #include "hash.h" #include "jksql.h" #include "sqlNum.h" #include "chromInfo.h" +#include "errCatch.h" #include "verbose.h" /* command line options and values */ static struct optionSpec optionSpecs[] = { {"db", OPTION_STRING}, {"prot", OPTION_BOOLEAN}, {"quiet", OPTION_BOOLEAN}, {"noCountCheck", OPTION_BOOLEAN}, {"targetSizes", OPTION_STRING}, {"querySizes", OPTION_STRING}, {"pass", OPTION_STRING}, {"fail", OPTION_STRING}, + {"filter", OPTION_BOOLEAN}, {"ignoreQUniq", OPTION_BOOLEAN}, {"skipInsertCounts", OPTION_BOOLEAN}, {NULL, 0} }; static char *db = NULL; static boolean protCheck = FALSE; static boolean quiet = FALSE; static boolean noCountCheck = FALSE; +static boolean filter = FALSE; static char *passFile = NULL; static char *failFile = NULL; static boolean ignoreQUniq = FALSE; static boolean skipInsertCounts = FALSE; static struct hash *targetSizes = NULL; static struct hash *querySizes = NULL; /* global count of alignments checked and errors */ static int chkCount = 0; static int failCount = 0; static int errCount = 0; void usage() /* Explain usage and exit. */ { errAbort( "pslCheck - validate PSL files\n" "usage:\n" " pslCheck fileTbl(s)\n" "options:\n" " -db=db - get targetSizes from this database, and if file doesn't exist,\n" " look for a table in this database.\n" " -prot - confirm psls are protein psls\n" " -noCountCheck - don't validate that match/mismatch counts are match\n" " the total size of the alignment blocks\n" " -pass=pslFile - write PSLs without errors to this file\n" " -fail=pslFile - write PSLs with errors to this file\n" + " -filter - use program as a filter, with -pass and/or -fail, don't error exit\n" + " on problems, but do report them.\n" " -targetSizes=sizesFile - tab file with columns of target and size.\n" " If specified, psl is check to have a valid target and target\n" " coordinates.\n" " -skipInsertCounts - Don't validate insert counts. Useful for BLAT protein\n" " PSLs where these are not computed consistently.\n" " -querySizes=sizesFile - file with query sizes.\n" " -ignoreQUniq - ignore everything after the last `-' in the qName field, that\n" " is sometimes used to generate a unique identifier\n" " -quiet - no write error message, just filter\n"); } static struct hash *loadSizes(char *sizesFile) /* load a sizes file */ { struct hash *sizes = hashNew(20); struct lineFile *lf = lineFileOpen(sizesFile, TRUE); char *cols[2]; while (lineFileNextRowTab(lf, cols, ArraySize(cols))) hashAddInt(sizes, cols[0], sqlUnsigned(cols[1])); lineFileClose(&lf); return sizes; } static struct hash *loadChromInfoSizes(struct sqlConnection *conn) /* chromInfo sizes */ { struct hash *sizes = hashNew(20); char **row; struct sqlResult *sr = sqlGetResult(conn, NOSQLINJ "select * from chromInfo"); while ((row = sqlNextRow(sr)) != NULL) { struct chromInfo *ci = chromInfoLoad(row); hashAddInt(sizes, ci->chrom, ci->size); chromInfoFree(&ci); } sqlFreeResult(&sr); return sizes; } static void prPslDesc(struct psl *psl, char *pslDesc,FILE *errFh) /* print a description of psl before the first error. */ { fprintf(errFh, "Error: invalid PSL: %s:%u-%u %s:%u-%u %s %s\n", psl->qName, psl->qStart, psl->qEnd, psl->tName, psl->tStart, psl->tEnd, psl->strand, pslDesc); } static char *getQName(char *qName) /* get query name, optionally dropping trailing unique identifier. * WARNING: static return */ { static struct dyString *buf = NULL; if (ignoreQUniq) { if (buf == NULL) buf = dyStringNew(2*strlen(qName)); dyStringClear(buf); char *dash = strrchr(qName, '-'); if (dash == NULL) return qName; dyStringAppendN(buf, qName, (dash-qName)); return buf->string; } else return qName; } +static void printErrRow(int numColumns, char** row) +/* print a row, which might not be a valid PSL, for error reporting */ +{ +int i; +for (i = 0; i < numColumns; i++) + fprintf(stderr, "\t%s", row[i]); +fprintf(stderr, "\n"); +} + +static struct psl *parsePsl(char *fileTblName, int lineNum, + int numColumns, char** row) +/* + * Parse a PSL. If an error occurs, report and count it and return NULL. + */ +{ +if (!((numColumns == PSL_NUM_COLS) || (numColumns == PSLX_NUM_COLS))) + { + fprintf(stderr, "Error: wrong number of columns in PSL: %d, expected %d or %d: %s:%d\n", + numColumns, PSL_NUM_COLS, PSLX_NUM_COLS, fileTblName, lineNum); + printErrRow(numColumns, row); + errCount += 1; + failCount += 1; + return NULL; + } +struct errCatch *errCatch = errCatchNew(); +struct psl *psl = NULL; +if (errCatchStart(errCatch)) + { + if (numColumns == PSL_NUM_COLS) + psl = pslLoad(row); + else + psl = pslxLoad(row); + } +errCatchEnd(errCatch); +if (errCatch->gotError) + { + fprintf(stderr, "Error: parsing of PSL failed: %s: %s:%d\n", + trimSpaces(errCatch->message->string), fileTblName, lineNum); + printErrRow(numColumns, row); + errCount += 1; + failCount += 1; + } +errCatchFree(&errCatch); +return psl; +} + static int checkSize(struct psl *psl, char *pslDesc, char *sizeDesc, int numErrs, struct hash *sizeTbl, char *name, int size, FILE *errFh) /* check a size, error count (0 or 1) */ { int expectSz = hashIntValDefault(sizeTbl, name, -1); if (expectSz < 0) { if (numErrs == 0) prPslDesc(psl, pslDesc, errFh); fprintf(errFh, "\t%s \"%s\" does not exist\n", sizeDesc, name); return 1; } if (size != expectSz) { if (numErrs == 0) prPslDesc(psl, pslDesc, errFh); fprintf(errFh, "\t%s \"%s\" size (%d) != expected (%d)\n", sizeDesc, name, size, expectSz); return 1; } return 0; } static int checkCounts(struct psl *psl, char *pslDesc, int numErrs, FILE *errFh) /* check the match/mismatch counts */ { unsigned matchCnts = (psl->match+psl->misMatch+psl->repMatch+psl->nCount); unsigned alnSize = 0; int iBlk; for (iBlk = 0; iBlk < psl->blockCount; iBlk++) alnSize += psl->blockSizes[iBlk]; if (alnSize != matchCnts) { if (numErrs == 0) prPslDesc(psl, pslDesc, errFh); fprintf(errFh, "alignment size (%d) doesn't match counts (%d)\n", alnSize, matchCnts); return 1; } else return 0; } static void checkPsl(struct lineFile *lf, char *tbl, unsigned opts, struct psl *psl, FILE *errFh, FILE *passFh, FILE *failFh) /* check a psl */ { char pslDesc[PATH_LEN+64]; int numErrs = 0; if (lf != NULL) safef(pslDesc, sizeof(pslDesc), "%s:%u", lf->fileName, lf->lineIx); else safef(pslDesc, sizeof(pslDesc), "%s", tbl); numErrs += pslCheck2(opts, pslDesc, errFh, psl); if (!noCountCheck) numErrs += checkCounts(psl, pslDesc, numErrs, errFh); if (protCheck && !pslIsProtein(psl)) { if (numErrs == 0) prPslDesc(psl, pslDesc, errFh); fprintf(errFh, "\tnot a protein psl\n"); numErrs++; } if (targetSizes != NULL) numErrs += checkSize(psl, pslDesc, "target", numErrs, targetSizes, psl->tName, psl->tSize, errFh); if (querySizes != NULL) numErrs += checkSize(psl, pslDesc, "query", numErrs, querySizes, getQName(psl->qName), psl->qSize, errFh); if ((passFh != NULL) && (numErrs == 0)) pslTabOut(psl, passFh); if ((failFh != NULL) && (numErrs > 0)) pslTabOut(psl, failFh); errCount += numErrs; chkCount++; if (numErrs > 0) failCount++; } static void checkPslFile(char *fileName, unsigned opts, FILE *errFh, FILE *passFh, FILE *failFh) /* Check one psl file */ { struct lineFile *lf = pslFileOpen(fileName); -struct psl *psl; +char *row[2*PSLX_NUM_COLS]; // allow extra +int numColumns; -while ((psl = pslNext(lf)) != NULL) +while ((numColumns = lineFileChopCharNext(lf, '\t', row, ArraySize(row))) > 0) + { + struct psl *psl = parsePsl(lf->fileName, lf->lineEnd, numColumns, row); + if (psl != NULL) { checkPsl(lf, NULL, opts, psl, errFh, passFh, failFh); pslFree(&psl); } + } lineFileClose(&lf); } static void checkPslTbl(struct sqlConnection *conn, char *tbl, unsigned opts, FILE *errFh, FILE *passFh, FILE *failFh) /* Check one psl table */ { char query[1024], **row; sqlSafef(query, sizeof(query), "select * from %s", tbl); struct sqlResult *sr = sqlGetResult(conn, query); +int numColumns = sqlCountColumns(sr); +int rowNum = 0; int rowOff = (sqlFieldColumn(sr, "bin") >= 0) ? 1 : 0; while ((row = sqlNextRow(sr)) != NULL) { - struct psl *psl = pslLoad(row+rowOff); + rowNum++; + struct psl *psl = parsePsl(tbl, rowNum, numColumns-rowOff, row+rowOff); + if (psl != NULL) + { checkPsl(NULL, tbl, opts, psl, errFh, passFh, failFh); pslFree(&psl); } + } sqlFreeResult(&sr); } void checkFileTbl(struct sqlConnection *conn, char *fileTblName, FILE *errFh, FILE *passFh, FILE *failFh) /* check a PSL file or table. */ { unsigned opts = 0; if (skipInsertCounts) opts |= PSL_CHECK_IGNORE_INSERT_CNTS; if (fileExists(fileTblName)) checkPslFile(fileTblName, opts, errFh, passFh, failFh); else if (conn == NULL) errAbort("file %s does not exist and no database specified", fileTblName); else checkPslTbl(conn, fileTblName, opts, errFh, passFh, failFh); } void checkFilesTbls(struct sqlConnection *conn, int fileTblCount, char *fileTblNames[]) /* check PSL files or tables. */ { int i; FILE *errFh = quiet ? mustOpen("/dev/null", "w") : stderr; FILE *passFh = passFile ? mustOpen(passFile, "w") : NULL; FILE *failFh = failFile ? mustOpen(failFile, "w") : NULL; for (i = 0; i< fileTblCount; i++) checkFileTbl(conn, fileTblNames[i], errFh, passFh, failFh); carefulClose(&passFh); carefulClose(&failFh); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, optionSpecs); if (argc < 2) usage(); db = optionVal("db", NULL); protCheck = optionExists("prot"); noCountCheck = optionExists("noCountCheck"); quiet = optionExists("quiet"); passFile = optionVal("pass", NULL); failFile = optionVal("fail", NULL); +filter = optionExists("filter"); ignoreQUniq = optionExists("ignoreQUniq"); skipInsertCounts = optionExists("skipInsertCounts"); struct sqlConnection *conn = NULL; if (db != NULL) conn = sqlConnect(db); if (optionExists("targetSizes")) targetSizes = loadSizes(optionVal("targetSizes", NULL)); else if (db != NULL) targetSizes = loadChromInfoSizes(conn); if (optionExists("querySizes")) querySizes = loadSizes(optionVal("querySizes", NULL)); checkFilesTbls(conn, argc-1, argv+1); sqlDisconnect(&conn); verbose(1, "checked: %d failed: %d errors: %d\n", chkCount, failCount, errCount); -return ((errCount == 0) ? 0 : 1); +return (((errCount == 0) || filter) ? 0 : 1); }