87160c4c6abc63ea1732458d5b58ca8bd420528c markd Wed Aug 30 08:53:24 2017 -0700 added option to pslCheck to not check insert sizes diff --git src/hg/pslCheck/pslCheck.c src/hg/pslCheck/pslCheck.c index 8174eba..ab83345 100644 --- src/hg/pslCheck/pslCheck.c +++ src/hg/pslCheck/pslCheck.c @@ -10,65 +10,69 @@ #include "verbose.h" /* command line options and values */ static struct optionSpec optionSpecs[] = { {"db", OPTION_STRING}, {"prot", OPTION_BOOLEAN}, {"quiet", OPTION_BOOLEAN}, {"noCountCheck", OPTION_BOOLEAN}, {"targetSizes", OPTION_STRING}, {"querySizes", OPTION_STRING}, {"pass", OPTION_STRING}, {"fail", OPTION_STRING}, {"ignoreQUniq", OPTION_BOOLEAN}, + {"skipInsertCounts", OPTION_BOOLEAN}, {NULL, 0} }; static char *db = NULL; static boolean protCheck = FALSE; static boolean quiet = FALSE; static boolean noCountCheck = FALSE; static char *passFile = NULL; static char *failFile = NULL; static boolean ignoreQUniq = FALSE; +static boolean skipInsertCounts = FALSE; static struct hash *targetSizes = NULL; static struct hash *querySizes = NULL; /* global count of alignments checked and errors */ static int chkCount = 0; static int failCount = 0; static int errCount = 0; void usage() /* Explain usage and exit. */ { errAbort( "pslCheck - validate PSL files\n" "usage:\n" " pslCheck fileTbl(s)\n" "options:\n" " -db=db - get targetSizes from this database, and if file doesn't exist,\n" " look for a table in this database.\n" " -prot - confirm psls are protein psls\n" " -noCountCheck - don't validate that match/mismatch counts are match\n" " the total size of the alignment blocks\n" " -pass=pslFile - write PSLs without errors to this file\n" " -fail=pslFile - write PSLs with errors to this file\n" " -targetSizes=sizesFile - tab file with columns of target and size.\n" " If specified, psl is check to have a valid target and target\n" " coordinates.\n" + " -skipInsertCounts - Don't validate insert counts. Useful for BLAT protein\n" + " PSLs where these are not computed consistently.\n" " -querySizes=sizesFile - file with query sizes.\n" " -ignoreQUniq - ignore everything after the last `-' in the qName field, that\n" " is sometimes used to generate a unique identifier\n" " -quiet - no write error message, just filter\n"); } static struct hash *loadSizes(char *sizesFile) /* load a sizes file */ { struct hash *sizes = hashNew(20); struct lineFile *lf = lineFileOpen(sizesFile, TRUE); char *cols[2]; while (lineFileNextRowTab(lf, cols, ArraySize(cols))) hashAddInt(sizes, cols[0], sqlUnsigned(cols[1])); @@ -155,137 +159,141 @@ for (iBlk = 0; iBlk < psl->blockCount; iBlk++) alnSize += psl->blockSizes[iBlk]; if (alnSize != matchCnts) { if (numErrs == 0) prPslDesc(psl, pslDesc, errFh); fprintf(errFh, "alignment size (%d) doesn't match counts (%d)\n", alnSize, matchCnts); return 1; } else return 0; } -static void checkPsl(struct lineFile *lf, char *tbl, struct psl *psl, +static void checkPsl(struct lineFile *lf, char *tbl, unsigned opts, struct psl *psl, FILE *errFh, FILE *passFh, FILE *failFh) /* check a psl */ { char pslDesc[PATH_LEN+64]; int numErrs = 0; if (lf != NULL) safef(pslDesc, sizeof(pslDesc), "%s:%u", lf->fileName, lf->lineIx); else safef(pslDesc, sizeof(pslDesc), "%s", tbl); -numErrs += pslCheck(pslDesc, errFh, psl); +numErrs += pslCheck2(opts, pslDesc, errFh, psl); if (!noCountCheck) numErrs += checkCounts(psl, pslDesc, numErrs, errFh); if (protCheck && !pslIsProtein(psl)) { if (numErrs == 0) prPslDesc(psl, pslDesc, errFh); fprintf(errFh, "\tnot a protein psl\n"); numErrs++; } if (targetSizes != NULL) numErrs += checkSize(psl, pslDesc, "target", numErrs, targetSizes, psl->tName, psl->tSize, errFh); if (querySizes != NULL) numErrs += checkSize(psl, pslDesc, "query", numErrs, querySizes, getQName(psl->qName), psl->qSize, errFh); if ((passFh != NULL) && (numErrs == 0)) pslTabOut(psl, passFh); if ((failFh != NULL) && (numErrs > 0)) pslTabOut(psl, failFh); errCount += numErrs; chkCount++; if (numErrs > 0) failCount++; } -static void checkPslFile(char *fileName, FILE *errFh, +static void checkPslFile(char *fileName, unsigned opts, FILE *errFh, FILE *passFh, FILE *failFh) /* Check one psl file */ { struct lineFile *lf = pslFileOpen(fileName); struct psl *psl; while ((psl = pslNext(lf)) != NULL) { - checkPsl(lf, NULL, psl, errFh, passFh, failFh); + checkPsl(lf, NULL, opts, psl, errFh, passFh, failFh); pslFree(&psl); } lineFileClose(&lf); } -static void checkPslTbl(struct sqlConnection *conn, char *tbl, FILE *errFh, +static void checkPslTbl(struct sqlConnection *conn, char *tbl, unsigned opts, FILE *errFh, FILE *passFh, FILE *failFh) /* Check one psl table */ { char query[1024], **row; sqlSafef(query, sizeof(query), "select * from %s", tbl); struct sqlResult *sr = sqlGetResult(conn, query); int rowOff = (sqlFieldColumn(sr, "bin") >= 0) ? 1 : 0; while ((row = sqlNextRow(sr)) != NULL) { struct psl *psl = pslLoad(row+rowOff); - checkPsl(NULL, tbl, psl, errFh, passFh, failFh); + checkPsl(NULL, tbl, opts, psl, errFh, passFh, failFh); pslFree(&psl); } sqlFreeResult(&sr); } void checkFileTbl(struct sqlConnection *conn, char *fileTblName, FILE *errFh, FILE *passFh, FILE *failFh) /* check a PSL file or table. */ { +unsigned opts = 0; +if (skipInsertCounts) + opts |= PSL_CHECK_IGNORE_INSERT_CNTS; if (fileExists(fileTblName)) - checkPslFile(fileTblName, errFh, passFh, failFh); + checkPslFile(fileTblName, opts, errFh, passFh, failFh); else if (conn == NULL) errAbort("file %s does not exist and no database specified", fileTblName); else - checkPslTbl(conn, fileTblName, errFh, passFh, failFh); + checkPslTbl(conn, fileTblName, opts, errFh, passFh, failFh); } void checkFilesTbls(struct sqlConnection *conn, int fileTblCount, char *fileTblNames[]) /* check PSL files or tables. */ { int i; FILE *errFh = quiet ? mustOpen("/dev/null", "w") : stderr; FILE *passFh = passFile ? mustOpen(passFile, "w") : NULL; FILE *failFh = failFile ? mustOpen(failFile, "w") : NULL; for (i = 0; i< fileTblCount; i++) checkFileTbl(conn, fileTblNames[i], errFh, passFh, failFh); carefulClose(&passFh); carefulClose(&failFh); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, optionSpecs); if (argc < 2) usage(); db = optionVal("db", NULL); protCheck = optionExists("prot"); noCountCheck = optionExists("noCountCheck"); quiet = optionExists("quiet"); passFile = optionVal("pass", NULL); failFile = optionVal("fail", NULL); ignoreQUniq = optionExists("ignoreQUniq"); +skipInsertCounts = optionExists("skipInsertCounts"); struct sqlConnection *conn = NULL; if (db != NULL) conn = sqlConnect(db); if (optionExists("targetSizes")) targetSizes = loadSizes(optionVal("targetSizes", NULL)); else if (db != NULL) targetSizes = loadChromInfoSizes(conn); if (optionExists("querySizes")) querySizes = loadSizes(optionVal("querySizes", NULL)); checkFilesTbls(conn, argc-1, argv+1); sqlDisconnect(&conn); verbose(1, "checked: %d failed: %d errors: %d\n", chkCount, failCount, errCount); return ((errCount == 0) ? 0 : 1); }