5197ebd63b54192a79e5f001a5cb18d89822b542 chmalee Wed May 29 13:42:32 2024 -0700 Improve genbank search performance, mostly by optimizing mysql queries down from one query per findSpec result into a single query that checks all the findSpec results diff --git src/hg/checkHgFindSpec/checkHgFindSpec.c src/hg/checkHgFindSpec/checkHgFindSpec.c index 0f52e46..8fc1f9d 100644 --- src/hg/checkHgFindSpec/checkHgFindSpec.c +++ src/hg/checkHgFindSpec/checkHgFindSpec.c @@ -1,436 +1,440 @@ /* checkHgFindSpec - test & describe search specs in hgFindSpec table. */ /* Copyright (C) 2013 The Regents of the University of California * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */ #include "common.h" #include "options.h" #include "jksql.h" #include "hash.h" #include "dystring.h" #include "portable.h" #include "hdb.h" #include "hui.h" #include "cheapcgi.h" #include "cart.h" #include "hgFind.h" #include "hgFindSpec.h" #include "regexHelper.h" #include "genbank.h" #include "cartTrackDb.h" char *database = NULL; /* Need to get a cart in order to use hgFind. */ struct cart *cart = NULL; /* Caches for searching */ extern struct trackDb *hgFindTdbList; extern struct grp *hgFindGrpList; extern struct hash *hgFindGroupHash; extern struct hash *hgFindTrackHash; /* Command line option specifications */ static struct optionSpec optionSpecs[] = { {"showSearches", OPTION_BOOLEAN}, {"checkTermRegex", OPTION_BOOLEAN}, {"exampleFor", OPTION_STRING}, {"checkIndexes", OPTION_BOOLEAN}, {"makeExamples", OPTION_BOOLEAN}, + {"noHtml", OPTION_BOOLEAN}, {NULL, 0} }; void usage() { errAbort( "checkHgFindSpec - test and describe search specs in hgFindSpec tables.\n" "usage:\n" " checkHgFindSpec database [options | termToSearch]\n" "If given a termToSearch, displays the list of tables that will be searched\n" "and how long it took to figure that out; then performs the search and the\n" "time it took.\n" "options:\n" " -showSearches Show the order in which tables will be searched in\n" " general. [This will be done anyway if no\n" " termToSearch or options are specified.]\n" " -checkTermRegex For each search spec that includes a regular\n" " expression for terms, make sure that all values of\n" " the table field to be searched match the regex. (If\n" " not, some of them could be excluded from searches.)\n" " -checkIndexes Make sure that an index is defined on each field to\n" " be searched.\n" +" -noHtml Do not print the html results list, just figure out\n" +" the search results and print timing\n" /**#*** IMPLEMENT ME! " -exampleFor=search Randomly choose a term for the specified search (from\n" " the target table for the search). Search for it.\n" " -makeExamples Print out an HTML table of example positions\n" " (suitable for a gateway description.html).\n" */ ); } -boolean reportSearch(char *termToSearch) +boolean reportSearch(char *termToSearch, boolean noHtml) /* Show the list of tables that will be searched, and how long it took to * figure that out. Then do the search; show results and time required. */ //#*** this doesn't handle ; in termToSearch (until the actual search) { struct hgFindSpec *shortList = NULL, *longList = NULL; struct hgFindSpec *hfs = NULL; int startMs = 0, endMs = 0; boolean gotError = FALSE; char *chrom = NULL; int chromStart = 0, chromEnd = 0; hgFindSpecGetAllSpecs(database, &shortList, &longList); puts("\n"); startMs = clock1000(); for (hfs = shortList; hfs != NULL; hfs = hfs->next) { boolean matches = TRUE; boolean tablesExist = hTableOrSplitExists(database, hfs->searchTable); if (isNotEmpty(termToSearch) && isNotEmpty(hfs->termRegex)) matches = regexMatchNoCase(termToSearch, hfs->termRegex); if (isNotEmpty(hfs->xrefTable)) tablesExist |= hTableExists(database, hfs->xrefTable); if (matches && tablesExist) { verbose(1, "SHORT-CIRCUIT %s %f\n", hfs->searchName, hfs->searchPriority); } else if (matches) { verbose(2, "no table %s: %s%s%s\n", hfs->searchName, hfs->searchTable, isNotEmpty(hfs->xrefTable) ? " and/or " : "", isNotEmpty(hfs->xrefTable) ? hfs->xrefTable : ""); } else { verbose(2, "no match %s: %s\n", hfs->searchName, hfs->termRegex); } } endMs = clock1000(); printf("\nTook %dms to determine short-circuit searches.\n\n", endMs - startMs); startMs = clock1000(); for (hfs = longList; hfs != NULL; hfs = hfs->next) { boolean matches = TRUE; boolean tablesExist = hTableOrSplitExists(database, hfs->searchTable); if (isNotEmpty(termToSearch) && isNotEmpty(hfs->termRegex)) matches = regexMatchNoCase(termToSearch, hfs->termRegex); if (isNotEmpty(hfs->xrefTable)) tablesExist |= hTableExists(database, hfs->xrefTable); if (matches && tablesExist) { verbose(1, "ADDITIVE %s %f\n", hfs->searchName, hfs->searchPriority); } else if (matches) { verbose(2, "no table %s: %s%s%s\n", hfs->searchName, hfs->searchTable, isNotEmpty(hfs->xrefTable) ? " and/or " : "", isNotEmpty(hfs->xrefTable) ? hfs->xrefTable : ""); } else { verbose(2, "no match %s: %s\n", hfs->searchName, hfs->termRegex); } } endMs = clock1000(); printf("\nTook %dms to determine multiple/additive searches.\n" "(These won't happen if it short-circuits.)\n\n", endMs - startMs); if (isNotEmpty(termToSearch)) { cartSetString(cart, "db", database); char *position = cloneString(termToSearch); struct dyString *dyWarn = dyStringNew(0); startMs = clock1000(); hashTracksAndGroups(cart, database); struct searchCategory *allCategories = getAllCategories(cart, database, hgFindGroupHash); struct hgPositions *hgp = hgFindSearch(cart, &position, &chrom, &chromStart, &chromEnd, "checkHgFindSpec", dyWarn, allCategories); endMs = clock1000(); if (isNotEmpty(dyWarn->string)) warn("%s", dyWarn->string); if (hgp->singlePos != NULL) { struct hgPos *pos = hgp->singlePos; char *table = "[No reported table!]"; char *name = pos->name ? pos->name : ""; char *browserName = pos->browserName ? pos->browserName : ""; char *description = pos->description ? pos->description : ""; if (hgp->tableList != NULL) table = hgp->tableList->name; printf("\nSingle result for %s from %s: %s:%d-%d [%s | %s | %s]\n", termToSearch, table, chrom, chromStart+1, chromEnd, name, browserName, description); - } - else + } else if (!noHtml) hgPositionsHtml(database, hgp, "checkHgFindSpec", cart); printf("\nTook %dms to search for %s.\n\n", endMs - startMs, termToSearch); } hgFindSpecFreeList(&shortList); hgFindSpecFreeList(&longList); return(gotError); } static char *getFieldFromQuery(char *query, char *searchName) /* Get the value of the field that's being searched in query. */ { char *ptr = strstr(query, " where "); char *field = NULL; if (ptr == NULL) errAbort("Can't find \" where \" in query \"%s\" for search %s", query, searchName); field = cloneString(ptr + strlen(" where ")); ptr = strchr(field, '='); if (ptr == NULL) ptr = strstr(field, " like "); if (ptr == NULL) ptr = strstr(field, " rlike "); if (ptr == NULL) errAbort("Can't find \"=\" or \" like \" after \" where %s\" in query " "\"%s\" for search %s", field, query, searchName); *ptr = 0; return(trimSpaces(field)); } static boolean checkRegexOnTableField(char *exp, char *altExp, char *table, char *field, char *searchName) /* Return TRUE and complain if any values of table.field do not match exp. */ { struct sqlConnection *conn = hAllocConn(database); struct sqlResult *sr = NULL; char **row = NULL; int errCount = 0; char buf[512]; sqlSafef(buf, sizeof(buf), "select %s from %s", field, table); sr = sqlGetResult(conn, buf); while ((row = sqlNextRow(sr)) != NULL) { if (isEmpty(row[0])) continue; if (! regexMatchNoCase(row[0], exp)) { if (isNotEmpty(altExp) && regexMatchNoCase(row[0], altExp)) continue; if (errCount < 1 || (errCount < 10 && verboseLevel() > 1)) { printf("Error: %s.%s.%s value \"%s\" doesn't match termRegex \"%s\"", database, table, field, row[0], exp); if (isNotEmpty(altExp)) printf(" or dontCheck \"%s\"", altExp); printf(" for search %s\n", searchName); } errCount++; } } if (errCount > 0) verbose(2, "Search %s: %d values of %s.%s overlooked.\n", searchName, errCount, table, field); sqlFreeResult(&sr); hFreeConn(&conn); return(errCount > 0); } boolean doCheckTermRegex() /* For each search that includes a regex, make sure that all values of the * target table field match the regex -- otherwise those values would be * invisible to a search. */ { struct hgFindSpec *shortList = NULL, *longList = NULL, *wholeList = NULL; struct hgFindSpec *hfs = NULL; boolean gotError = FALSE; hgFindSpecGetAllSpecs(database, &shortList, &longList); wholeList = slCat(shortList, longList); puts("\n"); for (hfs = wholeList; hfs != NULL; hfs = hfs->next) { if (isNotEmpty(hfs->termRegex)) { char *table = NULL, *query = NULL; if (isNotEmpty(hfs->xrefTable)) { table = hfs->xrefTable; query = hfs->xrefQuery; } else { table = hfs->searchTable; query = hfs->query; } if (isNotEmpty(query)) { struct slName *tableList = hSplitTableNames(database, table); struct slName *tPtr = NULL; char *termPrefix = hgFindSpecSetting(hfs, "termPrefix"); char *field = getFieldFromQuery(query, hfs->searchName); char *termRegex = hfs->termRegex; char *altRegex = hgFindSpecSetting(hfs, "dontCheck"); if (termPrefix != NULL && startsWith(termPrefix, termRegex+1)) termRegex += strlen(termPrefix)+1; verbose(2, "Checking termRegex \"%s\" for table %s (search %s).\n", termRegex, table, hfs->searchName); for (tPtr = tableList; tPtr != NULL; tPtr = tPtr->next) { gotError |= checkRegexOnTableField(termRegex, altRegex, tPtr->name, field, hfs->searchName); } } } } hgFindSpecFreeList(&wholeList); return(gotError); } boolean doCheckIndexes() /* For each search, make sure there's an index on the right table field(s). */ { struct hgFindSpec *shortList = NULL, *longList = NULL, *wholeList = NULL; struct hgFindSpec *hfs = NULL; struct slName *allChroms = hAllChromNames(database); boolean gotError = FALSE; hgFindSpecGetAllSpecs(database, &shortList, &longList); wholeList = slCat(shortList, longList); puts("\n"); for (hfs = wholeList; hfs != NULL; hfs = hfs->next) { if (isNotEmpty(hfs->query) && hTableOrSplitExists(database, hfs->searchTable)) { char *field = getFieldFromQuery(hfs->query, hfs->searchName); struct slName *tableList = hSplitTableNames(database, hfs->searchTable); struct slName *tPtr = NULL; for (tPtr = tableList; tPtr != NULL; tPtr = tPtr->next) { if (! hFieldHasIndex(database, tPtr->name, field)) { gotError = TRUE; printf("Error: No SQL index defined for %s.%s.%s (search %s)\n", database, tPtr->name, field, hfs->searchName); } else verbose(2, "Index exists for %s.%s (search %s)\n", tPtr->name, field, hfs->searchName); } } if (isNotEmpty(hfs->xrefQuery) && hTableOrSplitExists(database, hfs->xrefTable)) { char *field = getFieldFromQuery(hfs->xrefQuery, hfs->searchName); if (! hFieldHasIndex(database, hfs->xrefTable, field)) { gotError = TRUE; printf("Error: No SQL index defined for %s.%s.%s (search %s)\n", database, hfs->xrefTable, field, hfs->searchName); } else verbose(2, "Index exists for %s.%s.%s (search %s)\n", database, hfs->xrefTable, field, hfs->searchName); } } slFreeList(&allChroms); hgFindSpecFreeList(&wholeList); return(gotError); } char *getExampleFor(char *searchName) /* Randomly choose a search field value -- if it comes from the table, * we should be able to search for it and find our way back to the table. */ { char *example = NULL; errAbort("Sorry, -exampleFor=search not implemented yet."); return(example); } boolean doMakeExamples() /* Print out an HTML table of position examples for description.html. */ { boolean gotError = FALSE; errAbort("Sorry, -makeExamples not implemented yet."); return(gotError); } int checkHgFindSpec(char *db, char *termToSearch, boolean showSearches, boolean checkTermRegex, char *exampleFor, - boolean checkIndexes, boolean makeExamples) + boolean checkIndexes, boolean makeExamples, boolean noHtml) /* Perform searches/checks as specified, summarize errors, * return nonzero if there are errors. */ { boolean gotError = FALSE; database = db; initGenbankTableNames(db); if (isNotEmpty(termToSearch)) - gotError |= reportSearch(termToSearch); + gotError |= reportSearch(termToSearch, noHtml); if (showSearches) - gotError |= reportSearch(NULL); + gotError |= reportSearch(NULL, noHtml); if (checkTermRegex) gotError |= doCheckTermRegex(); if (isNotEmpty(exampleFor)) { termToSearch = getExampleFor(exampleFor); - gotError |= reportSearch(termToSearch); + gotError |= reportSearch(termToSearch, noHtml); } if (checkIndexes) gotError |= doCheckIndexes(); if (makeExamples) gotError |= doMakeExamples(); return gotError; } /* Just a placeholder -- we don't do anything with the cart. */ char *excludeVars[] = { NULL }; int main(int argc, char *argv[]) { char *termToSearch = NULL; boolean showSearches = FALSE; boolean checkTermRegex = FALSE; char *exampleFor = NULL; boolean checkIndexes = FALSE; boolean makeExamples = FALSE; +boolean noHtml = FALSE; optionInit(&argc, argv, optionSpecs); /* Allow "checkHgFindSpec db" or "checkHgFindSpec db termToSearch" usage: */ if (termToSearch == NULL && argc == 3) { termToSearch = argv[2]; argc--; } if (argc != 2) usage(); showSearches = optionExists("showSearches"); checkTermRegex = optionExists("checkTermRegex"); exampleFor = optionVal("exampleFor", exampleFor); checkIndexes = optionExists("checkIndexes"); makeExamples = optionExists("makeExamples"); +noHtml = optionExists("noHtml"); /* If no termToSearch or options are specified, do showSearches. */ if (termToSearch == NULL && exampleFor == NULL && !showSearches && !checkTermRegex && !checkIndexes && !makeExamples) showSearches = TRUE; cgiSpoof(&argc, argv); cart = cartAndCookie(hUserCookie(), excludeVars, NULL); return checkHgFindSpec(argv[1], termToSearch, showSearches, checkTermRegex, - exampleFor, checkIndexes, makeExamples); + exampleFor, checkIndexes, makeExamples, noHtml); }