c170b6e0f477bf61d70eab822ed89b9d4b5d0c45 tdreszer Thu Sep 5 14:25:19 2013 -0700 While hgFind exhaustive search may cause a CGI timeout (#11626), at least one app (hgGetAnn) requires an exhaustive search (#11665). So I have enforced a results limit for mRNA searches only, and have allowed hgGetAnn to override the limit. Will not be surprised if some other obscure case pops up where an hgFind call needs to be exhaustive, but have yet to code-browse discover one. diff --git src/hg/lib/hgFind.c src/hg/lib/hgFind.c index af9f23e..32e18e2 100644 --- src/hg/lib/hgFind.c +++ src/hg/lib/hgFind.c @@ -24,30 +24,35 @@ #include "refLink.h" #include "kgAlias.h" #include "kgProtAlias.h" #include "findKGAlias.h" #include "findKGProtAlias.h" #include "tigrCmrGene.h" #include "minGeneInfo.h" #include "pipeline.h" #include "hgConfig.h" #include "trix.h" #include "trackHub.h" #include "udc.h" #include "hubConnect.h" +// Exhaustive searches can lead to timeouts on CGIs (#11626). +// However, hgGetAnn requires exhaustive searches (#11665). +#define NONEXHAUSTIVE_SEARCH_LIMIT 500 +#define EXHAUSTIVE_SEARCH_REQUIRED -1 + extern struct cart *cart; char *hgAppName = ""; /* alignment tables to check when looking for mrna alignments */ static char *estTables[] = { "intronEst", "all_est", "xenoEst", NULL }; static char *estLabels[] = { "Spliced ESTs", "ESTs", "Other ESTs", NULL }; static char *mrnaTables[] = { "all_mrna", "xenoMrna", NULL }; static char *mrnaLabels[] = { "mRNAs", "Other mRNAs", NULL }; static struct dyString *hgpMatchNames = NULL; static void hgPosFree(struct hgPos **pEl) /* Free up hgPos. */ { struct hgPos *el; if ((el = *pEl) != NULL) @@ -572,31 +577,31 @@ /* This code works with just two SQL queries no matter how * big the search result list is. For cases where the search * result list is big (say 100 or 1000 items) this is noticably * faster than the simpler-to-code approach that would do two * queries for each search result. We pay for this speed tweak * by having to construct a more elaborate query, and by having * to maintain a hash to connect the query results back to the * individual positions. */ struct dyString *dy = dyStringNew(0); struct trixSearchResult *tsr; struct hash *hash = hashNew(16); struct hgPos *posList = NULL, *pos; struct tsrPos *tpList = NULL, *tp; struct sqlResult *sr; char **row; -int maxToReturn = 500; +int maxToReturn = NONEXHAUSTIVE_SEARCH_LIMIT; if (slCount(tsrList) > maxToReturn) { warn("Search terms are not very specific, only showing first %d matching UCSC Genes.", maxToReturn); tsr = slElementFromIx(tsrList, maxToReturn-1); tsr->next = NULL; } /* Make hash of all search results - one for each known gene ID. */ for (tsr = tsrList; tsr != NULL; tsr = tsr->next) { lmAllocVar(hash->lm, tp); tp->tsr = tsr; slAddHead(&tpList, tp); @@ -1427,121 +1432,134 @@ } return NULL; } static struct slName *genbankGrepQuery(char *indexFile, char *table, char *key) /* grep -i key indexFile, return a list of ids (first word of each line). */ { char *extraOptions = ""; if (sameString(table, "author")) extraOptions = "-w"; return doGrepQuery(indexFile, table, key, extraOptions); } static struct slName *genbankSqlFuzzyQuery(struct sqlConnection *conn, - char *table, char *key, int limit) + char *table, char *key, int limitResults) /* Perform a fuzzy sql search for %key% in table.name; return list of * corresponding table.id's. */ { struct slName *idList = NULL, *idEl = NULL; if (!isTooCommon(table, key)) { struct sqlResult *sr; char **row; char query[256]; + if (limitResults == EXHAUSTIVE_SEARCH_REQUIRED) + sqlSafef(query, sizeof(query), + "select id,name from %s where name like '%%%s%%'", table, key); + else // limit results to avoid CGI timeouts (#11626). sqlSafef(query, sizeof(query), - "select id,name from %s where name like '%%%s%%' limit %d", table, key, limit); + "select id,name from %s where name like '%%%s%%' limit %d", table, key, limitResults); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { touppers(row[1]); if (keyIsPrefix(key, row[1])) { idEl = newSlName(row[0]); slAddHead(&idList, idEl); } } sqlFreeResult(&sr); } return idList; } static boolean gotAllGenbankGrepIndexFiles(char *db, struct hgFindSpec *hfs, char *tables[], int tableCount) /* Return TRUE if all tables have a readable genbank index file. */ { int i; for (i=0; i < tableCount; i++) if (! getGenbankGrepIndex(db, hfs, tables[i], "idName")) return FALSE; return TRUE;; } static void findHitsToTables(char *db, struct hgFindSpec *hfs, - char *key, char *tables[], int tableCount, + char *key, int limitResults, char *tables[], int tableCount, struct hash **retHash, struct slName **retList) /* Return all unique accessions that match any table. */ // Modified to return only the first 500 hits because of CGI timeouts { struct slName *list = NULL, *el; struct hash *hash = newHash(0); struct sqlConnection *conn = hAllocConn(db); struct sqlResult *sr; char **row; char query[256]; char *field; int i; -int rowCount = 0, limit = 500; // Excessively broad searches were leading to CGI timeouts -for (i = 0; inext) + idList = genbankSqlFuzzyQuery(conn, field, key, limitResults); + for (idEl = idList; + idEl != NULL && (limitResults == EXHAUSTIVE_SEARCH_REQUIRED || rowCount < limitResults); + idEl = idEl->next) { /* don't check srcDb to exclude refseq for compat with older tables */ + if (limitResults == EXHAUSTIVE_SEARCH_REQUIRED) + sqlSafef(query, sizeof(query), + "select acc, organism from gbCdnaInfo where %s = %s " + " and type = 'mRNA'", field, idEl->name); + else // limit results to avoid CGI timeouts (#11626). sqlSafef(query, sizeof(query), "select acc, organism from gbCdnaInfo where %s = %s " - " and type = 'mRNA' limit %d", - field, idEl->name, limit); + " and type = 'mRNA' limit %d", field, idEl->name, limitResults); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { char *acc = row[0]; /* will use this later to distinguish xeno mrna */ int organismID = sqlUnsigned(row[1]); if (!isRefSeqAcc(acc) && !hashLookup(hash, acc)) { el = newSlName(acc); slAddHead(&list, el); hashAddInt(hash, acc, organismID); - } - if (rowCount++ > limit) + // limit results to avoid CGI timeouts (#11626). + if (rowCount++ > limitResults && limitResults != EXHAUSTIVE_SEARCH_REQUIRED) break; } + } sqlFreeResult(&sr); } slFreeList(&idList); } hFreeConn(&conn); slReverse(&list); *retList = list; *retHash = hash; } static void andHits(struct hash *aHash, struct slName *aList, struct hash *bHash, struct slName *bList, struct hash **retHash, struct slName **retList) /* Return hash/list that is intersection of lists a and b. */ @@ -1705,68 +1723,68 @@ slReverse(&table->posList); safef(title, sizeof(title), "%s%s %sligned mRNA Search Results", isXeno ? "Non-" : "", organism, aligns ? "A" : "Una"); freeMem(organism); table->description = cloneString(title); table->name = cloneString(mrnaTable); table->htmlOnePos = mrnaKeysHtmlOnePos; slAddHead(&hgp->tableList, table); } freeDyString(&dy); return alignCount; } static boolean findMrnaKeys(char *db, struct hgFindSpec *hfs, - char *keys, struct hgPositions *hgp) + char *keys, int limitResults, struct hgPositions *hgp) /* Find mRNA that has keyword in one of its fields. */ { int alignCount; static char *tables[] = { "productName", "geneName", "author", "tissue", "cell", "description", "development", }; struct hash *allKeysHash = NULL; struct slName *allKeysList = NULL; struct sqlConnection *conn = hAllocConn(db); boolean found = FALSE; /* If we can use grep to search all tables, then use piped grep to * implement implicit "AND" of multiple keys. */ if (gotAllGenbankGrepIndexFiles(db, hfs, tables, ArraySize(tables))) { - findHitsToTables(db, hfs, keys, tables, ArraySize(tables), + findHitsToTables(db, hfs, keys, limitResults, tables, ArraySize(tables), &allKeysHash, &allKeysList); } else { struct hash *oneKeyHash = NULL; struct slName *oneKeyList = NULL; struct hash *andedHash = NULL; struct slName *andedList = NULL; char *words[32]; char buf[512]; int wordCount; int i; safef(buf, sizeof(buf), "%s", keys); wordCount = chopLine(buf, words); if (wordCount == 0) return FALSE; found = TRUE; for (i=0; isearchType, "knownGene")) { if (gotFullText(db)) found = findKnownGeneFullText(db, term, hgp); else /* NOTE, in a few months (say by April 1 2006) get rid of else -JK */ { @@ -2635,31 +2653,31 @@ if (relativeFlag) { end = start + relEnd; start = start + relStart; } singlePos(hgp, hfs->searchDescription, NULL, hfs->searchTable, term, term, chrom, start, end); } } else if (sameString(hfs->searchType, "mrnaAcc")) { found = findMrnaPos(db, term, hgp); } else if (sameString(hfs->searchType, "mrnaKeyword")) { - found = findMrnaKeys(db, hfs, upcTerm, hgp); + found = findMrnaKeys(db, hfs, upcTerm, limitResults, hgp); } else if (sameString(hfs->searchType, "sgdGene")) { found = findYeastGenes(db, term, hgp); } else { isSpecial = FALSE; } *retFound = found; freeMem(upcTerm); return(isSpecial); } @@ -2814,54 +2832,54 @@ if (pos->chromEnd > chromSize) pos->chromEnd = chromSize; } slAddHead(&table->posList, pos); } } if (table != NULL) slReverse(&table->posList); sqlFreeResult(&sr); hFreeConn(&conn); slFreeList(&tableList); return(found); } -boolean hgFindUsingSpec(char *db, struct hgFindSpec *hfs, char *term, +boolean hgFindUsingSpec(char *db, struct hgFindSpec *hfs, char *term, int limitResults, struct hgPositions *hgp, boolean relativeFlag, int relStart, int relEnd, boolean multiTerm) /* Perform the search described by hfs on term. If successful, put results * in hgp and return TRUE. (If not, don't modify hgp.) */ { struct slPair *xrefList = NULL, *xrefPtr = NULL; boolean found = FALSE; if (hfs == NULL || term == NULL || hgp == NULL) errAbort("NULL passed to hgFindUsingSpec.\n"); if (strlen(term)<2 && ! (sameString(hfs->searchName, "knownGene") || sameString(hfs->searchName, "flyBaseGeneSymbolOneLetter"))) return FALSE; if (isNotEmpty(hfs->termRegex) && ! regexMatchNoCase(term, hfs->termRegex)) return(FALSE); if (! hTableOrSplitExists(db, hfs->searchTable)) return(FALSE); -if (isNotEmpty(hfs->searchType) && searchSpecial(db, hfs, term, hgp, relativeFlag, +if (isNotEmpty(hfs->searchType) && searchSpecial(db, hfs, term, limitResults, hgp, relativeFlag, relStart, relEnd, &found)) return(found); if (isNotEmpty(hfs->xrefTable)) { struct sqlConnection *conn = hAllocConn(db); // NOTE hfs->xrefTable can sometimes contain a comma-separated table list, // rather than just a single table. char *tables = replaceChars(hfs->xrefTable, ",", " "); boolean exists = sqlTablesExist(conn, tables); hFreeConn(&conn); freeMem(tables); if (! exists) return(FALSE); @@ -2983,92 +3001,100 @@ hFreeConn(&conn); return foundIt; } static struct hgFindSpec *hfsFind(struct hgFindSpec *list, char *name) /* Return first element of list that matches name. */ { struct hgFindSpec *el; for (el = list; el != NULL; el = el->next) if (sameString(name, el->searchName)) return el; return NULL; } -static boolean singleSearch(char *db, char *term, struct cart *cart, struct hgPositions *hgp) +static boolean singleSearch(char *db, char *term, int limitResults, struct cart *cart, + struct hgPositions *hgp) /* If a search type is specified in the CGI line (not cart), perform that search. * If the search is successful, fill in hgp as a single-pos result and return TRUE. */ { char *search = cgiOptionalString("singleSearch"); if (search == NULL) return FALSE; cartRemove(cart, "singleSearch"); boolean foundIt = FALSE; if (sameString(search, "knownCanonical")) foundIt = searchKnownCanonical(db, term, hgp); else { struct hgFindSpec *shortList = NULL, *longList = NULL; hgFindSpecGetAllSpecs(db, &shortList, &longList); struct hgFindSpec *hfs = hfsFind(shortList, search); if (hfs == NULL) hfs = hfsFind(longList, search); if (hfs != NULL) - foundIt = hgFindUsingSpec(db, hfs, term, hgp, FALSE, 0,0, FALSE); + foundIt = hgFindUsingSpec(db, hfs, term, limitResults, hgp, FALSE, 0,0, FALSE); else warn("Unrecognized singleSearch=%s in URL", search); } if (foundIt) { fixSinglePos(hgp); if (cart != NULL) cartSetString(cart, "hgFind.matches", hgp->tableList->posList->browserName); } return foundIt; } struct hgPositions *hgPositionsFind(char *db, char *term, char *extraCgi, char *hgAppNameIn, struct cart *cart, boolean multiTerm) /* Return table of positions that match term or NULL if none such. */ { struct hgPositions *hgp = NULL, *hgpItem = NULL; regmatch_t substrs[4]; boolean canonicalSpec = FALSE; boolean gbrowserSpec = FALSE; boolean lengthSpec = FALSE; boolean singleBaseSpec = FALSE; boolean relativeFlag = FALSE; int relStart = 0, relEnd = 0; hgAppName = hgAppNameIn; +// Exhaustive searches can lead to timeouts on CGIs (#11626). +// However, hgGetAnn requires exhaustive searches (#11665). +// So... set a non-exhaustive search limit on all except hgGetAnn. +// NOTE: currently non-exhaustive search limits are only applied to findMrnaKeys +int limitResults = NONEXHAUSTIVE_SEARCH_LIMIT; +if (sameString(hgAppNameIn,"hgGetAnn")) + limitResults = EXHAUSTIVE_SEARCH_REQUIRED; AllocVar(hgp); hgp->useAlias = FALSE; term = trimSpaces(term); if(isEmpty(term)) return hgp; hgp->query = cloneString(term); hgp->database = db; if (extraCgi == NULL) extraCgi = ""; hgp->extraCgi = cloneString(extraCgi); -if (singleSearch(db, term, cart, hgp)) +if (singleSearch(db, term, limitResults, cart, hgp)) return hgp; /* Allow any search term to end with a :Start-End range -- also support stuff * pasted in from BED (chrom start end) or SQL query (chrom | start | end). * If found, strip it off and remember the start and end. */ char *originalTerm = term; if ((canonicalSpec = regexMatchSubstrNoCase(term, canonicalRangeExp, substrs, ArraySize(substrs))) || (gbrowserSpec = regexMatchSubstrNoCase(term, gbrowserRangeExp, substrs, ArraySize(substrs))) || (lengthSpec = regexMatchSubstrNoCase(term, lengthRangeExp, substrs, ArraySize(substrs))) || regexMatchSubstrNoCase(term, bedRangeExp, substrs, ArraySize(substrs)) || (singleBaseSpec = regexMatchSubstrNoCase(term, singleBaseExp, substrs, ArraySize(substrs))) || @@ -3129,43 +3155,43 @@ boolean done = FALSE; // Disable singleBaseSpec for any term that is not hgOfficialChromName // because that mangles legitimate IDs that are [A-Z]:[0-9]+. if (singleBaseSpec) { singleBaseSpec = relativeFlag = FALSE; term = cloneString(originalTerm); // restore original term relStart = relEnd = 0; } if (!trackHubDatabase(db)) hgFindSpecGetAllSpecs(db, &shortList, &longList); for (hfs = shortList; hfs != NULL; hfs = hfs->next) { - if (hgFindUsingSpec(db, hfs, term, hgp, relativeFlag, relStart, relEnd, + if (hgFindUsingSpec(db, hfs, term, limitResults, hgp, relativeFlag, relStart, relEnd, multiTerm)) { done = TRUE; if (! hgFindSpecSetting(hfs, "semiShortCircuit")) break; } } if (! done) { for (hfs = longList; hfs != NULL; hfs = hfs->next) { - hgFindUsingSpec(db, hfs, term, hgp, relativeFlag, relStart, relEnd, + hgFindUsingSpec(db, hfs, term, limitResults, hgp, relativeFlag, relStart, relEnd, multiTerm); } /* Lowe lab additions -- would like to replace these with specs, but * will leave in for now. */ if (!trackHubDatabase(db)) findTigrGenes(db, term, hgp); trackHubFindPos(db, term, hgp); } hgFindSpecFreeList(&shortList); hgFindSpecFreeList(&longList); if(hgpMatchNames == NULL) hgpMatchNames = newDyString(256); for(hgpItem = hgp; hgpItem != NULL; hgpItem = hgpItem->next) {