5197ebd63b54192a79e5f001a5cb18d89822b542 chmalee Wed May 29 13:42:32 2024 -0700 Improve genbank search performance, mostly by optimizing mysql queries down from one query per findSpec result into a single query that checks all the findSpec results diff --git src/hg/lib/hgFind.c src/hg/lib/hgFind.c index 5003ff1..6b2d76e 100644 --- src/hg/lib/hgFind.c +++ src/hg/lib/hgFind.c @@ -1288,51 +1288,50 @@ // Modified to return only the first 500 hits because of CGI timeouts { struct slName *list = NULL, *el; struct hash *hash = newHash(0); struct sqlConnection *conn = hAllocConn(db); struct sqlResult *sr; char **row; char *field; int i; int rowCount = 0; // Excessively broad searches were leading to CGI timeouts (#11626). for (i = 0; inext) - { /* don't check srcDb to exclude refseq for compat with older tables */ - struct dyString *query = sqlDyStringCreate( - "select acc, organism from %s where %s = '%s' " - " and type = 'mRNA'", gbCdnaInfoTable, skipDb(field), idEl->name); + if (idList) + { + struct dyString *query = sqlDyStringCreate("select acc, organism from %s where %s in (", + gbCdnaInfoTable, skipDb(field)); + sqlDyStringPrintValuesList(query, idList); + sqlDyStringPrintf(query, ") and type = 'mRNA'"); // limit results to avoid CGI timeouts (#11626). if (limitResults != EXHAUSTIVE_SEARCH_REQUIRED) sqlDyStringPrintf(query, " limit %d", limitResults); sr = sqlGetResult(conn, dyStringContents(query)); dyStringFree(&query); while ((row = sqlNextRow(sr)) != NULL) { char *acc = row[0]; /* will use this later to distinguish xeno mrna */ int organismID = sqlUnsigned(row[1]); if (!isRefSeqAcc(acc) && !hashLookup(hash, acc)) { el = newSlName(acc); slAddHead(&list, el); hashAddInt(hash, acc, organismID); @@ -1727,77 +1726,80 @@ addRefLinkAccs(conn, accList, &rlList); } else { dyStringClear(ds); sqlDyStringPrintf(ds, "select * from %s where product like '%%%s%%' limit %d", refLinkTable, specNoVersion, NONEXHAUSTIVE_SEARCH_LIMIT); addRefLinks(conn, ds, &rlList); } } } if (rlList != NULL) { struct hgPosTable *table = NULL; struct hash *hash = newHash(8); - for (rl = rlList; rl != NULL; rl = rl->next) - { - char where[64]; struct genePredReader *gpr; struct genePred *gp; - + struct slName *values = NULL; + struct dyString *where = sqlDyStringCreate("name in ("); + // convert the rlList to slNames for sql comma sep strings + for (rl = rlList; rl != NULL; rl = rl->next) + { /* Don't return duplicate mrna accessions */ if (hashFindVal(hash, rl->mrnaAcc)) { - hashAdd(hash, rl->mrnaAcc, rl); continue; } - hashAdd(hash, rl->mrnaAcc, rl); - sqlSafef(where, sizeof where, "name = '%s'", rl->mrnaAcc); - gpr = genePredReaderQuery(conn, hfs->searchTable, where); + slAddHead(&values, slNameNew(rl->mrnaAcc)); + } + slReverse(&values); + sqlDyStringPrintValuesList(where, values); + sqlDyStringPrintf(where, ") order by find_in_set(name, '%s')", slNameListToString(values,',')); + gpr = genePredReaderQuery(conn, hfs->searchTable, dyStringCannibalize(&where)); while ((gp = genePredReaderNext(gpr)) != NULL) { struct hgPos *pos = NULL; AllocVar(pos); if (table == NULL) { char desc[256]; AllocVar(table); table->searchTime = -1; table->name = cloneString(hfs->searchTable); if (startsWith("xeno", hfs->searchTable)) safef(desc, sizeof(desc), "Non-%s RefSeq Genes", hOrganism(db)); else safef(desc, sizeof(desc), "RefSeq Genes"); table->description = cloneString(desc); slAddHead(&hgp->tableList, table); } + struct refLink *rl = (struct refLink *)hashFindVal(hash, gp->name); slAddHead(&table->posList, pos); pos->name = cloneString(rl->name); pos->browserName = cloneString(rl->mrnaAcc); dyStringClear(ds); dyStringPrintf(ds, "(%s) %s", rl->mrnaAcc, rl->product); pos->description = cloneString(ds->string); pos->chrom = hgOfficialChromName(db, gp->chrom); pos->chromStart = gp->txStart; pos->chromEnd = gp->txEnd; genePredFree(&gp); found = TRUE; } genePredReaderFree(&gpr); - } if (table != NULL && measureTiming) table->searchTime = clock1000() - startTime; refLinkFreeList(&rlList); freeHash(&hash); } dyStringFree(&ds); hFreeConn(&conn); return(found); } /* Lowe lab additions */ static void addTigrCmrGenes(struct sqlConnection *conn, struct dyString *query, struct tigrCmrGene **pList) /* Query database and add returned tigrCmrGenes to head of list. */ @@ -2490,31 +2492,31 @@ slReverse(&xrefList); if (xrefList == NULL && hgFindSpecSetting(hfs, "searchBoth") != NULL) xrefList = slPairNew(cloneString(""), cloneString(term)); return(xrefList); } char *addHighlight(char *db, char *chrom, unsigned start, unsigned end) /* Return a string that can be assigned to the cart var addHighlight, to add a yellow highlight * at db.chrom:start+1-end for search results. */ { char *color = "fcfcac"; struct dyString *dy = dyStringCreate("%s.%s:%u-%u#%s", db, chrom, start+1, end, color); return dyStringCannibalize(&dy); } -static boolean doQuery(char *db, struct hgFindSpec *hfs, char *xrefTerm, char *term, +static boolean doQuery(char *db, struct hgFindSpec *hfs, struct slPair *xrefList, char *term, struct hgPositions *hgp, boolean relativeFlag, int relStart, int relEnd, boolean multiTerm, int limitResults, boolean measureTiming) /* Perform a query as specified in hfs, assuming table existence has been * checked and xref'ing has been taken care of. */ { struct slName *tableList = hSplitTableNames(db, hfs->searchTable); struct slName *tPtr = NULL; struct hgPosTable *table = NULL; struct hgPos *pos = NULL; struct sqlConnection *conn = hAllocConn(db); struct sqlResult *sr = NULL; char **row = NULL; char *termPrefix = hgFindSpecSetting(hfs, "termPrefix"); char *paddingStr = hgFindSpecSetting(hfs, "padding"); @@ -2528,66 +2530,89 @@ term += strlen(termPrefix); if (isEmpty(term)) return(FALSE); if (isNotEmpty(hfs->searchDescription)) truncatef(buf, sizeof(buf), "%s", hfs->searchDescription); else safef(buf, sizeof(buf), "%s", hfs->searchTable); description = cloneString(buf); if (hgp->tableList != NULL && sameString(hgp->tableList->name, hfs->searchTable) && sameString(hgp->tableList->description, description)) table = hgp->tableList; +// this is taken from hgFindSpecCustom.c:checkQueryFormat() and changed to +// capture the name field for a table +static char *queryFormatRegexForIdField = + "^select [[:alnum:]]+, ?[[:alnum:]]+, ?[[:alnum:]]+, ?[[:alnum:]]+ " + "from %s where ([[:alnum:]]+) (r?like|=) ['\"]?.*%s.*['\"]?$"; +regmatch_t substrs[3]; for (tPtr = tableList; tPtr != NULL; tPtr = tPtr->next ) { - // we do not have control over the original sql since it comes from trackDb.ra or elsewhere? struct dyString *query = sqlDyStringCreate(hfs->query, tPtr->name, term); + if (xrefList) + { + // NOTE: hfsPolish guarantees the query format and allows the below regex to work. + // See hgFindSpecCustom.c for more details, specifically checkQueryFormat() + if (regexMatchSubstr(hfs->query, queryFormatRegexForIdField, substrs, ArraySize(substrs))) + { + char *idField = regexSubstringClone(hfs->query, substrs[1]); + sqlDyStringPrintf(query, " or %s in (", idField); + struct slName *vals = NULL; + struct slPair *ptr = xrefList; + for (; ptr != NULL; ptr = ptr->next) + { + if (ptr->name && isNotEmpty(ptr->name)) + { + // we got an actual xref term, otherwise the val + // just is the term clonestring'd, which we may have + // changed via termPrefix above + slAddHead(&vals, newSlName((char *)ptr->val)); + } + } + if (!vals) + slAddHead(&vals, newSlName(term)); + slReverse(&vals); + sqlDyStringPrintValuesList(query, vals); + sqlDyStringPrintf(query, ") order by find_in_set(%s, '%s')", idField, slNameListToString(vals, ',')); + } + } if (limitResults != EXHAUSTIVE_SEARCH_REQUIRED) sqlDyStringPrintf(query, " limit %d", limitResults); sr = sqlGetResult(conn, dyStringContents(query)); dyStringFree(&query); while ((row = sqlNextRow(sr)) != NULL) { if(table == NULL) { AllocVar(table); table->searchTime = -1; table->description = description; table->name = cloneString(hfs->searchTable); slAddHead(&hgp->tableList, table); } found = TRUE; AllocVar(pos); pos->chrom = cloneString(row[0]); pos->chromStart = atoi(row[1]); pos->chromEnd = atoi(row[2]); - if (isNotEmpty(xrefTerm)) - truncatef(buf, sizeof(buf), xrefTerm); - else safef(buf, sizeof(buf), "%s%s", termPrefix ? termPrefix : "", row[3]); pos->name = cloneString(buf); pos->browserName = cloneString(row[3]); - if (isNotEmpty(xrefTerm)) - { - safef(buf, sizeof(buf), "(%s%s)", - termPrefix ? termPrefix : "", row[3]); - pos->description = cloneString(buf); - } if (relativeFlag && (pos->chromStart + relEnd) <= pos->chromEnd) { pos->chromEnd = pos->chromStart + relEnd; pos->chromStart = pos->chromStart + relStart; } else if (padding > 0 && !multiTerm) { // highlight the item bases to distinguish from padding pos->highlight = addHighlight(db, pos->chrom, pos->chromStart, pos->chromEnd); int chromSize = hChromSize(db, pos->chrom); pos->chromStart -= padding; pos->chromEnd += padding; if (pos->chromStart < 0) pos->chromStart = 0; if (pos->chromEnd > chromSize) @@ -2602,31 +2627,31 @@ sqlFreeResult(&sr); hFreeConn(&conn); slFreeList(&tableList); if (measureTiming && table) table->searchTime += clock1000() - startTime; return(found); } static boolean hgFindUsingSpec(struct cart *cart, char *db, struct hgFindSpec *hfs, char *term, int limitResults, struct hgPositions *hgp, boolean relativeFlag, int relStart, int relEnd, boolean multiTerm, boolean measureTiming) /* Perform the search described by hfs on term. If successful, put results * in hgp and return TRUE. (If not, don't modify hgp.) */ { -struct slPair *xrefList = NULL, *xrefPtr = NULL; +struct slPair *xrefList = NULL; boolean found = FALSE; if (hfs == NULL || term == NULL || hgp == NULL) errAbort("NULL passed to hgFindUsingSpec.\n"); if (strlen(term)<2 && ! (sameString(hfs->searchName, "knownGene") || sameString(hfs->searchName, "flyBaseGeneSymbolOneLetter"))) return FALSE; if (isNotEmpty(hfs->termRegex) && ! regexMatchNoCase(term, hfs->termRegex)) return(FALSE); if ((!(sameString(hfs->searchType, "mrnaKeyword") || sameString(hfs->searchType, "mrnaAcc"))) && !isBigFileFind(hfs)) @@ -2644,35 +2669,32 @@ struct sqlConnection *conn = hAllocConn(db); // NOTE hfs->xrefTable can sometimes contain a comma-separated table list, // rather than just a single table. char *tables = replaceChars(hfs->xrefTable, ",", " "); boolean exists = sqlTablesExist(conn, tables); hFreeConn(&conn); freeMem(tables); if (! exists) return(FALSE); xrefList = getXrefTerms(db, hfs, term); } else xrefList = slPairNew(cloneString(""), cloneString(term)); -for (xrefPtr = xrefList; xrefPtr != NULL; xrefPtr = xrefPtr->next) - { - found |= doQuery(db, hfs, xrefPtr->name, (char *)xrefPtr->val, hgp, +found |= doQuery(db, hfs, xrefList, term, hgp, relativeFlag, relStart, relEnd, multiTerm, limitResults, measureTiming); - } slPairFreeValsAndList(&xrefList); return(found); } /* Support these formats for range specifiers. Note the ()'s around chrom, * start and end portions for substring retrieval: */ char *canonicalRangeExp = "^([[:alnum:]._#\\-]+)" "[[:space:]]*:[[:space:]]*" "([-0-9,]+)" "[[:space:]]*[-_][[:space:]]*" "([0-9,]+)$"; char *gbrowserRangeExp = "^([[:alnum:]._#\\-]+)" @@ -3451,40 +3473,45 @@ struct hgFindSpec *shortList = NULL, *longList = NULL; struct trackDb *hubCategoryList = NULL; // get all the lists of what to query: if (!trackHubDatabase(db)) { if (categories) myLoadFindSpecs(db, categories, &shortList, &longList); else hgFindSpecGetAllSpecs(db, &shortList, &longList); } // lastly search any included track hubs, or in the case of an assembly hub, any of the tracks hubCategoryList = hubCategoriesToTdbList(categories); struct hgFindSpec *hfs; for (hfs = shortList; hfs != NULL; hfs = hfs->next) { + if (hashFindVal(foundSpecHash, hfs->searchTable)) + continue; // we've already found a hit for this table, don't search it again boolean foundSpec = hgFindUsingSpec(cart, db, hfs, term, limitResults, hgp, FALSE, 0, 0, multiTerm, measureTiming); if (foundSpec) + { hashAdd(foundSpecHash, hfs->searchTable, hfs->searchTable); + } foundIt |= foundSpec; // for multiTerm searches (like '15q11;15q13'), each individual component // must resolve to a single position, so break once we find the first match if (multiTerm && foundSpec) break; } + if (!(multiTerm) || (multiTerm && !foundIt)) { for (hfs = longList; hfs != NULL; hfs = hfs->next) { if (hashFindVal(foundSpecHash, hfs->searchTable) != NULL) continue; foundIt |= hgFindUsingSpec(cart, db, hfs, term, limitResults, hgp, FALSE, 0, 0, multiTerm, measureTiming); } // lastly search any included track hubs, or in the case of an assembly hub, any of the tracks if (hubCategoryList) foundIt |= findBigBedPosInTdbList(cart, db, hubCategoryList, term, hgp, NULL, measureTiming); } // multiTerm searches must resolve to a single range on a chromosome, so don't // do these non positional searches if a multiTerm was requested