5197ebd63b54192a79e5f001a5cb18d89822b542
chmalee
  Wed May 29 13:42:32 2024 -0700
Improve genbank search performance, mostly by optimizing mysql queries
down from one query per findSpec result into a single query that checks
all the findSpec results

diff --git src/hg/lib/hgFind.c src/hg/lib/hgFind.c
index 5003ff1..6b2d76e 100644
--- src/hg/lib/hgFind.c
+++ src/hg/lib/hgFind.c
@@ -1288,51 +1288,50 @@
 // Modified to return only the first 500 hits because of CGI timeouts
 {
 struct slName *list = NULL, *el;
 struct hash *hash = newHash(0);
 struct sqlConnection *conn = hAllocConn(db);
 struct sqlResult *sr;
 char **row;
 char *field;
 int i;
 
 int rowCount = 0; // Excessively broad searches were leading to CGI timeouts (#11626).
 for (i = 0;
      i<tableCount && (limitResults == EXHAUSTIVE_SEARCH_REQUIRED || rowCount < limitResults);
      ++i)
     {
-    struct slName *idList = NULL, *idEl;
+    struct slName *idList = NULL;
     char *grepIndexFile = NULL;
     
     /* I'm doing this query in two steps in C rather than
      * in one step in SQL just because it somehow is much
      * faster this way (like 100x faster) when using mySQL. */
     field = tables[i];
     if (!sqlTableExists(conn, field))
 	continue;
     if ((grepIndexFile = getGenbankGrepIndex(db, hfs, field, "idName")) != NULL)
 	idList = genbankGrepQuery(grepIndexFile, field, key);
     else
         idList = genbankSqlFuzzyQuery(conn, field, key, limitResults);
-    for (idEl = idList;
-         idEl != NULL && (limitResults == EXHAUSTIVE_SEARCH_REQUIRED || rowCount < limitResults);
-         idEl = idEl->next)
-        {
     /* don't check srcDb to exclude refseq for compat with older tables */
-	struct dyString *query = sqlDyStringCreate(
-	      "select acc, organism from %s where %s = '%s' "
-	      " and type = 'mRNA'", gbCdnaInfoTable, skipDb(field), idEl->name);
+    if (idList)
+        {
+        struct dyString *query = sqlDyStringCreate("select acc, organism from %s where %s in (",
+                gbCdnaInfoTable, skipDb(field));
+        sqlDyStringPrintValuesList(query, idList);
+        sqlDyStringPrintf(query, ") and type = 'mRNA'");
         // limit results to avoid CGI timeouts (#11626).
         if (limitResults != EXHAUSTIVE_SEARCH_REQUIRED)
             sqlDyStringPrintf(query, " limit %d", limitResults);
         sr = sqlGetResult(conn, dyStringContents(query));
         dyStringFree(&query);
         while ((row = sqlNextRow(sr)) != NULL)
             {
             char *acc = row[0];
             /* will use this later to distinguish xeno mrna */
             int organismID = sqlUnsigned(row[1]);
             if (!isRefSeqAcc(acc) && !hashLookup(hash, acc))
                 {
                 el = newSlName(acc);
                 slAddHead(&list, el);
                 hashAddInt(hash, acc, organismID);
@@ -1727,77 +1726,80 @@
 	    addRefLinkAccs(conn, accList, &rlList);
 	    }
 	else
 	    {
 	    dyStringClear(ds);
 	    sqlDyStringPrintf(ds, "select * from %s where product like '%%%s%%' limit %d",
 			   refLinkTable, specNoVersion, NONEXHAUSTIVE_SEARCH_LIMIT);
 	    addRefLinks(conn, ds, &rlList);
 	    }
 	}
     }
 if (rlList != NULL)
     {
     struct hgPosTable *table = NULL;
     struct hash *hash = newHash(8);
-    for (rl = rlList; rl != NULL; rl = rl->next)
-        {
-        char where[64];
     struct genePredReader *gpr;
     struct genePred *gp;
-
+    struct slName *values = NULL;
+    struct dyString *where = sqlDyStringCreate("name in (");
+    // convert the rlList to slNames for sql comma sep strings
+    for (rl = rlList; rl != NULL; rl = rl->next)
+        {
         /* Don't return duplicate mrna accessions */
         if (hashFindVal(hash, rl->mrnaAcc))
             {            
-            hashAdd(hash, rl->mrnaAcc, rl);
             continue;
             }
-
         hashAdd(hash, rl->mrnaAcc, rl);
-        sqlSafef(where, sizeof where, "name = '%s'", rl->mrnaAcc);
-        gpr = genePredReaderQuery(conn, hfs->searchTable, where);
+        slAddHead(&values, slNameNew(rl->mrnaAcc));
+        }
+    slReverse(&values);
+    sqlDyStringPrintValuesList(where, values);
+    sqlDyStringPrintf(where, ") order by find_in_set(name, '%s')", slNameListToString(values,','));
+    gpr = genePredReaderQuery(conn, hfs->searchTable, dyStringCannibalize(&where));
     while ((gp = genePredReaderNext(gpr)) != NULL)
         {
         struct hgPos *pos = NULL;
         AllocVar(pos);
         if (table == NULL)
             {
             char desc[256];
             AllocVar(table);
             table->searchTime = -1;
             table->name = cloneString(hfs->searchTable);
             if (startsWith("xeno", hfs->searchTable))
                 safef(desc, sizeof(desc), "Non-%s RefSeq Genes", hOrganism(db));
             else
                 safef(desc, sizeof(desc), "RefSeq Genes");
             table->description = cloneString(desc);
             slAddHead(&hgp->tableList, table);
             }
+        struct refLink *rl = (struct refLink *)hashFindVal(hash, gp->name);
         slAddHead(&table->posList, pos);
         pos->name = cloneString(rl->name);
         pos->browserName = cloneString(rl->mrnaAcc);
         dyStringClear(ds);
         dyStringPrintf(ds, "(%s) %s", rl->mrnaAcc, rl->product);
         pos->description = cloneString(ds->string);
         pos->chrom = hgOfficialChromName(db, gp->chrom);
         pos->chromStart = gp->txStart;
         pos->chromEnd = gp->txEnd;
         genePredFree(&gp);
         found = TRUE;
         }
     genePredReaderFree(&gpr);
-	}
     if (table != NULL && measureTiming)
         table->searchTime = clock1000() - startTime;
     refLinkFreeList(&rlList);
     freeHash(&hash);
     }
 dyStringFree(&ds);
 hFreeConn(&conn);
 return(found);
 }
 
 /* Lowe lab additions */
 
 static void addTigrCmrGenes(struct sqlConnection *conn, struct dyString *query,
 	struct tigrCmrGene **pList)
 /* Query database and add returned tigrCmrGenes to head of list. */
@@ -2490,31 +2492,31 @@
 slReverse(&xrefList);
 if (xrefList == NULL && hgFindSpecSetting(hfs, "searchBoth") != NULL)
     xrefList = slPairNew(cloneString(""), cloneString(term));
 return(xrefList);
 }
 
 char *addHighlight(char *db, char *chrom, unsigned start, unsigned end)
 /* Return a string that can be assigned to the cart var addHighlight, to add a yellow highlight
  * at db.chrom:start+1-end for search results. */
 {
 char *color = "fcfcac";
 struct dyString *dy = dyStringCreate("%s.%s:%u-%u#%s", db, chrom, start+1, end, color);
 return dyStringCannibalize(&dy);
 }
 
-static boolean doQuery(char *db, struct hgFindSpec *hfs, char *xrefTerm, char *term,
+static boolean doQuery(char *db, struct hgFindSpec *hfs, struct slPair *xrefList, char *term,
 		       struct hgPositions *hgp,
 		       boolean relativeFlag, int relStart, int relEnd,
 		       boolean multiTerm, int limitResults, boolean measureTiming)
 /* Perform a query as specified in hfs, assuming table existence has been 
  * checked and xref'ing has been taken care of. */
 {
 struct slName *tableList = hSplitTableNames(db, hfs->searchTable);
 struct slName *tPtr = NULL;
 struct hgPosTable *table = NULL;
 struct hgPos *pos = NULL;
 struct sqlConnection *conn = hAllocConn(db);
 struct sqlResult *sr = NULL;
 char **row = NULL;
 char *termPrefix = hgFindSpecSetting(hfs, "termPrefix");
 char *paddingStr = hgFindSpecSetting(hfs, "padding");
@@ -2528,66 +2530,89 @@
     term += strlen(termPrefix);
 if (isEmpty(term))
     return(FALSE);
 
 if (isNotEmpty(hfs->searchDescription))
     truncatef(buf, sizeof(buf), "%s", hfs->searchDescription);
 else
     safef(buf, sizeof(buf), "%s", hfs->searchTable);
 description = cloneString(buf);
 
 if (hgp->tableList != NULL &&
     sameString(hgp->tableList->name, hfs->searchTable) &&
     sameString(hgp->tableList->description, description))
     table = hgp->tableList;
 
+// this is taken from hgFindSpecCustom.c:checkQueryFormat() and changed to
+// capture the name field for a table
+static char *queryFormatRegexForIdField =
+    "^select [[:alnum:]]+, ?[[:alnum:]]+, ?[[:alnum:]]+, ?[[:alnum:]]+ "
+    "from %s where ([[:alnum:]]+) (r?like|=) ['\"]?.*%s.*['\"]?$";
+regmatch_t substrs[3];
 for (tPtr = tableList;  tPtr != NULL;  tPtr = tPtr->next )
     {
-    // we do not have control over the original sql since it comes from trackDb.ra or elsewhere?
     struct dyString *query = sqlDyStringCreate(hfs->query, tPtr->name, term);
+    if (xrefList)
+        {
+        // NOTE: hfsPolish guarantees the query format and allows the below regex to work.
+        // See hgFindSpecCustom.c for more details, specifically checkQueryFormat()
+        if (regexMatchSubstr(hfs->query, queryFormatRegexForIdField, substrs, ArraySize(substrs)))
+            {
+            char *idField = regexSubstringClone(hfs->query, substrs[1]);
+            sqlDyStringPrintf(query, " or %s in (", idField);
+            struct slName *vals = NULL;
+            struct slPair *ptr = xrefList;
+            for (; ptr != NULL; ptr = ptr->next)
+                {
+                if (ptr->name && isNotEmpty(ptr->name))
+                    {
+                    // we got an actual xref term, otherwise the val
+                    // just is the term clonestring'd, which we may have
+                    // changed via termPrefix above
+                    slAddHead(&vals, newSlName((char *)ptr->val));
+                    }
+                }
+            if (!vals)
+                slAddHead(&vals, newSlName(term));
+            slReverse(&vals);
+            sqlDyStringPrintValuesList(query, vals);
+            sqlDyStringPrintf(query, ") order by find_in_set(%s, '%s')", idField, slNameListToString(vals, ','));
+            }
+        }
     if (limitResults != EXHAUSTIVE_SEARCH_REQUIRED)
         sqlDyStringPrintf(query, " limit %d", limitResults);
     sr = sqlGetResult(conn, dyStringContents(query));
     dyStringFree(&query);
     while ((row = sqlNextRow(sr)) != NULL)
         {
         if(table == NULL)
             {
             AllocVar(table);
             table->searchTime = -1;
             table->description = description;
             table->name = cloneString(hfs->searchTable);
             slAddHead(&hgp->tableList, table);
             }
         found = TRUE;
         AllocVar(pos);
         pos->chrom = cloneString(row[0]);
         pos->chromStart = atoi(row[1]);
         pos->chromEnd = atoi(row[2]);
-        if (isNotEmpty(xrefTerm))
-            truncatef(buf, sizeof(buf), xrefTerm);
-        else
         safef(buf, sizeof(buf), "%s%s",
               termPrefix ? termPrefix : "", row[3]);
         pos->name = cloneString(buf);
         pos->browserName = cloneString(row[3]);
-        if (isNotEmpty(xrefTerm))
-            {
-            safef(buf, sizeof(buf), "(%s%s)",
-                  termPrefix ? termPrefix : "", row[3]);
-            pos->description = cloneString(buf);
-            }
         if (relativeFlag && (pos->chromStart + relEnd) <= pos->chromEnd)
             {
             pos->chromEnd   = pos->chromStart + relEnd;
             pos->chromStart = pos->chromStart + relStart;
             }
         else if (padding > 0 && !multiTerm)
             {
             // highlight the item bases to distinguish from padding
             pos->highlight = addHighlight(db, pos->chrom, pos->chromStart, pos->chromEnd);
             int chromSize = hChromSize(db, pos->chrom);
             pos->chromStart -= padding;
             pos->chromEnd   += padding;
             if (pos->chromStart < 0)
                 pos->chromStart = 0;
             if (pos->chromEnd > chromSize)
@@ -2602,31 +2627,31 @@
 sqlFreeResult(&sr);
 hFreeConn(&conn);
 slFreeList(&tableList);
 if (measureTiming && table)
     table->searchTime += clock1000() - startTime;
 return(found);
 }
 
 static boolean hgFindUsingSpec(struct cart *cart,
                         char *db, struct hgFindSpec *hfs, char *term, int limitResults,
 			struct hgPositions *hgp, boolean relativeFlag,
 			int relStart, int relEnd, boolean multiTerm, boolean measureTiming)
 /* Perform the search described by hfs on term.  If successful, put results
  * in hgp and return TRUE.  (If not, don't modify hgp.) */
 {
-struct slPair *xrefList = NULL, *xrefPtr = NULL; 
+struct slPair *xrefList = NULL; 
 boolean found = FALSE;
 
 if (hfs == NULL || term == NULL || hgp == NULL)
     errAbort("NULL passed to hgFindUsingSpec.\n");
 
 if (strlen(term)<2 && !
     (sameString(hfs->searchName, "knownGene") ||
      sameString(hfs->searchName, "flyBaseGeneSymbolOneLetter")))
     return FALSE;
 
 if (isNotEmpty(hfs->termRegex) && ! regexMatchNoCase(term, hfs->termRegex))
     return(FALSE);
 
 if ((!(sameString(hfs->searchType, "mrnaKeyword") || sameString(hfs->searchType, "mrnaAcc")))
     && !isBigFileFind(hfs))
@@ -2644,35 +2669,32 @@
     struct sqlConnection *conn = hAllocConn(db);
     // NOTE hfs->xrefTable can sometimes contain a comma-separated table list, 
     // rather than just a single table. 
     char *tables = replaceChars(hfs->xrefTable, ",", " ");
     boolean exists = sqlTablesExist(conn, tables);
     hFreeConn(&conn);
     freeMem(tables);
     if (! exists)
 	return(FALSE);
     
     xrefList = getXrefTerms(db, hfs, term);
     }
 else
     xrefList = slPairNew(cloneString(""), cloneString(term));
 
-for (xrefPtr = xrefList;  xrefPtr != NULL;  xrefPtr = xrefPtr->next)
-    {
-    found |= doQuery(db, hfs, xrefPtr->name, (char *)xrefPtr->val, hgp,
+found |= doQuery(db, hfs, xrefList, term, hgp,
          relativeFlag, relStart, relEnd, multiTerm, limitResults, measureTiming);
-    }
 slPairFreeValsAndList(&xrefList);
 return(found);
 }
 
 
 /* Support these formats for range specifiers.  Note the ()'s around chrom,
  * start and end portions for substring retrieval: */
 char *canonicalRangeExp = 
 		     "^([[:alnum:]._#\\-]+)"
 		     "[[:space:]]*:[[:space:]]*"
 		     "([-0-9,]+)"
 		     "[[:space:]]*[-_][[:space:]]*"
 		     "([0-9,]+)$";
 char *gbrowserRangeExp = 
 		     "^([[:alnum:]._#\\-]+)"
@@ -3451,40 +3473,45 @@
 struct hgFindSpec *shortList = NULL, *longList = NULL;
 struct trackDb *hubCategoryList = NULL;
 // get all the lists of what to query:
 if (!trackHubDatabase(db))
     {
     if (categories)
         myLoadFindSpecs(db, categories, &shortList, &longList);
     else
         hgFindSpecGetAllSpecs(db, &shortList, &longList);
     }
 // lastly search any included track hubs, or in the case of an assembly hub, any of the tracks
 hubCategoryList = hubCategoriesToTdbList(categories);
 struct hgFindSpec *hfs;
 for (hfs = shortList; hfs != NULL; hfs = hfs->next)
     {
+    if (hashFindVal(foundSpecHash, hfs->searchTable))
+        continue; // we've already found a hit for this table, don't search it again
     boolean foundSpec = hgFindUsingSpec(cart, db, hfs, term, limitResults, hgp, FALSE, 0, 0, multiTerm, measureTiming);
     if (foundSpec)
+        {
         hashAdd(foundSpecHash, hfs->searchTable, hfs->searchTable);
+        }
     foundIt |= foundSpec;
 
     // for multiTerm searches (like '15q11;15q13'), each individual component
     // must resolve to a single position, so break once we find the first match
     if (multiTerm && foundSpec)
         break;
     }
+
 if (!(multiTerm) || (multiTerm && !foundIt))
     {
     for (hfs = longList; hfs != NULL; hfs = hfs->next)
         {
         if (hashFindVal(foundSpecHash, hfs->searchTable) != NULL)
             continue;
         foundIt |= hgFindUsingSpec(cart, db, hfs, term, limitResults, hgp, FALSE, 0, 0, multiTerm, measureTiming);
         }
     // lastly search any included track hubs, or in the case of an assembly hub, any of the tracks
     if (hubCategoryList)
         foundIt |= findBigBedPosInTdbList(cart, db, hubCategoryList, term, hgp, NULL, measureTiming);
     }
 
 // multiTerm searches must resolve to a single range on a chromosome, so don't
 // do these non positional searches if a multiTerm was requested