5a00dc1ab5eaaaf1776558e2fedc99997471eb73
hiram
  Tue Aug 13 14:38:20 2024 -0700
ready to start cleaning up refs #32897

diff --git src/hg/hubApi/findGenome.c src/hg/hubApi/findGenome.c
index 5bb88a8..123ffd1 100644
--- src/hg/hubApi/findGenome.c
+++ src/hg/hubApi/findGenome.c
@@ -1,36 +1,38 @@
 /* findGenome search functions */
 
 #include "dataApi.h"
 #include "hgFind.h"
 #include "cartTrackDb.h"
 #include "cartJson.h"
 #include "genark.h"
 #include "asmAlias.h"
 #include "asmSummary.h"
+#include "assemblyList.h"
 
 struct combinedSummary
 /* may have information from any of:  asmSummary, genark or dbDb */
     {
     struct combinedSummary *next;	/* Next in singly linked list */
     struct asmSummary *summary;		/* from asmSummary table */
     struct genark *genArk;		/* from genark table */
     struct dbDb *dbDb;			/* from dbDb table */
     };
 
 /* will be initialized as this function begins */
 static char *genarkTable = NULL;
+static char *asmListTable = NULL;
 static boolean statsOnly = FALSE;
 static boolean allowAll = FALSE;	/* default only show existing browsers*/
 
 static void startGenomes(struct jsonWrite *jw)
 /* begin the list output */
 {
 if (jsonOutputArrays)
     jsonWriteListStart(jw, "genomes");
 else
     jsonWriteObjectStart(jw, "genomes");
 }
 
 static void endGenomes(struct jsonWrite *jw)
 /* end the list output */
 {
@@ -285,124 +287,120 @@
 	asmIdFound = genBank;
     }
 if (asmIdFound)
     {
     char *words[3];
     int wordCount = chopString(asmIdFound, "_", words, ArraySize(words));
     if (wordCount > 2)
 	{
 	safef(query, sizeof(query), "%s_%s", words[0], words[1]);
         itemReturn = checkAsmSummary(centConn, query, TRUE, 1);
 	}
     }
 return itemReturn;
 }
 
-static int singleWordSearch(struct sqlConnection *conn, char *searchWord, struct jsonWrite *jw)
-/* conn is a connection to hgcentral, searching for one word,
- *   might be a UCSC database or a GenArk hub accession, or just some word.
- */
+static int sqlJsonOut(struct jsonWrite *jw, struct sqlResult *sr)
+/* given a sqlResult, walk through the rows and output the json */
 {
-/* the input word might be a database alias
- * asmAliasFind returns the searchWord if no alias found
- */
-char *perhapsAlias = asmAliasFind(searchWord);
-
 int itemCount = 0;
-
-if (startsWith("GC", perhapsAlias))
-    {
-    struct asmSummary *asmSumFound = NULL;
-    struct hash *asmSummaryHash = NULL;
-    asmSumFound = checkAsmSummary(conn, perhapsAlias, FALSE, 0);
-    if (asmSumFound)
-        {
-        struct asmSummary *el;
-        asmSummaryHash = newHash(0);
-        for (el = asmSumFound; el != NULL; el = el->next)
-            hashAdd(asmSummaryHash, el->assemblyAccession, (void *)el);
-        }
-    char query[4096];
-    sqlSafef(query, sizeof(query), "SELECT * FROM %s WHERE gcAccession LIKE \"%s%%\"", genarkTable, perhapsAlias);
-    struct sqlResult *sr = sqlGetResult(conn, query);
 char **row;
-    struct combinedSummary *comboOutput = NULL;
 while ((row = sqlNextRow(sr)) != NULL)
     {
-        struct genark *genome = genarkLoad(row);
-        struct combinedSummary *cs = NULL;
-        AllocVar(cs);
-        cs->genArk = genome;
-        cs->dbDb = NULL;
-	if (asmSummaryHash)
-	    {
-	    cs->summary = hashFindVal(asmSummaryHash, genome->gcAccession);
-	    if (cs->summary)
-		hashRemove(asmSummaryHash, genome->gcAccession);
+    struct assemblyList *el = assemblyListLoadWithNull(row);
+    jsonWriteObjectStart(jw, el->name);
+//    jsonWriteString(jw, "name", el->name);
+    jsonWriteNumber(jw, "priority", (long long)el->priority);
+    jsonWriteString(jw, "commonName", el->commonName);
+    jsonWriteString(jw, "scientificName", el->scientificName);
+    jsonWriteNumber(jw, "taxId", (long long)el->taxId);
+    jsonWriteString(jw, "clade", el->clade);
+    jsonWriteString(jw, "description", el->description);
+    if (1 == *el->browserExists)
+	jsonWriteBoolean(jw, "browserExists", TRUE);
+    else
+	jsonWriteBoolean(jw, "browserExists", FALSE);
+    if (isEmpty(el->hubUrl))
+        jsonWriteString(jw, "hubUrl", NULL);
+    else
+        jsonWriteString(jw, "hubUrl", el->hubUrl);
+    jsonWriteObjectEnd(jw);
+    ++itemCount;
     }
-	slAddHead(&comboOutput, cs);
+return (itemCount);
 }
-    sqlFreeResult(&sr);
-    if (allowAll && asmSummaryHash)
+
+static long long multipleWordSearch(struct sqlConnection *conn, char **words, int wordCount, struct jsonWrite *jw, long long *totalMatchCount)
+/* perform search on multiple words, prepare json and return number of matches */
 {
-	/* check if all asmSummaryHash has been used up */
-	struct hashCookie cookie = hashFirst(asmSummaryHash);
-	struct hashEl *hel;
-	while ((hel = hashNext(&cookie)) != NULL)
+long long itemCount = 0;
+*totalMatchCount = 0;
+if (wordCount < 0)
+    return itemCount;
+
+/* get the words[] into a single string */
+struct dyString *queryDy = dyStringNew(128);
+dyStringPrintf(queryDy, "%s", words[0]);
+for (int i = 1; i < wordCount; ++i)
+    dyStringPrintf(queryDy, " %s", words[i]);
+
+char query[4096];
+sqlSafef(query, sizeof(query), "SELECT count(*) FROM %s WHERE MATCH(name, commonName, scientificName, clade, description) AGAINST ('%s' IN BOOLEAN MODE);", asmListTable, queryDy->string);
+long long matchCount = sqlQuickLongLong(conn, query);
+if (matchCount > 0)
     {
-            struct combinedSummary *cs = NULL;
-            AllocVar(cs);
-            cs->genArk = NULL;
-            cs->dbDb = NULL;
-            cs->summary = hel->val;
-            slAddHead(&comboOutput, cs);
+    verbose(1, "DBG: matchCount: %lld from search '%s'\n", matchCount, query);
+    *totalMatchCount = matchCount;
+    sqlSafef(query, sizeof(query), "SELECT * FROM %s WHERE MATCH(name, commonName, scientificName, clade, description) AGAINST ('%s' IN BOOLEAN MODE) ORDER BY priority LIMIT %d;", asmListTable, queryDy->string, maxItemsOutput);
+    struct sqlResult *sr = sqlGetResult(conn, query);
+    itemCount = sqlJsonOut(jw, sr);
+    verbose(1, "DBG: itemCount: %lld from search '%s'\n", itemCount, query);
+    sqlFreeResult(&sr);
     }
+return itemCount;
 }
-    if (comboOutput)
+
+static long long oneWordSearch(struct sqlConnection *conn, char *searchWord, struct jsonWrite *jw, long long *totalMatchCount)
+/* perform search on a single word, prepare json and return number of matches
+ *   and number of potential matches totalMatchCount
+ */
 {
-	slReverse(&comboOutput);
-	itemCount = outputCombo(jw, comboOutput);
-	}
-    }	/*	if (startsWith("GC", perhapsAlias))	*/
-else
-    {	/* not a GC genArk name, perhaps a UCSC database */
-    if (hDbIsActive(perhapsAlias))
-        {
-	struct combinedSummary *comboOutput = NULL;
-        AllocVar(comboOutput);
-        comboOutput->summary = NULL;
-        comboOutput->genArk = NULL;
-        struct dbDb *dbDbEntry = hDbDb(perhapsAlias);
-        comboOutput->dbDb = dbDbEntry;
-        struct asmSummary *sumList = dbDbAsmEquivalent(conn, perhapsAlias);
-        if (sumList)
-	   comboOutput->summary = sumList;
-	itemCount = outputCombo(jw, comboOutput);
-        }
-    }	/*	checked genArk and UCSC database */
-if (0 == itemCount)	/* not found in genark or ucsc db, check asmSummary */
+char query[4096];
+int itemCount = 0;
+*totalMatchCount = 0;
+
+sqlSafef(query, sizeof(query), "SELECT count(*) FROM %s WHERE MATCH(name, commonName, scientificName, clade, description) AGAINST ('%s' IN BOOLEAN MODE);", asmListTable, searchWord);
+long long matchCount = sqlQuickLongLong(conn, query);
+boolean prefixSearch = FALSE;
+if (matchCount < 1)	/* no match, add the * wild card match to make a prefix match */
     {
-    long long totalMatch = 0;
-    /* no more than 100 items result please */
-    struct asmSummary *summaryList = asmSummaryFullText(conn, perhapsAlias, (long long) 100, &totalMatch);
-    /* now check the genark table to see if there are any of those */
-    struct combinedSummary *comboOutput = checkForGenArk(conn, summaryList);
-    if (comboOutput)
-	itemCount = outputCombo(jw, comboOutput);
+    sqlSafef(query, sizeof(query), "SELECT count(*) FROM %s WHERE MATCH(name, commonName, scientificName, clade, description) AGAINST ('%s*' IN BOOLEAN MODE);", asmListTable, searchWord);
+    matchCount = sqlQuickLongLong(conn, query);
+    if (matchCount > 0)
+	prefixSearch = TRUE;
     }
+if (matchCount < 1)
+    return itemCount;
+*totalMatchCount = matchCount;
+
+sqlSafef(query, sizeof(query), "SELECT * FROM %s WHERE MATCH(name, commonName, scientificName, clade, description) AGAINST ('%s%s' IN BOOLEAN MODE) ORDER BY priority LIMIT %d;", asmListTable, searchWord, prefixSearch ? "*" : "", maxItemsOutput);
+struct sqlResult *sr = sqlGetResult(conn, query);
+itemCount = sqlJsonOut(jw, sr);
+sqlFreeResult(&sr);
+
 return itemCount;
-}	/*	static int singleWordSearch(struct sqlConnection *conn, char *searchWord, struct jsonWrite *jw) */
+}	/*	static long long oneWordSearch(struct sqlConnection *conn, char *searchWord, struct jsonWrite *jw) */
 
 static void asmSummaryGroup(struct sqlConnection *conn, struct jsonWrite *jw, char *field)
 /* show a grouping count for a field in asmSummary table */
 {
 char query[4096];
 jsonWriteObjectStart(jw, field);
 sqlSafef(query, sizeof(query), "SELECT %s, COUNT(*) FROM asmSummary GROUP by %s", field, field);
 struct sqlResult *sr = sqlGetResult(conn, query);
 char **row;
 while ((row = sqlNextRow(sr)) != NULL)
     {
     if (strlen(row[0]) < 1)
 	jsonWriteNumber(jw, "na", sqlLongLong(row[1]));
     else
 	jsonWriteNumber(jw, row[0], sqlLongLong(row[1]));
@@ -450,34 +448,39 @@
 
 static void elapsedTime(struct jsonWrite *jw)
 {
 long nowTime = clock1000();
 long elapsedTimeMs = nowTime - enteredMainTime;
 jsonWriteNumber(jw, "elapsedTimeMs", elapsedTimeMs);
 }
 
 void apiFindGenome(char *pathString[MAX_PATH_INFO])
 /* 'findGenome' function */
 {
 char *searchString = cgiOptionalString(argGenomeSearchTerm);
 char *inputSearchString = cloneString(searchString);
 char *extraArgs = verifyLegalArgs(argFindGenome);
 genarkTable = genarkTableName();
+asmListTable = assemblyListTableName();
 
 if (extraArgs)
     apiErrAbort(err400, err400Msg, "extraneous arguments found for function /findGenome'%s'", extraArgs);
 
+boolean asmListExists = hTableExists("hgcentraltest", asmListTable);
+if (!asmListExists)
+    apiErrAbort(err400, err400Msg, "table hgcentraltest.assemblyList does not exist for /findGenome");
+
 boolean asmSummaryExists = hTableExists("hgcentraltest", "asmSummary");
 if (!asmSummaryExists)
     apiErrAbort(err400, err400Msg, "table hgcentraltest.asmSummary does not exist for /findGenome");
 
 boolean genArkExists = hTableExists("hgcentraltest", genarkTable);
 if (!genArkExists)
     apiErrAbort(err400, err400Msg, "table hgcentraltest.%s does not exist for /findGenome", genarkTable);
 
 char *allowAllString = cgiOptionalString(argAllowAll);
 if (isNotEmpty(allowAllString))
     {
     if (sameString("1", allowAllString))
 	allowAll = TRUE;
     else if (sameString("0", allowAllString))
 	allowAll = FALSE;
@@ -486,90 +489,104 @@
     }
 
 char *statsOnlyString = cgiOptionalString(argStatsOnly);
 if (isNotEmpty(statsOnlyString))
     {
     if (sameString("1", statsOnlyString))
 	statsOnly = TRUE;
     else if (sameString("0", statsOnlyString))
 	statsOnly = FALSE;
     else
 	apiErrAbort(err400, err400Msg, "unrecognized '%s=%s' argument, can only be =1 or =0", argStatsOnly, statsOnlyString);
     }
 
 struct sqlConnection *conn = hConnectCentral();
 
-if (!sqlTableExists(conn, genarkTable))
-    apiErrAbort(err500, err500Msg, "missing central.genark table in function /findGenome'%s'", extraArgs);
+if (!sqlTableExists(conn, asmListTable))
+    apiErrAbort(err500, err500Msg, "missing central.assemblyList table in function /findGenome'%s'", extraArgs);
 
 int wordCount = 0;
 
 if (! statsOnly)
     {
     /* verify number of words in search string is legal */
     wordCount = chopByWhite(searchString, NULL, 0);
 
     if (wordCount < 1)
     apiErrAbort(err400, err400Msg, "search term '%s' does not contain a word ? for function /findGenome", argGenomeSearchTerm);
     if (wordCount > 5)
     apiErrAbort(err400, err400Msg, "search term '%s=%s' should not have more than 5 words for function /findGenome", argGenomeSearchTerm, searchString);
     }
 
-genarkTable = genarkTableName();
-
 struct jsonWrite *jw = apiStartOutput();
 if (statsOnly)
     {
     doStatsOnly(conn, jw);
     elapsedTime(jw);
     apiFinishOutput(0, NULL, jw);
     hDisconnectCentral(&conn);
     return;
     }
 
 jsonWriteString(jw, argGenomeSearchTerm, searchString);
 if (allowAll)
 	jsonWriteBoolean(jw, argAllowAll, allowAll);
 
-int itemCount = 0;
+long long itemCount = 0;
+long long totalMatchCount = 0;
 /* save the search string before it is chopped */
 char *pristineSearchString = cloneString(searchString);
 
 char **words;
 AllocArray(words, wordCount);
 (void) chopByWhite(searchString, words, wordCount);
 if (1 == wordCount)
-    {
-    itemCount = singleWordSearch(conn, words[0], jw);
-    if (itemCount)
-	jsonWriteNumber(jw, "itemCount", itemCount);
-    else
-	verbose(0, "# DBG need to search this word %s somewhere else\n", words[0]);
-    }
+    itemCount = oneWordSearch(conn, words[0], jw, &totalMatchCount);
 else	/* multiple word search */
+/* rules about what can be in the search string:
+ *  + sign before a word indicates the word must be in the result
+ *  - sign before a word indicates it must not be in the result
+ *  * at end of word makes the word be a prefix search
+ *  "double quotes" to group words together as a phrase to match exactly
+ *  < or > adjust the words contribution to the relevance value
+ *          >moreImportant  <lessImportant
+ *  ~ negates the word's contribution to the relevance value without
+ *    excluding it from the results
+ *  (parens clauses) to groups words together for more complex queries
+ *  | OR operator 'thisWord | otherWord'
+ */
     {
+    itemCount = multipleWordSearch(conn, words, wordCount, jw, &totalMatchCount);
+    if ( 1 == 0 ) {
     long long totalMatch = 0;
     /* no more than 100 items result please */
     struct asmSummary *summaryList = asmSummaryFullText(conn, pristineSearchString, (long long) 100, &totalMatch);
     if (summaryList)
 	{
 	struct combinedSummary *comboOutput = checkForGenArk(conn, summaryList);
 	if (comboOutput)
 	    itemCount = outputCombo(jw, comboOutput);
 	if (itemCount)
 	    jsonWriteNumber(jw, "itemCount", itemCount);
 	else
 	    verbose(0, "# DBG need to search this string '%s' somewhere else\n", pristineSearchString);
 	}
     else
 	verbose(0, "# DBG need to search this string '%s' somewhere else\n", pristineSearchString);
+    }	/*	if ( 1 == 0 )	*/
     }
 
 elapsedTime(jw);
 if (itemCount)
+    {
+    jsonWriteNumber(jw, "itemCount", itemCount);
+    jsonWriteNumber(jw, "totalMatchCount", totalMatchCount);
+    if (totalMatchCount > itemCount)
+	jsonWriteBoolean(jw, "maxItemsLimit", TRUE);
     apiFinishOutput(0, NULL, jw);
+    }
 else
     apiErrAbort(err400, err400Msg, "no genomes found matching search term %s='%s' for endpoint: /findGenome", argGenomeSearchTerm, inputSearchString);
 
 hDisconnectCentral(&conn);
 
 }