1fb288d9c135df7d88a8accc0792f2986afe17bc
hiram
  Tue Sep 3 17:59:08 2024 -0700
correctly manage the special characters in a single word, to be done same for multiple words refs #32596

diff --git src/hg/hubApi/findGenome.c src/hg/hubApi/findGenome.c
index 2bc433c..9876e79 100644
--- src/hg/hubApi/findGenome.c
+++ src/hg/hubApi/findGenome.c
@@ -90,30 +90,83 @@
         jsonWriteString(jw, "refSeqCategory", el->refSeqCategory);
     if (isEmpty(el->versionStatus))
         jsonWriteString(jw, "versionStatus", NULL);
     else
         jsonWriteString(jw, "versionStatus", el->versionStatus);
     if (isEmpty(el->assemblyLevel))
         jsonWriteString(jw, "assemblyLevel", NULL);
     else
         jsonWriteString(jw, "assemblyLevel", el->assemblyLevel);
     jsonWriteObjectEnd(jw);
     ++itemCount;
     }
 return (itemCount);
 }
 
+/* MySQL FULLTEXT indexing has indexed 'words' as broken up by
+ *   word break characters, such as in the regular expression: '\W+'
+ *   or, in this case, checking the string with isalnum() function,
+ *   must all be isalnum()
+ * Return: TRUE when there are word breaks
+ *         FALSE - the string is all one 'word'
+ */
+static boolean hasWordBreaks(char *s)
+/* Return TRUE if there is any word break in string.
+ * allowing characters _ * + - as those are special characters
+ *  to the MySQL FULLTEXT search
+ *  The string has already been checked for the special prefix
+ *   characters of: " - +
+ *   or the special end character of: *
+ */
+{
+char c;
+while ((c = *s++) != 0)
+    {
+    if (c == '_' || c == '*' || c == '+' || c == '-')
+	continue;
+    if (! isalnum(c))
+        return TRUE;
+    }
+return FALSE;
+}
+
+static char *quoteWords(char *s)
+/* given a string with word break characters, break it up into
+ *  a quoted string with the word break characters turned to single space
+ */
+{
+struct dyString *quoteString = dyStringNew(128);
+dyStringPrintf(quoteString, "\"");
+char c;
+int spaceCount = 0;
+while ((c = *s++) != 0)
+    if (isalnum(c) || c == '_' || c == '*' || c == '+' || c == '-')
+	{
+	dyStringPrintf(quoteString, "%c", c);
+	spaceCount = 0;
+	}
+    else
+	{
+	if (spaceCount)
+	    continue;
+	dyStringPrintf(quoteString, " ");
+	++spaceCount;
+	}
+dyStringPrintf(quoteString, "\"");
+return dyStringCannibalize(&quoteString);
+}
+
 static void addBrowserExists(struct dyString *query)
 /* add the AND clauses for browserExist depending upon option */
 {
 if (browserMustExist)
     sqlDyStringPrintf(query, " AND browserExists=1");
 else if (browserNotExist)
     sqlDyStringPrintf(query, " AND browserExists=0");
 }
 
 static void addCategory(struct dyString *query)
 /* refSeqCategory = reference or representative */
 {
 if (isNotEmpty(refSeqCategory))
     sqlDyStringPrintf(query, " AND refSeqCategory='%s'", refSeqCategory);
 }
@@ -236,31 +289,31 @@
 }	/*	static long long oneWordSearch(struct sqlConnection *conn, char *searchWord, struct jsonWrite *jw, boolean *prefixSearch) */
 
 static long elapsedTime(struct jsonWrite *jw)
 {
 long nowTime = clock1000();
 long elapsedTimeMs = nowTime - enteredMainTime;
 jsonWriteNumber(jw, "elapsedTimeMs", elapsedTimeMs);
 return elapsedTimeMs;
 }
 
 void apiFindGenome(char *pathString[MAX_PATH_INFO])
 /* 'findGenome' function */
 {
 char *searchString = cgiOptionalString(argQ);
 char *inputSearchString = cloneString(searchString);
-char *endResultSearchString = NULL;
+char *endResultSearchString = inputSearchString;
 boolean prefixSearch = FALSE;
 char *extraArgs = verifyLegalArgs(argFindGenome);
 genarkTable = genarkTableName();
 asmListTable = assemblyListTableName();
 
 if (extraArgs)
     apiErrAbort(err400, err400Msg, "extraneous arguments found for function /findGenome'%s'", extraArgs);
 
 boolean asmListExists = hTableExists("hgcentraltest", asmListTable);
 if (!asmListExists)
     apiErrAbort(err400, err400Msg, "table hgcentraltest.assemblyList does not exist for /findGenome");
 
 boolean asmSummaryExists = hTableExists("hgcentraltest", "asmSummary");
 if (!asmSummaryExists)
     apiErrAbort(err400, err400Msg, "table hgcentraltest.asmSummary does not exist for /findGenome");
@@ -386,45 +439,62 @@
 if (specificYear > 0)
     jsonWriteNumber(jw, argYear, specificYear);
 if (isNotEmpty(refSeqCategory))
     jsonWriteString(jw, argCategory, refSeqCategory);
 if (isNotEmpty(versionStatus))
     jsonWriteString(jw, argStatus, versionStatus);
 if (isNotEmpty(assemblyLevel))
     jsonWriteString(jw, argLevel, assemblyLevel);
 
 long long itemCount = 0;
 long long totalMatchCount = 0;
 char **words;
 AllocArray(words, wordCount);
 (void) chopByWhite(searchString, words, wordCount);
 if (1 == wordCount)
+    {
+    boolean doQuote = TRUE;
+    if (startsWith("\"", words[0]))
+	doQuote = FALSE;
+    if (startsWith("-", words[0]))
+	doQuote = FALSE;
+    if (startsWith("+", words[0]))
+	doQuote = FALSE;
+    if (endsWith(words[0], "*"))
+	doQuote = FALSE;
+    if (doQuote && hasWordBreaks(words[0]))
+	{
+	char *quotedWords = quoteWords(words[0]);
+	endResultSearchString = quotedWords;
+	itemCount = oneWordSearch(conn, quotedWords, jw, &totalMatchCount, &prefixSearch);
+	} else {
 	itemCount = oneWordSearch(conn, words[0], jw, &totalMatchCount, &prefixSearch);
+	}
+    }
 else	/* multiple word search */
     itemCount = multipleWordSearch(conn, words, wordCount, jw, &totalMatchCount);
 
 if (prefixSearch)
     {
     struct dyString *addedStar = dyStringNew(64);
     dyStringPrintf(addedStar, "%s*", inputSearchString);
     endResultSearchString = dyStringCannibalize(&addedStar);
     jsonWriteString(jw, argQ, endResultSearchString);
     }
 else
     {
-    endResultSearchString = inputSearchString;
-    jsonWriteString(jw, argQ, inputSearchString);
+    jsonWriteString(jw, argQ, endResultSearchString);
     }
 
 /* rules about what can be in the search string:
  *  + sign before a word indicates the word must be in the result
  *  - sign before a word indicates it must not be in the result
  *  * at end of word makes the word be a prefix search
  *  "double quotes" to group words together as a phrase to match exactly
  *  < or > adjust the words contribution to the relevance value
  *          >moreImportant  <lessImportant
  *  ~ negates the word's contribution to the relevance value without
  *    excluding it from the results
  *  (parens clauses) to groups words together for more complex queries
  *  | OR operator 'thisWord | otherWord'
  */