1fb288d9c135df7d88a8accc0792f2986afe17bc hiram Tue Sep 3 17:59:08 2024 -0700 correctly manage the special characters in a single word, to be done same for multiple words refs #32596 diff --git src/hg/hubApi/findGenome.c src/hg/hubApi/findGenome.c index 2bc433c..9876e79 100644 --- src/hg/hubApi/findGenome.c +++ src/hg/hubApi/findGenome.c @@ -90,30 +90,83 @@ jsonWriteString(jw, "refSeqCategory", el->refSeqCategory); if (isEmpty(el->versionStatus)) jsonWriteString(jw, "versionStatus", NULL); else jsonWriteString(jw, "versionStatus", el->versionStatus); if (isEmpty(el->assemblyLevel)) jsonWriteString(jw, "assemblyLevel", NULL); else jsonWriteString(jw, "assemblyLevel", el->assemblyLevel); jsonWriteObjectEnd(jw); ++itemCount; } return (itemCount); } +/* MySQL FULLTEXT indexing has indexed 'words' as broken up by + * word break characters, such as in the regular expression: '\W+' + * or, in this case, checking the string with isalnum() function, + * must all be isalnum() + * Return: TRUE when there are word breaks + * FALSE - the string is all one 'word' + */ +static boolean hasWordBreaks(char *s) +/* Return TRUE if there is any word break in string. + * allowing characters _ * + - as those are special characters + * to the MySQL FULLTEXT search + * The string has already been checked for the special prefix + * characters of: " - + + * or the special end character of: * + */ +{ +char c; +while ((c = *s++) != 0) + { + if (c == '_' || c == '*' || c == '+' || c == '-') + continue; + if (! isalnum(c)) + return TRUE; + } +return FALSE; +} + +static char *quoteWords(char *s) +/* given a string with word break characters, break it up into + * a quoted string with the word break characters turned to single space + */ +{ +struct dyString *quoteString = dyStringNew(128); +dyStringPrintf(quoteString, "\""); +char c; +int spaceCount = 0; +while ((c = *s++) != 0) + if (isalnum(c) || c == '_' || c == '*' || c == '+' || c == '-') + { + dyStringPrintf(quoteString, "%c", c); + spaceCount = 0; + } + else + { + if (spaceCount) + continue; + dyStringPrintf(quoteString, " "); + ++spaceCount; + } +dyStringPrintf(quoteString, "\""); +return dyStringCannibalize("eString); +} + static void addBrowserExists(struct dyString *query) /* add the AND clauses for browserExist depending upon option */ { if (browserMustExist) sqlDyStringPrintf(query, " AND browserExists=1"); else if (browserNotExist) sqlDyStringPrintf(query, " AND browserExists=0"); } static void addCategory(struct dyString *query) /* refSeqCategory = reference or representative */ { if (isNotEmpty(refSeqCategory)) sqlDyStringPrintf(query, " AND refSeqCategory='%s'", refSeqCategory); } @@ -236,31 +289,31 @@ } /* static long long oneWordSearch(struct sqlConnection *conn, char *searchWord, struct jsonWrite *jw, boolean *prefixSearch) */ static long elapsedTime(struct jsonWrite *jw) { long nowTime = clock1000(); long elapsedTimeMs = nowTime - enteredMainTime; jsonWriteNumber(jw, "elapsedTimeMs", elapsedTimeMs); return elapsedTimeMs; } void apiFindGenome(char *pathString[MAX_PATH_INFO]) /* 'findGenome' function */ { char *searchString = cgiOptionalString(argQ); char *inputSearchString = cloneString(searchString); -char *endResultSearchString = NULL; +char *endResultSearchString = inputSearchString; boolean prefixSearch = FALSE; char *extraArgs = verifyLegalArgs(argFindGenome); genarkTable = genarkTableName(); asmListTable = assemblyListTableName(); if (extraArgs) apiErrAbort(err400, err400Msg, "extraneous arguments found for function /findGenome'%s'", extraArgs); boolean asmListExists = hTableExists("hgcentraltest", asmListTable); if (!asmListExists) apiErrAbort(err400, err400Msg, "table hgcentraltest.assemblyList does not exist for /findGenome"); boolean asmSummaryExists = hTableExists("hgcentraltest", "asmSummary"); if (!asmSummaryExists) apiErrAbort(err400, err400Msg, "table hgcentraltest.asmSummary does not exist for /findGenome"); @@ -386,45 +439,62 @@ if (specificYear > 0) jsonWriteNumber(jw, argYear, specificYear); if (isNotEmpty(refSeqCategory)) jsonWriteString(jw, argCategory, refSeqCategory); if (isNotEmpty(versionStatus)) jsonWriteString(jw, argStatus, versionStatus); if (isNotEmpty(assemblyLevel)) jsonWriteString(jw, argLevel, assemblyLevel); long long itemCount = 0; long long totalMatchCount = 0; char **words; AllocArray(words, wordCount); (void) chopByWhite(searchString, words, wordCount); if (1 == wordCount) + { + boolean doQuote = TRUE; + if (startsWith("\"", words[0])) + doQuote = FALSE; + if (startsWith("-", words[0])) + doQuote = FALSE; + if (startsWith("+", words[0])) + doQuote = FALSE; + if (endsWith(words[0], "*")) + doQuote = FALSE; + if (doQuote && hasWordBreaks(words[0])) + { + char *quotedWords = quoteWords(words[0]); + endResultSearchString = quotedWords; + itemCount = oneWordSearch(conn, quotedWords, jw, &totalMatchCount, &prefixSearch); + } else { itemCount = oneWordSearch(conn, words[0], jw, &totalMatchCount, &prefixSearch); + } + } else /* multiple word search */ itemCount = multipleWordSearch(conn, words, wordCount, jw, &totalMatchCount); if (prefixSearch) { struct dyString *addedStar = dyStringNew(64); dyStringPrintf(addedStar, "%s*", inputSearchString); endResultSearchString = dyStringCannibalize(&addedStar); jsonWriteString(jw, argQ, endResultSearchString); } else { - endResultSearchString = inputSearchString; - jsonWriteString(jw, argQ, inputSearchString); + jsonWriteString(jw, argQ, endResultSearchString); } /* rules about what can be in the search string: * + sign before a word indicates the word must be in the result * - sign before a word indicates it must not be in the result * * at end of word makes the word be a prefix search * "double quotes" to group words together as a phrase to match exactly * < or > adjust the words contribution to the relevance value * >moreImportant <lessImportant * ~ negates the word's contribution to the relevance value without * excluding it from the results * (parens clauses) to groups words together for more complex queries * | OR operator 'thisWord | otherWord' */