15aca5da7562e7314e110729f6b5e85f3b77371b
jcasper
  Mon Jul 24 08:57:28 2017 -0700
hgHubConnect now requires all search terms and supports term expansion, refs #19758

diff --git src/hg/hgHubConnect/hgHubConnect.c src/hg/hgHubConnect/hgHubConnect.c
index 64fff60..41fa255 100644
--- src/hg/hgHubConnect/hgHubConnect.c
+++ src/hg/hgHubConnect/hgHubConnect.c
@@ -354,53 +354,106 @@
         "select %s.hubUrl from %s left join %s on %s.hubUrl = %s.hubUrl where %s.hubUrl is NULL",
         publicTable, publicTable, statusTable, publicTable, statusTable, statusTable); 
 struct sqlResult *sr = sqlGetResult(conn, query);
 char **row;
 while ((row = sqlNextRow(sr)) != NULL)
     {
     char *errorMessage = NULL;
     char *url = row[0];
 
     // add this url to the hubStatus table
     hubFindOrAddUrlInStatusTable(database, cart, url, &errorMessage);
     }
 }
 
 
+char *modifyTermsForHubSearch(char *hubSearchTerms, bool isStrictSearch)
+/* This won't exactly be pretty.  MySQL treats any sequence of alphanumerics and underscores
+ * as a word, and single apostrophes are allowed as long as they don't come back-to-back.
+ * Cut down to those characters, then add initial + (for requiring word) and * (for word
+ * expansion) as appropriate. */
+{
+char *cloneTerms = cloneString(hubSearchTerms);
+struct dyString *modifiedTerms = dyStringNew(0);
+if (isNotEmpty(cloneTerms))
+    {
+    int i;
+    for (i=0; i<strlen(cloneTerms); i++)
+        {
+        // allowed punctuation is underscore and apostrophe, and we'll do special handling for hyphen
+        if (!isalnum(cloneTerms[i]) && cloneTerms[i] != '_' && cloneTerms[i] != '\'' &&
+                cloneTerms[i] != '-')
+            cloneTerms[i] = ' ';
+        }
+    char *splitTerms[1024];
+    int termCount = chopByWhite(cloneTerms, splitTerms, sizeof(splitTerms));
+    for (i=0; i<termCount; i++)
+        {
+        char *hyphenatedTerms[1024];
+        int hyphenTerms = chopString(splitTerms[i], "-", hyphenatedTerms, sizeof(hyphenatedTerms));
+        int j;
+        for (j=0; j<hyphenTerms-1; j++)
+            {
+            dyStringPrintf(modifiedTerms, "+%s ", hyphenatedTerms[j]);
+            }
+        if (isStrictSearch)
+            dyStringPrintf(modifiedTerms, "+%s ", hyphenatedTerms[j]);
+        else
+            {
+            dyStringPrintf(modifiedTerms, "+%s* ", hyphenatedTerms[j]);
+            }
+        }
+    }
+fprintf(stderr, "Final search terms: %s\n", dyStringContents(modifiedTerms));
+return dyStringCannibalize(&modifiedTerms);
+}
+
+
 struct hubSearchText *getHubSearchResults(struct sqlConnection *conn, char *hubSearchTableName,
         char *hubSearchTerms, bool checkLongText, char *dbFilter, struct hash *hubLookup)
 /* Find hubs, genomes, and tracks that match the provided search terms.
  * Return all hits that satisfy the (optional) supplied assembly filter.
  * if checkLongText is FALSE, skip searching within the long description text entries */
 {
+char *cleanSearchTerms = cloneString(hubSearchTerms);
+if (isNotEmpty(cleanSearchTerms))
+    tolowers(cleanSearchTerms);
+bool isStrictSearch = FALSE;
+char *modifiedSearchTerms = modifyTermsForHubSearch(cleanSearchTerms, isStrictSearch);
 struct hubSearchText *hubSearchResultsList = NULL;
 struct dyString *query = dyStringNew(100);
 char *noLongText = NULL;
 
 if (!checkLongText)
     noLongText = cloneString("textLength = 'Short' and");
 else
     noLongText = cloneString("");
 
-sqlDyStringPrintf(query, "select * from %s where %s match(text) against ('%s' in natural language mode)",
-        hubSearchTableName, noLongText, hubSearchTerms);
+sqlDyStringPrintf(query, "select * from %s where %s match(text) against ('%s' in boolean mode)"
+        " order by match(text) against ('%s' in boolean mode)",
+        hubSearchTableName, noLongText, modifiedSearchTerms, modifiedSearchTerms);
 
 struct sqlResult *sr = sqlGetResult(conn, dyStringContents(query));
 char **row;
 while ((row = sqlNextRow(sr)) != NULL)
     {
-    struct hubSearchText *hst = hubSearchTextLoadWithNullGiveContext(row, hubSearchTerms);
+    struct hubSearchText *hst = hubSearchTextLoadWithNullGiveContext(row, cleanSearchTerms);
+    // Skip rows where the long text matched the more lax MySQL search (punctuation just
+    // splits terms into two words, so "rna-seq" finds "rna" and "seq" separately, but
+    // not the more strict rules used to find context for the search terms.
+    if ((hst->textLength == hubSearchTextLong) && isEmpty(hst->text))
+        continue;
     char *hubUrl = hst->hubUrl;
     struct hubEntry *hubInfo = hashFindVal(hubLookup, hubUrl);
     if (hubInfo == NULL)
         continue; // Search table evidently includes a hub that's not on this server.  Skip it.
     char *db = cloneString(hst->db);
     tolowers(db);
     if (isNotEmpty(dbFilter))
         {
         if (isNotEmpty(db))
             {
             if (stringIn(dbFilter, db) == NULL)
                 continue;
             }
         else
             {
@@ -849,31 +902,30 @@
 /* Given a hub's info and a structure listing the search hits within the hub, first print
  * a basic line of hub information with a "connect" button.  Then, if the search results
  * are non-NULL, write out information about the genomes and tracks from the search hits that
  * match the db filter.
  * If there are no search results to print, the basic hub lines are combined into a single HTML table
  * that is defined outside this function.
  * Otherwise, each hub line is printed in its own table followed by a <ul> containing details
  * about the search results. */
 {
 if (hubSearchResult != NULL)
     printf("<table class='hubList'><tbody>\n");
 outputPublicTableRow(hubInfo, count);
 if (hubSearchResult != NULL)
     {
     printf("</tbody></table>\n");
-
     struct trackHub *hub = fetchTrackHub(hubInfo);
     struct hubOutputStructure *hubOut = buildHubSearchOutputStructure(hub, hubSearchResult);
     if (dyStringIsEmpty(hubOut->descriptionMatch) && (hubOut->genomes == NULL))
         return; // no detailed search results; hit must have been to hub short label or something   
 
     printf("<div class=\"hubTdbTree\">\n");
     printf("<ul>\n");
     printf("<li>Search details ...\n<ul>\n");
     if (isNotEmpty(dyStringContents(hubOut->descriptionMatch)))
         printf("<li>Hub Description:&nbsp<span class='descriptionMatch'><em>%s</em></span></li>\n", dyStringContents(hubOut->descriptionMatch));
 
     struct genomeOutputStructure *genomeOut = hubOut->genomes;
     if (genomeOut != NULL)
         {
         printf("<li>%d Matching Assembl%s\n<ul>\n", hubOut->genomeCount, hubOut->genomeCount==1?"y":"ies");
@@ -940,89 +992,85 @@
     printf("<table class='hubList' id='hideThisTable'><tbody>\n");
     struct slName *thisHubName = NULL;
     for (thisHubName = hubsToPrint; thisHubName != NULL; thisHubName = thisHubName->next)
         {
         struct hubEntry *hubInfo = (struct hubEntry *) hashFindVal(hubLookup, thisHubName->name);
         if (hubInfo == NULL)
             {
             continue;
             }
         printOutputForHub(hubInfo, NULL, count);
         count++;
         }
     printf("</tbody></table>\n");
     printf("</div>\n");
     }
-
 jsInline(
         "function lineUpCols()\n"
         "    {\n"
         "    var tableList = $('table.hubList');\n"
         "    if (tableList.length == 0)\n"
         "        return;\n"
         "    var colWidths = new Array();\n"
         "    var combinedTrackTable = $('#hideThisTable');\n"
         "    for (i=0; i<combinedTrackTable[0].rows[0].cells.length; i++)\n"
         "        colWidths[i] = combinedTrackTable[0].rows[0].cells[i].clientWidth;\n"
         "    $('#hideThisDiv')[0].style.display = 'none';\n"
         "    for(i=0; i<tableList.length; i++)\n"
         "        {\n"
         "        for(j=0; j<tableList[i].rows[0].cells.length; j++)\n"
         "            tableList[i].rows[0].cells[j].style.width = colWidths[j]+'px';\n"
         "        }\n"
         "    }\n"
         "window.onload = lineUpCols();\n"
         );
 }
 
 
 static bool outputPublicTable(struct sqlConnection *conn, char *publicTable, char *statusTable,
         struct hash **pHash)
 /* Put up the list of public hubs and other controls for the page. */
 {
 char *hubSearchTerms = cartOptionalString(cart, hgHubSearchTerms);
-char *cleanSearchTerms = cloneString(hubSearchTerms); // only cleaned by tolowers() at the moment
 char *dbFilter = cartOptionalString(cart, hgHubDbFilter);
 char *lcDbFilter = cloneString(dbFilter);
 if (isNotEmpty(lcDbFilter))
     tolowers(lcDbFilter);
 
 // make sure all the public hubs are in the hubStatus table.
 addPublicHubsToHubStatus(conn, publicTable, statusTable);
 
 // build full public hub lookup hash, taking each URL to struct hubEntry * for that hub
 struct hash *hubLookup = buildPublicLookupHash(conn, publicTable, statusTable, pHash);
 
 printf("<div id=\"publicHubs\" class=\"hubList\"> \n");
 
 char *hubSearchTableName = cfgOptionDefault("hubSearchTextTable", "hubSearchText");
 int searchEnabled = sqlTableExists(conn, hubSearchTableName);
 
 printSearchAndFilterBoxes(searchEnabled, hubSearchTerms, dbFilter);
 
 struct hash *searchResultHash = NULL;
 struct slName *hubsToPrint = NULL;
 if (searchEnabled && !isEmpty(hubSearchTerms))
     {
     printSearchTerms(hubSearchTerms);
-    if (isNotEmpty(cleanSearchTerms))
-        tolowers(cleanSearchTerms);
     // Forcing checkDescriptions to TRUE right now, but we might want to add this as a
     // checkbox option for users in the near future.
     bool checkDescriptions = TRUE;
     struct hubSearchText *hubSearchResults = getHubSearchResults(conn, hubSearchTableName,
-            cleanSearchTerms, checkDescriptions, lcDbFilter, hubLookup);
+            hubSearchTerms, checkDescriptions, lcDbFilter, hubLookup);
     searchResultHash = newHash(5);
     struct hubSearchText *hst = hubSearchResults;
     while (hst != NULL)
         {
         struct hubSearchText *nextHst = hst->next;
         hst->next = NULL;
         struct hashEl *hubHashEnt = hashLookup(searchResultHash, hst->hubUrl);
         if (hubHashEnt == NULL)
             {
             slNameAddHead(&hubsToPrint, hst->hubUrl);
             hashAdd(searchResultHash, hst->hubUrl, hst);
             }
         else
             slAddTail(&(hubHashEnt->val), hst);
         hst = nextHst;