15aca5da7562e7314e110729f6b5e85f3b77371b jcasper Mon Jul 24 08:57:28 2017 -0700 hgHubConnect now requires all search terms and supports term expansion, refs #19758 diff --git src/hg/hgHubConnect/hgHubConnect.c src/hg/hgHubConnect/hgHubConnect.c index 64fff60..41fa255 100644 --- src/hg/hgHubConnect/hgHubConnect.c +++ src/hg/hgHubConnect/hgHubConnect.c @@ -354,53 +354,106 @@ "select %s.hubUrl from %s left join %s on %s.hubUrl = %s.hubUrl where %s.hubUrl is NULL", publicTable, publicTable, statusTable, publicTable, statusTable, statusTable); struct sqlResult *sr = sqlGetResult(conn, query); char **row; while ((row = sqlNextRow(sr)) != NULL) { char *errorMessage = NULL; char *url = row[0]; // add this url to the hubStatus table hubFindOrAddUrlInStatusTable(database, cart, url, &errorMessage); } } +char *modifyTermsForHubSearch(char *hubSearchTerms, bool isStrictSearch) +/* This won't exactly be pretty. MySQL treats any sequence of alphanumerics and underscores + * as a word, and single apostrophes are allowed as long as they don't come back-to-back. + * Cut down to those characters, then add initial + (for requiring word) and * (for word + * expansion) as appropriate. */ +{ +char *cloneTerms = cloneString(hubSearchTerms); +struct dyString *modifiedTerms = dyStringNew(0); +if (isNotEmpty(cloneTerms)) + { + int i; + for (i=0; i<strlen(cloneTerms); i++) + { + // allowed punctuation is underscore and apostrophe, and we'll do special handling for hyphen + if (!isalnum(cloneTerms[i]) && cloneTerms[i] != '_' && cloneTerms[i] != '\'' && + cloneTerms[i] != '-') + cloneTerms[i] = ' '; + } + char *splitTerms[1024]; + int termCount = chopByWhite(cloneTerms, splitTerms, sizeof(splitTerms)); + for (i=0; i<termCount; i++) + { + char *hyphenatedTerms[1024]; + int hyphenTerms = chopString(splitTerms[i], "-", hyphenatedTerms, sizeof(hyphenatedTerms)); + int j; + for (j=0; j<hyphenTerms-1; j++) + { + dyStringPrintf(modifiedTerms, "+%s ", hyphenatedTerms[j]); + } + if (isStrictSearch) + dyStringPrintf(modifiedTerms, "+%s ", hyphenatedTerms[j]); + else + { + dyStringPrintf(modifiedTerms, "+%s* ", hyphenatedTerms[j]); + } + } + } +fprintf(stderr, "Final search terms: %s\n", dyStringContents(modifiedTerms)); +return dyStringCannibalize(&modifiedTerms); +} + + struct hubSearchText *getHubSearchResults(struct sqlConnection *conn, char *hubSearchTableName, char *hubSearchTerms, bool checkLongText, char *dbFilter, struct hash *hubLookup) /* Find hubs, genomes, and tracks that match the provided search terms. * Return all hits that satisfy the (optional) supplied assembly filter. * if checkLongText is FALSE, skip searching within the long description text entries */ { +char *cleanSearchTerms = cloneString(hubSearchTerms); +if (isNotEmpty(cleanSearchTerms)) + tolowers(cleanSearchTerms); +bool isStrictSearch = FALSE; +char *modifiedSearchTerms = modifyTermsForHubSearch(cleanSearchTerms, isStrictSearch); struct hubSearchText *hubSearchResultsList = NULL; struct dyString *query = dyStringNew(100); char *noLongText = NULL; if (!checkLongText) noLongText = cloneString("textLength = 'Short' and"); else noLongText = cloneString(""); -sqlDyStringPrintf(query, "select * from %s where %s match(text) against ('%s' in natural language mode)", - hubSearchTableName, noLongText, hubSearchTerms); +sqlDyStringPrintf(query, "select * from %s where %s match(text) against ('%s' in boolean mode)" + " order by match(text) against ('%s' in boolean mode)", + hubSearchTableName, noLongText, modifiedSearchTerms, modifiedSearchTerms); struct sqlResult *sr = sqlGetResult(conn, dyStringContents(query)); char **row; while ((row = sqlNextRow(sr)) != NULL) { - struct hubSearchText *hst = hubSearchTextLoadWithNullGiveContext(row, hubSearchTerms); + struct hubSearchText *hst = hubSearchTextLoadWithNullGiveContext(row, cleanSearchTerms); + // Skip rows where the long text matched the more lax MySQL search (punctuation just + // splits terms into two words, so "rna-seq" finds "rna" and "seq" separately, but + // not the more strict rules used to find context for the search terms. + if ((hst->textLength == hubSearchTextLong) && isEmpty(hst->text)) + continue; char *hubUrl = hst->hubUrl; struct hubEntry *hubInfo = hashFindVal(hubLookup, hubUrl); if (hubInfo == NULL) continue; // Search table evidently includes a hub that's not on this server. Skip it. char *db = cloneString(hst->db); tolowers(db); if (isNotEmpty(dbFilter)) { if (isNotEmpty(db)) { if (stringIn(dbFilter, db) == NULL) continue; } else { @@ -849,31 +902,30 @@ /* Given a hub's info and a structure listing the search hits within the hub, first print * a basic line of hub information with a "connect" button. Then, if the search results * are non-NULL, write out information about the genomes and tracks from the search hits that * match the db filter. * If there are no search results to print, the basic hub lines are combined into a single HTML table * that is defined outside this function. * Otherwise, each hub line is printed in its own table followed by a <ul> containing details * about the search results. */ { if (hubSearchResult != NULL) printf("<table class='hubList'><tbody>\n"); outputPublicTableRow(hubInfo, count); if (hubSearchResult != NULL) { printf("</tbody></table>\n"); - struct trackHub *hub = fetchTrackHub(hubInfo); struct hubOutputStructure *hubOut = buildHubSearchOutputStructure(hub, hubSearchResult); if (dyStringIsEmpty(hubOut->descriptionMatch) && (hubOut->genomes == NULL)) return; // no detailed search results; hit must have been to hub short label or something printf("<div class=\"hubTdbTree\">\n"); printf("<ul>\n"); printf("<li>Search details ...\n<ul>\n"); if (isNotEmpty(dyStringContents(hubOut->descriptionMatch))) printf("<li>Hub Description: <span class='descriptionMatch'><em>%s</em></span></li>\n", dyStringContents(hubOut->descriptionMatch)); struct genomeOutputStructure *genomeOut = hubOut->genomes; if (genomeOut != NULL) { printf("<li>%d Matching Assembl%s\n<ul>\n", hubOut->genomeCount, hubOut->genomeCount==1?"y":"ies"); @@ -940,89 +992,85 @@ printf("<table class='hubList' id='hideThisTable'><tbody>\n"); struct slName *thisHubName = NULL; for (thisHubName = hubsToPrint; thisHubName != NULL; thisHubName = thisHubName->next) { struct hubEntry *hubInfo = (struct hubEntry *) hashFindVal(hubLookup, thisHubName->name); if (hubInfo == NULL) { continue; } printOutputForHub(hubInfo, NULL, count); count++; } printf("</tbody></table>\n"); printf("</div>\n"); } - jsInline( "function lineUpCols()\n" " {\n" " var tableList = $('table.hubList');\n" " if (tableList.length == 0)\n" " return;\n" " var colWidths = new Array();\n" " var combinedTrackTable = $('#hideThisTable');\n" " for (i=0; i<combinedTrackTable[0].rows[0].cells.length; i++)\n" " colWidths[i] = combinedTrackTable[0].rows[0].cells[i].clientWidth;\n" " $('#hideThisDiv')[0].style.display = 'none';\n" " for(i=0; i<tableList.length; i++)\n" " {\n" " for(j=0; j<tableList[i].rows[0].cells.length; j++)\n" " tableList[i].rows[0].cells[j].style.width = colWidths[j]+'px';\n" " }\n" " }\n" "window.onload = lineUpCols();\n" ); } static bool outputPublicTable(struct sqlConnection *conn, char *publicTable, char *statusTable, struct hash **pHash) /* Put up the list of public hubs and other controls for the page. */ { char *hubSearchTerms = cartOptionalString(cart, hgHubSearchTerms); -char *cleanSearchTerms = cloneString(hubSearchTerms); // only cleaned by tolowers() at the moment char *dbFilter = cartOptionalString(cart, hgHubDbFilter); char *lcDbFilter = cloneString(dbFilter); if (isNotEmpty(lcDbFilter)) tolowers(lcDbFilter); // make sure all the public hubs are in the hubStatus table. addPublicHubsToHubStatus(conn, publicTable, statusTable); // build full public hub lookup hash, taking each URL to struct hubEntry * for that hub struct hash *hubLookup = buildPublicLookupHash(conn, publicTable, statusTable, pHash); printf("<div id=\"publicHubs\" class=\"hubList\"> \n"); char *hubSearchTableName = cfgOptionDefault("hubSearchTextTable", "hubSearchText"); int searchEnabled = sqlTableExists(conn, hubSearchTableName); printSearchAndFilterBoxes(searchEnabled, hubSearchTerms, dbFilter); struct hash *searchResultHash = NULL; struct slName *hubsToPrint = NULL; if (searchEnabled && !isEmpty(hubSearchTerms)) { printSearchTerms(hubSearchTerms); - if (isNotEmpty(cleanSearchTerms)) - tolowers(cleanSearchTerms); // Forcing checkDescriptions to TRUE right now, but we might want to add this as a // checkbox option for users in the near future. bool checkDescriptions = TRUE; struct hubSearchText *hubSearchResults = getHubSearchResults(conn, hubSearchTableName, - cleanSearchTerms, checkDescriptions, lcDbFilter, hubLookup); + hubSearchTerms, checkDescriptions, lcDbFilter, hubLookup); searchResultHash = newHash(5); struct hubSearchText *hst = hubSearchResults; while (hst != NULL) { struct hubSearchText *nextHst = hst->next; hst->next = NULL; struct hashEl *hubHashEnt = hashLookup(searchResultHash, hst->hubUrl); if (hubHashEnt == NULL) { slNameAddHead(&hubsToPrint, hst->hubUrl); hashAdd(searchResultHash, hst->hubUrl, hst); } else slAddTail(&(hubHashEnt->val), hst); hst = nextHst;