67ce69b2d6be585fcad3cb0dd5fc927e95c327ff
chmalee
  Mon Oct 14 13:48:54 2019 -0700
Rework of hgHubConnect hub searching to not use trackDb or udc so searches can be sped up. The hubSearchText table now has an extra column for the parent track names (if any) of a search result to a track. hgHubConnect has been changed to use this field of the table instead of using trackDb. hubCrawl has been changed to generate this additional column, refs #23812

diff --git src/hg/utils/hubCrawl/hubCrawl.c src/hg/utils/hubCrawl/hubCrawl.c
index 9aab3a7..d29425c 100644
--- src/hg/utils/hubCrawl/hubCrawl.c
+++ src/hg/utils/hubCrawl/hubCrawl.c
@@ -71,52 +71,71 @@
 if (isEmpty(html))
     return NULL;
 char *stripHtml = htmlTextStripJavascriptCssAndTags(html);
 strSwapChar(stripHtml, '\n', ' ');
 strSwapChar(stripHtml, '\t', ' ');
 strSwapChar(stripHtml, '\015', ' ');
 strSwapChar(stripHtml, ')', ' ');
 strSwapChar(stripHtml, '(', ' ');
 strSwapChar(stripHtml, '[', ' ');
 strSwapChar(stripHtml, ']', ' ');
 char *withoutExtraSpaces = removeExtraSpaces(stripHtml);
 return withoutExtraSpaces;
 }
 
 
-void trackHubCrawlTrack(struct trackDb *tdbList, struct trackHubGenome *genome, char *hubUrl,
+void trackHubCrawlTrack(struct trackDb *tdb, struct trackHubGenome *genome, char *hubUrl,
         char *dbName, FILE *searchFp, struct hash *visitedTracks)
 /* Given a trackDb and the hub genome it comes from, write out hubSearchText lines for all of
  * the tracks in that trackDb */
 {
-struct trackDb *tdb = tdbList;
-for (tdb = tdbList; tdb != NULL; tdb = tdb->next)
-    {
 if (hashLookup(visitedTracks, tdb->track) == NULL)
     {
     // Visit parent first, so that any parent HTML description is loaded before handling this
     // track.  Otherwise we could write out the HTML description of this track without knowing
     // that it's identical to the parent's.
+    hashStore(visitedTracks, tdb->track);
     if (tdb->parent != NULL)
         trackHubCrawlTrack(tdb->parent, genome, hubUrl, dbName, searchFp, visitedTracks);
 
-        hashStore(visitedTracks, tdb->track);
     struct hubSearchText *trackHst = NULL;
+    struct dyString *csParents = dyStringNew(0);
     AllocVar(trackHst);
     trackHst->hubUrl = cloneString(hubUrl);
     trackHst->db = cloneString(dbName);
     trackHst->track = cloneString(trackHubSkipHubName(tdb->track));
+    struct trackDb *ptdb = tdb->parent;
+    while (ptdb != NULL)
+        {
+        // start with track
+        dyStringPrintf(csParents, "\"%s\",", htmlEncode(trackHubSkipHubName(ptdb->track)));
+
+        // now add the label, which may just be the track name again
+        dyStringPrintf(csParents, "\"");
+        if (isNotEmpty(tdb->longLabel))
+            dyStringPrintf(csParents, "%s", htmlEncode(ptdb->longLabel));
+        else if (isNotEmpty(tdb->shortLabel))
+            dyStringPrintf(csParents, "%s", htmlEncode(ptdb->shortLabel));
+        else
+            dyStringPrintf(csParents, "%s", htmlEncode(ptdb->track));
+        dyStringPrintf(csParents, "\"");
+
+        if (ptdb->parent != NULL)
+            dyStringPrintf(csParents, ",");
+        ptdb = ptdb->parent;
+        }
+    trackHst->parents = dyStringCannibalize(&csParents);
     if (isNotEmpty(tdb->longLabel))
         {
         trackHst->label = cloneString(tdb->longLabel);
         }
     else if (isNotEmpty(tdb->shortLabel))
         {
         trackHst->label = cloneString(tdb->shortLabel);
         }
     else
         trackHst->label = cloneString(trackHubSkipHubName(tdb->track));
 
     trackHst->textLength = hubSearchTextShort;
     trackHst->text = cloneString(trackHubSkipHubName(tdb->track));
     hubSearchTextTabOut(trackHst, searchFp);
 
@@ -145,54 +164,58 @@
             }
         }
 
     // memory leak ditching metadata pairs.  slPairFreeValsAndList would fix that.
     trackHst->textLength = hubSearchTextMeta;
     trackHst->text = (char *) needMem(4096);
     struct slPair *metaPairs = trackDbMetaPairs(tdb);
     while (metaPairs != NULL)
         {
         safef(trackHst->text, 4096, "%s: %s", metaPairs->name, (char *) metaPairs->val);
         hubSearchTextTabOut(trackHst, searchFp);
         metaPairs = metaPairs->next;
         }
 
     // Write out lines for child tracks
-        if (tdb->subtracks != NULL)
-            trackHubCrawlTrack(tdb->subtracks, genome, hubUrl, dbName, searchFp, visitedTracks);
+    struct trackDb *subtrack = NULL;
+    while (subtrack != NULL)
+        {
+        trackHubCrawlTrack(subtrack, genome, hubUrl, dbName, searchFp, visitedTracks);
+        subtrack = subtrack->next;
         }
     }
 }
 
 
 void trackHubCrawlGenome(struct trackHubGenome *genome, struct trackHub *hub, FILE *searchFp)
 /* Given a hub genome and the hub it came from, write out hubSearchText lines for that genome.
  * NB: Errors fetching particular trackDb files will not be reported to the calling function. */
 {
 struct hubSearchText *genomeHst = NULL;
 AllocVar(genomeHst);
 genomeHst->hubUrl = cloneString(hub->url);
 genomeHst->db = cloneString(trackHubSkipHubName(genome->name));
 genomeHst->track = cloneString("");
 char label[256];
 if (isNotEmpty(genome->description))
     safef(label, sizeof(label), "%s (%s)", genome->description, trackHubSkipHubName(genome->name));
 else if (isNotEmpty(genome->organism))
     safef(label, sizeof(label), "%s (%s)", trackHubSkipHubName(genome->organism), trackHubSkipHubName(genome->name));
 else
     safef(label, sizeof(label), "%s", trackHubSkipHubName(genome->name));
 genomeHst->label = cloneString(label);
+genomeHst->parents = cloneString("");
 genomeHst->textLength = hubSearchTextShort;
 genomeHst->text = cloneString(trackHubSkipHubName(genome->name));
 hubSearchTextTabOut(genomeHst, searchFp);
 
 if (isNotEmpty(genome->organism) && differentString(genome->organism, genome->name))
     {
     genomeHst->text = cloneString(trackHubSkipHubName(genome->organism));
     hubSearchTextTabOut(genomeHst, searchFp);
     }
 if (isNotEmpty(genome->description))
     {
     genomeHst->text = cloneString(genome->description);
     hubSearchTextTabOut(genomeHst, searchFp);
     }
 struct hashEl *hel = NULL;
@@ -210,31 +233,35 @@
     char *htmlPath = (char *)(hel->val);
     genomeHst->textLength = hubSearchTextLong;
     char *rawHtml = netReadTextFileIfExists(htmlPath);
     genomeHst->text = cleanHubHtml(rawHtml);
     if (isNotEmpty(genomeHst->text))
         hubSearchTextTabOut(genomeHst, searchFp);
     }
 
 /* Write out trackDb search text */
 struct trackDb *tdbList = trackHubTracksForGenome(hub, genome);
 tdbList = trackDbLinkUpGenerations(tdbList);
 tdbList = trackDbPolishAfterLinkup(tdbList, genome->name);
 trackHubPolishTrackNames(hub, tdbList);
 
 struct hash *visitedTracks = newHash(5);
-trackHubCrawlTrack(tdbList, genome, hub->url, genomeHst->db, searchFp, visitedTracks);
+struct trackDb *tdb = NULL;
+for (tdb = tdbList; tdb != NULL; tdb = tdb->next)
+    {
+    trackHubCrawlTrack(tdb, genome, hub->url, genomeHst->db, searchFp, visitedTracks);
+    }
 }
 
 
 int trackHubCrawl(char *hubUrl)
 /* Crawl a track data hub and output strings useful in a search */
 {
 struct errCatch *errCatch = errCatchNew();
 struct trackHub *hub = NULL;
 int retVal = 0;
 
 if (errCatchStart(errCatch))
     {
     hub = trackHubOpen(hubUrl, "hub_0");
     }
 errCatchEnd(errCatch);
@@ -244,30 +271,31 @@
     fprintf(stderr, "%s\n", errCatch->message->string);
     }
 errCatchFree(&errCatch);
 
 if (hub == NULL)
     return 1;
 
 FILE *searchFp =stdout;
 struct hubSearchText *hubHst;
 AllocVar(hubHst);
 
 hubHst->hubUrl = cloneString(hub->url);
 hubHst->db = cloneString("");
 hubHst->track = cloneString("");
 hubHst->label = cloneString("");
+hubHst->parents = cloneString("");
 hubHst->textLength = hubSearchTextShort;
 hubHst->text = cloneString(hub->shortLabel);
 hubSearchTextTabOut(hubHst, searchFp);
 
 hubHst->text = cloneString(hub->longLabel);
 hubSearchTextTabOut(hubHst, searchFp);
 
 if (hub->descriptionUrl != NULL)
     {
     hubHst->textLength = hubSearchTextLong;
     char *rawHtml = netReadTextFileIfExists(hub->descriptionUrl);
     hubHst->text = cleanHubHtml(rawHtml);
     if (isNotEmpty(hubHst->text))
         hubSearchTextTabOut(hubHst, searchFp);
     }