5f5b68942b3208f40494af94049234fbbb6f73b6 chmalee Tue Jul 13 09:08:17 2021 -0700 Add hubCrawl to list of utilities managed by the build, so it stays up to date for qateam to use it every week. diff --git src/hg/utils/hubCrawl/hubCrawl.c src/hg/utils/hubCrawl/hubCrawl.c index 650f286..adff58b 100644 --- src/hg/utils/hubCrawl/hubCrawl.c +++ src/hg/utils/hubCrawl/hubCrawl.c @@ -1,353 +1,353 @@ /* Copyright (C) 2014 The Regents of the University of California * See README in this or parent directory for licensing information. */ #include "axt.h" #include "common.h" #include "bigWig.h" #include "bigBed.h" #include "dystring.h" #include "errCatch.h" #include "hgBam.h" #include "htmshell.h" #include "htmlPage.h" #include "hui.h" #include "net.h" #include "options.h" #include "trackDb.h" #include "trackHub.h" #include "udc.h" #include "vcf.h" #include "hubSearchText.h" static int cacheTime = 1; void usage() /* Explain usage and exit. */ { errAbort( "hubCrawl - Crawl a track data hub and output search strings\n" "usage:\n" " hubCrawl http://yourHost/yourDir/hub.txt\n" "options:\n" " -udcDir=/dir/to/cache - place to put cache for hub, genome, and trackDb text files.\n" " -cacheTime=N - set cache refresh time in seconds, default %d\n" " -verbose=2 - output verbosely\n" , cacheTime ); } static struct optionSpec options[] = { {"udcDir", OPTION_STRING}, {"cacheTime", OPTION_INT}, {NULL, 0}, }; char *removeExtraSpaces(char *s) /* Replace long spans of whitespace in s with just the first whitespace * character in each span. */ { if (s == NULL) return NULL; char *scrubbed = needMem(strlen(s)); char *from=s; char *to=scrubbed; while (*from!='\0') { if (isspace(*from) && (*(from+1) == '\0' || isspace(*(from+1)))) from++; else *to++ = *from++; } return scrubbed; } char *cleanHubHtml(char *html) /* Clean up an HTML description page by removing all tags, javascript, and css, * and deleting a number of awkward special characters. */ { if (isEmpty(html)) return NULL; char *stripHtml = htmlTextStripJavascriptCssAndTags(html); strSwapChar(stripHtml, '\n', ' '); strSwapChar(stripHtml, '\t', ' '); strSwapChar(stripHtml, '\015', ' '); strSwapChar(stripHtml, ')', ' '); strSwapChar(stripHtml, '(', ' '); strSwapChar(stripHtml, '[', ' '); strSwapChar(stripHtml, ']', ' '); char *withoutExtraSpaces = removeExtraSpaces(stripHtml); return withoutExtraSpaces; } void trackHubCrawlTrack(struct trackDb *tdb, struct trackHubGenome *genome, char *hubUrl, char *dbName, FILE *searchFp, struct hash *visitedTracks) /* Given a trackDb and the hub genome it comes from, write out hubSearchText lines for all of * the tracks in that trackDb */ { if (hashLookup(visitedTracks, tdb->track) == NULL) { // Visit parent first, so that any parent HTML description is loaded before handling this // track. Otherwise we could write out the HTML description of this track without knowing // that it's identical to the parent's. hashStore(visitedTracks, tdb->track); if (tdb->parent != NULL) trackHubCrawlTrack(tdb->parent, genome, hubUrl, dbName, searchFp, visitedTracks); struct hubSearchText *trackHst = NULL; struct dyString *csParents = dyStringNew(0); struct dyString *csParentTypes = dyStringNew(0); AllocVar(trackHst); trackHst->hubUrl = cloneString(hubUrl); trackHst->db = cloneString(dbName); trackHst->track = cloneString(trackHubSkipHubName(tdb->track)); struct trackDb *ptdb = tdb->parent; while (ptdb != NULL) { // start with track dyStringPrintf(csParents, "\"%s\",", htmlEncode(trackHubSkipHubName(ptdb->track))); // now add the label, which may just be the track name again dyStringPrintf(csParents, "\""); if (isNotEmpty(tdb->longLabel)) dyStringPrintf(csParents, "%s", htmlEncode(ptdb->longLabel)); else if (isNotEmpty(tdb->shortLabel)) dyStringPrintf(csParents, "%s", htmlEncode(ptdb->shortLabel)); else dyStringPrintf(csParents, "%s", htmlEncode(ptdb->track)); dyStringPrintf(csParents, "\""); // now fill in the type of the parent, "comp" for composite/multiWig (ie valid trackUi page), // "super" for super track, and "view" for view, "other" for everything else. // these are used by hgHubConnect for printing the correct links to track ui pages // for search results if (tdbIsComposite(ptdb) || trackDbLocalSetting(ptdb, "container")) dyStringPrintf(csParentTypes, "comp"); else if (tdbIsSuper(ptdb)) dyStringPrintf(csParentTypes, "super"); else if (tdbIsCompositeView(ptdb)) dyStringPrintf(csParentTypes, "view"); else // handle any extra dyStringPrintf(csParentTypes, "other"); if (ptdb->parent != NULL) { dyStringPrintf(csParents, ","); dyStringPrintf(csParentTypes, ","); } ptdb = ptdb->parent; } trackHst->parents = dyStringCannibalize(&csParents); trackHst->parentTypes = dyStringCannibalize(&csParentTypes); if (isNotEmpty(tdb->longLabel)) { trackHst->label = cloneString(tdb->longLabel); } else if (isNotEmpty(tdb->shortLabel)) { trackHst->label = cloneString(tdb->shortLabel); } else trackHst->label = cloneString(trackHubSkipHubName(tdb->track)); trackHst->textLength = hubSearchTextShort; trackHst->text = cloneString(trackHubSkipHubName(tdb->track)); hubSearchTextTabOut(trackHst, searchFp); if (isNotEmpty(tdb->shortLabel)) { trackHst->text = cloneString(tdb->shortLabel); hubSearchTextTabOut(trackHst, searchFp); } if (isNotEmpty(tdb->longLabel)) { trackHst->text = cloneString(tdb->longLabel); hubSearchTextTabOut(trackHst, searchFp); } trackHubAddOneDescription(genome->trackDbFile, tdb); if (isNotEmpty(tdb->html)) { // In theory we could compare the html setting fields (the URLs for the track descriptions) // instead of the descriptions themselves, but that would falter if a remote server was set // up to return the same description page for a variety of URLs (it's happened). if (tdb->parent == NULL || tdb->parent->html == NULL || differentString(tdb->html, tdb->parent->html)) { trackHst->textLength = hubSearchTextLong; trackHst->text = cleanHubHtml(tdb->html); hubSearchTextTabOut(trackHst, searchFp); } } // memory leak ditching metadata pairs. slPairFreeValsAndList would fix that. trackHst->textLength = hubSearchTextMeta; trackHst->text = (char *) needMem(4096); struct slPair *metaPairs = trackDbMetaPairs(tdb); while (metaPairs != NULL) { safef(trackHst->text, 4096, "%s: %s", metaPairs->name, (char *) metaPairs->val); hubSearchTextTabOut(trackHst, searchFp); metaPairs = metaPairs->next; } // Write out lines for child tracks struct trackDb *subtrack = NULL; for (subtrack = tdb->subtracks; subtrack != NULL; subtrack = subtrack->next) { trackHubCrawlTrack(subtrack, genome, hubUrl, dbName, searchFp, visitedTracks); } } } void trackHubCrawlGenome(struct trackHubGenome *genome, struct trackHub *hub, FILE *searchFp) /* Given a hub genome and the hub it came from, write out hubSearchText lines for that genome. * NB: Errors fetching particular trackDb files will not be reported to the calling function. */ { struct hubSearchText *genomeHst = NULL; AllocVar(genomeHst); genomeHst->hubUrl = cloneString(hub->url); genomeHst->db = cloneString(trackHubSkipHubName(genome->name)); genomeHst->track = cloneString(""); char label[256]; if (isNotEmpty(genome->description)) safef(label, sizeof(label), "%s (%s)", genome->description, trackHubSkipHubName(genome->name)); else if (isNotEmpty(genome->organism)) safef(label, sizeof(label), "%s (%s)", trackHubSkipHubName(genome->organism), trackHubSkipHubName(genome->name)); else safef(label, sizeof(label), "%s", trackHubSkipHubName(genome->name)); genomeHst->label = cloneString(label); genomeHst->parents = cloneString(""); genomeHst->parentTypes = cloneString(""); genomeHst->textLength = hubSearchTextShort; genomeHst->text = cloneString(trackHubSkipHubName(genome->name)); hubSearchTextTabOut(genomeHst, searchFp); if (isNotEmpty(genome->organism) && differentString(genome->organism, genome->name)) { genomeHst->text = cloneString(trackHubSkipHubName(genome->organism)); hubSearchTextTabOut(genomeHst, searchFp); } if (isNotEmpty(genome->description)) { genomeHst->text = cloneString(genome->description); hubSearchTextTabOut(genomeHst, searchFp); } struct hashEl *hel = NULL; if (genome->settingsHash && (hel = hashLookup(genome->settingsHash, "scientificName")) != NULL) { char *sciName = (char *)(hel->val); if (differentString(sciName, genome->name)) { genomeHst->text = cloneString(sciName); hubSearchTextTabOut(genomeHst, searchFp); } } if (genome->settingsHash && (hel = hashLookup(genome->settingsHash, "htmlPath")) != NULL) { char *htmlPath = (char *)(hel->val); genomeHst->textLength = hubSearchTextLong; char *rawHtml = netReadTextFileIfExists(htmlPath); genomeHst->text = cleanHubHtml(rawHtml); if (isNotEmpty(genomeHst->text)) hubSearchTextTabOut(genomeHst, searchFp); } /* Write out trackDb search text */ -struct trackDb *tdbList = trackHubTracksForGenome(hub, genome); +struct trackDb *tdbList = trackHubTracksForGenome(hub, genome,NULL); tdbList = trackDbLinkUpGenerations(tdbList); tdbList = trackDbPolishAfterLinkup(tdbList, genome->name); trackHubPolishTrackNames(hub, tdbList); struct hash *visitedTracks = newHash(5); struct trackDb *tdb = NULL; for (tdb = tdbList; tdb != NULL; tdb = tdb->next) { trackHubCrawlTrack(tdb, genome, hub->url, genomeHst->db, searchFp, visitedTracks); } } int trackHubCrawl(char *hubUrl) /* Crawl a track data hub and output strings useful in a search */ { struct errCatch *errCatch = errCatchNew(); struct trackHub *hub = NULL; int retVal = 0; if (errCatchStart(errCatch)) { hub = trackHubOpen(hubUrl, "hub_0"); } errCatchEnd(errCatch); if (errCatch->gotError) { retVal = 1; fprintf(stderr, "%s\n", errCatch->message->string); } errCatchFree(&errCatch); if (hub == NULL) return 1; FILE *searchFp =stdout; struct hubSearchText *hubHst; AllocVar(hubHst); hubHst->hubUrl = cloneString(hub->url); hubHst->db = cloneString(""); hubHst->track = cloneString(""); hubHst->label = cloneString(""); hubHst->textLength = hubSearchTextShort; hubHst->text = cloneString(hub->shortLabel); hubHst->parents = cloneString(""); hubHst->parentTypes = cloneString(""); hubSearchTextTabOut(hubHst, searchFp); hubHst->text = cloneString(hub->longLabel); hubSearchTextTabOut(hubHst, searchFp); if (hub->descriptionUrl != NULL) { hubHst->textLength = hubSearchTextLong; char *rawHtml = netReadTextFileIfExists(hub->descriptionUrl); hubHst->text = cleanHubHtml(rawHtml); if (isNotEmpty(hubHst->text)) hubSearchTextTabOut(hubHst, searchFp); } struct trackHubGenome *genome; for (genome = hub->genomeList; genome != NULL; genome = genome->next) trackHubCrawlGenome(genome, hub, searchFp); trackHubClose(&hub); return retVal; } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 2) usage(); udcSetCacheTimeout(cacheTime); // UDC cache dir: first check for hg.conf setting, then override with command line option if given. setUdcCacheDir(); udcSetDefaultDir(optionVal("udcDir", udcDefaultDir())); if (trackHubCrawl(argv[1])) { // Maybe it's a transient failure; try again after a minute fprintf(stderr, "Error fetching %s - retrying after 60 seconds\n", argv[1]); sleep(60); return trackHubCrawl(argv[1]); } return 0; }