a345d17f7912a5f0e121951f6d261b496cce608b braney Fri May 10 09:25:54 2013 -0700 first cut at track item search in track hubs that have bigBed files with a searchIndex field refs #10426 diff --git src/hg/lib/trackHub.c src/hg/lib/trackHub.c index 3407631..de1b095 100644 --- src/hg/lib/trackHub.c +++ src/hg/lib/trackHub.c @@ -23,30 +23,34 @@ #include "ra.h" #include "filePath.h" #include "htmlPage.h" #include "trackDb.h" #include "trackHub.h" #include "errCatch.h" #include "hgBam.h" #include "bigWig.h" #include "bigBed.h" #include "hdb.h" #include "chromInfo.h" #include "grp.h" #include "twoBit.h" #include "dbDb.h" #include "net.h" +#include "bbiFile.h" +#include "bPlusTree.h" +#include "hgFind.h" +#include "hubConnect.h" static struct hash *hubCladeHash; // mapping of clade name to hub pointer static struct hash *hubAssemblyHash; // mapping of assembly name to genome struct static struct hash *hubOrgHash; // mapping from organism name to hub pointer struct trackHub *globalAssemblyHubList; // list of trackHubs in the user's cart char *trackHubRelativeUrl(char *hubUrl, char *path) /* Return full path (in URL form if it's a remote hub) given * path possibly relative to hubUrl. Do a freeMem of result * when done. */ { /* If path itself is a URL then just return a copy of it. */ if (hasProtocol(path)) return cloneString(path); @@ -70,82 +74,89 @@ /* Given a track hub clade(hub name) return the default genome. */ { if (hubCladeHash == NULL) return FALSE; struct hashEl *hel = hashLookup(hubCladeHash, clade); if (hel == NULL) return FALSE; struct trackHub *trackHub = hel->val; struct trackHubGenome *hubGenome = trackHub->genomeList; for(; hubGenome; hubGenome=hubGenome->next) if (hubGenome->twoBitPath != NULL) return hubGenome->organism ; return NULL; } -boolean trackHubDatabase(char *database) -/* Is this an assembly from an Assembly Data hub? */ +struct trackHubGenome *trackHubGetGenome(char *database) { if (hubAssemblyHash == NULL) - return FALSE; + errAbort("requesting hub genome with no hubs loaded"); + struct hashEl *hel = hashLookup(hubAssemblyHash, database); + if (hel == NULL) + return NULL; + +return (struct trackHubGenome *)hel->val; +} + +boolean trackHubDatabase(char *database) +/* Is this an assembly from an Assembly Data hub? */ +{ +if (hubAssemblyHash == NULL) return FALSE; -return TRUE; + +return trackHubGetGenome(database) != NULL; } char *trackHubAssemblyField(char *database, char *field) /* Get data field from a assembly data hub. */ { -if (hubAssemblyHash == NULL) - errAbort("requesting hub assembly field with no hubs loaded"); -struct hashEl *hel = hashLookup(hubAssemblyHash, database); -if (hel == NULL) +struct trackHubGenome *genome = trackHubGetGenome(database); + +if (genome == NULL) return NULL; -struct trackHubGenome *genome = hel->val; char *ret = hashFindVal(genome->settingsHash, field); return cloneString(ret); } static struct dbDb *makeDbDbFromAssemblyGenome(struct trackHubGenome *hubGenome) /* Make a dbdb struture from a single assembly hub database. */ { struct dbDb *db; AllocVar(db); db->genome = cloneString(hubGenome->organism); db->organism = cloneString(hubGenome->organism); db->name = cloneString(hubGenome->name); db->active = TRUE; db->description = cloneString(hubGenome->description); db->orderKey = sqlUnsigned(hashFindVal(hubGenome->settingsHash, "orderKey")); return db; } struct dbDb *trackHubDbDbFromAssemblyDb(char *database) /* Return a dbDb structure for just this database. */ { -if (hubAssemblyHash == NULL) - errAbort("requesting hub assembly dbDb with no hubs loaded"); -struct hashEl *hel = hashLookup(hubAssemblyHash, database); -if (hel == NULL) +struct trackHubGenome *genome = trackHubGetGenome(database); + +if (genome == NULL) return NULL; -struct trackHubGenome *genome = hel->val; return makeDbDbFromAssemblyGenome(genome); } struct slPair *trackHubGetCladeLabels() /* Get a list of labels describing the loaded assembly data hubs. */ { if (globalAssemblyHubList == NULL) return NULL; struct slPair *clade, *cladeList = NULL; struct trackHub *trackHub = globalAssemblyHubList; for(;trackHub; trackHub = trackHub->next) { @@ -177,106 +188,106 @@ { if (hubGenome->twoBitPath != NULL) { db = makeDbDbFromAssemblyGenome(hubGenome); slAddHead(&dbList, db); } } } } slSort(&dbList, hDbDbCmpOrderKey); slReverse(&dbList); return dbList; } -int trackHubChromCount(char *database) -/* Return number of chromosomes in a assembly data hub. */ -{ -struct hashEl *hel = hashLookup(hubAssemblyHash, database); -if (hel == NULL) - return 0; - -struct trackHubGenome *genome = hel->val; -struct slName *chromList = twoBitSeqNames(genome->twoBitPath); - -int num = slCount(chromList); -slFreeList(&chromList); -return num; -} - struct slName *trackHubAllChromNames(char *database) /* Return a list of all the chrom names in this assembly hub database. */ /* Free with slFreeList. */ { -struct hashEl *hel = hashLookup(hubAssemblyHash, database); -if (hel == NULL) - return 0; +struct trackHubGenome *genome = trackHubGetGenome(database); +if (genome == NULL) + return NULL; -struct trackHubGenome *genome = hel->val; struct slName *chromList = twoBitSeqNames(genome->twoBitPath); return chromList; } +int trackHubChromCount(char *database) +/* Return number of chromosomes in a assembly data hub. */ +{ +struct slName *chromList = trackHubAllChromNames(database); + +int num = slCount(chromList); +slFreeList(&chromList); +return num; +} + char *trackHubDefaultChrom(char *database) /* Return the default chromosome for this track hub assembly. */ { -struct hashEl *hel = hashLookup(hubAssemblyHash, database); -if (hel == NULL) - return NULL; +struct slName *chromList = trackHubAllChromNames(database); -struct trackHubGenome *genome = hel->val; -struct slName *chromList = twoBitSeqNames(genome->twoBitPath); +if (chromList == NULL) + return NULL; char *defaultName = cloneString( chromList->name); slFreeList(&chromList); return defaultName; } -struct chromInfo *trackHubChromInfo(char *database, char *chrom) -/* Return a chromInfo structure for just this chrom in this database. */ +struct chromInfo *trackHubMaybeChromInfo(char *database, char *chrom) +/* Return a chromInfo structure for just this chrom in this database. + * Return NULL if chrom doesn't exist. */ { -if (hubAssemblyHash == NULL) +struct trackHubGenome *genome = trackHubGetGenome(database); +if (genome == NULL) return NULL; -struct hashEl *hel = hashLookup(hubAssemblyHash, database); - -if (hel == NULL) +if (!twoBitIsSequence(genome->tbf, chrom)) return NULL; -struct trackHubGenome *genome = hel->val; struct chromInfo *ci; - AllocVar(ci); ci->chrom = cloneString(chrom); ci->fileName = genome->twoBitPath; ci->size = twoBitSeqSize(genome->tbf, chrom); return ci; } -struct chromInfo *trackHubAllChromInfo(char *db) -/* Return a chromInfo structure for all the chroms in this database. */ +struct chromInfo *trackHubChromInfo(char *database, char *chrom) +/* Return a chromInfo structure for just this chrom in this database. + * errAbort if chrom doesn't exist. */ { -struct hashEl *hel = hashLookup(hubAssemblyHash, db); +struct chromInfo *ci = trackHubMaybeChromInfo(database, chrom); -if (hel == NULL) +if (ci == NULL) + errAbort("%s is not in %s", chrom, database); + +return ci; +} + +struct chromInfo *trackHubAllChromInfo(char *database) +/* Return a chromInfo structure for all the chroms in this database. */ +{ +struct trackHubGenome *genome = trackHubGetGenome(database); +if (genome == NULL) return NULL; -struct trackHubGenome *genome = hel->val; struct chromInfo *ci, *ciList = NULL; struct slName *chromList = twoBitSeqNames(genome->twoBitPath); for(; chromList; chromList = chromList->next) { AllocVar(ci); ci->chrom = cloneString(chromList->name); ci->fileName = genome->twoBitPath; ci->size = twoBitSeqSize(genome->tbf, chromList->name); slAddHead(&ciList, ci); } slFreeList(&chromList); return ciList; } @@ -308,39 +319,33 @@ grp->label = cloneString(getRequiredGrpSetting(ra, "label", lf)); grp->priority = atof(getRequiredGrpSetting(ra, "priority", lf)); grp->defaultIsClosed = sqlUnsigned(getRequiredGrpSetting(ra,"defaultIsClosed",lf)); hashFree(&ra); } if (list) slReverse(&list); lineFileClose(&lf); return list; } struct grp *trackHubLoadGroups(char *database) /* Load the grp structures for this track hub database. */ { -if (hubAssemblyHash == NULL) - return NULL; - -struct hashEl *hel = hashLookup(hubAssemblyHash, database); - -if (hel == NULL) +struct trackHubGenome *genome = trackHubGetGenome(database); +if (genome == NULL) return NULL; - -struct trackHubGenome *genome = hel->val; struct grp *list = readGroupRa(genome->groups); return list; } char *trackHubGenomeNameToDb(char *genome) /* Return assembly name given a genome name if one exists, otherwise NULL. */ { struct hashEl *hel; if ((hubOrgHash != NULL) && (hel = hashLookup(hubOrgHash, genome)) != NULL) { struct trackHub *hub = hel->val; struct trackHubGenome *genomeList = hub->genomeList; for(; genomeList; genomeList=genomeList->next) if ((genomeList->organism != NULL ) && @@ -943,15 +948,99 @@ if (hub == NULL) return 1; verbose(2, "hub %s\nshortLabel %s\nlongLabel %s\n", hubUrl, hub->shortLabel, hub->longLabel); verbose(2, "%s has %d elements\n", hub->genomesFile, slCount(hub->genomeList)); struct trackHubGenome *genome; for (genome = hub->genomeList; genome != NULL; genome = genome->next) { retVal |= hubCheckGenome(hub, genome, errors, checkTracks); } trackHubClose(&hub); return retVal; } + + +static struct hgPos *bigBedIntervalListToHgPositions(struct bbiFile *bbi, char *term, struct bigBedInterval *intervalList) +/* Given an open bigBed file, and an interval list, return a pointer to a list of hgPos structures. */ +{ +struct hgPos *posList = NULL; +char chromName[bbi->chromBpt->keySize+1]; +int lastChromId = -1; +struct bigBedInterval *interval; + +for (interval = intervalList; interval != NULL; interval = interval->next) + { + struct hgPos *hgPos; + AllocVar(hgPos); + slAddHead(&posList, hgPos); + + bbiCachedChromLookup(bbi, interval->chromId, lastChromId, chromName, sizeof(chromName)); + lastChromId = interval->chromId; + + hgPos->chrom = cloneString(chromName); + hgPos->chromStart = interval->start; + hgPos->chromEnd = interval->end; + hgPos->name = term; + } + +return posList; +} + +static struct hgPos *getPosFromBigBed(char *bigDataUrl, char *indexField, char *term) +/* Given a bigBed file with a search index, check for term. */ +{ +struct bbiFile *bbi = bigBedFileOpen(bigDataUrl); +int fieldIx; +struct bptFile *bpt = bigBedOpenExtraIndex(bbi, indexField, &fieldIx); +struct lm *lm = lmInit(0); +struct bigBedInterval *intervalList; +intervalList = bigBedNameQuery(bbi, bpt, fieldIx, term, lm); + +return bigBedIntervalListToHgPositions(bbi, term, intervalList); +} + +static void findPosInTdbList(struct trackDb *tdbList, char *term, struct hgPositions *hgp) +/* Given a trackHub's trackDb entries, check each of them for a searchIndex */ +{ +struct trackDb *tdb; + +for(tdb=tdbList; tdb; tdb = tdb->next) + { + char *indexField = trackDbSetting(tdb, "searchIndex"); + char *bigDataUrl = trackDbSetting(tdb, "bigDataUrl"); + + if (indexField && bigDataUrl) + { + struct hgPos *posList = getPosFromBigBed(bigDataUrl, indexField, term); + + if (posList != NULL) + { + struct hgPosTable *table; + + AllocVar(table); + slAddHead(&hgp->tableList, table); + table->description = cloneString(tdb->table); + table->name = cloneString(tdb->table); + + table->posList = posList; + } + } + } +} + +void trackHubFindPos(char *db, char *term, struct hgPositions *hgp) +/* Look for term in track hubs. Update hgp if found */ +{ +struct trackDb *tdbList = NULL; +if (trackHubDatabase(db)) + { + struct trackHubGenome *genome = trackHubGetGenome(db); + tdbList = trackHubTracksForGenome(genome->trackHub, genome); + } +else + tdbList = hubCollectTracks(db, NULL); + +findPosInTdbList(tdbList, term, hgp); +}