7338fb10bb7ffdc0a75445a36cd128e6be593d36 braney Mon Feb 26 10:51:47 2024 -0800 fix a bunch of places that were printing out hub_#_db instead of just db diff --git src/hg/hgTables/bigBed.c src/hg/hgTables/bigBed.c index 68567b5..f0467fd 100644 --- src/hg/hgTables/bigBed.c +++ src/hg/hgTables/bigBed.c @@ -1,626 +1,626 @@ /* bigBed - stuff to handle bigBed in the Table Browser. */ /* Copyright (C) 2013 The Regents of the University of California * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */ #include "common.h" #include "hash.h" #include "linefile.h" #include "dystring.h" #include "localmem.h" #include "jksql.h" #include "cheapcgi.h" #include "cart.h" #include "web.h" #include "bed.h" #include "hdb.h" #include "trackDb.h" #include "obscure.h" #include "hmmstats.h" #include "correlate.h" #include "asParse.h" #include "bbiFile.h" #include "bigBed.h" #include "hubConnect.h" #include "asFilter.h" #include "hgTables.h" #include "trackHub.h" #include "chromAlias.h" #include "bPlusTree.h" #include "errCatch.h" boolean isBigBed(char *database, char *table, struct trackDb *parent, struct customTrack *(*ctLookupName)(char *table)) /* Local test to see if something is big bed. Handles hub tracks unlike hIsBigBed. */ { struct trackDb *tdb = hashFindVal(fullTableToTdbHash, table); // if "table" is explicitly listed, we're going to use that instead of any bigDataUrl if (tdb && hashLookup(tdb->settingsHash, "table")) return FALSE; if (tdb) return tdbIsBigBed(tdb); else return hIsBigBed(database, table, parent, ctLookupName); } static struct hash *asColumnHash(struct asObject *as) /* Return a hash full of the object's columns, keyed by colum name */ { struct hash *hash = hashNew(6); struct asColumn *col; for (col = as->columnList; col != NULL; col = col->next) hashAdd(hash, col->name, col); return hash; } static void fillField(struct hash *colHash, char *key, char output[HDB_MAX_FIELD_STRING]) /* If key is in colHash, then copy key to output. */ { if (hashLookup(colHash, key)) strncpy(output, key, HDB_MAX_FIELD_STRING-1); } struct hTableInfo *bigBedToHti(char *table, struct sqlConnection *conn) /* Get fields of bigBed into hti structure. */ { /* Get columns in asObject format. */ char *fileName = bigBedFileName(table, conn); struct bbiFile *bbi = bigBedFileOpenAlias(fileName, chromAliasFindAliases); struct asObject *as = bigBedAsOrDefault(bbi); /* Allocate hTableInfo structure and fill in info about bed fields. */ struct hash *colHash = asColumnHash(as); struct hTableInfo *hti; AllocVar(hti); hti->rootName = cloneString(table); hti->isPos= TRUE; fillField(colHash, "chrom", hti->chromField); fillField(colHash, "chromStart", hti->startField); fillField(colHash, "chromEnd", hti->endField); fillField(colHash, "name", hti->nameField); fillField(colHash, "score", hti->scoreField); fillField(colHash, "strand", hti->strandField); fillField(colHash, "thickStart", hti->cdsStartField); fillField(colHash, "thickEnd", hti->cdsEndField); fillField(colHash, "blockCount", hti->countField); fillField(colHash, "chromStarts", hti->startsField); fillField(colHash, "blockSizes", hti->endsSizesField); hti->hasCDS = (bbi->definedFieldCount >= 8); hti->hasBlocks = (bbi->definedFieldCount >= 12); char type[256]; safef(type, sizeof(type), "bed %d %c", bbi->definedFieldCount, (bbi->definedFieldCount == bbi->fieldCount ? '.' : '+')); hti->type = cloneString(type); freeMem(fileName); hashFree(&colHash); bbiFileClose(&bbi); return hti; } struct slName *bigBedGetFields(char *table, struct sqlConnection *conn) /* Get fields of bigBed as simple name list. */ { char *fileName = bigBedFileName(table, conn); struct bbiFile *bbi = bigBedFileOpenAlias(fileName, chromAliasFindAliases); struct asObject *as = bigBedAsOrDefault(bbi); struct slName *names = asColNames(as); freeMem(fileName); bbiFileClose(&bbi); return names; } struct sqlFieldType *bigBedListFieldsAndTypes(struct trackDb *tdb, struct sqlConnection *conn) /* Get fields of bigBed as list of sqlFieldType. */ { char *fileOrUrl = bigFileNameFromCtOrHub(tdb->table, conn); if (fileOrUrl == NULL) fileOrUrl = bbiNameFromSettingOrTable(tdb, conn, tdb->table); struct bbiFile *bbi = bigBedFileOpenAlias(fileOrUrl, chromAliasFindAliases); struct asObject *as = bigBedAsOrDefault(bbi); struct sqlFieldType *list = sqlFieldTypesFromAs(as); bbiFileClose(&bbi); return list; } static void addFilteredBedsOnRegion(struct bbiFile *bbi, struct region *region, char *table, struct asFilter *filter, struct lm *bedLm, struct bed **pBedList) /* Add relevant beds in reverse order to pBedList */ { struct lm *bbLm = lmInit(0); struct bigBedInterval *ivList = NULL, *iv; ivList = bigBedIntervalQuery(bbi, region->chrom, region->start, region->end, 0, bbLm); char *row[bbi->fieldCount]; char startBuf[16], endBuf[16]; struct hash *idHash = NULL; if (bbi->fieldCount >= 4) idHash = identifierHash(database, table); for (iv = ivList; iv != NULL; iv = iv->next) { bigBedIntervalToRow(iv, region->chrom, startBuf, endBuf, row, bbi->fieldCount); if (asFilterOnRow(filter, row)) { if ((idHash != NULL) && (hashLookup(idHash, row[3]) == NULL)) continue; struct bed *bed = bedLoadN(row, bbi->definedFieldCount); struct bed *lmBed = lmCloneBed(bed, bedLm); slAddHead(pBedList, lmBed); bedFree(&bed); } } lmCleanup(&bbLm); } static struct bptFile *getNameIndexOrDie(struct bbiFile *bbi, int *pFieldIndex) /* Return the index on the 'name' field in the passed bbi. errAbort on failure. */ { struct bptFile *bpt = NULL; struct errCatch *errCatch = errCatchNew(); if (errCatchStart(errCatch)) { bpt = bigBedOpenExtraIndex(bbi, "name", pFieldIndex); } errCatchEnd(errCatch); if (errCatch->gotError) { errAbort("Getting identifiers from whole genome regions requires an index on the name field of the bigBedFile %s", bbi->fileName); } errCatchFree(&errCatch); return bpt; } struct bed *bigBedGetFilteredBedsOnRegions(struct sqlConnection *conn, char *db, char *table, struct region *regionList, struct lm *lm, int *retFieldCount) /* Get list of beds from bigBed, in all regions, that pass filtering. */ { /* Connect to big bed and get metadata and filter. */ char *fileName = bigBedFileName(table, conn); struct bbiFile *bbi = bigBedFileOpenAlias(fileName, chromAliasFindAliases); struct asObject *as = bigBedAsOrDefault(bbi); struct asFilter *filter = asFilterFromCart(cart, db, table, as); struct bed *bedList = NULL; /* If we're doing a whole-genome query with a name index then use the name index to retrieve items * instead of iterating over regions. */ struct hash *idHash = NULL; if (bbi->definedFieldCount >= 4) idHash = identifierHash(db, table); int fieldIx; struct bptFile *bpt = NULL; struct lm *bbLm = NULL; struct bigBedInterval *ivList = NULL; if (idHash && isRegionWholeGenome()) bpt = getNameIndexOrDie(bbi, &fieldIx); if (bpt != NULL) { struct slName *nameList = hashSlNameFromHash(idHash), *name; int count = slCount(nameList); char *names[count]; int ii; for (ii=0, name = nameList; ii < count; ii++, name = name->next) { names[ii] = name->name; } bbLm = lmInit(0); ivList = bigBedMultiNameQuery(bbi, bpt, fieldIx, names, count, bbLm); slNameFreeList(&nameList); } struct region *region; for (region = regionList; region != NULL; region = region->next) { if (bpt) { /*** NOTE: it is inefficient to convert intervals from a name-index query to filtered bed * inside the loop on regionList. However, bigBedGetFilteredBedsOnRegions is called by * getFilteredBeds on a "regionList" that has been doctored to one region at a time, * so we can do intersection one region at a time. Since this is called once per region, * we really do need to restrict items to region->chrom, otherwise all items would be * returned for every region. It is still much more efficient for large bigBeds to do * name-index queries when names are pasted/uploaded than to fetch all intervals in all * regions and then check names. See MLQ #32625. */ char chromBuf[4096]; struct bigBedInterval *iv = NULL; char *displayChromName = NULL; int lastChromId = -1; for (iv = ivList; iv != NULL; iv = iv->next) { if (iv->chromId != lastChromId) { bptStringKeyAtPos(bbi->chromBpt, iv->chromId, chromBuf, sizeof chromBuf); displayChromName = chromAliasGetDisplayChrom(database, cart, hgOfficialChromName(database, chromBuf)); } if (sameString(displayChromName, region->chrom)) { char *row[bbi->fieldCount]; char startBuf[16], endBuf[16]; bigBedIntervalToRow(iv, displayChromName, startBuf, endBuf, row, bbi->fieldCount); if (asFilterOnRow(filter, row)) { struct bed *bed = bedLoadN(row, bbi->definedFieldCount); struct bed *lmBed = lmCloneBed(bed, lm); slAddHead(&bedList, lmBed); bedFree(&bed); } } lastChromId = iv->chromId; } } else { /* Get beds a region at a time. */ addFilteredBedsOnRegion(bbi, region, table, filter, lm, &bedList); } slReverse(&bedList); } /* Clean up and return. */ if (retFieldCount != NULL) *retFieldCount = bbi->definedFieldCount; lmCleanup(&bbLm); hashFree(&idHash); bptFileDetach(&bpt); bbiFileClose(&bbi); freeMem(fileName); return bedList; } void bigBedTabOut(char *db, char *table, struct sqlConnection *conn, char *fields, FILE *f, char outSep) /* Print out selected fields from Big Bed. If fields is NULL, then print out all fields. */ { if (f == NULL) f = stdout; /* Convert comma separated list of fields to array. */ int fieldCount = chopByChar(fields, ',', NULL, 0); char **fieldArray; AllocArray(fieldArray, fieldCount); chopByChar(fields, ',', fieldArray, fieldCount); /* Get list of all fields in big bed and turn it into a hash of column indexes keyed by * column name. */ struct hash *fieldHash = hashNew(0); struct slName *bb, *bbList = bigBedGetFields(table, conn); int i; for (bb = bbList, i=0; bb != NULL; bb = bb->next, ++i) hashAddInt(fieldHash, bb->name, i); // If bigBed has name column, look up pasted/uploaded identifiers if any: struct hash *idHash = NULL; if (slCount(bbList) >= 4) idHash = identifierHash(db, table); /* Create an array of column indexes corresponding to the selected field list. */ int *columnArray; AllocArray(columnArray, fieldCount); for (i=0; i<fieldCount; ++i) { columnArray[i] = hashIntVal(fieldHash, fieldArray[i]); } /* Output row of labels */ fprintf(f, "#"); if (outSep == ',') fputc('"', f); fprintf(f, "%s", fieldArray[0]); if (outSep == ',') fputc('"', f); for (i=1; i<fieldCount; ++i) { fputc(outSep, f); if (outSep == ',') fputc('"', f); fprintf(f, "%s", fieldArray[i]); if (outSep == ',') fputc('"', f); } fprintf(f, "\n"); /* Open up bigBed file. */ char *fileName = bigBedFileName(table, conn); struct bbiFile *bbi = bigBedFileOpenAlias(fileName, chromAliasFindAliases); struct asObject *as = bigBedAsOrDefault(bbi); struct asFilter *filter = NULL; if (anyFilter()) { filter = asFilterFromCart(cart, db, table, as); if (filter) { fprintf(f, "# Filtering on %d columns\n", slCount(filter->columnList)); } } struct bptFile *bpt = NULL; int fieldIx; if (idHash && isRegionWholeGenome()) bpt = getNameIndexOrDie(bbi, &fieldIx); char *row[bbi->fieldCount]; char startBuf[16], endBuf[16]; if (bpt) // if we have an index it means we're whole genome and don't need to filter based on regions { struct slName *nameList = hashSlNameFromHash(idHash); int count = slCount(nameList); char *names[count]; int ii; for (ii=0; ii < count; ii++) { names[ii] = nameList->name; nameList = nameList->next; } struct lm *lm = lmInit(0); struct bigBedInterval *ivList = bigBedMultiNameQuery(bbi, bpt, fieldIx, names, count, lm); char chromBuf[4096]; struct bigBedInterval *interval, *prevInterval = NULL; char * displayChromName = NULL; for (interval = ivList; interval != NULL; prevInterval = interval, interval = interval->next) { int lastChromId = (prevInterval == NULL ? -1 : prevInterval->chromId); if (interval->chromId != lastChromId) { bptStringKeyAtPos(bbi->chromBpt, interval->chromId, chromBuf, sizeof chromBuf); displayChromName = chromAliasGetDisplayChrom(database, cart, hgOfficialChromName(database, chromBuf)); } bigBedIntervalToRow(interval, displayChromName, startBuf, endBuf, row, bbi->fieldCount); if (asFilterOnRow(filter, row)) { int i; if (outSep == ',') fputc('"', f); fprintf(f, "%s", row[columnArray[0]]); if (outSep == ',') fputc('"', f); for (i=1; i<fieldCount; ++i) { fputc(outSep, f); if (outSep == ',') fputc('"', f); fprintf(f, "%s", row[columnArray[i]]); if (outSep == ',') fputc('"', f); } fprintf(f, "\n"); } } } else { /* Loop through outputting each region */ struct region *region, *regionList = getRegions(); for (region = regionList; region != NULL; region = region->next) { struct lm *lm = lmInit(0); struct bigBedInterval *iv, *ivList = bigBedIntervalQuery(bbi, region->chrom, region->start, region->end, 0, lm); char * displayChromName = chromAliasGetDisplayChrom(database, cart, region->chrom); for (iv = ivList; iv != NULL; iv = iv->next) { bigBedIntervalToRow(iv, displayChromName, startBuf, endBuf, row, bbi->fieldCount); if (asFilterOnRow(filter, row)) { if ((idHash != NULL) && (hashLookup(idHash, row[3]) == NULL)) continue; int i; if (outSep == ',') fputc('"', f); fprintf(f, "%s", row[columnArray[0]]); if (outSep == ',') fputc('"', f); for (i=1; i<fieldCount; ++i) { fputc(outSep, f); if (outSep == ',') fputc('"', f); fprintf(f, "%s", row[columnArray[i]]); if (outSep == ',') fputc('"', f); } fprintf(f, "\n"); } } lmCleanup(&lm); } } /* Clean up and exit. */ bbiFileClose(&bbi); hashFree(&fieldHash); freeMem(fieldArray); freeMem(columnArray); } static unsigned slCountAtMost(const void *list, unsigned max) // return the length of the list, but only count up to max { struct slList *pt = (struct slList *)list; int len = 0; while (pt != NULL) { len += 1; pt = pt->next; if (len == max) break; } return len; } static struct bigBedInterval *getNElements(struct bbiFile *bbi, struct bbiChromInfo *chromList, struct lm *lm, int n) // get up to n sample rows from the first chrom listed in the bigBed. // will return less than n if there are less than n on the first chrom. { struct bigBedInterval *ivList = NULL; // start out requesting only 10k bp so we don't hang if the bigBed is huge int currentLen = 10000; // look about 2/3 of the way through the chrom to avoid the telomeres // and the centromere int startAddr = 2 * chromList->size / 3; int endAddr; while ((slCountAtMost(ivList, n)) < n) { endAddr = startAddr + currentLen; // if we're pointing beyond the end of the chromosome if (endAddr > chromList->size) { // move the start address back startAddr -= (endAddr - chromList->size); endAddr = chromList->size; } // if we're pointing to before the start of the chrom if (startAddr < 0) startAddr = 0; // ask for n items ivList = bigBedIntervalQuery(bbi, chromList->name, startAddr, endAddr, n, lm); currentLen *= 2; if ((startAddr == 0) && (endAddr == chromList->size)) break; } return ivList; } struct slName *randomBigBedIds(char *table, struct sqlConnection *conn, int count) /* Return some arbitrary IDs from a bigBed file. */ { /* Figure out bigBed file name and open it. Get contents for first chromosome as an example. */ struct slName *idList = NULL; char *fileName = bigBedFileName(table, conn); struct bbiFile *bbi = bigBedFileOpenAlias(fileName, chromAliasFindAliases); struct bbiChromInfo *chromList = bbiChromList(bbi); struct lm *lm = lmInit(0); int orderedCount = count * 4; if (orderedCount < 100) orderedCount = 100; struct bigBedInterval *iv, *ivList = getNElements(bbi, chromList, lm, orderedCount); shuffleList(&ivList); // Make a list of item names from intervals. int outCount = 0; for (iv = ivList; iv != NULL && outCount < count; iv = iv->next) { char *row[bbi->fieldCount]; char startBuf[16], endBuf[16]; bigBedIntervalToRow(iv, chromList->name, startBuf, endBuf, row, bbi->fieldCount); if (idList == NULL || differentString(row[3], idList->name)) { slAddHead(&idList, slNameNew(row[3])); outCount++; } } lmCleanup(&lm); bbiFileClose(&bbi); freeMem(fileName); return idList; } void showSchemaBigBed(char *table, struct trackDb *tdb) /* Show schema on bigBed. */ { /* Figure out bigBed file name and open it. Get contents for first chromosome as an example. */ struct sqlConnection *conn = NULL; if (!trackHubDatabase(database)) conn = hAllocConn(database); char *fileName = bigBedFileName(table, conn); struct bbiFile *bbi = bigBedFileOpenAlias(fileName, chromAliasFindAliases); struct bbiChromInfo *chromList = bbiChromList(bbi); struct lm *lm = lmInit(0); struct bigBedInterval *ivList = getNElements(bbi, chromList, lm, 10); time_t timep = bbiUpdateTime(bbi); /* Get description of columns, making it up from BED records if need be. */ struct asObject *as = bigBedAsOrDefault(bbi); -hPrintf("<B>Database:</B> %s", database); +hPrintf("<B>Database:</B> %s", trackHubSkipHubName(database)); hPrintf(" <B>Primary Table:</B> %s ", table); printf("<B>Data last updated: </B>%s<BR>\n", firstWordInLine(sqlUnixTimeToDate(&timep, FALSE))); printDownloadLink("Big Bed", fileName); if (bbi->version >= 2) { hPrintf("<BR><B>Item Count:</B> "); printLongWithCommas(stdout, bigBedItemCount(bbi)); } printTypeHelpDesc(tdb->type); hPrintf("<BR>\n"); hPrintf("<B>Format description:</B> %s<BR>", as->comment); /* Put up table that describes fields. */ hTableStart(); hPrintf("<TR><TH>field</TH>"); if (ivList != NULL) hPrintf("<TH>example</TH>"); hPrintf("<TH>description</TH> "); puts("</TR>\n"); struct asColumn *col; int colCount = 0; char *row[bbi->fieldCount]; char startBuf[16], endBuf[16]; if (ivList != NULL) { char *dupeRest = lmCloneString(lm, ivList->rest); /* Manage rest-stomping side-effect */ bigBedIntervalToRow(ivList, chromList->name, startBuf, endBuf, row, bbi->fieldCount); ivList->rest = dupeRest; } for (col = as->columnList; col != NULL; col = col->next) { hPrintf("<TR><TD><TT>%s</TT></TD>", col->name); if (ivList != NULL) hPrintf("<TD>%s</TD>", row[colCount]); hPrintf("<TD>%s</TD></TR>", col->comment); ++colCount; } /* If more fields than descriptions put up minimally helpful info (at least has example). */ for ( ; colCount < bbi->fieldCount; ++colCount) { hPrintf("<TR><TD><TT>column%d</TT></TD>", colCount+1); if (ivList != NULL) hPrintf("<TD>%s</TD>", row[colCount]); hPrintf("<TD>n/a</TD></TR>\n"); } hTableEnd(); if (ivList != NULL) { /* Put up another section with sample rows. */ webNewSection("Sample Rows"); hTableStart(); /* Print field names as column headers for example */ hPrintf("<TR>"); int colIx = 0; for (col = as->columnList; col != NULL; col = col->next) { hPrintf("<TH>%s</TH>", col->name); ++colIx; } for (; colIx < colCount; ++colIx) hPrintf("<TH>column%d</TH>", colIx+1); hPrintf("</TR>\n"); /* Print sample lines. */ struct bigBedInterval *iv; for (iv=ivList; iv != NULL; iv = iv->next) { bigBedIntervalToRow(iv, chromList->name, startBuf, endBuf, row, bbi->fieldCount); hPrintf("<TR>"); for (colIx=0; colIx<colCount; ++colIx) { writeHtmlCell(row[colIx]); } hPrintf("</TR>\n"); } hTableEnd(); } printTrackHtml(tdb); /* Clean up and go home. */ lmCleanup(&lm); bbiFileClose(&bbi); freeMem(fileName); hFreeConn(&conn); }