259c477dd82ab1de43ddcf36f809a3bfe99a54da angie Mon Nov 27 14:29:32 2023 -0800 In bigBedGetFilteredBedsOnRegions, use nameIndex when user pastes/uploads identifiers, so we don't time out on large bigBeds like dbSnp. refs #32625 diff --git src/hg/hgTables/bigBed.c src/hg/hgTables/bigBed.c index 4f1dfb4..2b63ccc 100644 --- src/hg/hgTables/bigBed.c +++ src/hg/hgTables/bigBed.c @@ -152,41 +152,110 @@ } lmCleanup(&bbLm); } struct bed *bigBedGetFilteredBedsOnRegions(struct sqlConnection *conn, char *db, char *table, struct region *regionList, struct lm *lm, int *retFieldCount) /* Get list of beds from bigBed, in all regions, that pass filtering. */ { /* Connect to big bed and get metadata and filter. */ char *fileName = bigBedFileName(table, conn); struct bbiFile *bbi = bigBedFileOpenAlias(fileName, chromAliasFindAliases); struct asObject *as = bigBedAsOrDefault(bbi); struct asFilter *filter = asFilterFromCart(cart, db, table, as); - -/* Get beds a region at a time. */ struct bed *bedList = NULL; + +/* If we're doing a whole-genome query with a name index then use the name index to retrieve items + * instead of iterating over regions. */ +struct hash *idHash = NULL; +if (bbi->definedFieldCount >= 4) + idHash = identifierHash(db, table); +int fieldIx; +struct bptFile *bpt = NULL; +struct lm *bbLm = NULL; +struct bigBedInterval *ivList = NULL; +if (idHash && isRegionWholeGenome()) + { + bpt = bigBedOpenExtraIndex(bbi, "name", &fieldIx); + struct slName *nameList = hashSlNameFromHash(idHash); + int count = slCount(nameList); + char *names[count]; + int ii; + for (ii=0; ii < count; ii++) + { + names[ii] = nameList->name; + nameList = nameList->next; + } + bbLm = lmInit(0); + ivList = bigBedMultiNameQuery(bbi, bpt, fieldIx, names, count, bbLm); + } struct region *region; for (region = regionList; region != NULL; region = region->next) + { + if (bpt) + { + /*** NOTE: it is inefficient to convert intervals from a name-index query to filtered bed + * inside the loop on regionList. However, bigBedGetFilteredBedsOnRegions is called by + * getFilteredBeds on a "regionList" that has been doctored to one region at a time, + * so we can do intersection one region at a time. Since this is called once per region, + * we really do need to restrict items to region->chrom, otherwise all items would be + * returned for every region. It is still much more efficient for large bigBeds to do + * name-index queries when names are pasted/uploaded than to fetch all intervals in all + * regions and then check names. See MLQ #32625. */ + char chromBuf[4096]; + struct bigBedInterval *iv = NULL; + char *displayChromName = NULL; + int lastChromId = -1; + for (iv = ivList; iv != NULL; iv = iv->next) + { + if (iv->chromId != lastChromId) + { + bptStringKeyAtPos(bbi->chromBpt, iv->chromId, chromBuf, sizeof chromBuf); + displayChromName = chromAliasGetDisplayChrom(database, cart, hgOfficialChromName(database, chromBuf)); + } + if (sameString(displayChromName, region->chrom)) + { + char *row[bbi->fieldCount]; + char startBuf[16], endBuf[16]; + bigBedIntervalToRow(iv, displayChromName, startBuf, endBuf, row, bbi->fieldCount); + if (asFilterOnRow(filter, row)) + { + struct bed *bed = bedLoadN(row, bbi->definedFieldCount); + struct bed *lmBed = lmCloneBed(bed, lm); + slAddHead(&bedList, lmBed); + bedFree(&bed); + } + } + lastChromId = iv->chromId; + } + } + else + { + /* Get beds a region at a time. */ addFilteredBedsOnRegion(bbi, region, table, filter, lm, &bedList); + } slReverse(&bedList); + } /* Clean up and return. */ if (retFieldCount != NULL) *retFieldCount = bbi->definedFieldCount; +lmCleanup(&bbLm); +hashFree(&idHash); +bptFileDetach(&bpt); bbiFileClose(&bbi); freeMem(fileName); return bedList; } void bigBedTabOut(char *db, char *table, struct sqlConnection *conn, char *fields, FILE *f, char outSep) /* Print out selected fields from Big Bed. If fields is NULL, then print out all fields. */ { if (f == NULL) f = stdout; /* Convert comma separated list of fields to array. */ int fieldCount = chopByChar(fields, ',', NULL, 0); char **fieldArray; AllocArray(fieldArray, fieldCount);