src/hg/hgTables/bigBed.c 259c477dd82ab1de43ddcf36f809a3bfe99a54da

259c477dd82ab1de43ddcf36f809a3bfe99a54da
angie
  Mon Nov 27 14:29:32 2023 -0800
In bigBedGetFilteredBedsOnRegions, use nameIndex when user pastes/uploads identifiers, so we don't time out on large bigBeds like dbSnp.  refs #32625

diff --git src/hg/hgTables/bigBed.c src/hg/hgTables/bigBed.c
index 4f1dfb4..2b63ccc 100644
--- src/hg/hgTables/bigBed.c
+++ src/hg/hgTables/bigBed.c
@@ -152,41 +152,110 @@
     }
 
 lmCleanup(&bbLm);
 }
 
 struct bed *bigBedGetFilteredBedsOnRegions(struct sqlConnection *conn,
 	char *db, char *table, struct region *regionList, struct lm *lm,
 	int *retFieldCount)
 /* Get list of beds from bigBed, in all regions, that pass filtering. */
 {
 /* Connect to big bed and get metadata and filter. */
 char *fileName = bigBedFileName(table, conn);
 struct bbiFile *bbi =  bigBedFileOpenAlias(fileName, chromAliasFindAliases);
 struct asObject *as = bigBedAsOrDefault(bbi);
 struct asFilter *filter = asFilterFromCart(cart, db, table, as);
-
-/* Get beds a region at a time. */
 struct bed *bedList = NULL;
+
+/* If we're doing a whole-genome query with a name index then use the name index to retrieve items
+ * instead of iterating over regions. */
+struct hash *idHash = NULL;
+if (bbi->definedFieldCount >= 4)
+    idHash = identifierHash(db, table);
+int fieldIx;
+struct bptFile *bpt = NULL;
+struct lm *bbLm = NULL;
+struct bigBedInterval *ivList = NULL;
+if (idHash && isRegionWholeGenome())
+    {
+    bpt = bigBedOpenExtraIndex(bbi, "name", &fieldIx);
+    struct slName *nameList = hashSlNameFromHash(idHash);
+    int count = slCount(nameList);
+    char *names[count];
+    int ii;
+    for (ii=0; ii < count; ii++)
+        {
+        names[ii] = nameList->name;
+        nameList = nameList->next;
+        }
+    bbLm = lmInit(0);
+    ivList = bigBedMultiNameQuery(bbi, bpt, fieldIx, names, count, bbLm);
+    }
 struct region *region;
 for (region = regionList; region != NULL; region = region->next)
+    {
+    if (bpt)
+        {
+        /*** NOTE: it is inefficient to convert intervals from a name-index query to filtered bed
+         * inside the loop on regionList.  However, bigBedGetFilteredBedsOnRegions is called by
+         * getFilteredBeds on a "regionList" that has been doctored to one region at a time,
+         * so we can do intersection one region at a time.  Since this is called once per region,
+         * we really do need to restrict items to region->chrom, otherwise all items would be
+         * returned for every region.  It is still much more efficient for large bigBeds to do
+         * name-index queries when names are pasted/uploaded than to fetch all intervals in all
+         * regions and then check names.  See MLQ #32625. */
+        char chromBuf[4096];
+        struct bigBedInterval *iv = NULL;
+        char *displayChromName = NULL;
+        int lastChromId = -1;
+        for (iv = ivList; iv != NULL; iv = iv->next)
+            {
+            if (iv->chromId != lastChromId)
+                {
+                bptStringKeyAtPos(bbi->chromBpt, iv->chromId, chromBuf, sizeof chromBuf);
+                displayChromName = chromAliasGetDisplayChrom(database, cart, hgOfficialChromName(database, chromBuf));
+                }
+            if (sameString(displayChromName, region->chrom))
+                {
+                char *row[bbi->fieldCount];
+                char startBuf[16], endBuf[16];
+                bigBedIntervalToRow(iv, displayChromName, startBuf, endBuf, row, bbi->fieldCount);
+                if (asFilterOnRow(filter, row))
+                    {
+                    struct bed *bed = bedLoadN(row, bbi->definedFieldCount);
+                    struct bed *lmBed = lmCloneBed(bed, lm);
+                    slAddHead(&bedList, lmBed);
+                    bedFree(&bed);
+                    }
+                }
+            lastChromId = iv->chromId;
+            }
+        }
+    else
+        {
+        /* Get beds a region at a time. */
         addFilteredBedsOnRegion(bbi, region, table, filter, lm, &bedList);
+        }
     slReverse(&bedList);
+    }
 
 /* Clean up and return. */
 if (retFieldCount != NULL)
     *retFieldCount = bbi->definedFieldCount;
+lmCleanup(&bbLm);
+hashFree(&idHash);
+bptFileDetach(&bpt);
 bbiFileClose(&bbi);
 freeMem(fileName);
 return bedList;
 }
 
 void bigBedTabOut(char *db, char *table, struct sqlConnection *conn, char *fields, FILE *f, char outSep)
 /* Print out selected fields from Big Bed.  If fields is NULL, then print out all fields. */
 {
 if (f == NULL)
     f = stdout;
 
 /* Convert comma separated list of fields to array. */
 int fieldCount = chopByChar(fields, ',', NULL, 0);
 char **fieldArray;
 AllocArray(fieldArray, fieldCount);