a27a778580784849da8b9026ef2e34a5c67a7367
kent
  Fri Jul 1 16:50:30 2022 -0700
Seems to work.  I'll fix some of the names, maybe polish it some more, before moving it out of oneShot.

diff --git src/oneShot/testTwoBitRangeCache/testTwoBitRangeCache.c src/oneShot/testTwoBitRangeCache/testTwoBitRangeCache.c
new file mode 100644
index 0000000..5ca03b0
--- /dev/null
+++ src/oneShot/testTwoBitRangeCache/testTwoBitRangeCache.c
@@ -0,0 +1,230 @@
+/* testTwoBitRangeCache - Test a caching capability for sequences from two bit files.. */
+#include "common.h"
+#include "linefile.h"
+#include "hash.h"
+#include "options.h"
+#include "dnaseq.h"
+#include "twoBit.h"
+#include "rangeTree.h"
+
+void usage()
+/* Explain usage and exit. */
+{
+errAbort(
+  "testTwoBitRangeCache - Test a caching capability for sequences from two bit files.\n"
+  "usage:\n"
+  "   testTwoBitRangeCache input.tab\n"
+  "Where input.tab is a tab-separated-file with columns:\n"
+  "   file.2bit	seqName	strand	startBase   endBase\n"
+  "options:\n"
+  "   -xxx=XXX\n"
+  );
+}
+
+/* Command line validation table. */
+static struct optionSpec options[] = {
+   {NULL, 0},
+};
+
+struct cacheSeqRanges
+/*  Cached ranges of single sequence*/
+    {
+    struct cacheSeqRanges *next;
+    char *seqName;	// Name of sequence
+    int seqSize;	// Size of sequence
+    struct twoBitFile *tbf;  // Associated twoBit file;
+    boolean doRc;	// If set, cache reverse complement
+    struct rbTree *rangeTree;	// Values of this range tree are DNA seqs 
+    long basesQueried;	// Total bases read from cache
+    long basesRead;	// Total bases read by cache
+    int queryCount;	// Number of queries made
+    };
+
+struct cacheTwoBitUrl
+/* An open two bit file and a hash of sequences in it */
+    {
+    struct cacheTwoBitUrl *next;    /* Next in list */
+    char *url;			    /* Name of URL hashed */
+    struct twoBitFile *tbf;	    /* Open two bit file */
+    struct hash *seqHash;	    /* Hash of cacheSeqRanges */
+    struct hash *rcSeqHash;	    /* Hash of reverse-complemented cacheSeqRanges */
+    struct cacheSeqRanges *seqList;   /* List of sequences accessed */
+    };
+
+struct cacheTwoBits
+/* Cache open two bit files and sequences */
+    {
+    struct cacheTwoBits *next;	/* Next in list of such caches if you need. */
+    struct hash *urlHash;	/* Hash of twoBit file URLs */
+    struct cacheTwoBitUrl *urlList;  /* List of values in above cache */
+    };
+
+struct cacheTwoBits *cacheTwoBitsNew()
+/* Create a new cache for two bits */
+{
+struct cacheTwoBits *cache;
+AllocVar(cache);
+cache->urlHash = hashNew(0);
+return cache;
+}
+
+struct cacheSeqRanges *cacheSeqRangesNew(struct cacheTwoBitUrl *cacheUrl, char *seqName, 
+    boolean doRc)
+/* Create a new cacheSeqRanges structure */
+{
+struct cacheSeqRanges *csRange;
+AllocVar(csRange);
+csRange->seqName = cloneString(seqName);
+csRange->seqSize = twoBitSeqSize(cacheUrl->tbf, seqName);
+csRange->doRc = doRc;
+csRange->tbf = cacheUrl->tbf;
+csRange->rangeTree = rangeTreeNew();
+return csRange;
+}
+
+
+
+struct dnaSeq *cacheSeqRangesFetch(struct cacheSeqRanges *csRange, int start, int end, int *retOffset)
+/* Get two bit file */
+{
+int size = end-start;
+if (size <= 0)
+   errAbort("start >= end in cacheSeqRangesFetch. start %d, end %d\n", start, end);
+long offset = 0;
+struct dnaSeq *seq = NULL;
+if (csRange->doRc)
+    reverseIntRange(&start, &end, csRange->seqSize);
+csRange->basesQueried += size;
+csRange->queryCount += 1;
+struct rbTree *rangeTree = csRange->rangeTree;
+struct range *enclosing = rangeTreeFindEnclosing(rangeTree, start, end);
+if (enclosing != NULL)
+    {
+    seq = enclosing->val;
+    offset = enclosing->start;
+    }
+else
+    {
+    struct range *overlappingList = rangeTreeAllOverlapping(rangeTree, start, end);
+
+    /* Expand start end so it includes range of all overlapping */
+    struct range *range;
+    for (range = overlappingList; range != NULL; range = range->next)
+        {
+	if (start > range->start) start = range->start;
+	if (end < range->end) end = range->end;
+	}
+    seq = twoBitReadSeqFrag(csRange->tbf, csRange->seqName, start, end);
+    if (seq != NULL)
+	rangeTreeAddVal(rangeTree, start, end, seq, NULL);
+    csRange->basesRead += size;
+    offset = start;
+    }
+*retOffset = offset;
+return seq;
+}
+
+
+struct dnaSeq *cacheTwoBitsFetchSeq(struct cacheTwoBits *cacheAll, char *url, char *seqName, 
+    int start, int end, boolean doRc, int *retOffset)
+/* fetch a sequence from a 2bit.  Caches open two bit files and sequence in 
+ * both forward and reverse strand */
+{
+/* Init static url hash */
+struct hash *urlHash = cacheAll->urlHash;  // hash of open files
+
+struct cacheTwoBitUrl *cacheUrl = hashFindVal(urlHash, url);
+if (cacheUrl == NULL)
+    {
+    AllocVar(cacheUrl);
+    cacheUrl->url = cloneString(url);
+    cacheUrl->tbf = twoBitOpen(url);
+    cacheUrl->seqHash = hashNew(0);
+    cacheUrl->rcSeqHash = hashNew(0);
+    hashAdd(urlHash, url, cacheUrl);
+    slAddHead(&cacheAll->urlList, cacheUrl);
+    }
+struct hash *seqHash = (doRc ? cacheUrl->rcSeqHash : cacheUrl->seqHash);
+struct cacheSeqRanges *csRange = hashFindVal(seqHash, seqName);
+
+if (csRange == NULL)
+    {
+    csRange = cacheSeqRangesNew(cacheUrl, seqName, doRc);
+    hashAdd(seqHash, seqName, csRange);
+    slAddHead(&cacheUrl->seqList, csRange);
+    }
+struct dnaSeq *seq = cacheSeqRangesFetch(csRange, start, end, retOffset);
+return seq;
+}
+
+void printCacheStats(struct cacheTwoBits *cache)
+{
+printf("caching %d twoBit files\n", slCount(cache->urlList));
+struct cacheTwoBitUrl *cachedUrl;
+int totalSeq = 0;
+int totalRanges = 0;
+long basesQueried = 0;	// Total bases read from cache
+long basesRead = 0;	// Total bases read by cache
+int queryCount = 0;
+for (cachedUrl = cache->urlList; cachedUrl != NULL; cachedUrl = cachedUrl->next)
+    {
+    struct cacheSeqRanges *csRange;
+    for (csRange = cachedUrl->seqList; csRange != NULL; csRange = csRange->next)
+	{
+	printf("  %s %c strand\n", csRange->seqName, csRange->doRc ? '=' : '+');
+	totalSeq += 1;
+	totalRanges += csRange->rangeTree->n;
+	basesQueried += csRange->basesQueried;
+	basesRead += csRange->basesRead;
+	queryCount += csRange->queryCount;
+	struct range *range = rangeTreeList(csRange->rangeTree);
+	for ( ; range != NULL; range = range->next)
+	    {
+	    printf("    %d %d\n", range->start, range->end);
+	    }
+	}
+    }
+printf("total sequences cached %d in %d ranges covering %d queries\n", 
+    totalSeq, totalRanges, queryCount);
+printf("basesRead %ld bases queried %ld\n", basesRead, basesQueried);
+}
+
+void testTwoBitRangeCache(char *input)
+/* testTwoBitRangeCache - Test a caching capability for sequences from two bit files.. */
+{
+struct cacheTwoBits *cache = cacheTwoBitsNew();
+/* Create a new cache for two bits */
+struct lineFile *lf = lineFileOpen(input, TRUE);
+char *row[5];
+int rowCount = 0;
+long totalBases = 0;
+while (lineFileRow(lf, row))
+    {
+    char *fileName = row[0];
+    char *seqName = row[1];
+    char *strand = row[2];
+    int startBase = lineFileNeedNum(lf, row, 3);
+    int endBase = lineFileNeedNum(lf, row, 4);
+    int baseSize = endBase - startBase;
+    boolean isRc = (strand[0] == '-');
+    int seqOffset=0;
+    struct dnaSeq *seq = cacheTwoBitsFetchSeq(cache, fileName, 
+	seqName, startBase, endBase, isRc, &seqOffset);
+    if (seq == NULL)
+        warn("Missing seq %s:%s\n", fileName, seqName);
+    totalBases += baseSize;
+    ++rowCount;
+    }
+printf("%ld total bases in %d rows, %d 2bit files\n", totalBases, rowCount, cache->urlHash->elCount);
+printCacheStats(cache);
+}
+
+int main(int argc, char *argv[])
+/* Process command line. */
+{
+optionInit(&argc, argv, options);
+if (argc != 2)
+    usage();
+testTwoBitRangeCache(argv[1]);
+return 0;
+}