f8458ca93f0c0abe5456789dd7a9618d63e55e5b
kent
  Sun Jul 3 15:01:40 2022 -0700
Fixing bug on reverse strand and improving cache behavior.  Making it so you can ask for a cached sequence without aborting if it doesn't exist.

diff --git src/lib/cacheTwoBit.c src/lib/cacheTwoBit.c
index 1c2c8f8..d546751 100644
--- src/lib/cacheTwoBit.c
+++ src/lib/cacheTwoBit.c
@@ -4,151 +4,192 @@
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "dnaseq.h"
 #include "twoBit.h"
 #include "rangeTree.h"
 #include "cacheTwoBit.h"
 
 struct cacheTwoBitSeq
 /*  Cached ranges of single sequence in a twoBit file */
     {
     struct cacheTwoBitSeq *next;
     char *seqName;	// Name of sequence
     int seqSize;	// Size of sequence
     struct twoBitFile *tbf;  // Associated twoBit file;
-    boolean doRc;	// If set, cache reverse complement
+    bool doRc;	// If set, cache reverse complement
+    bool doUpper; // If set sequence is upper cased before caching
     struct rbTree *rangeTree;	// Values of this range tree are DNA seqs 
     long basesQueried;	// Total bases read from cache
     long basesRead;	// Total bases read by cache
     int queryCount;	// Number of queries made
     };
 
 
 struct cacheTwoBitUrl
 /* An open two bit file from the internet and a hash of sequences in it */
     {
     struct cacheTwoBitUrl *next;    /* Next in list */
     char *url;			    /* Name of URL hashed */
     struct twoBitFile *tbf;	    /* Open two bit file */
     struct hash *seqHash;	    /* Hash of cacheTwoBitSeq */
     struct hash *rcSeqHash;	    /* Hash of reverse-complemented cacheTwoBitSeq */
     struct cacheTwoBitSeq *seqList;   /* List of sequences accessed */
     };
-struct cacheTwoBitRanges *cacheTwoBitRangesNew()
+
+struct cacheTwoBitRanges *cacheTwoBitRangesNew(boolean doUpper)
 /* Create a new cache for ranges or complete sequence in two bit files */
 {
 struct cacheTwoBitRanges *cache;
 AllocVar(cache);
 cache->urlHash = hashNew(0);
+cache->doUpper = doUpper;
 return cache;
 }
 
 struct cacheTwoBitSeq *cacheTwoBitSeqNew(struct cacheTwoBitUrl *cacheUrl, 
-    char *seqName, boolean doRc)
+    char *seqName, boolean doRc, boolean doUpper)
 /* Create a new cacheTwoBitSeq structure */
 {
 struct cacheTwoBitSeq *ctbSeq;
 AllocVar(ctbSeq);
 ctbSeq->seqName = cloneString(seqName);
 ctbSeq->seqSize = twoBitSeqSize(cacheUrl->tbf, seqName);
 ctbSeq->doRc = doRc;
+ctbSeq->doUpper = doUpper;
 ctbSeq->tbf = cacheUrl->tbf;
 ctbSeq->rangeTree = rangeTreeNew();
 return ctbSeq;
 }
 
 
 
 static struct dnaSeq *cacheTwoBitSeqFetch(struct cacheTwoBitSeq *ctbSeq, 
     int start, int end, int *retOffset)
 /* Get dna sequence from file */
 {
+if (start == 0 && end == 0)  /* Spacial case, we read whole seq */
+    end = twoBitSeqSize(ctbSeq->tbf, ctbSeq->seqName);
+
 int size = end-start;
 if (size <= 0)
    errAbort("start >= end in cacheTwoBitSeqFetch. start %d, end %d\n", start, end);
 long offset = 0;
 struct dnaSeq *seq = NULL;
 ctbSeq->basesQueried += size;
 ctbSeq->queryCount += 1;
-if (ctbSeq->doRc)
-    reverseIntRange(&start, &end, ctbSeq->seqSize);
 struct rbTree *rangeTree = ctbSeq->rangeTree;
 struct range *enclosing = rangeTreeFindEnclosing(rangeTree, start, end);
 if (enclosing != NULL)
     {
     seq = enclosing->val;
     offset = enclosing->start;
+    // fprintf(stderr,  "reusing %d %s:%d+%d\n", ctbSeq->doRc, ctbSeq->seqName, start, size);
     }
 else
     {
     struct range *overlappingList = rangeTreeAllOverlapping(rangeTree, start, end);
 
     /* Expand start end so it includes range of all overlapping */
     struct range *range;
     for (range = overlappingList; range != NULL; range = range->next)
         {
 	if (start > range->start) start = range->start;
 	if (end < range->end) end = range->end;
 	}
-    seq = twoBitReadSeqFrag(ctbSeq->tbf, ctbSeq->seqName, start, end);
+
+    // fprintf(stderr,  "fetching new %d %s:%d+%d\n", ctbSeq->doRc, ctbSeq->seqName, start, size); 
+    int rcStart = start, rcEnd = end;
+    if (ctbSeq->doRc)
+	{
+	reverseIntRange(&rcStart, &rcEnd, ctbSeq->seqSize);
+	}
+    seq = twoBitReadSeqFrag(ctbSeq->tbf, ctbSeq->seqName, rcStart, rcEnd);
     if (seq != NULL)
 	{
+	if (ctbSeq->doUpper)
+	    toUpperN(seq->dna, seq->size);
 	if (ctbSeq->doRc)
+	    {
 	    reverseComplement(seq->dna, seq->size);
-	rangeTreeAddVal(rangeTree, start, end, seq, NULL);
 	    }
+	rangeTreeAddVal(rangeTree, start, end, seq, NULL);
 	ctbSeq->basesRead += size;
 	offset = start;
 	}
+    }
 *retOffset = offset;
 return seq;
 }
 
 
-struct dnaSeq *cacheTwoBitRangesFetch(struct cacheTwoBitRanges *cacheAll, char *url, char *seqName, 
-    int start, int end, boolean doRc, int *retOffset)
+static struct dnaSeq *cacheTwoBitRangesFetchOrNot(struct cacheTwoBitRanges *cacheAll, 
+    char *url, char *seqName, int start, int end, boolean doRc, boolean missOk, int *retOffset)
 /* fetch a sequence from a 2bit.  Caches open two bit files and sequence in 
  * both forward and reverse strand */
 {
 /* Init static url hash */
 struct hash *urlHash = cacheAll->urlHash;  // hash of open files
 
 struct cacheTwoBitUrl *cacheUrl = hashFindVal(urlHash, url);
 if (cacheUrl == NULL)
     {
     AllocVar(cacheUrl);
     cacheUrl->url = cloneString(url);
     cacheUrl->tbf = twoBitOpen(url);
     cacheUrl->seqHash = hashNew(0);
     cacheUrl->rcSeqHash = hashNew(0);
     hashAdd(urlHash, url, cacheUrl);
     slAddHead(&cacheAll->urlList, cacheUrl);
     }
+if (missOk && !twoBitHasSeq(cacheUrl->tbf, seqName))
+   {
+   return NULL;
+   }
 struct hash *seqHash = (doRc ? cacheUrl->rcSeqHash : cacheUrl->seqHash);
 struct cacheTwoBitSeq *ctbSeq = hashFindVal(seqHash, seqName);
 
 if (ctbSeq == NULL)
     {
-    ctbSeq = cacheTwoBitSeqNew(cacheUrl, seqName, doRc);
+    ctbSeq = cacheTwoBitSeqNew(cacheUrl, seqName, doRc, cacheAll->doUpper);
     hashAdd(seqHash, seqName, ctbSeq);
     slAddHead(&cacheUrl->seqList, ctbSeq);
     }
-struct dnaSeq *seq = cacheTwoBitSeqFetch(ctbSeq, start, end, retOffset);
+struct dnaSeq *seq = NULL;
+if (start < end || !missOk)
+    seq = cacheTwoBitSeqFetch(ctbSeq, start, end, retOffset);
 return seq;
 }
 
+struct dnaSeq *cacheTwoBitRangesFetch(struct cacheTwoBitRanges *cacheAll, 
+    char *url, char *seqName, int start, int end, boolean doRc, int *retOffset)
+/* fetch a sequence from a 2bit.  Caches open two bit files and sequence in 
+ * both forward and reverse strand */
+{
+return cacheTwoBitRangesFetchOrNot(cacheAll, url, seqName, start, end, doRc, FALSE, retOffset);
+}
+
+struct dnaSeq *cacheTwoBitRangesMayFetch(struct cacheTwoBitRanges *cacheAll, 
+	char *url, char *seqName, int start, int end, boolean doRc, int *retOffset)
+/* Fetch a sequence from a twoBit cache. The result in retOffset is where the return dnaSeq
+ * sits within the named sequence, the whole of which is stored in the subtracted 
+ * associated twoBit file. Do not free the returned sequence. Returns NULL if sequence not
+ * found */
+{
+return cacheTwoBitRangesFetchOrNot(cacheAll, url, seqName, start, end, doRc, TRUE, retOffset);
+}
+
 
 void cacheTwoBitRangesPrintStats(struct cacheTwoBitRanges *cache)
 /* print cache statistics - Debugging routine */
 {
 printf("caching %d twoBit files\n", slCount(cache->urlList));
 struct cacheTwoBitUrl *cachedUrl;
 int totalSeq = 0;
 int totalRanges = 0;
 long basesQueried = 0;	// Total bases read from cache
 long basesRead = 0;	// Total bases read by cache
 int queryCount = 0;
 for (cachedUrl = cache->urlList; cachedUrl != NULL; cachedUrl = cachedUrl->next)
     {
     printf("%s has %d + strand, %d minus strand sequences cached\n",
 	cachedUrl->url, cachedUrl->seqHash->elCount, cachedUrl->rcSeqHash->elCount);