017f1ebbfad81dd9a4759d391debdc44b227b595
markd
  Fri Aug 19 23:25:17 2016 -0700
Cache twoBit sequence header information for the last sequence
accessed to avoid rereading N and mask blocks when sequentially
reader fragments of the genome.  This resulted in a 20% speedup
for pslRecalcMatch.

diff --git src/lib/twoBit.c src/lib/twoBit.c
index 077fb10..513b2b9 100644
--- src/lib/twoBit.c
+++ src/lib/twoBit.c
@@ -20,30 +20,35 @@
  * that read twoBit files.   All of these are to get around the C compiler
  * complaining about the automatic cast of a void * to FILE * or 
  * struct udcFile *.
  */
 /* first the UDC wrappers */
 static void udcSeekCurWrap(void *file, bits64 offset)
 {
 udcSeekCur((struct udcFile *)file, offset);
 }
 
 static void udcSeekWrap(void *file, bits64 offset)
 {
 udcSeek((struct udcFile *)file, offset);
 }
 
+static bits64 udcTellWrap(void *file)
+{
+return udcTell((struct udcFile *)file);
+}
+
 static void udcMustReadWrap(void *file, void *buf, size_t size)
 {
 udcMustRead((struct udcFile *)file, buf, size);
 }
 
 static void udcFileCloseWrap(void *pFile)
 {
 udcFileClose((struct udcFile **)pFile);
 }
 
 static bits32 udcReadBits32Wrap(void *f, boolean isSwapped)
 {
 return udcReadBits32((struct udcFile *)f, isSwapped);
 }
 
@@ -51,68 +56,75 @@
 {
 return udcFastReadString((struct udcFile *)f, buf);
 }
 
 /* now the stdio wrappers */
 static void seekCurWrap(void *file, bits64 offset)
 {
 fseek((FILE *)file, offset, SEEK_CUR);
 }
 
 static void seekWrap(void *file, bits64 offset)
 {
 fseek((FILE *)file, offset, SEEK_SET);
 }
 
+static bits64 tellWrap(void *file)
+{
+return ftell((FILE *)file);
+}
+
 static void mustReadWrap(void *file, void *buf, size_t size)
 {
 mustRead((FILE *)file, buf, size);
 }
 
 static void fileCloseWrap(void *pFile)
 {
 carefulClose((FILE **)pFile);
 }
 
 static bits32 readBits32Wrap(void *f, boolean isSwapped)
 {
 return readBits32((FILE *)f, isSwapped);
 }
 
 static boolean fastReadStringWrap(void *f, char buf[256])
 {
 return fastReadString((FILE *)f, buf);
 }
 
 static void setFileFuncs( struct twoBitFile *tbf, boolean useUdc)
 /* choose the proper function pointers depending on whether
  * this open twoBit is using stdio or UDC
  */
 {
 if (useUdc)
     {
     tbf->ourSeekCur = udcSeekCurWrap;
     tbf->ourSeek = udcSeekWrap;
+    tbf->ourTell = udcTellWrap;
     tbf->ourReadBits32 = udcReadBits32Wrap;
     tbf->ourFastReadString = udcFastReadStringWrap;
     tbf->ourClose = udcFileCloseWrap;
     tbf->ourMustRead = udcMustReadWrap;
     }
 else
     {
     tbf->ourSeekCur = seekCurWrap;
     tbf->ourSeek = seekWrap;
+    tbf->ourTell = tellWrap;
     tbf->ourReadBits32 = readBits32Wrap;
     tbf->ourFastReadString = fastReadStringWrap;
     tbf->ourClose = fileCloseWrap;
     tbf->ourMustRead = mustReadWrap;
     }
 }
 
 static int countBlocksOfN(char *s, int size)
 /* Count number of blocks of N's (or n's) in s. */
 {
 int i;
 boolean isN, lastIsN = FALSE;
 char c;
 int blockCount = 0;
 
@@ -351,30 +363,31 @@
     counter += (long long)size;
     if (counter > UINT_MAX )
         errAbort("Error in faToTwoBit, index overflow at %s. The 2bit format "
                 "does not support indexes larger than %dGb, \n"
                 "please split up into smaller files.\n", 
                 twoBit->name, UINT_MAX/1000000000);
     }
 }
 
 void twoBitClose(struct twoBitFile **pTbf)
 /* Free up resources associated with twoBitFile. */
 {
 struct twoBitFile *tbf = *pTbf;
 if (tbf != NULL)
     {
+    twoBitFree(&tbf->seqCache);
     freez(&tbf->fileName);
     (*tbf->ourClose)(&tbf->f);
     hashFree(&tbf->hash);
     /* The indexList is allocated out of the hash's memory pool. */
     bptFileClose(&tbf->bpt);
     freez(pTbf);
     }
 }
 
 boolean twoBitSigRead(struct twoBitFile *tbf, boolean *isSwapped)
 /* read twoBit signature, return FALSE if not good 
  * set isSwapped to TRUE if twoBit file is byte swapped */
 {
 bits32 sig;
 
@@ -542,57 +555,67 @@
     (*tbf->ourMustRead)(tbf->f, nSizes, sizeof(nSizes[0]) * blkCount);
     if (isSwapped)
 	{
 	int i;
 	for (i=0; i<blkCount; ++i)
 	    {
 	    nStarts[i] = byteSwap32(nStarts[i]);
 	    nSizes[i] = byteSwap32(nSizes[i]);
 	    }
 	}
     *retBlockStarts = nStarts;
     *retBlockSizes = nSizes;
     }
 }
 
-struct twoBit *twoBitOneFromFile(struct twoBitFile *tbf, char *name)
-/* Get single sequence as two bit. */
+static struct twoBit *readTwoBitSeqHeader(struct twoBitFile *tbf, char *name)
+/* read a sequence header, nBlocks and maskBlocks from a twoBit file,
+ * leaving file pointer at data block */
 {
-bits32 packByteCount;
 boolean isSwapped = tbf->isSwapped;
 struct twoBit *twoBit;
 AllocVar(twoBit);
 twoBit->name = cloneString(name);
 void *f = tbf->f;
 
 /* Find offset in index and seek to it */
 twoBitSeekTo(tbf, name);
 
 /* Read in seqSize. */
 twoBit->size = (*tbf->ourReadBits32)(f, isSwapped);
 
 /* Read in blocks of N. */
 readBlockCoords(tbf, isSwapped, &(twoBit->nBlockCount),
 		&(twoBit->nStarts), &(twoBit->nSizes));
 
 /* Read in masked blocks. */
 readBlockCoords(tbf, isSwapped, &(twoBit->maskBlockCount),
 		&(twoBit->maskStarts), &(twoBit->maskSizes));
 
 /* Reserved word. */
 twoBit->reserved = (*tbf->ourReadBits32)(f, isSwapped);
 
+return twoBit;
+}
+
+struct twoBit *twoBitOneFromFile(struct twoBitFile *tbf, char *name)
+/* Get single sequence as two bit. */
+{
+struct twoBit *twoBit = readTwoBitSeqHeader(tbf, name);
+bits32 packByteCount;
+void *f = tbf->f;
+
 /* Read in data. */
 packByteCount = packedSize(twoBit->size);
 twoBit->data = needLargeMem(packByteCount);
 (*tbf->ourMustRead)(f, twoBit->data, packByteCount);
 
 return twoBit;
 }
 
 struct twoBit *twoBitFromFile(char *fileName)
 /* Get twoBit list of all sequences in twoBit file. */
 {
 struct twoBitFile *tbf = twoBitOpen(fileName);
 struct twoBitIndex *index;
 struct twoBit *twoBitList = NULL;
 
@@ -623,77 +646,80 @@
 }
 
 void twoBitFreeList(struct twoBit **pList)
 /* Free a list of dynamically allocated twoBit's */
 {
 struct twoBit *el, *next;
 
 for (el = *pList; el != NULL; el = next)
     {
     next = el->next;
     twoBitFree(&el);
     }
 *pList = NULL;
 }
 
+static struct twoBit *getTwoBitSeqHeader(struct twoBitFile *tbf, char *name)
+/* get the sequence header information using the cache.  Position file
+ * right at data. */
+{
+if ((tbf->seqCache != NULL) && (sameString(tbf->seqCache->name, name)))
+    {
+    // use cached
+    (*tbf->ourSeek)(tbf->f, tbf->dataOffsetCache);
+    }
+else
+    {
+    // fetch new and cache
+    twoBitFree(&tbf->seqCache);
+    tbf->seqCache = readTwoBitSeqHeader(tbf, name);
+    tbf->dataOffsetCache = (*tbf->ourTell)(tbf->f);
+    }
+return tbf->seqCache;
+}
 
 struct dnaSeq *twoBitReadSeqFragExt(struct twoBitFile *tbf, char *name,
 	int fragStart, int fragEnd, boolean doMask, int *retFullSize)
 /* Read part of sequence from .2bit file.  To read full
  * sequence call with start=end=0.  Sequence will be lower
  * case if doMask is false, mixed case (repeats in lower)
  * if doMask is true. */
 {
 struct dnaSeq *seq;
-bits32 seqSize;
-bits32 nBlockCount, maskBlockCount;
-bits32 *nStarts = NULL, *nSizes = NULL;
-bits32 *maskStarts = NULL, *maskSizes = NULL;
-boolean isSwapped = tbf->isSwapped;
 void *f = tbf->f;
 int i;
 int packByteCount, packedStart, packedEnd, remainder, midStart, midEnd;
 int outSize;
 UBYTE *packed, *packedAlloc;
 DNA *dna;
 
-/* Find offset in index and seek to it */
+/* get sequence header information, which is cached */
 dnaUtilOpen();
-twoBitSeekTo(tbf, name);
+struct twoBit *twoBit = getTwoBitSeqHeader(tbf, name);
 
-/* Read in seqSize. */
-seqSize = (*tbf->ourReadBits32)(f, isSwapped);
+/* validate range. */
 if (fragEnd == 0)
-    fragEnd = seqSize;
-if (fragEnd > seqSize)
-    errAbort("twoBitReadSeqFrag in %s end (%d) >= seqSize (%d)", name, fragEnd, seqSize);
+    fragEnd = twoBit->size;
+if (fragEnd > twoBit->size)
+    errAbort("twoBitReadSeqFrag in %s end (%d) >= seqSize (%d)", name, fragEnd, twoBit->size);
 outSize = fragEnd - fragStart;
 if (outSize < 1)
     errAbort("twoBitReadSeqFrag in %s start (%d) >= end (%d)", name, fragStart, fragEnd);
 
-/* Read in blocks of N. */
-readBlockCoords(tbf, isSwapped, &nBlockCount, &nStarts, &nSizes);
-
-/* Read in masked blocks. */
-readBlockCoords(tbf, isSwapped, &maskBlockCount, &maskStarts, &maskSizes);
-
-/* Skip over reserved word. */
-(*tbf->ourReadBits32)(f, isSwapped);
-
 /* Allocate dnaSeq, and fill in zero tag at end of sequence. */
 AllocVar(seq);
-if (outSize == seqSize)
+if (outSize == twoBit->size)
     seq->name = cloneString(name);
 else
     {
     char buf[256*2];
     safef(buf, sizeof(buf), "%s:%d-%d", name, fragStart, fragEnd);
     seq->name = cloneString(buf);
     }
 seq->size = outSize;
 dna = seq->dna = needLargeMem(outSize+1);
 seq->dna[outSize] = 0;
 
 
 /* Skip to bits we need and read them in. */
 packedStart = (fragStart>>2);
 packedEnd = ((fragEnd+3)>>2);
@@ -749,76 +775,72 @@
 	}
 
     if (remainder >0)
 	{
 	UBYTE part = *packed;
 	part >>= (8-remainder-remainder);
 	for (i=remainder-1; i>=0; --i)
 	    {
 	    dna[i] = valToNt[part&3];
 	    part >>= 2;
 	    }
 	}
     }
 freez(&packedAlloc);
 
-if (nBlockCount > 0)
+if (twoBit->nBlockCount > 0)
     {
-    int startIx = findGreatestLowerBound(nBlockCount, nStarts, fragStart);
-    for (i=startIx; i<nBlockCount; ++i)
+    int startIx = findGreatestLowerBound(twoBit->nBlockCount, twoBit->nStarts, fragStart);
+    for (i=startIx; i<twoBit->nBlockCount; ++i)
         {
-	int s = nStarts[i];
-	int e = s + nSizes[i];
+	int s = twoBit->nStarts[i];
+	int e = s + twoBit->nSizes[i];
 	if (s >= fragEnd)
 	    break;
 	if (s < fragStart)
 	   s = fragStart;
 	if (e > fragEnd)
 	   e = fragEnd;
 	if (s < e)
 	    memset(seq->dna + s - fragStart, 'n', e - s);
 	}
     }
 
 if (doMask)
     {
     toUpperN(seq->dna, seq->size);
-    if (maskBlockCount > 0)
+    if (twoBit->maskBlockCount > 0)
 	{
-	int startIx = findGreatestLowerBound(maskBlockCount, maskStarts, 
+	int startIx = findGreatestLowerBound(twoBit->maskBlockCount, twoBit->maskStarts,
 		fragStart);
-	for (i=startIx; i<maskBlockCount; ++i)
+	for (i=startIx; i<twoBit->maskBlockCount; ++i)
 	    {
-	    int s = maskStarts[i];
-	    int e = s + maskSizes[i];
+	    int s = twoBit->maskStarts[i];
+	    int e = s + twoBit->maskSizes[i];
 	    if (s >= fragEnd)
 		break;
 	    if (s < fragStart)
 		s = fragStart;
 	    if (e > fragEnd)
 		e = fragEnd;
 	    if (s < e)
 		toLowerN(seq->dna + s - fragStart, e - s);
 	    }
 	}
     }
-freez(&nStarts);
-freez(&nSizes);
-freez(&maskStarts);
-freez(&maskSizes);
 if (retFullSize != NULL)
-    *retFullSize = seqSize;
+    *retFullSize = twoBit->size;
 return seq;
 }
 
 struct dnaSeq *twoBitReadSeqFrag(struct twoBitFile *tbf, char *name,
 	int fragStart, int fragEnd)
 /* Read part of sequence from .2bit file.  To read full
  * sequence call with start=end=0.  Note that sequence will
  * be mixed case, with repeats in lower case and rest in
  * upper case. */
 {
 return twoBitReadSeqFragExt(tbf, name, fragStart, fragEnd, TRUE, NULL);
 }
 
 struct dnaSeq *twoBitReadSeqFragLower(struct twoBitFile *tbf, char *name,
 	int fragStart, int fragEnd)