017f1ebbfad81dd9a4759d391debdc44b227b595 markd Fri Aug 19 23:25:17 2016 -0700 Cache twoBit sequence header information for the last sequence accessed to avoid rereading N and mask blocks when sequentially reader fragments of the genome. This resulted in a 20% speedup for pslRecalcMatch. diff --git src/lib/twoBit.c src/lib/twoBit.c index 077fb10..513b2b9 100644 --- src/lib/twoBit.c +++ src/lib/twoBit.c @@ -20,30 +20,35 @@ * that read twoBit files. All of these are to get around the C compiler * complaining about the automatic cast of a void * to FILE * or * struct udcFile *. */ /* first the UDC wrappers */ static void udcSeekCurWrap(void *file, bits64 offset) { udcSeekCur((struct udcFile *)file, offset); } static void udcSeekWrap(void *file, bits64 offset) { udcSeek((struct udcFile *)file, offset); } +static bits64 udcTellWrap(void *file) +{ +return udcTell((struct udcFile *)file); +} + static void udcMustReadWrap(void *file, void *buf, size_t size) { udcMustRead((struct udcFile *)file, buf, size); } static void udcFileCloseWrap(void *pFile) { udcFileClose((struct udcFile **)pFile); } static bits32 udcReadBits32Wrap(void *f, boolean isSwapped) { return udcReadBits32((struct udcFile *)f, isSwapped); } @@ -51,68 +56,75 @@ { return udcFastReadString((struct udcFile *)f, buf); } /* now the stdio wrappers */ static void seekCurWrap(void *file, bits64 offset) { fseek((FILE *)file, offset, SEEK_CUR); } static void seekWrap(void *file, bits64 offset) { fseek((FILE *)file, offset, SEEK_SET); } +static bits64 tellWrap(void *file) +{ +return ftell((FILE *)file); +} + static void mustReadWrap(void *file, void *buf, size_t size) { mustRead((FILE *)file, buf, size); } static void fileCloseWrap(void *pFile) { carefulClose((FILE **)pFile); } static bits32 readBits32Wrap(void *f, boolean isSwapped) { return readBits32((FILE *)f, isSwapped); } static boolean fastReadStringWrap(void *f, char buf[256]) { return fastReadString((FILE *)f, buf); } static void setFileFuncs( struct twoBitFile *tbf, boolean useUdc) /* choose the proper function pointers depending on whether * this open twoBit is using stdio or UDC */ { if (useUdc) { tbf->ourSeekCur = udcSeekCurWrap; tbf->ourSeek = udcSeekWrap; + tbf->ourTell = udcTellWrap; tbf->ourReadBits32 = udcReadBits32Wrap; tbf->ourFastReadString = udcFastReadStringWrap; tbf->ourClose = udcFileCloseWrap; tbf->ourMustRead = udcMustReadWrap; } else { tbf->ourSeekCur = seekCurWrap; tbf->ourSeek = seekWrap; + tbf->ourTell = tellWrap; tbf->ourReadBits32 = readBits32Wrap; tbf->ourFastReadString = fastReadStringWrap; tbf->ourClose = fileCloseWrap; tbf->ourMustRead = mustReadWrap; } } static int countBlocksOfN(char *s, int size) /* Count number of blocks of N's (or n's) in s. */ { int i; boolean isN, lastIsN = FALSE; char c; int blockCount = 0; @@ -351,30 +363,31 @@ counter += (long long)size; if (counter > UINT_MAX ) errAbort("Error in faToTwoBit, index overflow at %s. The 2bit format " "does not support indexes larger than %dGb, \n" "please split up into smaller files.\n", twoBit->name, UINT_MAX/1000000000); } } void twoBitClose(struct twoBitFile **pTbf) /* Free up resources associated with twoBitFile. */ { struct twoBitFile *tbf = *pTbf; if (tbf != NULL) { + twoBitFree(&tbf->seqCache); freez(&tbf->fileName); (*tbf->ourClose)(&tbf->f); hashFree(&tbf->hash); /* The indexList is allocated out of the hash's memory pool. */ bptFileClose(&tbf->bpt); freez(pTbf); } } boolean twoBitSigRead(struct twoBitFile *tbf, boolean *isSwapped) /* read twoBit signature, return FALSE if not good * set isSwapped to TRUE if twoBit file is byte swapped */ { bits32 sig; @@ -542,57 +555,67 @@ (*tbf->ourMustRead)(tbf->f, nSizes, sizeof(nSizes[0]) * blkCount); if (isSwapped) { int i; for (i=0; iisSwapped; struct twoBit *twoBit; AllocVar(twoBit); twoBit->name = cloneString(name); void *f = tbf->f; /* Find offset in index and seek to it */ twoBitSeekTo(tbf, name); /* Read in seqSize. */ twoBit->size = (*tbf->ourReadBits32)(f, isSwapped); /* Read in blocks of N. */ readBlockCoords(tbf, isSwapped, &(twoBit->nBlockCount), &(twoBit->nStarts), &(twoBit->nSizes)); /* Read in masked blocks. */ readBlockCoords(tbf, isSwapped, &(twoBit->maskBlockCount), &(twoBit->maskStarts), &(twoBit->maskSizes)); /* Reserved word. */ twoBit->reserved = (*tbf->ourReadBits32)(f, isSwapped); +return twoBit; +} + +struct twoBit *twoBitOneFromFile(struct twoBitFile *tbf, char *name) +/* Get single sequence as two bit. */ +{ +struct twoBit *twoBit = readTwoBitSeqHeader(tbf, name); +bits32 packByteCount; +void *f = tbf->f; + /* Read in data. */ packByteCount = packedSize(twoBit->size); twoBit->data = needLargeMem(packByteCount); (*tbf->ourMustRead)(f, twoBit->data, packByteCount); return twoBit; } struct twoBit *twoBitFromFile(char *fileName) /* Get twoBit list of all sequences in twoBit file. */ { struct twoBitFile *tbf = twoBitOpen(fileName); struct twoBitIndex *index; struct twoBit *twoBitList = NULL; @@ -623,77 +646,80 @@ } void twoBitFreeList(struct twoBit **pList) /* Free a list of dynamically allocated twoBit's */ { struct twoBit *el, *next; for (el = *pList; el != NULL; el = next) { next = el->next; twoBitFree(&el); } *pList = NULL; } +static struct twoBit *getTwoBitSeqHeader(struct twoBitFile *tbf, char *name) +/* get the sequence header information using the cache. Position file + * right at data. */ +{ +if ((tbf->seqCache != NULL) && (sameString(tbf->seqCache->name, name))) + { + // use cached + (*tbf->ourSeek)(tbf->f, tbf->dataOffsetCache); + } +else + { + // fetch new and cache + twoBitFree(&tbf->seqCache); + tbf->seqCache = readTwoBitSeqHeader(tbf, name); + tbf->dataOffsetCache = (*tbf->ourTell)(tbf->f); + } +return tbf->seqCache; +} struct dnaSeq *twoBitReadSeqFragExt(struct twoBitFile *tbf, char *name, int fragStart, int fragEnd, boolean doMask, int *retFullSize) /* Read part of sequence from .2bit file. To read full * sequence call with start=end=0. Sequence will be lower * case if doMask is false, mixed case (repeats in lower) * if doMask is true. */ { struct dnaSeq *seq; -bits32 seqSize; -bits32 nBlockCount, maskBlockCount; -bits32 *nStarts = NULL, *nSizes = NULL; -bits32 *maskStarts = NULL, *maskSizes = NULL; -boolean isSwapped = tbf->isSwapped; void *f = tbf->f; int i; int packByteCount, packedStart, packedEnd, remainder, midStart, midEnd; int outSize; UBYTE *packed, *packedAlloc; DNA *dna; -/* Find offset in index and seek to it */ +/* get sequence header information, which is cached */ dnaUtilOpen(); -twoBitSeekTo(tbf, name); +struct twoBit *twoBit = getTwoBitSeqHeader(tbf, name); -/* Read in seqSize. */ -seqSize = (*tbf->ourReadBits32)(f, isSwapped); +/* validate range. */ if (fragEnd == 0) - fragEnd = seqSize; -if (fragEnd > seqSize) - errAbort("twoBitReadSeqFrag in %s end (%d) >= seqSize (%d)", name, fragEnd, seqSize); + fragEnd = twoBit->size; +if (fragEnd > twoBit->size) + errAbort("twoBitReadSeqFrag in %s end (%d) >= seqSize (%d)", name, fragEnd, twoBit->size); outSize = fragEnd - fragStart; if (outSize < 1) errAbort("twoBitReadSeqFrag in %s start (%d) >= end (%d)", name, fragStart, fragEnd); -/* Read in blocks of N. */ -readBlockCoords(tbf, isSwapped, &nBlockCount, &nStarts, &nSizes); - -/* Read in masked blocks. */ -readBlockCoords(tbf, isSwapped, &maskBlockCount, &maskStarts, &maskSizes); - -/* Skip over reserved word. */ -(*tbf->ourReadBits32)(f, isSwapped); - /* Allocate dnaSeq, and fill in zero tag at end of sequence. */ AllocVar(seq); -if (outSize == seqSize) +if (outSize == twoBit->size) seq->name = cloneString(name); else { char buf[256*2]; safef(buf, sizeof(buf), "%s:%d-%d", name, fragStart, fragEnd); seq->name = cloneString(buf); } seq->size = outSize; dna = seq->dna = needLargeMem(outSize+1); seq->dna[outSize] = 0; /* Skip to bits we need and read them in. */ packedStart = (fragStart>>2); packedEnd = ((fragEnd+3)>>2); @@ -749,76 +775,72 @@ } if (remainder >0) { UBYTE part = *packed; part >>= (8-remainder-remainder); for (i=remainder-1; i>=0; --i) { dna[i] = valToNt[part&3]; part >>= 2; } } } freez(&packedAlloc); -if (nBlockCount > 0) +if (twoBit->nBlockCount > 0) { - int startIx = findGreatestLowerBound(nBlockCount, nStarts, fragStart); - for (i=startIx; inBlockCount, twoBit->nStarts, fragStart); + for (i=startIx; inBlockCount; ++i) { - int s = nStarts[i]; - int e = s + nSizes[i]; + int s = twoBit->nStarts[i]; + int e = s + twoBit->nSizes[i]; if (s >= fragEnd) break; if (s < fragStart) s = fragStart; if (e > fragEnd) e = fragEnd; if (s < e) memset(seq->dna + s - fragStart, 'n', e - s); } } if (doMask) { toUpperN(seq->dna, seq->size); - if (maskBlockCount > 0) + if (twoBit->maskBlockCount > 0) { - int startIx = findGreatestLowerBound(maskBlockCount, maskStarts, + int startIx = findGreatestLowerBound(twoBit->maskBlockCount, twoBit->maskStarts, fragStart); - for (i=startIx; imaskBlockCount; ++i) { - int s = maskStarts[i]; - int e = s + maskSizes[i]; + int s = twoBit->maskStarts[i]; + int e = s + twoBit->maskSizes[i]; if (s >= fragEnd) break; if (s < fragStart) s = fragStart; if (e > fragEnd) e = fragEnd; if (s < e) toLowerN(seq->dna + s - fragStart, e - s); } } } -freez(&nStarts); -freez(&nSizes); -freez(&maskStarts); -freez(&maskSizes); if (retFullSize != NULL) - *retFullSize = seqSize; + *retFullSize = twoBit->size; return seq; } struct dnaSeq *twoBitReadSeqFrag(struct twoBitFile *tbf, char *name, int fragStart, int fragEnd) /* Read part of sequence from .2bit file. To read full * sequence call with start=end=0. Note that sequence will * be mixed case, with repeats in lower case and rest in * upper case. */ { return twoBitReadSeqFragExt(tbf, name, fragStart, fragEnd, TRUE, NULL); } struct dnaSeq *twoBitReadSeqFragLower(struct twoBitFile *tbf, char *name, int fragStart, int fragEnd)