53c1a54fa21c9b90af300dbc83d3a64b3b869f72 braney Fri Feb 15 14:03:58 2013 -0800 change twoBit library to use either UDC or stdio. #8072 diff --git src/lib/twoBit.c src/lib/twoBit.c index 10ec2e2..49e65f5 100644 --- src/lib/twoBit.c +++ src/lib/twoBit.c @@ -1,27 +1,119 @@ #include "common.h" #include "hash.h" #include "dnaseq.h" #include "dnautil.h" #include "sig.h" #include "localmem.h" #include "linefile.h" #include "obscure.h" #include "bPlusTree.h" #include "twoBit.h" +#include "udc.h" #include <limits.h> +/* following are the wrap functions for the UDC and stdio functoins + * that read twoBit files. All of these are to get around the C compiler + * complaining about the automatic cast of a void * to FILE * or + * struct udcFile *. + */ +/* first the UDC wrappers */ +static void udcSeekCurWrap(void *file, bits64 offset) +{ +udcSeekCur((struct udcFile *)file, offset); +} + +static void udcSeekWrap(void *file, bits64 offset) +{ +udcSeek((struct udcFile *)file, offset); +} + +static void udcMustReadWrap(void *file, void *buf, size_t size) +{ +udcMustRead((struct udcFile *)file, buf, size); +} + +static inline void udcFileCloseWrap(void *pFile) +{ +udcFileClose((struct udcFile **)pFile); +} + +inline bits32 udcReadBits32Wrap(void *f, boolean isSwapped) +{ +return udcReadBits32((struct udcFile *)f, isSwapped); +} + +inline boolean udcFastReadStringWrap(void *f, char buf[256]) +{ +return udcFastReadString((struct udcFile *)f, buf); +} + +/* now the stdio wrappers */ +static void seekCurWrap(void *file, bits64 offset) +{ +fseek((FILE *)file, offset, SEEK_CUR); +} + +static void seekWrap(void *file, bits64 offset) +{ +fseek((FILE *)file, offset, SEEK_SET); +} + +static void mustReadWrap(void *file, void *buf, size_t size) +{ +mustRead((FILE *)file, buf, size); +} + +inline void fileCloseWrap(void *pFile) +{ +carefulClose((FILE **)pFile); +} + +inline bits32 readBits32Wrap(void *f, boolean isSwapped) +{ +return readBits32((FILE *)f, isSwapped); +} + +inline boolean fastReadStringWrap(void *f, char buf[256]) +{ +return fastReadString((FILE *)f, buf); +} + +static void setFileFuncs( struct twoBitFile *tbf, boolean useUdc) +/* choose the proper function pointers depending on whether + * this open twoBit is using stdio or UDC + */ +{ +if (useUdc) + { + tbf->ourSeekCur = udcSeekCurWrap; + tbf->ourSeek = udcSeekWrap; + tbf->ourReadBits32 = udcReadBits32Wrap; + tbf->ourFastReadString = udcFastReadStringWrap; + tbf->ourClose = udcFileCloseWrap; + tbf->ourMustRead = udcMustReadWrap; + } +else + { + tbf->ourSeekCur = seekCurWrap; + tbf->ourSeek = seekWrap; + tbf->ourReadBits32 = readBits32Wrap; + tbf->ourFastReadString = fastReadStringWrap; + tbf->ourClose = fileCloseWrap; + tbf->ourMustRead = mustReadWrap; + } +} static int countBlocksOfN(char *s, int size) /* Count number of blocks of N's (or n's) in s. */ { int i; boolean isN, lastIsN = FALSE; char c; int blockCount = 0; for (i=0; i<size; ++i) { c = s[i]; isN = (c == 'n' || c == 'N'); if (isN && !lastIsN) ++blockCount; @@ -255,116 +347,138 @@ if (counter > UINT_MAX ) errAbort("Error in faToTwoBit, index overflow at %s. The 2bit format " "does not support indexes larger than %dGb, \n" "please split up into smaller files.\n", twoBit->name, UINT_MAX/1000000000); } } void twoBitClose(struct twoBitFile **pTbf) /* Free up resources associated with twoBitFile. */ { struct twoBitFile *tbf = *pTbf; if (tbf != NULL) { freez(&tbf->fileName); - carefulClose(&tbf->f); + (*tbf->ourClose)(&tbf->f); hashFree(&tbf->hash); /* The indexList is allocated out of the hash's memory pool. */ bptFileClose(&tbf->bpt); freez(pTbf); } } -boolean twoBitSigRead(FILE *f, boolean *isSwapped) +boolean twoBitSigRead(struct twoBitFile *tbf, boolean *isSwapped) /* read twoBit signature, return FALSE if not good * set isSwapped to TRUE if twoBit file is byte swapped */ { bits32 sig; *isSwapped = FALSE; -mustReadOne(f, sig); +(*tbf->ourMustRead)(tbf->f, &sig, sizeof(sig)); if (sig == twoBitSwapSig) *isSwapped = TRUE; else if (sig != twoBitSig) return FALSE; return TRUE; } -static struct twoBitFile *twoBitOpenReadHeader(char *fileName) +static struct twoBitFile *getTbfAndOpen(char *fileName, boolean useUdc) +{ +struct twoBitFile *tbf; + +AllocVar(tbf); +setFileFuncs(tbf, useUdc); + +if (useUdc) + tbf->f = udcFileOpen(fileName, NULL); +else + tbf->f = mustOpen(fileName, "rb"); + +return tbf; +} + +static struct twoBitFile *twoBitOpenReadHeader(char *fileName, boolean useUdc) /* Open file, read in header but not index. * Squawk and die if there is a problem. */ { struct twoBitFile *tbf; boolean isSwapped = FALSE; -FILE *f = mustOpen(fileName, "rb"); + +tbf = getTbfAndOpen(fileName, useUdc); /* Allocate header verify signature, and read in * the constant-length bits. */ -AllocVar(tbf); -if (!twoBitSigRead(f, &isSwapped)) +if (!twoBitSigRead(tbf, &isSwapped)) errAbort("%s doesn't have a valid twoBitSig", fileName); tbf->isSwapped = isSwapped; tbf->fileName = cloneString(fileName); -tbf->f = f; -tbf->version = readBits32(f, isSwapped); +tbf->version = (*tbf->ourReadBits32)(tbf->f, isSwapped); if (tbf->version != 0) { errAbort("Can only handle version 0 of this file. This is version %d", (int)tbf->version); } -tbf->seqCount = readBits32(f, isSwapped); -tbf->reserved = readBits32(f, isSwapped); +tbf->seqCount = (*tbf->ourReadBits32)(tbf->f, isSwapped); +tbf->reserved = (*tbf->ourReadBits32)(tbf->f, isSwapped); return tbf; } -struct twoBitFile *twoBitOpen(char *fileName) + +struct twoBitFile *twoBitOpenExt(char *fileName, boolean useUdc) /* Open file, read in header and index. * Squawk and die if there is a problem. */ { -struct twoBitFile *tbf = twoBitOpenReadHeader(fileName); +struct twoBitFile *tbf = twoBitOpenReadHeader(fileName, useUdc); struct twoBitIndex *index; boolean isSwapped = tbf->isSwapped; int i; struct hash *hash; -FILE *f = tbf->f; +void *f = tbf->f; /* Read in index. */ hash = tbf->hash = hashNew(digitsBaseTwo(tbf->seqCount)); for (i=0; i<tbf->seqCount; ++i) { char name[256]; - if (!fastReadString(f, name)) + if (!(*tbf->ourFastReadString)(f, name)) errAbort("%s is truncated", fileName); lmAllocVar(hash->lm, index); - index->offset = readBits32(f, isSwapped); + index->offset = (*tbf->ourReadBits32)(f, isSwapped); hashAddSaveName(hash, name, index, &index->name); slAddHead(&tbf->indexList, index); } slReverse(&tbf->indexList); return tbf; } +struct twoBitFile *twoBitOpen(char *fileName) +/* Open stdio file, read in header and index. + * Squawk and die if there is a problem. */ +{ +return twoBitOpenExt(fileName, FALSE); +} + struct twoBitFile *twoBitOpenExternalBptIndex(char *twoBitName, char *bptName) /* Open file, read in header, but not regular index. Instead use * bpt index. Beware if you use this the indexList field will be NULL * as will the hash. */ { -struct twoBitFile *tbf = twoBitOpenReadHeader(twoBitName); +struct twoBitFile *tbf = twoBitOpenReadHeader(twoBitName, FALSE); tbf->bpt = bptFileOpen(bptName); if (tbf->seqCount != tbf->bpt->itemCount) errAbort("%s and %s don't have same number of sequences!", twoBitName, bptName); return tbf; } static int findGreatestLowerBound(int blockCount, bits32 *pos, int val) /* Find index of greatest element in posArray that is less * than or equal to val using a binary search. */ { int startIx=0, endIx=blockCount-1, midIx; int posVal; @@ -383,127 +497,132 @@ if (posVal < val) startIx = midIx+1; else endIx = midIx; } } static void twoBitSeekTo(struct twoBitFile *tbf, char *name) /* Seek to start of named record. Abort if can't find it. */ { if (tbf->bpt) { bits32 offset; if (!bptFileFind(tbf->bpt, name, strlen(name), &offset, sizeof(offset))) errAbort("%s is not in %s", name, tbf->bpt->fileName); - fseek(tbf->f, offset, SEEK_SET); + (*tbf->ourSeek)(tbf->f, offset); } else { struct twoBitIndex *index = hashFindVal(tbf->hash, name); if (index == NULL) errAbort("%s is not in %s", name, tbf->fileName); - fseek(tbf->f, index->offset, SEEK_SET); + (*tbf->ourSeek)(tbf->f, index->offset); } } -static void readBlockCoords(FILE *f, boolean isSwapped, bits32 *retBlockCount, +static void readBlockCoords(struct twoBitFile *tbf, boolean isSwapped, bits32 *retBlockCount, bits32 **retBlockStarts, bits32 **retBlockSizes) /* Read in blockCount, starts and sizes from file. (Same structure used for * both blocks of N's and masked blocks.) */ { -bits32 blkCount = readBits32(f, isSwapped); +bits32 blkCount = (*tbf->ourReadBits32)(tbf->f, isSwapped); *retBlockCount = blkCount; if (blkCount == 0) { *retBlockStarts = NULL; *retBlockSizes = NULL; } else { bits32 *nStarts, *nSizes; AllocArray(nStarts, blkCount); AllocArray(nSizes, blkCount); - mustRead(f, nStarts, sizeof(nStarts[0]) * blkCount); - mustRead(f, nSizes, sizeof(nSizes[0]) * blkCount); + (*tbf->ourMustRead)(tbf->f, nStarts, sizeof(nStarts[0]) * blkCount); + (*tbf->ourMustRead)(tbf->f, nSizes, sizeof(nSizes[0]) * blkCount); if (isSwapped) { int i; for (i=0; i<blkCount; ++i) { nStarts[i] = byteSwap32(nStarts[i]); nSizes[i] = byteSwap32(nSizes[i]); } } *retBlockStarts = nStarts; *retBlockSizes = nSizes; } } struct twoBit *twoBitOneFromFile(struct twoBitFile *tbf, char *name) /* Get single sequence as two bit. */ { bits32 packByteCount; boolean isSwapped = tbf->isSwapped; struct twoBit *twoBit; AllocVar(twoBit); twoBit->name = cloneString(name); -FILE *f = tbf->f; +void *f = tbf->f; /* Find offset in index and seek to it */ twoBitSeekTo(tbf, name); /* Read in seqSize. */ -twoBit->size = readBits32(f, isSwapped); +twoBit->size = (*tbf->ourReadBits32)(f, isSwapped); /* Read in blocks of N. */ -readBlockCoords(f, isSwapped, &(twoBit->nBlockCount), +readBlockCoords(tbf, isSwapped, &(twoBit->nBlockCount), &(twoBit->nStarts), &(twoBit->nSizes)); /* Read in masked blocks. */ -readBlockCoords(f, isSwapped, &(twoBit->maskBlockCount), +readBlockCoords(tbf, isSwapped, &(twoBit->maskBlockCount), &(twoBit->maskStarts), &(twoBit->maskSizes)); /* Reserved word. */ -twoBit->reserved = readBits32(f, isSwapped); +twoBit->reserved = (*tbf->ourReadBits32)(f, isSwapped); /* Read in data. */ packByteCount = packedSize(twoBit->size); twoBit->data = needLargeMem(packByteCount); -mustRead(f, twoBit->data, packByteCount); +(*tbf->ourMustRead)(f, twoBit->data, packByteCount); return twoBit; } -struct twoBit *twoBitFromFile(char *fileName) +struct twoBit *twoBitFromFileExt(char *fileName, boolean useUdc) /* Get twoBit list of all sequences in twoBit file. */ { -struct twoBitFile *tbf = twoBitOpen(fileName); +struct twoBitFile *tbf = twoBitOpenExt(fileName, useUdc); struct twoBitIndex *index; struct twoBit *twoBitList = NULL; for (index = tbf->indexList; index != NULL; index = index->next) { struct twoBit *twoBit = twoBitOneFromFile(tbf, index->name); slAddHead(&twoBitList, twoBit); } twoBitClose(&tbf); slReverse(&twoBitList); return twoBitList; } +struct twoBit *twoBitFromFile(char *fileName) +/* Get twoBit list of all sequences in twoBit file. */ +{ +return twoBitFromFileExt(fileName, FALSE); +} void twoBitFree(struct twoBit **pTwoBit) /* Free up a two bit structure. */ { struct twoBit *twoBit = *pTwoBit; if (twoBit != NULL) { freeMem(twoBit->nStarts); freeMem(twoBit->nSizes); freeMem(twoBit->maskStarts); freeMem(twoBit->maskSizes); freeMem(twoBit->data); freez(pTwoBit); } } @@ -523,82 +642,82 @@ struct dnaSeq *twoBitReadSeqFragExt(struct twoBitFile *tbf, char *name, int fragStart, int fragEnd, boolean doMask, int *retFullSize) /* Read part of sequence from .2bit file. To read full * sequence call with start=end=0. Sequence will be lower * case if doMask is false, mixed case (repeats in lower) * if doMask is true. */ { struct dnaSeq *seq; bits32 seqSize; bits32 nBlockCount, maskBlockCount; bits32 *nStarts = NULL, *nSizes = NULL; bits32 *maskStarts = NULL, *maskSizes = NULL; boolean isSwapped = tbf->isSwapped; -FILE *f = tbf->f; +void *f = tbf->f; int i; int packByteCount, packedStart, packedEnd, remainder, midStart, midEnd; int outSize; UBYTE *packed, *packedAlloc; DNA *dna; /* Find offset in index and seek to it */ dnaUtilOpen(); twoBitSeekTo(tbf, name); /* Read in seqSize. */ -seqSize = readBits32(f, isSwapped); +seqSize = (*tbf->ourReadBits32)(f, isSwapped); if (fragEnd == 0) fragEnd = seqSize; if (fragEnd > seqSize) errAbort("twoBitReadSeqFrag in %s end (%d) >= seqSize (%d)", name, fragEnd, seqSize); outSize = fragEnd - fragStart; if (outSize < 1) errAbort("twoBitReadSeqFrag in %s start (%d) >= end (%d)", name, fragStart, fragEnd); /* Read in blocks of N. */ -readBlockCoords(f, isSwapped, &nBlockCount, &nStarts, &nSizes); +readBlockCoords(tbf, isSwapped, &nBlockCount, &nStarts, &nSizes); /* Read in masked blocks. */ -readBlockCoords(f, isSwapped, &maskBlockCount, &maskStarts, &maskSizes); +readBlockCoords(tbf, isSwapped, &maskBlockCount, &maskStarts, &maskSizes); /* Skip over reserved word. */ -readBits32(f, isSwapped); +(*tbf->ourReadBits32)(f, isSwapped); /* Allocate dnaSeq, and fill in zero tag at end of sequence. */ AllocVar(seq); if (outSize == seqSize) seq->name = cloneString(name); else { char buf[256*2]; safef(buf, sizeof(buf), "%s:%d-%d", name, fragStart, fragEnd); seq->name = cloneString(buf); } seq->size = outSize; dna = seq->dna = needLargeMem(outSize+1); seq->dna[outSize] = 0; /* Skip to bits we need and read them in. */ packedStart = (fragStart>>2); packedEnd = ((fragEnd+3)>>2); packByteCount = packedEnd - packedStart; packed = packedAlloc = needLargeMem(packByteCount); -fseek(f, packedStart, SEEK_CUR); -mustRead(f, packed, packByteCount); +(*tbf->ourSeekCur)(f, packedStart); +(*tbf->ourMustRead)(f, packed, packByteCount); /* Handle case where everything is in one packed byte */ if (packByteCount == 1) { int pOff = (packedStart<<2); int pStart = fragStart - pOff; int pEnd = fragEnd - pOff; UBYTE partial = *packed; assert(pEnd <= 4); assert(pStart >= 0); for (i=pStart; i<pEnd; ++i) *dna++ = valToNt[(partial >> (6-i-i)) & 3]; } else { @@ -705,107 +824,124 @@ { return twoBitReadSeqFragExt(tbf, name, fragStart, fragEnd, TRUE, NULL); } struct dnaSeq *twoBitReadSeqFragLower(struct twoBitFile *tbf, char *name, int fragStart, int fragEnd) /* Same as twoBitReadSeqFrag, but sequence is returned in lower case. */ { return twoBitReadSeqFragExt(tbf, name, fragStart, fragEnd, FALSE, NULL); } int twoBitSeqSize(struct twoBitFile *tbf, char *name) /* Return size of sequence in two bit file in bases. */ { twoBitSeekTo(tbf, name); -return readBits32(tbf->f, tbf->isSwapped); +return (*tbf->ourReadBits32)(tbf->f, tbf->isSwapped); } long long twoBitTotalSize(struct twoBitFile *tbf) /* Return total size of all sequences in two bit file. */ { struct twoBitIndex *index; long long totalSize = 0; for (index = tbf->indexList; index != NULL; index = index->next) { - fseek(tbf->f, index->offset, SEEK_SET); - totalSize += readBits32(tbf->f, tbf->isSwapped); + (*tbf->ourSeek)(tbf->f, index->offset); + totalSize += (*tbf->ourReadBits32)(tbf->f, tbf->isSwapped); } return totalSize; } -struct dnaSeq *twoBitLoadAll(char *spec) +struct dnaSeq *twoBitLoadAllExt(char *spec, boolean useUdc) /* Return list of all sequences matching spec, which is in * the form: * * file/path/input.2bit[:seqSpec1][,seqSpec2,...] * * where seqSpec is either * seqName * or * seqName:start-end */ { struct twoBitSpec *tbs = twoBitSpecNew(spec); -struct twoBitFile *tbf = twoBitOpen(tbs->fileName); +struct twoBitFile *tbf = twoBitOpenExt(tbs->fileName, useUdc); struct dnaSeq *list = NULL; if (tbs->seqs != NULL) { struct twoBitSeqSpec *tbss; for (tbss = tbs->seqs; tbss != NULL; tbss = tbss->next) slSafeAddHead(&list, twoBitReadSeqFrag(tbf, tbss->name, tbss->start, tbss->end)); } else { struct twoBitIndex *index; for (index = tbf->indexList; index != NULL; index = index->next) slSafeAddHead(&list, twoBitReadSeqFrag(tbf, index->name, 0, 0)); } slReverse(&list); twoBitClose(&tbf); twoBitSpecFree(&tbs); return list; } -struct slName *twoBitSeqNames(char *fileName) +struct dnaSeq *twoBitLoadAll(char *spec) +{ +return twoBitLoadAllExt(spec, FALSE); +} + +struct slName *twoBitSeqNamesExt(char *fileName, boolean useUdc) /* Get list of all sequences in twoBit file. */ { -struct twoBitFile *tbf = twoBitOpen(fileName); +struct twoBitFile *tbf = twoBitOpenExt(fileName, useUdc); struct twoBitIndex *index; struct slName *name, *list = NULL; for (index = tbf->indexList; index != NULL; index = index->next) { name = slNameNew(index->name); slAddHead(&list, name); } twoBitClose(&tbf); slReverse(&list); return list; } -boolean twoBitIsFile(char *fileName) +struct slName *twoBitSeqNames(char *fileName) +/* Get list of all sequences in twoBit file. */ +{ +return twoBitSeqNamesExt(fileName, FALSE); +} + +boolean twoBitIsFileExt(char *fileName, boolean useUdc) /* Return TRUE if file is in .2bit format. */ { -FILE *f = mustOpen(fileName, "rb"); +struct twoBitFile *tbf = getTbfAndOpen(fileName, useUdc); boolean isSwapped; -boolean isTwoBit = twoBitSigRead(f, &isSwapped); +boolean isTwoBit = twoBitSigRead(tbf, &isSwapped); -fclose(f); +(*tbf->ourClose)(&tbf->f); return isTwoBit; } +boolean twoBitIsFile(char *fileName) +/* Return TRUE if file is in .2bit format. */ +{ +return twoBitIsFileExt(fileName, FALSE); +} + boolean twoBitParseRange(char *rangeSpec, char **retFile, char **retSeq, int *retStart, int *retEnd) /* Parse out something in format * file/path/name:seqName:start-end * or * file/path/name:seqName * or * file/path/name:seqName1,seqName2,seqName3,... * This will destroy the input 'rangeSpec' in the process. Returns FALSE if * it doesn't fit this format, setting retFile to rangeSpec, and retSet to * null. If it is the shorter form then start and end will both be returned * as zero, which is ok by twoBitReadSeqFrag. Any of the return arguments * maybe NULL. */ { @@ -1006,86 +1142,86 @@ freeMem(seq); } freeMem(spec->fileName); freeMem(spec); *specPtr = NULL; } } void twoBitOutNBeds(struct twoBitFile *tbf, char *seqName, FILE *outF) /* output a series of bed3's that enumerate the number of N's in a sequence*/ { int nBlockCount; twoBitSeekTo(tbf, seqName); -readBits32(tbf->f, tbf->isSwapped); +(*tbf->ourReadBits32)(tbf->f, tbf->isSwapped); /* Read in blocks of N. */ -nBlockCount = readBits32(tbf->f, tbf->isSwapped); +nBlockCount = (*tbf->ourReadBits32)(tbf->f, tbf->isSwapped); if (nBlockCount > 0) { bits32 *nStarts = NULL, *nSizes = NULL; int i; AllocArray(nStarts, nBlockCount); AllocArray(nSizes, nBlockCount); - mustRead(tbf->f, nStarts, sizeof(nStarts[0]) * nBlockCount); - mustRead(tbf->f, nSizes, sizeof(nSizes[0]) * nBlockCount); + (*tbf->ourMustRead)(tbf->f, nStarts, sizeof(nStarts[0]) * nBlockCount); + (*tbf->ourMustRead)(tbf->f, nSizes, sizeof(nSizes[0]) * nBlockCount); if (tbf->isSwapped) { for (i=0; i<nBlockCount; ++i) { nStarts[i] = byteSwap32(nStarts[i]); nSizes[i] = byteSwap32(nSizes[i]); } } for (i=0; i<nBlockCount; ++i) { fprintf(outF, "%s\t%d\t%d\n", seqName, nStarts[i], nStarts[i] + nSizes[i]); } freez(&nStarts); freez(&nSizes); } } int twoBitSeqSizeNoNs(struct twoBitFile *tbf, char *seqName) /* return the size of the sequence, not counting N's*/ { int nBlockCount; int size; twoBitSeekTo(tbf, seqName); -size = readBits32(tbf->f, tbf->isSwapped); +size = (*tbf->ourReadBits32)(tbf->f, tbf->isSwapped); /* Read in blocks of N. */ -nBlockCount = readBits32(tbf->f, tbf->isSwapped); +nBlockCount = (*tbf->ourReadBits32)(tbf->f, tbf->isSwapped); if (nBlockCount > 0) { bits32 *nStarts = NULL, *nSizes = NULL; int i; AllocArray(nStarts, nBlockCount); AllocArray(nSizes, nBlockCount); - mustRead(tbf->f, nStarts, sizeof(nStarts[0]) * nBlockCount); - mustRead(tbf->f, nSizes, sizeof(nSizes[0]) * nBlockCount); + (*tbf->ourMustRead)(tbf->f, nStarts, sizeof(nStarts[0]) * nBlockCount); + (*tbf->ourMustRead)(tbf->f, nSizes, sizeof(nSizes[0]) * nBlockCount); if (tbf->isSwapped) { for (i=0; i<nBlockCount; ++i) { nStarts[i] = byteSwap32(nStarts[i]); nSizes[i] = byteSwap32(nSizes[i]); } } for (i=0; i<nBlockCount; ++i) { size -= nSizes[i]; } freez(&nStarts);