965f6d70ac26f0fc37afe89305955c826edbe031 braney Mon Apr 10 15:03:30 2017 -0700 allow -long option to faToTwoBit which builds a twoBit file with 64-bit offsets in the index instead of 32-bit offsets. This allows the total amount of stored sequence to be greater than 2Gb diff --git src/lib/twoBit.c src/lib/twoBit.c index 06cc295..5db09d2 100644 --- src/lib/twoBit.c +++ src/lib/twoBit.c @@ -40,30 +40,35 @@ static void udcMustReadWrap(void *file, void *buf, size_t size) { udcMustRead((struct udcFile *)file, buf, size); } static void udcFileCloseWrap(void *pFile) { udcFileClose((struct udcFile **)pFile); } static bits32 udcReadBits32Wrap(void *f, boolean isSwapped) { return udcReadBits32((struct udcFile *)f, isSwapped); } +static bits64 udcReadBits64Wrap(void *f, boolean isSwapped) +{ +return udcReadBits64((struct udcFile *)f, isSwapped); +} + static boolean udcFastReadStringWrap(void *f, char buf[256]) { return udcFastReadString((struct udcFile *)f, buf); } /* now the stdio wrappers */ static void seekCurWrap(void *file, bits64 offset) { fseek((FILE *)file, offset, SEEK_CUR); } static void seekWrap(void *file, bits64 offset) { fseek((FILE *)file, offset, SEEK_SET); } @@ -76,56 +81,63 @@ static void mustReadWrap(void *file, void *buf, size_t size) { mustRead((FILE *)file, buf, size); } static void fileCloseWrap(void *pFile) { carefulClose((FILE **)pFile); } static bits32 readBits32Wrap(void *f, boolean isSwapped) { return readBits32((FILE *)f, isSwapped); } +static bits64 readBits64Wrap(void *f, boolean isSwapped) +{ +return readBits64((FILE *)f, isSwapped); +} + static boolean fastReadStringWrap(void *f, char buf[256]) { return fastReadString((FILE *)f, buf); } static void setFileFuncs( struct twoBitFile *tbf, boolean useUdc) /* choose the proper function pointers depending on whether * this open twoBit is using stdio or UDC */ { if (useUdc) { tbf->ourSeekCur = udcSeekCurWrap; tbf->ourSeek = udcSeekWrap; tbf->ourTell = udcTellWrap; tbf->ourReadBits32 = udcReadBits32Wrap; + tbf->ourReadBits64 = udcReadBits64Wrap; tbf->ourFastReadString = udcFastReadStringWrap; tbf->ourClose = udcFileCloseWrap; tbf->ourMustRead = udcMustReadWrap; } else { tbf->ourSeekCur = seekCurWrap; tbf->ourSeek = seekWrap; tbf->ourTell = tellWrap; tbf->ourReadBits32 = readBits32Wrap; + tbf->ourReadBits64 = readBits64Wrap; tbf->ourFastReadString = fastReadStringWrap; tbf->ourClose = fileCloseWrap; tbf->ourMustRead = mustReadWrap; } } static int countBlocksOfN(char *s, int size) /* Count number of blocks of N's (or n's) in s. */ { int i; boolean isN, lastIsN = FALSE; char c; int blockCount = 0; for (i=0; i<size; ++i) @@ -311,76 +323,98 @@ fwrite(twoBit->nSizes, sizeof(twoBit->nSizes[0]), twoBit->nBlockCount, f); } writeOne(f, twoBit->maskBlockCount); if (twoBit->maskBlockCount > 0) { fwrite(twoBit->maskStarts, sizeof(twoBit->maskStarts[0]), twoBit->maskBlockCount, f); fwrite(twoBit->maskSizes, sizeof(twoBit->maskSizes[0]), twoBit->maskBlockCount, f); } writeOne(f, twoBit->reserved); mustWrite(f, twoBit->data, packedSize(twoBit->size)); } -void twoBitWriteHeader(struct twoBit *twoBitList, FILE *f) +void twoBitWriteHeaderExt(struct twoBit *twoBitList, FILE *f, boolean useLong) /* Write out header portion of twoBit file, including initial - * index */ + * index. If useLong is True, use 64 bit quantities for the index offsets to support >4Gb assemblies */ { bits32 sig = twoBitSig; bits32 version = 0; +if (useLong) + version = 1; + bits32 seqCount = slCount(twoBitList); bits32 reserved = 0; bits32 offset = 0; +bits64 longOffset = 0; struct twoBit *twoBit; long long counter = 0; /* check for 32 bit overflow */ /* Write out fixed parts of header. */ writeOne(f, sig); writeOne(f, version); writeOne(f, seqCount); writeOne(f, reserved); /* Figure out location of first byte past index. * Each index entry contains 4 bytes of offset information * and the name of the sequence, which is variable length. */ -offset = sizeof(sig) + sizeof(version) + sizeof(seqCount) + sizeof(reserved); +longOffset = offset = sizeof(sig) + sizeof(version) + sizeof(seqCount) + sizeof(reserved); for (twoBit = twoBitList; twoBit != NULL; twoBit = twoBit->next) { int nameLen = strlen(twoBit->name); if (nameLen > 255) errAbort("name %s too long", twoBit->name); + if (useLong) + longOffset += nameLen + 1 + sizeof(bits64); + else offset += nameLen + 1 + sizeof(bits32); } /* Write out index. */ for (twoBit = twoBitList; twoBit != NULL; twoBit = twoBit->next) { int size = twoBitSizeInFile(twoBit); writeString(f, twoBit->name); + if (useLong) + { + writeOne(f, longOffset); + longOffset += size; + } + else + { writeOne(f, offset); offset += size; + } counter += (long long)size; - if (counter > UINT_MAX ) + if (!useLong && (counter > UINT_MAX )) errAbort("Error in faToTwoBit, index overflow at %s. The 2bit format " "does not support indexes larger than %dGb, \n" - "please split up into smaller files.\n", + "please split up into smaller files, or use -long option.\n", twoBit->name, UINT_MAX/1000000000); } } +void twoBitWriteHeader(struct twoBit *twoBitList, FILE *f) +/* Write out header portion of twoBit file, including initial + * index */ +{ +twoBitWriteHeaderExt(twoBitList, f, FALSE); +} + void twoBitClose(struct twoBitFile **pTbf) /* Free up resources associated with twoBitFile. */ { struct twoBitFile *tbf = *pTbf; if (tbf != NULL) { twoBitFree(&tbf->seqCache); freez(&tbf->fileName); (*tbf->ourClose)(&tbf->f); hashFree(&tbf->hash); /* The indexList is allocated out of the hash's memory pool. */ bptFileClose(&tbf->bpt); freez(pTbf); } } @@ -422,63 +456,66 @@ { struct twoBitFile *tbf; boolean isSwapped = FALSE; tbf = getTbfAndOpen(fileName, useUdc); /* Allocate header verify signature, and read in * the constant-length bits. */ if (!twoBitSigRead(tbf, &isSwapped)) errAbort("%s doesn't have a valid twoBitSig", fileName); tbf->isSwapped = isSwapped; tbf->fileName = cloneString(fileName); tbf->version = (*tbf->ourReadBits32)(tbf->f, isSwapped); -if (tbf->version != 0) +if ((tbf->version != 0) && (tbf->version != 1)) { - errAbort("Can only handle version 0 of this file. This is version %d", + errAbort("Can only handle version 0 or version 1 of this file. This is version %d", (int)tbf->version); } tbf->seqCount = (*tbf->ourReadBits32)(tbf->f, isSwapped); tbf->reserved = (*tbf->ourReadBits32)(tbf->f, isSwapped); return tbf; } struct twoBitFile *twoBitOpen(char *fileName) /* Open file, read in header and index. * Squawk and die if there is a problem. */ { boolean useUdc = FALSE; if (hasProtocol(fileName)) useUdc = TRUE; struct twoBitFile *tbf = twoBitOpenReadHeader(fileName, useUdc); struct twoBitIndex *index; boolean isSwapped = tbf->isSwapped; int i; struct hash *hash; void *f = tbf->f; /* Read in index. */ hash = tbf->hash = hashNew(digitsBaseTwo(tbf->seqCount)); for (i=0; i<tbf->seqCount; ++i) { char name[256]; if (!(*tbf->ourFastReadString)(f, name)) errAbort("%s is truncated", fileName); lmAllocVar(hash->lm, index); + if (tbf->version == 1) + index->offset = (*tbf->ourReadBits64)(f, isSwapped); + else index->offset = (*tbf->ourReadBits32)(f, isSwapped); hashAddSaveName(hash, name, index, &index->name); slAddHead(&tbf->indexList, index); } slReverse(&tbf->indexList); return tbf; } struct twoBitFile *twoBitOpenExternalBptIndex(char *twoBitName, char *bptName) /* Open file, read in header, but not regular index. Instead use * bpt index. Beware if you use this the indexList field will be NULL * as will the hash. */ { struct twoBitFile *tbf = twoBitOpenReadHeader(twoBitName, FALSE); tbf->bpt = bptFileOpen(bptName);