b6d7e4446453c3ab019b30024741247171a2f040 markd Mon Jun 29 20:04:40 2020 +0000 make sure data is 64-bit aligned, store offsets in headers to simplify diff --git src/jkOwnLib/genoFind.c src/jkOwnLib/genoFind.c index dc158ae..f475290 100644 --- src/jkOwnLib/genoFind.c +++ src/jkOwnLib/genoFind.c @@ -7,31 +7,32 @@ #include "portable.h" #include "obscure.h" #include "dnautil.h" #include "dnaseq.h" #include "nib.h" #include "twoBit.h" #include "fa.h" #include "dystring.h" #include "errAbort.h" #include "sig.h" #include "ooc.h" #include "genoFind.h" #include "trans3.h" #include "binRange.h" -static char indexMagic[] = "genoFind"; +static char indexFileMagic[] = "genoFind"; +static char indexFileVerison[] = "1.0"; char *gfSignature() /* Return signature that starts each command to gfServer. Helps defend * server from confused clients. */ { static char signature[] = "0ddf270562684f29"; return signature; } volatile boolean pipeBroke = FALSE; /* Flag broken pipes here. */ static void gfPipeHandler(int sigNum) /* Set broken pipe flag. */ { pipeBroke = TRUE; @@ -54,287 +55,431 @@ { oneRead = read(sd, buf + totalRead, size - totalRead); if (oneRead < 0) { perror("Couldn't finish large read"); return oneRead; } else if (oneRead == 0) /* Avoid an infinite loop when the client closed the socket. */ break; totalRead += oneRead; } return totalRead; } -static void genoFindWrite(struct genoFind *gf, FILE *f) -/* write one genoFind structure */ -{ -// write out the parameters -mustWrite(f, &gf->maxPat, sizeof(gf->maxPat)); -mustWrite(f, &gf->minMatch, sizeof(gf->minMatch)); -mustWrite(f, &gf->maxGap, sizeof(gf->maxGap)); -mustWrite(f, &gf->tileSize, sizeof(gf->tileSize)); -mustWrite(f, &gf->stepSize, sizeof(gf->stepSize)); -mustWrite(f, &gf->tileSpaceSize, sizeof(gf->tileSpaceSize)); -mustWrite(f, &gf->tileMask, sizeof(gf->tileMask)); -mustWrite(f, &gf->sourceCount, sizeof(gf->sourceCount)); -mustWrite(f, &gf->isPep, sizeof(gf->isPep)); -mustWrite(f, &gf->allowOneMismatch, sizeof(gf->allowOneMismatch)); -mustWrite(f, &gf->segSize, sizeof(gf->segSize)); -mustWrite(f, &gf->totalSeqSize, sizeof(gf->totalSeqSize)); -// now write out the variable-size arrays. The ones we need to -// keep are listSizes and allocated--endLists/lists are generated -// at load time, and in fact *must* be as they are -// pointer-to-pointers which cannot be mmapped properly. - -// sources: length = gf->sourceCount -int i; -for (i = 0; i < gf->sourceCount; i++) - { - struct gfSeqSource *ss = gf->sources + i; - char *fileName = ss->fileName; - if (fileName != NULL) - { - // don't include directories - char *s = strrchr(fileName, '/'); - if (s != NULL) - fileName = s + 1; - } - size_t fileNameLen = fileName ? strlen(fileName) + 1 : 0; - mustWrite(f, &fileNameLen, sizeof(fileNameLen)); - if (fileNameLen != 0) - mustWrite(f, fileName, fileNameLen); - mustWrite(f, &ss->start, sizeof(bits32)); - mustWrite(f, &ss->end, sizeof(bits32)); - // FIXME: no masking information written/read yet. - } -// listSizes: length = gf->tileSpaceSize -mustWrite(f, gf->listSizes, gf->tileSpaceSize * sizeof(gf->listSizes[0])); - -if (gf->segSize == 0) - { - // use lists - size_t count = 0; - for (i = 0; i < gf->tileSpaceSize; i++) - { - if (gf->listSizes[i] < gf->maxPat) - count += gf->listSizes[i]; - } - mustWrite(f, gf->allocated, count*sizeof(bits32)); - } -else - { - // use endLists - size_t count = 0; - for (i = 0; i < gf->tileSpaceSize; i++) - count += gf->listSizes[i]; - mustWrite(f, gf->allocated, 3*count*sizeof(bits16)); - } -} void genoFindFree(struct genoFind **pGenoFind) /* Free up a genoFind index. */ { struct genoFind *gf = *pGenoFind; int i; struct gfSeqSource *sources; if (gf != NULL) { freeMem(gf->lists); freeMem(gf->listSizes); freeMem(gf->allocated); if ((sources = gf->sources) != NULL) { for (i=0; i<gf->sourceCount; ++i) bitFree(&sources[i].maskedBits); freeMem(sources); } freez(pGenoFind); } } -static struct genoFind *genoFindLoad(FILE *f, void *memMapped) -/* construct one genoFind from mapped file */ +static off_t mustSeekAligned(FILE *f) +/* seek so that the current offset is 64-bit aligned */ { -struct genoFind *gf; -AllocVar(gf); +off_t off = ftell(f); +if ((off & 0x7) != 0) + { + off = (off & ~0x7) + 0x8; + mustSeek(f, off, SEEK_SET); + } +return off; +} -// read the parameters -mustRead(f, &gf->maxPat, sizeof(gf->maxPat)); -mustRead(f, &gf->minMatch, sizeof(gf->minMatch)); -mustRead(f, &gf->maxGap, sizeof(gf->maxGap)); -mustRead(f, &gf->tileSize, sizeof(gf->tileSize)); -mustRead(f, &gf->stepSize, sizeof(gf->stepSize)); -mustRead(f, &gf->tileSpaceSize, sizeof(gf->tileSpaceSize)); -mustRead(f, &gf->tileMask, sizeof(gf->tileMask)); -mustRead(f, &gf->sourceCount, sizeof(gf->sourceCount)); -mustRead(f, &gf->isPep, sizeof(gf->isPep)); -mustRead(f, &gf->allowOneMismatch, sizeof(gf->allowOneMismatch)); -mustRead(f, &gf->segSize, sizeof(gf->segSize)); -mustRead(f, &gf->totalSeqSize, sizeof(gf->totalSeqSize)); - -// sources: length = gf->sourceCount + +struct genoFindFileHdr +/* header for genoFind section in file */ +{ + int maxPat; + int minMatch; + int maxGap; + int tileSize; + int stepSize; + int tileSpaceSize; + int tileMask; + int sourceCount; + bool isPep; + bool allowOneMismatch; + bool noSimpRepMask; + int segSize; + + off_t sourcesOff; // offset of sequences sources + off_t listSizesOff; // offset of listSizes + off_t listsOff; // offset of lists or endLists + off_t endListsOff; + bits64 reserved[32]; // vesion 1.0: 32 +}; + +static void genoFindInitHdr(struct genoFind *gf, + struct genoFindFileHdr *hdr) +/* fill in the file header struct from in-memory struct */ +{ +zeroBytes(hdr, sizeof(struct genoFindFileHdr)); +hdr->maxPat = gf->maxPat; +hdr->minMatch = gf->minMatch; +hdr->maxGap = gf->maxGap; +hdr->tileSize = gf->tileSize; +hdr->stepSize = gf->stepSize; +hdr->tileSpaceSize = gf->tileSpaceSize; +hdr->tileMask = gf->tileMask; +hdr->sourceCount = gf->sourceCount; +hdr->isPep = gf->isPep; +hdr->allowOneMismatch = gf->allowOneMismatch; +hdr->noSimpRepMask = gf->noSimpRepMask; +hdr->segSize = gf->segSize; +} + +static void genoFindReadHdr(struct genoFindFileHdr *hdr, + struct genoFind *gf) +/* fill in the in-memory struct from file header struct */ +{ +gf->maxPat = hdr->maxPat; +gf->minMatch = hdr->minMatch; +gf->maxGap = hdr->maxGap; +gf->tileSize = hdr->tileSize; +gf->stepSize = hdr->stepSize; +gf->tileSpaceSize = hdr->tileSpaceSize; +gf->tileMask = hdr->tileMask; +gf->sourceCount = hdr->sourceCount; +gf->isPep = hdr->isPep; +gf->allowOneMismatch = hdr->allowOneMismatch; +gf->noSimpRepMask = hdr->noSimpRepMask; +gf->segSize = hdr->segSize; +} + +static void genoFindWriteSource(struct gfSeqSource *ss, FILE *f) +/* write a gfSeqSource object */ +{ +if ((ss->seq != NULL) || (ss->maskedBits != NULL)) + errAbort("can't write index contained sequences, must used external sequence file"); + +char *fileName = ss->fileName; +if (fileName != NULL) + { + // don't include directories + char *s = strrchr(fileName, '/'); + if (s != NULL) + fileName = s + 1; + } +writeStringSafe(f, fileName); +mustWrite(f, &ss->start, sizeof(bits32)); +mustWrite(f, &ss->end, sizeof(bits32)); +} + +static void genoFindReadSource(FILE *f, struct gfSeqSource *ss) +/* read a gfSeqSource from file */ +{ +ss->fileName = readString(f); +mustRead(f, &ss->start, sizeof(bits32)); +mustRead(f, &ss->end, sizeof(bits32)); +} + +static off_t genoFindWriteSources(struct genoFind *gf, FILE *f) +/* write the sources to the file */ +{ +off_t off = mustSeekAligned(f); +int i; +for (i = 0; i < gf->sourceCount; i++) + genoFindWriteSource(gf->sources + i, f); +return off; +} + +static void genoFindReadSources(FILE *f, off_t off, struct genoFind *gf) +/* read the sources from the file */ +{ +mustSeek(f, off, SEEK_SET); gf->sources = needLargeMem(gf->sourceCount * sizeof(struct gfSeqSource)); int i; for (i = 0; i < gf->sourceCount; i++) + genoFindReadSource(f, gf->sources + i); +} + +static off_t genoFindWriteListSizes(struct genoFind *gf, FILE *f) +/* write the list sizes */ { - struct gfSeqSource *ss = gf->sources + i; - size_t fileNameLen; - mustRead(f, &fileNameLen, sizeof(fileNameLen)); - if (fileNameLen != 0) +off_t off = mustSeekAligned(f); +// length = gf->tileSpaceSize +mustWrite(f, gf->listSizes, gf->tileSpaceSize * sizeof(gf->listSizes[0])); +return off; +} + +static void genoFindMapListSize(void *memMapped, off_t off, struct genoFind *gf) +/* map the list sizes into memory */ { - ss->fileName = malloc(fileNameLen); - mustRead(f, ss->fileName, fileNameLen); +gf->listSizes = memMapped + off; } - mustRead(f, &ss->start, sizeof(bits32)); - mustRead(f, &ss->end, sizeof(bits32)); - // no seq information written/read - // no masking information written/read + +static off_t genoFindWriteLists(struct genoFind *gf, FILE *f) +/* write the lists */ +{ +off_t off = mustSeekAligned(f); +size_t count = 0; +int i; +for (i = 0; i < gf->tileSpaceSize; i++) + if (gf->listSizes[i] < gf->maxPat) + count += gf->listSizes[i]; +mustWrite(f, gf->allocated, count*sizeof(bits32)); +return off; } -// listSizes: length = (gf->tileSpaceSize) -gf->listSizes = memMapped + ftell(f); -mustSeek(f, (gf->tileSpaceSize * sizeof(gf->listSizes[0])), SEEK_CUR); -gf->allocated = memMapped + ftell(f); -if (gf->segSize == 0) +static void genoFindMapLists(void *memMapped, off_t off, struct genoFind *gf) +/* maps the lists into memory */ { - // use lists +gf->allocated = memMapped + off; gf->lists = needHugeZeroedMem(gf->tileSpaceSize * sizeof(gf->lists[0])); bits32 *cur = gf->allocated; size_t count = 0; +int i; for (i = 0; i < gf->tileSpaceSize; i++) { if (gf->listSizes[i] < gf->maxPat) { gf->lists[i] = cur; cur += gf->listSizes[i]; count += gf->listSizes[i]; } } - mustSeek(f, count*sizeof(bits32), SEEK_CUR); } -else + +static off_t genoFindWriteEndLists(struct genoFind *gf, FILE *f) +/* write the endList */ +{ +off_t off = mustSeekAligned(f); +size_t count = 0; +int i; +for (i = 0; i < gf->tileSpaceSize; i++) + count += gf->listSizes[i]; +mustWrite(f, gf->allocated, 3*count*sizeof(bits16)); +return off; +} + +static void genoFindMapEndLists(void *memMapped, off_t off, struct genoFind *gf) +/* maps the endLists into memory */ { - // use endLists gf->endLists = needHugeZeroedMem(gf->tileSpaceSize * sizeof(gf->endLists[0])); bits16 *cur = gf->allocated; size_t count = 0; +int i; for (i = 0; i < gf->tileSpaceSize; i++) { gf->endLists[i] = cur; cur += 3 * gf->listSizes[i]; count += gf->listSizes[i]; } - mustSeek(f, 3*count*sizeof(bits16), SEEK_CUR); } + + +static off_t genoFindWrite(struct genoFind *gf, FILE *f) +/* write one genoFind structure, return offset */ +{ +off_t hdrOff = mustSeekAligned(f); + +struct genoFindFileHdr hdr; +genoFindInitHdr(gf, &hdr); +mustWrite(f, &hdr, sizeof(hdr)); + +// now write out the variable-size arrays. The ones we need to keep are +// sources, listSizes and allocated--endLists/lists are generated at load +// time, and in fact *must* be as they are pointer-to-pointers which cannot be +// mmapped properly. + +hdr.sourcesOff = genoFindWriteSources(gf, f); +hdr.listSizesOff = genoFindWriteListSizes(gf, f); + +if (gf->segSize == 0) + hdr.listsOff= genoFindWriteLists(gf, f); +else + hdr.endListsOff = genoFindWriteEndLists(gf, f); + +// rewrite header with offsets +off_t endOff = mustSeekAligned(f); +mustSeek(f, hdrOff, SEEK_SET); +mustWrite(f, &hdr, sizeof(hdr)); +mustSeek(f, endOff, SEEK_SET); +return hdrOff; +} + +static struct genoFind *genoFindLoad(FILE* f ,void *memMapped, off_t off) +/* construct one genoFind from mapped file */ +{ +struct genoFind *gf; +AllocVar(gf); +struct genoFindFileHdr *hdr = memMapped + off; +genoFindReadHdr(hdr, gf); + +genoFindReadSources(f, hdr->sourcesOff, gf); +genoFindMapListSize(memMapped, hdr->listSizesOff, gf); + +if (gf->segSize == 0) + genoFindMapLists(memMapped, hdr->listsOff, gf); +else + genoFindMapEndLists(memMapped, hdr->endListsOff, gf); return gf; } -static struct genoFindIndex* genoFindIndexNew() +static struct genoFindIndex* genoFindIndexNew(boolean isTrans) /* construct an empty genoFindIndex */ { struct genoFindIndex *gfIdx; AllocVar(gfIdx); +gfIdx->isTrans = isTrans; return gfIdx; } struct genoFindIndex* genoFindIndexBuild(int fileCount, char *seqFiles[], int minMatch, int maxGap, int tileSize, int repMatch, boolean doTrans, char *oocFile, boolean allowOneMismatch, boolean doMask, int stepSize, boolean noSimpRepMask) /* build a untranslated or translated index */ { -struct genoFindIndex* gfIdx = genoFindIndexNew(); +struct genoFindIndex* gfIdx = genoFindIndexNew(doTrans); gfIdx->isTrans = doTrans; if (doTrans) - { gfIndexTransNibsAndTwoBits(gfIdx->transGf, fileCount, seqFiles, minMatch, maxGap, tileSize, repMatch, oocFile, allowOneMismatch, doMask, stepSize, noSimpRepMask); - } else - { gfIdx->untransGf = gfIndexNibsAndTwoBits(fileCount, seqFiles, minMatch, maxGap, tileSize, repMatch, oocFile, allowOneMismatch, stepSize, noSimpRepMask); - } return gfIdx; } +struct genoFindIndexFileHdr +/* header for genoFind section in file */ +{ + char magic[32]; + char version[32]; + bits32 indexAddressSize; // 32 or 64 bit, compile time. + boolean isTrans; + // offsets to data, only one is filed in based on being translated or note + off_t untransOff; + off_t transOff[2][3]; + bits64 reserved[32]; // vesion 1.0: 32 +}; + +static void genoFindIndexInitHeader(struct genoFindIndex *gfIdx, + struct genoFindIndexFileHdr* hdr) +/* fill in the file header struct from in-memory struct */ +{ +zeroBytes(hdr, sizeof(struct genoFindIndexFileHdr)); +safecpy(hdr->magic, sizeof(hdr->magic), indexFileMagic); +safecpy(hdr->version, sizeof(hdr->version), indexFileVerison); +hdr->indexAddressSize = 32; +hdr->isTrans = gfIdx->isTrans; +} + +static void genoFindIndexReadHeader(void* memMapped, struct genoFindIndexFileHdr* hdr, + struct genoFindIndex* gfIdx) +/* fill in the file header from file and valodate */ +{ +*hdr = *((struct genoFindIndexFileHdr*)memMapped); +if (!sameString(hdr->magic, indexFileMagic)) + errAbort("wrong magic string for index file"); +if (!sameString(hdr->version, indexFileVerison)) + errAbort("unsupported version for index file: %s", hdr->version); +if (hdr->indexAddressSize != 32) + errAbort("not a 32-bit index: %d", hdr->indexAddressSize); +if (hdr->isTrans != gfIdx->isTrans) + errAbort("index file has isTrans=%d, isTrans=%d requested", hdr->isTrans, gfIdx->isTrans); +} + +static void genoFindIndexWriteTrans(struct genoFindIndex *gfIdx, + struct genoFindIndexFileHdr* hdr, + FILE *f) +/* write translated indexes */ +{ +int i, j; +for (i = 0; i < 2; i++) + for (j = 0; j < 3; j++) + hdr->transOff[i][j] = genoFindWrite(gfIdx->transGf[i][j], f); +} + +static void genoFindIndexMapTrans(FILE* f, void* memMapped, struct genoFindIndexFileHdr* hdr, + struct genoFindIndex *gfIdx) +/* mapped translated indexes into memory */ +{ +int i, j; +for (i = 0; i < 2; i++) + for (j = 0; j < 3; j++) + gfIdx->transGf[i][j] = genoFindLoad(f, memMapped, hdr->transOff[i][j]); +} + void genoFindIndexWrite(struct genoFindIndex *gfIdx, char *fileName) /* write index to file that can be mapped */ { // create in atomic matter so we don't end up with partial index char fileNameTmp[PATH_LEN]; safef(fileNameTmp, sizeof(fileNameTmp), "%s.%s.%d.tmp", fileName, getHost(), getpid()); unlink(fileNameTmp); FILE *f = mustOpen(fileNameTmp, "w"); -mustWrite(f, indexMagic, sizeof(indexMagic)); -mustWrite(f, &gfIdx->isTrans, sizeof(gfIdx->isTrans)); +struct genoFindIndexFileHdr hdr; +genoFindIndexInitHeader(gfIdx, &hdr); +mustWrite(f, &hdr, sizeof(hdr)); if (gfIdx->isTrans) - { - int i, j; - for (i = 0; i < 2; i++) - for (j = 0; j < 3; j++) - genoFindWrite(gfIdx->transGf[i][j], f); - } + genoFindIndexWriteTrans(gfIdx, &hdr, f); else - { - genoFindWrite(gfIdx->untransGf, f); - } + hdr.untransOff = genoFindWrite(gfIdx->untransGf, f); +// rewrite header to save offsets +mustSeek(f, 0, SEEK_SET); +mustWrite(f, &hdr, sizeof(hdr)); carefulClose(&f); mustRename(fileNameTmp, fileName); } struct genoFindIndex* genoFindIndexLoad(char *fileName, boolean isTrans) /* load indexes from file. */ { -struct genoFindIndex* gfIdx = genoFindIndexNew(); +struct genoFindIndex* gfIdx = genoFindIndexNew(isTrans); FILE *f = mustOpen(fileName, "r"); -char fileMagic[sizeof(indexMagic) + 1]; -mustRead(f, fileMagic, sizeof(indexMagic)); -fileMagic[sizeof(indexMagic)] = '\0'; -if (strcmp(fileMagic, indexMagic)) - errAbort("wrong magic string for index file"); -mustRead(f, &gfIdx->isTrans, sizeof(gfIdx->isTrans)); -if (isTrans != gfIdx->isTrans) - errAbort("index file isTrans==%d and -trans==%d", gfIdx->isTrans, isTrans); gfIdx->memLength = fileSize(fileName); gfIdx->memMapped = mmap(NULL, gfIdx->memLength, PROT_READ, MAP_SHARED, fileno(f), 0); if (gfIdx->memMapped == MAP_FAILED) errnoAbort("mmap of index file failed: %s", fileName); if (madvise(gfIdx->memMapped, gfIdx->memLength, MADV_RANDOM | MADV_WILLNEED) < 0) errnoAbort("madvise of index file failed: %s", fileName); +struct genoFindIndexFileHdr hdr; +genoFindIndexReadHeader(gfIdx->memMapped, &hdr, gfIdx); + if (isTrans) - { - int i, j; - for (i = 0; i < 2; i++) - for (j = 0; j < 3; j++) - gfIdx->transGf[i][j] = genoFindLoad(f, gfIdx->memMapped); - } + genoFindIndexMapTrans(f, gfIdx->memMapped, &hdr, gfIdx); else - { - gfIdx->untransGf = genoFindLoad(f, gfIdx->memMapped); - } + gfIdx->untransGf = genoFindLoad(f, gfIdx->memMapped, hdr.untransOff); + carefulClose(&f); return gfIdx; } int gfPowerOf20(int n) /* Return a 20 to the n */ { int res = 20; while (--n > 0) res *= 20; return res; } void gfCheckTileSize(int tileSize, boolean isPep) /* Check that tile size is legal. Abort if not. */