0a10e299786e9b470b55b2f02b9718ec6c4e4bb4 markd Sat Jun 27 15:33:26 2020 -0700 move genoFind build, save, and load to a library diff --git src/jkOwnLib/genoFind.c src/jkOwnLib/genoFind.c index ae05696..b359c0c 100644 --- src/jkOwnLib/genoFind.c +++ src/jkOwnLib/genoFind.c @@ -1,34 +1,37 @@ /* genoFind - Quickly find where DNA occurs in genome.. */ /* Copyright 2001-2005 Jim Kent. All rights reserved. */ #include "common.h" #include <signal.h> +#include <sys/mman.h> +#include "portable.h" #include "obscure.h" #include "dnautil.h" #include "dnaseq.h" #include "nib.h" #include "twoBit.h" #include "fa.h" #include "dystring.h" #include "errAbort.h" #include "sig.h" #include "ooc.h" #include "genoFind.h" #include "trans3.h" #include "binRange.h" +static char indexMagic[] = "genoFind"; char *gfSignature() /* Return signature that starts each command to gfServer. Helps defend * server from confused clients. */ { static char signature[] = "0ddf270562684f29"; return signature; } volatile boolean pipeBroke = FALSE; /* Flag broken pipes here. */ static void gfPipeHandler(int sigNum) /* Set broken pipe flag. */ { pipeBroke = TRUE; @@ -51,51 +54,283 @@ { oneRead = read(sd, buf + totalRead, size - totalRead); if (oneRead < 0) { perror("Couldn't finish large read"); return oneRead; } else if (oneRead == 0) /* Avoid an infinite loop when the client closed the socket. */ break; totalRead += oneRead; } return totalRead; } +static void genoFindWrite(struct genoFind *gf, FILE *f) +/* write one genoFind structure */ +{ +// write out the parameters +mustWrite(f, &gf->maxPat, sizeof(gf->maxPat)); +mustWrite(f, &gf->minMatch, sizeof(gf->minMatch)); +mustWrite(f, &gf->maxGap, sizeof(gf->maxGap)); +mustWrite(f, &gf->tileSize, sizeof(gf->tileSize)); +mustWrite(f, &gf->stepSize, sizeof(gf->stepSize)); +mustWrite(f, &gf->tileSpaceSize, sizeof(gf->tileSpaceSize)); +mustWrite(f, &gf->tileMask, sizeof(gf->tileMask)); +mustWrite(f, &gf->sourceCount, sizeof(gf->sourceCount)); +mustWrite(f, &gf->isPep, sizeof(gf->isPep)); +mustWrite(f, &gf->allowOneMismatch, sizeof(gf->allowOneMismatch)); +mustWrite(f, &gf->segSize, sizeof(gf->segSize)); +mustWrite(f, &gf->totalSeqSize, sizeof(gf->totalSeqSize)); +// now write out the variable-size arrays. The ones we need to +// keep are listSizes and allocated--endLists/lists are generated +// at load time, and in fact *must* be as they are +// pointer-to-pointers which cannot be mmapped properly. + +// sources: length = gf->sourceCount +int i; +for (i = 0; i < gf->sourceCount; i++) + { + struct gfSeqSource *ss = gf->sources + i; + size_t fileNameLen = ss->fileName ? strlen(ss->fileName) + 1 : 0; + mustWrite(f, &fileNameLen, sizeof(fileNameLen)); + if (fileNameLen != 0) + mustWrite(f, ss->fileName, fileNameLen); + mustWrite(f, &ss->start, sizeof(bits32)); + mustWrite(f, &ss->end, sizeof(bits32)); + // no masking information written/read yet. + } +// listSizes: length = gf->tileSpaceSize +mustWrite(f, gf->listSizes, gf->tileSpaceSize * sizeof(gf->listSizes[0])); + +if (gf->segSize == 0) + { + // use lists + size_t count = 0; + for (i = 0; i < gf->tileSpaceSize; i++) + { + if (gf->listSizes[i] < gf->maxPat) + count += gf->listSizes[i]; + } + mustWrite(f, gf->allocated, count*sizeof(bits32)); + } +else + { + // use endLists + size_t count = 0; + for (i = 0; i < gf->tileSpaceSize; i++) + count += gf->listSizes[i]; + mustWrite(f, gf->allocated, 3*count*sizeof(bits16)); + } +} + void genoFindFree(struct genoFind **pGenoFind) /* Free up a genoFind index. */ { struct genoFind *gf = *pGenoFind; int i; struct gfSeqSource *sources; if (gf != NULL) { freeMem(gf->lists); freeMem(gf->listSizes); freeMem(gf->allocated); if ((sources = gf->sources) != NULL) { for (i=0; i<gf->sourceCount; ++i) bitFree(&sources[i].maskedBits); freeMem(sources); } freez(pGenoFind); } } +static struct genoFind *loadGenoFind(FILE *f, void *memMapped) +/* construct one genoFind from mapped file */ +{ +struct genoFind *gf; +AllocVar(gf); + +// read the parameters +mustRead(f, &gf->maxPat, sizeof(gf->maxPat)); +mustRead(f, &gf->minMatch, sizeof(gf->minMatch)); +mustRead(f, &gf->maxGap, sizeof(gf->maxGap)); +mustRead(f, &gf->tileSize, sizeof(gf->tileSize)); +mustRead(f, &gf->stepSize, sizeof(gf->stepSize)); +mustRead(f, &gf->tileSpaceSize, sizeof(gf->tileSpaceSize)); +mustRead(f, &gf->tileMask, sizeof(gf->tileMask)); +mustRead(f, &gf->sourceCount, sizeof(gf->sourceCount)); +mustRead(f, &gf->isPep, sizeof(gf->isPep)); +mustRead(f, &gf->allowOneMismatch, sizeof(gf->allowOneMismatch)); +mustRead(f, &gf->segSize, sizeof(gf->segSize)); +mustRead(f, &gf->totalSeqSize, sizeof(gf->totalSeqSize)); + +// sources: length = gf->sourceCount +gf->sources = needLargeMem(gf->sourceCount * sizeof(struct gfSeqSource)); +int i; +for (i = 0; i < gf->sourceCount; i++) + { + struct gfSeqSource *ss = gf->sources + i; + size_t fileNameLen; + mustRead(f, &fileNameLen, sizeof(fileNameLen)); + if (fileNameLen != 0) + { + ss->fileName = malloc(fileNameLen); + mustRead(f, ss->fileName, fileNameLen); + } + mustRead(f, &ss->start, sizeof(bits32)); + mustRead(f, &ss->end, sizeof(bits32)); + // no seq information written/read + // no masking information written/read + } + +// listSizes: length = (gf->tileSpaceSize) +gf->listSizes = memMapped + ftell(f); +mustSeek(f, (gf->tileSpaceSize * sizeof(gf->listSizes[0])), SEEK_CUR); +gf->allocated = memMapped + ftell(f); +if (gf->segSize == 0) + { + // use lists + gf->lists = needHugeZeroedMem(gf->tileSpaceSize * sizeof(gf->lists[0])); + bits32 *cur = gf->allocated; + size_t count = 0; + for (i = 0; i < gf->tileSpaceSize; i++) + { + if (gf->listSizes[i] < gf->maxPat) + { + gf->lists[i] = cur; + cur += gf->listSizes[i]; + count += gf->listSizes[i]; + } + } + mustSeek(f, count*sizeof(bits32), SEEK_CUR); + } +else + { + // use endLists + gf->endLists = needHugeZeroedMem(gf->tileSpaceSize * sizeof(gf->endLists[0])); + bits16 *cur = gf->allocated; + size_t count = 0; + for (i = 0; i < gf->tileSpaceSize; i++) + { + gf->endLists[i] = cur; + cur += 3 * gf->listSizes[i]; + count += gf->listSizes[i]; + } + mustSeek(f, 3*count*sizeof(bits16), SEEK_CUR); + } +return gf; +} + +static struct genoFindIndex* genoFindIndexNew() +/* construct an empty genoFindIndex */ +{ +struct genoFindIndex *gfIdx; +AllocVar(gfIdx); +return gfIdx; +} + +struct genoFindIndex* genoFindIndexBuild(int fileCount, char *seqFiles[], + int minMatch, int maxGap, int tileSize, + int repMatch, boolean doTrans, char *oocFile, + boolean allowOneMismatch, boolean doMask, + int stepSize, boolean noSimpRepMask) +/* build a untranslated or translated index */ +{ +struct genoFindIndex* gfIdx = genoFindIndexNew(); +gfIdx->isTrans = doTrans; +if (doTrans) + { + gfIndexTransNibsAndTwoBits(gfIdx->transGf, fileCount, seqFiles, + minMatch, maxGap, tileSize, repMatch, oocFile, allowOneMismatch, + doMask, stepSize, noSimpRepMask); + } +else + { + gfIdx->untransGf = gfIndexNibsAndTwoBits(fileCount, seqFiles, minMatch, + maxGap, tileSize, repMatch, oocFile, allowOneMismatch, + stepSize, noSimpRepMask); + } +return gfIdx; +} + +void genoFindIndexWrite(struct genoFindIndex *gfIdx, char *fileName) +/* write index to file that can be mapped */ +{ +// create in atomic matter so we don't end up with partial index +char fileNameTmp[PATH_LEN]; +safef(fileNameTmp, sizeof(fileNameTmp), "%s.%s.%d.tmp", fileName, getHost(), getpid()); +unlink(fileNameTmp); + +FILE *f = mustOpen(fileNameTmp, "w"); + +mustWrite(f, indexMagic, sizeof(indexMagic)); +mustWrite(f, &gfIdx->isTrans, sizeof(gfIdx->isTrans)); + +if (gfIdx->isTrans) + { + int i, j; + for (i = 0; i < 2; i++) + for (j = 0; j < 3; j++) + genoFindWrite(gfIdx->transGf[i][j], f); + } +else + { + genoFindWrite(gfIdx->untransGf, f); + } + +carefulClose(&f); +mustRename(fileNameTmp, fileName); +} + +struct genoFindIndex* genoFindIndexLoad(char *fileName, boolean isTrans) +/* load indexes from file. */ +{ +struct genoFindIndex* gfIdx = genoFindIndexNew(); + +FILE *f = mustOpen(fileName, "r"); +char fileMagic[sizeof(indexMagic) + 1]; +mustRead(f, fileMagic, sizeof(indexMagic)); +fileMagic[sizeof(indexMagic)] = '\0'; +if (strcmp(fileMagic, indexMagic)) + errAbort("wrong magic string for index file"); +mustRead(f, &gfIdx->isTrans, sizeof(gfIdx->isTrans)); +if (isTrans != gfIdx->isTrans) + errAbort("index file isTrans==%d and -trans==%d", gfIdx->isTrans, isTrans); +gfIdx->memLength = fileSize(fileName); +gfIdx->memMapped = mmap(NULL, gfIdx->memLength, PROT_READ, MAP_SHARED, fileno(f), 0); +if (gfIdx->memMapped == MAP_FAILED) + errnoAbort("mmap of index file failed: %s", fileName); +if (madvise(gfIdx->memMapped, gfIdx->memLength, MADV_RANDOM | MADV_WILLNEED) < 0) + errnoAbort("madvise of index file failed: %s", fileName); + +if (isTrans) + { + int i, j; + for (i = 0; i < 2; i++) + for (j = 0; j < 3; j++) + gfIdx->transGf[i][j] = loadGenoFind(f, gfIdx->memMapped); + } +else + { + gfIdx->untransGf = loadGenoFind(f, gfIdx->memMapped); + } +carefulClose(&f); +return gfIdx; +} + int gfPowerOf20(int n) /* Return a 20 to the n */ { int res = 20; while (--n > 0) res *= 20; return res; } void gfCheckTileSize(int tileSize, boolean isPep) /* Check that tile size is legal. Abort if not. */ { if (isPep) { if (tileSize < 3 || tileSize > 8)