0a10e299786e9b470b55b2f02b9718ec6c4e4bb4 markd Sat Jun 27 15:33:26 2020 -0700 move genoFind build, save, and load to a library diff --git src/gfServer/gfServer.c src/gfServer/gfServer.c index b12a7fd..1723c85 100644 --- src/gfServer/gfServer.c +++ src/gfServer/gfServer.c @@ -1,24 +1,23 @@ /* gfServer - set up an index of the genome in memory and * respond to search requests. */ /* Copyright 2001-2003 Jim Kent. All rights reserved. */ #include "common.h" #include <signal.h> #include <sys/socket.h> #include <netinet/in.h> #include <netdb.h> -#include <sys/mman.h> #include "portable.h" #include "net.h" #include "dnautil.h" #include "dnaseq.h" #include "nib.h" #include "twoBit.h" #include "fa.h" #include "dystring.h" #include "errAbort.h" #include "memalloc.h" #include "genoFind.h" #include "options.h" #include "trans3.h" #include "log.h" #include "internet.h" @@ -60,33 +59,30 @@ int stepSize = 0; /* Can be overridden from command line. */ boolean doTrans = FALSE; /* Do translation? */ boolean allowOneMismatch = FALSE; boolean noSimpRepMask = FALSE; int repMatch = 1024; /* Can be overridden from command line. */ int maxDnaHits = 100; /* Can be overridden from command line. */ int maxTransHits = 200; /* Can be overridden from command line. */ int maxGap = gfMaxGap; boolean seqLog = FALSE; boolean ipLog = FALSE; boolean doMask = FALSE; boolean canStop = FALSE; boolean writeIndex = FALSE; char *indexFile = NULL; -static char indexMagic[] = "genoFind"; - - void usage() /* Explain usage and exit. */ { errAbort( "gfServer v %s - Make a server to quickly find where DNA occurs in genome\n" " To set up a server:\n" " gfServer start host port file(s)\n" " where the files are .2bit or .nib format files specified relative to the current directory\n" " To remove a server:\n" " gfServer stop host port\n" " To query a server with DNA sequence:\n" " gfServer query host port probe.fa\n" " To query a server with protein sequence:\n" " gfServer protQuery host port probe.fa\n" " To query a server with translated DNA sequence:\n" @@ -459,49 +455,48 @@ /* Clean up and report problem. */ { memTrackerEnd(); popAbortHandler(); // must come after memTracker } static void errorSafeCleanupMess(int connectionHandle, char *message) /* Clean up and report problem. */ { errorSafeCleanup(); logError("Recovering from error via longjmp"); netSendString(connectionHandle, message); } static void errorSafeQuery(boolean doTrans, boolean queryIsProt, - struct dnaSeq *seq, struct genoFind *gf, struct genoFind *transGf[2][3], + struct dnaSeq *seq, struct genoFindIndex *gfIdx, int connectionHandle, char *buf, struct hash *perSeqMaxHash) /* Wrap error handling code around index query. */ { int status; errorSafeSetup(); status = setjmp(gfRecover); if (status == 0) /* Always true except after long jump. */ { if (doTrans) { if (queryIsProt) - transQuery(transGf, seq, connectionHandle, buf); + transQuery(gfIdx->transGf, seq, connectionHandle, buf); else - transTransQuery(transGf, seq, - connectionHandle, buf); + transTransQuery(gfIdx->transGf, seq, connectionHandle, buf); } else - dnaQuery(gf, seq, connectionHandle, buf, perSeqMaxHash); + dnaQuery(gfIdx->untransGf, seq, connectionHandle, buf, perSeqMaxHash); errorSafeCleanup(); } else /* They long jumped here because of an error. */ { errorSafeCleanupMess(connectionHandle, "Error: gfServer out of memory. Try reducing size of query."); } } static void errorSafePcr(struct genoFind *gf, char *fPrimer, char *rPrimer, int maxDistance, int connectionHandle) /* Wrap error handling around pcr index query. */ { int status; errorSafeSetup(); @@ -557,287 +552,78 @@ } lineFileClose(&lf); } return perSeqMaxHash; } static void hashZeroVals(struct hash *hash) /* Set the value of every element of hash to NULL (0 for ints). */ { struct hashEl *hel; struct hashCookie cookie = hashFirst(hash); while ((hel = hashNext(&cookie)) != NULL) hel->val = 0; } -static void writeGenoFind(struct genoFind *gf, FILE *f) -/* write one genoFind structure */ -{ -// write out the parameters -mustWrite(f, &gf->maxPat, sizeof(gf->maxPat)); -mustWrite(f, &gf->minMatch, sizeof(gf->minMatch)); -mustWrite(f, &gf->maxGap, sizeof(gf->maxGap)); -mustWrite(f, &gf->tileSize, sizeof(gf->tileSize)); -mustWrite(f, &gf->stepSize, sizeof(gf->stepSize)); -mustWrite(f, &gf->tileSpaceSize, sizeof(gf->tileSpaceSize)); -mustWrite(f, &gf->tileMask, sizeof(gf->tileMask)); -mustWrite(f, &gf->sourceCount, sizeof(gf->sourceCount)); -mustWrite(f, &gf->isPep, sizeof(gf->isPep)); -mustWrite(f, &gf->allowOneMismatch, sizeof(gf->allowOneMismatch)); -mustWrite(f, &gf->segSize, sizeof(gf->segSize)); -mustWrite(f, &gf->totalSeqSize, sizeof(gf->totalSeqSize)); -// now write out the variable-size arrays. The ones we need to -// keep are listSizes and allocated--endLists/lists are generated -// at load time, and in fact *must* be as they are -// pointer-to-pointers which cannot be mmapped properly. - -// sources: length = gf->sourceCount -int i; -for (i = 0; i < gf->sourceCount; i++) - { - struct gfSeqSource *ss = gf->sources + i; - size_t fileNameLen = ss->fileName ? strlen(ss->fileName) + 1 : 0; - mustWrite(f, &fileNameLen, sizeof(fileNameLen)); - if (fileNameLen != 0) - mustWrite(f, ss->fileName, fileNameLen); - mustWrite(f, &ss->start, sizeof(bits32)); - mustWrite(f, &ss->end, sizeof(bits32)); - // no masking information written/read yet. - } -// listSizes: length = gf->tileSpaceSize -mustWrite(f, gf->listSizes, gf->tileSpaceSize * sizeof(gf->listSizes[0])); - -if (gf->segSize == 0) - { - // use lists - size_t count = 0; - for (i = 0; i < gf->tileSpaceSize; i++) - { - if (gf->listSizes[i] < gf->maxPat) - count += gf->listSizes[i]; - } - mustWrite(f, gf->allocated, count*sizeof(bits32)); - } -else - { - // use endLists - size_t count = 0; - for (i = 0; i < gf->tileSpaceSize; i++) - count += gf->listSizes[i]; - mustWrite(f, gf->allocated, 3*count*sizeof(bits16)); - } -} - -static void writeGenoFindIndex(struct genoFind *gf, struct genoFind *transGf[2][3], char *fileName) -/* write index to file that can be mapped. Only one of gf or transGf is used. */ -{ -// create in atomic matter so we don't end up with partial index -char fileNameTmp[PATH_LEN]; -safef(fileNameTmp, sizeof(fileNameTmp), "%s.%s.%d.tmp", fileName, getHost(), getpid()); -unlink(fileNameTmp); - -FILE *f = mustOpen(fileNameTmp, "w"); - -mustWrite(f, indexMagic, sizeof(indexMagic)); -mustWrite(f, &doTrans, sizeof(doTrans)); - -if (doTrans) - { - int i, j; - for (i = 0; i < 2; i++) - for (j = 0; j < 3; j++) - writeGenoFind(transGf[i][j], f); - } -else - { - writeGenoFind(gf, f); - } - -carefulClose(&f); -mustRename(fileNameTmp, fileName); -} - -static struct genoFind *loadGenoFind(FILE *f, void *memMapped) -/* construct one genoFind, mapping file */ -{ -struct genoFind *gf; -AllocVar(gf); - -// read the parameters -mustRead(f, &gf->maxPat, sizeof(gf->maxPat)); -mustRead(f, &gf->minMatch, sizeof(gf->minMatch)); -mustRead(f, &gf->maxGap, sizeof(gf->maxGap)); -mustRead(f, &gf->tileSize, sizeof(gf->tileSize)); -mustRead(f, &gf->stepSize, sizeof(gf->stepSize)); -mustRead(f, &gf->tileSpaceSize, sizeof(gf->tileSpaceSize)); -mustRead(f, &gf->tileMask, sizeof(gf->tileMask)); -mustRead(f, &gf->sourceCount, sizeof(gf->sourceCount)); -mustRead(f, &gf->isPep, sizeof(gf->isPep)); -mustRead(f, &gf->allowOneMismatch, sizeof(gf->allowOneMismatch)); -mustRead(f, &gf->segSize, sizeof(gf->segSize)); -mustRead(f, &gf->totalSeqSize, sizeof(gf->totalSeqSize)); - -// sources: length = gf->sourceCount -gf->sources = needLargeMem(gf->sourceCount * sizeof(struct gfSeqSource)); -int i; -for (i = 0; i < gf->sourceCount; i++) - { - struct gfSeqSource *ss = gf->sources + i; - size_t fileNameLen; - mustRead(f, &fileNameLen, sizeof(fileNameLen)); - if (fileNameLen != 0) - { - ss->fileName = malloc(fileNameLen); - mustRead(f, ss->fileName, fileNameLen); - } - mustRead(f, &ss->start, sizeof(bits32)); - mustRead(f, &ss->end, sizeof(bits32)); - // no seq information written/read - // no masking information written/read - } - -// listSizes: length = (gf->tileSpaceSize) -gf->listSizes = memMapped + ftell(f); -mustSeek(f, (gf->tileSpaceSize * sizeof(gf->listSizes[0])), SEEK_CUR); -gf->allocated = memMapped + ftell(f); -if (gf->segSize == 0) - { - // use lists - gf->lists = needHugeZeroedMem(gf->tileSpaceSize * sizeof(gf->lists[0])); - bits32 *cur = gf->allocated; - size_t count = 0; - for (i = 0; i < gf->tileSpaceSize; i++) - { - if (gf->listSizes[i] < gf->maxPat) - { - gf->lists[i] = cur; - cur += gf->listSizes[i]; - count += gf->listSizes[i]; - } - } - mustSeek(f, count*sizeof(bits32), SEEK_CUR); - } -else - { - // use endLists - gf->endLists = needHugeZeroedMem(gf->tileSpaceSize * sizeof(gf->endLists[0])); - bits16 *cur = gf->allocated; - size_t count = 0; - for (i = 0; i < gf->tileSpaceSize; i++) - { - gf->endLists[i] = cur; - cur += 3 * gf->listSizes[i]; - count += gf->listSizes[i]; - } - mustSeek(f, 3*count*sizeof(bits16), SEEK_CUR); - } -return gf; -} - -void loadGenoFindIndex(char *fileName, struct genoFind **gfRet, struct genoFind *transGf[2][3]) -/* load indexes from file. Only one of gfRet or transGf is set. */ -{ -FILE *f = mustOpen(fileName, "r"); -char fileMagic[sizeof(indexMagic) + 1]; -mustRead(f, fileMagic, sizeof(indexMagic)); -fileMagic[sizeof(indexMagic)] = '\0'; -if (strcmp(fileMagic, indexMagic)) - errAbort("wrong magic string for index file"); -boolean isTrans; -mustRead(f, &isTrans, sizeof(isTrans)); -if (doTrans != isTrans) - errAbort("index file isTrans==%d and -trans==%d", isTrans, doTrans); -off_t fsize = fileSize(fileName); -void *memMapped = mmap(NULL, fsize, PROT_READ, MAP_SHARED, fileno(f), 0); -if (memMapped == MAP_FAILED) - errnoAbort("mmap of index file failed: %s", fileName); -if (madvise(memMapped, fsize, MADV_RANDOM | MADV_WILLNEED) < 0) - errnoAbort("madvise of index file failed: %s", fileName); - -if (doTrans) - { - int i, j; - for (i = 0; i < 2; i++) - for (j = 0; j < 3; j++) - transGf[i][j] = loadGenoFind(f, memMapped); - } -else - { - *gfRet = loadGenoFind(f, memMapped); - } -carefulClose(&f); -} - void startServer(char *hostName, char *portName, int fileCount, char *seqFiles[]) /* Load up index and hang out in RAM. */ { -struct genoFind *gf = NULL; -struct genoFind *transGf[2][3] = {{NULL, NULL, NULL}, {NULL, NULL, NULL}}; +struct genoFindIndex *gfIdx = NULL; char buf[256]; char *line, *command; struct sockaddr_in6 fromAddr; socklen_t fromLen; int readSize; int socketHandle = 0, connectionHandle = 0; int port = atoi(portName); time_t curtime; struct tm *loctime; char timestr[256]; netBlockBrokenPipes(); curtime = time (NULL); /* Get the current time. */ loctime = localtime (&curtime); /* Convert it to local time representation. */ strftime (timestr, sizeof(timestr), "%Y-%m-%d %H:%M", loctime); /* formate datetime as string */ logInfo("gfServer version %s on host %s, port %s (%s)", gfVersion, hostName, portName, timestr); struct hash *perSeqMaxHash = maybePerSeqMax(fileCount, seqFiles); time_t startIndexTime = clock1000(); if (writeIndex || (!writeIndex && (indexFile == NULL))) { - if (doTrans) - { - uglyf("starting translated server...\n"); - logInfo("setting up translated index"); - gfIndexTransNibsAndTwoBits(transGf, fileCount, seqFiles, - minMatch, maxGap, tileSize, repMatch, NULL, allowOneMismatch, - doMask, stepSize, noSimpRepMask); - } - else - { - uglyf("starting untranslated server...\n"); - logInfo("setting up untranslated index"); - gf = gfIndexNibsAndTwoBits(fileCount, seqFiles, minMatch, - maxGap, tileSize, repMatch, NULL, allowOneMismatch, - stepSize, noSimpRepMask); - } + char *desc = doTrans ? "translated" : "untranslated"; + uglyf("starting %s server...\n", desc); + logInfo("setting up %s index", desc); + gfIdx = genoFindIndexBuild(fileCount, seqFiles, minMatch, maxGap, tileSize, repMatch, doTrans, NULL, + allowOneMismatch, doMask, stepSize, noSimpRepMask); logInfo("indexing building complete in %4.3f seconds", 0.001 * (clock1000() - startIndexTime)); if (writeIndex) { - writeGenoFindIndex(gf, transGf, indexFile); + genoFindIndexWrite(gfIdx, indexFile); logInfo("index file built, exiting: %s", indexFile); exit(0); } } else { - loadGenoFindIndex(indexFile, &gf, transGf); + gfIdx = genoFindIndexLoad(indexFile, doTrans); logInfo("indexing loading complete in %4.3f seconds", 0.001 * (clock1000() - startIndexTime)); } + /* Set up socket. Get ready to listen to it. */ socketHandle = netAcceptingSocket(port, 100); if (socketHandle < 0) errAbort("Fatal Error: Unable to open listening socket on port %d.", port); logInfo("Server ready for queries!"); printf("Server ready for queries!\n"); int connectFailCount = 0; for (;;) { ZeroVar(&fromAddr); fromLen = sizeof(fromAddr); connectionHandle = accept(socketHandle, (struct sockaddr*)&fromAddr, &fromLen); if (connectionHandle < 0) { @@ -987,32 +773,32 @@ { ++trimCount; seq.size = maxSize; seq.dna[maxSize] = 0; } if (queryIsProt) aaCount += seq.size; else baseCount += seq.size; if (seqLog && (logGetFile() != NULL)) { FILE *lf = logGetFile(); faWriteNext(lf, "query", seq.dna, seq.size); fflush(lf); } - errorSafeQuery(doTrans, queryIsProt, &seq, gf, - transGf, connectionHandle, buf, perSeqMaxHash); + errorSafeQuery(doTrans, queryIsProt, &seq, gfIdx, + connectionHandle, buf, perSeqMaxHash); if (perSeqMaxHash) hashZeroVals(perSeqMaxHash); } freez(&seq.dna); } netSendString(connectionHandle, "end"); } } } } else if (sameString("pcr", command)) { char *f = nextWord(&line); char *r = nextWord(&line); char *s = nextWord(&line); @@ -1024,31 +810,31 @@ ++warnCount; } else if (doTrans) { warn("Can't pcr on translated server"); ++warnCount; } else if (badPcrPrimerSeq(f) || badPcrPrimerSeq(r)) { warn("Can only handle ACGT in primer sequences."); ++warnCount; } else { maxDistance = atoi(s); - errorSafePcr(gf, f, r, maxDistance, connectionHandle); + errorSafePcr(gfIdx->untransGf, f, r, maxDistance, connectionHandle); } } else if (sameString("files", command)) { int i; sprintf(buf, "%d", fileCount); netSendString(connectionHandle, buf); for (i=0; i<fileCount; ++i) { sprintf(buf, "%s", seqFiles[i]); netSendString(connectionHandle, buf); } } else {