a1c950cde18a4b977dc47c4528dfa6a9357a1f0f jcarmstr Mon Jul 13 19:13:08 2015 -0700 Option for gfServer to save index as an mmap file on disk to speed startup. diff --git src/gfServer/gfServer.c src/gfServer/gfServer.c index 45717e0..53a6f75 100644 --- src/gfServer/gfServer.c +++ src/gfServer/gfServer.c @@ -1,23 +1,24 @@ /* gfServer - set up an index of the genome in memory and * respond to search requests. */ /* Copyright 2001-2003 Jim Kent. All rights reserved. */ #include "common.h" #include <signal.h> #include <sys/socket.h> #include <netinet/in.h> #include <netdb.h> +#include <sys/mman.h> #include "portable.h" #include "net.h" #include "dnautil.h" #include "dnaseq.h" #include "nib.h" #include "twoBit.h" #include "fa.h" #include "dystring.h" #include "errAbort.h" #include "memalloc.h" #include "genoFind.h" #include "options.h" #include "trans3.h" #include "log.h" #include "internet.h" @@ -33,51 +34,58 @@ {"maxDnaHits", OPTION_INT}, {"maxGap", OPTION_INT}, {"maxNtSize", OPTION_INT}, {"maxTransHits", OPTION_INT}, {"minMatch", OPTION_INT}, {"repMatch", OPTION_INT}, {"seqLog", OPTION_BOOLEAN}, {"ipLog", OPTION_BOOLEAN}, {"debugLog", OPTION_BOOLEAN}, {"stepSize", OPTION_INT}, {"tileSize", OPTION_INT}, {"trans", OPTION_BOOLEAN}, {"syslog", OPTION_BOOLEAN}, {"perSeqMax", OPTION_STRING}, {"noSimpRepMask", OPTION_BOOLEAN}, + {"writeIndex", OPTION_BOOLEAN}, + {"indexFile", OPTION_STRING}, {NULL, 0} }; int maxNtSize = 40000; int maxAaSize = 8000; int minMatch = gfMinMatch; /* Can be overridden from command line. */ int tileSize = gfTileSize; /* Can be overridden from command line. */ int stepSize = 0; /* Can be overridden from command line. */ boolean doTrans = FALSE; /* Do translation? */ boolean allowOneMismatch = FALSE; boolean noSimpRepMask = FALSE; int repMatch = 1024; /* Can be overridden from command line. */ int maxDnaHits = 100; /* Can be overridden from command line. */ int maxTransHits = 200; /* Can be overridden from command line. */ int maxGap = gfMaxGap; boolean seqLog = FALSE; boolean ipLog = FALSE; boolean doMask = FALSE; boolean canStop = FALSE; +boolean writeIndex = FALSE; +char *indexFile = NULL; + +static char indexMagic[] = "genoFind"; + void usage() /* Explain usage and exit. */ { errAbort( "gfServer v %s - Make a server to quickly find where DNA occurs in genome\n" " To set up a server:\n" " gfServer start host port file(s)\n" " where the files are .2bit or .nib format files specified relative to the current directory\n" " To remove a server:\n" " gfServer stop host port\n" " To query a server with DNA sequence:\n" " gfServer query host port probe.fa\n" " To query a server with protein sequence:\n" " gfServer protQuery host port probe.fa\n" @@ -114,30 +122,36 @@ " tile. Default is %d.\n" " -noSimpRepMask Suppresses simple repeat masking.\n" " -maxDnaHits=N Maximum number of hits for a DNA query that are sent from the server.\n" " Default is %d.\n" " -maxTransHits=N Maximum number of hits for a translated query that are sent from the server.\n" " Default is %d.\n" " -maxNtSize=N Maximum size of untranslated DNA query sequence.\n" " Default is %d.\n" " -maxAaSize=N Maximum size of protein or translated DNA queries.\n" " Default is %d.\n" " -perSeqMax=file File contains one seq filename (possibly with ':seq' suffix) per line.\n" " -maxDnaHits will be applied to each filename[:seq] separately: each may\n" " have at most maxDnaHits/2 hits.\n" " Useful for assemblies with many alternate/patch sequences.\n" " -canStop If set, a quit message will actually take down the server.\n" + " -writeIndex Write the in-memory index to indexFile after building and exit.\n" + " -indexFile File for index. If -writeIndex is specified, the file is created,\n" + " otherwise it is loaded from this file. Saving index can speed up\n" + " gfServer startup by two orders of magnitude. The parameters must\n" + " exactly match the parameters when the file is written or bad things\n" + " will happen.\n" , gfVersion, repMatch, maxDnaHits, maxTransHits, maxNtSize, maxAaSize ); } /* Note about file(s) specified in the start command: The path(s) specified here are sent back exactly as-is to clients such as gfClient, hgBlat, webBlat. It is intended that relative paths are used. Absolute paths starting with '/' tend not to work unless the client is on the same machine as the server. For use with hgBlat and webBlat, cd to the directory where the file is and use the plain file name with no slashes. hgBlat will append the path(s) given to dbDb.nibPath. webBlat will append the path(s) given to path specified in webBlat.cfg. @@ -543,73 +557,285 @@ } lineFileClose(&lf); } return perSeqMaxHash; } static void hashZeroVals(struct hash *hash) /* Set the value of every element of hash to NULL (0 for ints). */ { struct hashEl *hel; struct hashCookie cookie = hashFirst(hash); while ((hel = hashNext(&cookie)) != NULL) hel->val = 0; } +static void writeGenoFind(struct genoFind *gf, FILE *f) +/* write one genoFind structure */ +{ +// write out the parameters +mustWrite(f, &gf->maxPat, sizeof(gf->maxPat)); +mustWrite(f, &gf->minMatch, sizeof(gf->minMatch)); +mustWrite(f, &gf->maxGap, sizeof(gf->maxGap)); +mustWrite(f, &gf->tileSize, sizeof(gf->tileSize)); +mustWrite(f, &gf->stepSize, sizeof(gf->stepSize)); +mustWrite(f, &gf->tileSpaceSize, sizeof(gf->tileSpaceSize)); +mustWrite(f, &gf->tileMask, sizeof(gf->tileMask)); +mustWrite(f, &gf->sourceCount, sizeof(gf->sourceCount)); +mustWrite(f, &gf->isPep, sizeof(gf->isPep)); +mustWrite(f, &gf->allowOneMismatch, sizeof(gf->allowOneMismatch)); +mustWrite(f, &gf->segSize, sizeof(gf->segSize)); +mustWrite(f, &gf->totalSeqSize, sizeof(gf->totalSeqSize)); +// now write out the variable-size arrays. The ones we need to +// keep are listSizes and allocated--endLists/lists are generated +// at load time, and in fact *must* be as they are +// pointer-to-pointers which cannot be mmapped properly. + +// sources: length = gf->sourceCount +int i; +for (i = 0; i < gf->sourceCount; i++) + { + struct gfSeqSource *ss = gf->sources + i; + size_t fileNameLen = ss->fileName ? strlen(ss->fileName) + 1 : 0; + mustWrite(f, &fileNameLen, sizeof(fileNameLen)); + if (fileNameLen != 0) + mustWrite(f, ss->fileName, fileNameLen); + mustWrite(f, &ss->start, sizeof(bits32)); + mustWrite(f, &ss->end, sizeof(bits32)); + // no masking information written/read yet. + } +// listSizes: length = gf->tileSpaceSize +mustWrite(f, gf->listSizes, gf->tileSpaceSize * sizeof(gf->listSizes[0])); + +if (gf->segSize == 0) + { + // use lists + size_t count = 0; + for (i = 0; i < gf->tileSpaceSize; i++) + { + if (gf->listSizes[i] < gf->maxPat) + count += gf->listSizes[i]; + } + mustWrite(f, gf->allocated, count*sizeof(bits32)); + } +else + { + // use endLists + size_t count = 0; + for (i = 0; i < gf->tileSpaceSize; i++) + count += gf->listSizes[i]; + mustWrite(f, gf->allocated, 3*count*sizeof(bits16)); + } +} + +static void writeGenoFindIndex(struct genoFind *gf, struct genoFind *transGf[2][3], char *fileName) +/* write index to file that can be mapped. Only one of gf or transGf is used. */ +{ +// create in atomic matter so we don't end up with partial index +char fileNameTmp[PATH_LEN]; +safef(fileNameTmp, sizeof(fileNameTmp), "%s.%s.%d.tmp", fileName, getHost(), getpid()); +unlink(fileNameTmp); + +FILE *f = mustOpen(fileNameTmp, "w"); + +mustWrite(f, indexMagic, sizeof(indexMagic)); +mustWrite(f, &doTrans, sizeof(doTrans)); + +if (doTrans) + { + int i, j; + for (i = 0; i < 2; i++) + for (j = 0; j < 3; j++) + writeGenoFind(transGf[i][j], f); + } +else + { + writeGenoFind(gf, f); + } + +carefulClose(&f); +mustRename(fileNameTmp, fileName); +} + +static struct genoFind *loadGenoFind(FILE *f, void *memMapped) +/* construct one genoFind, mapping file */ +{ +struct genoFind *gf; +AllocVar(gf); + +// read the parameters +mustRead(f, &gf->maxPat, sizeof(gf->maxPat)); +mustRead(f, &gf->minMatch, sizeof(gf->minMatch)); +mustRead(f, &gf->maxGap, sizeof(gf->maxGap)); +mustRead(f, &gf->tileSize, sizeof(gf->tileSize)); +mustRead(f, &gf->stepSize, sizeof(gf->stepSize)); +mustRead(f, &gf->tileSpaceSize, sizeof(gf->tileSpaceSize)); +mustRead(f, &gf->tileMask, sizeof(gf->tileMask)); +mustRead(f, &gf->sourceCount, sizeof(gf->sourceCount)); +mustRead(f, &gf->isPep, sizeof(gf->isPep)); +mustRead(f, &gf->allowOneMismatch, sizeof(gf->allowOneMismatch)); +mustRead(f, &gf->segSize, sizeof(gf->segSize)); +mustRead(f, &gf->totalSeqSize, sizeof(gf->totalSeqSize)); + +// sources: length = gf->sourceCount +gf->sources = needLargeMem(gf->sourceCount * sizeof(struct gfSeqSource)); +int i; +for (i = 0; i < gf->sourceCount; i++) + { + struct gfSeqSource *ss = gf->sources + i; + size_t fileNameLen; + mustRead(f, &fileNameLen, sizeof(fileNameLen)); + if (fileNameLen != 0) + { + ss->fileName = malloc(fileNameLen); + mustRead(f, ss->fileName, fileNameLen); + } + mustRead(f, &ss->start, sizeof(bits32)); + mustRead(f, &ss->end, sizeof(bits32)); + // no seq information written/read + // no masking information written/read + } + +// listSizes: length = (gf->tileSpaceSize) +gf->listSizes = memMapped + ftell(f); +mustSeek(f, (gf->tileSpaceSize * sizeof(gf->listSizes[0])), SEEK_CUR); +gf->allocated = memMapped + ftell(f); +if (gf->segSize == 0) + { + // use lists + gf->lists = needHugeZeroedMem(gf->tileSpaceSize * sizeof(gf->lists[0])); + bits32 *cur = gf->allocated; + size_t count = 0; + for (i = 0; i < gf->tileSpaceSize; i++) + { + if (gf->listSizes[i] < gf->maxPat) + { + gf->lists[i] = cur; + cur += gf->listSizes[i]; + count += gf->listSizes[i]; + } + } + mustSeek(f, count*sizeof(bits32), SEEK_CUR); + } +else + { + // use endLists + gf->endLists = needHugeZeroedMem(gf->tileSpaceSize * sizeof(gf->endLists[0])); + bits16 *cur = gf->allocated; + size_t count = 0; + for (i = 0; i < gf->tileSpaceSize; i++) + { + gf->endLists[i] = cur; + cur += 3 * gf->listSizes[i]; + count += gf->listSizes[i]; + } + mustSeek(f, 3*count*sizeof(bits16), SEEK_CUR); + } +return gf; +} + +void loadGenoFindIndex(char *fileName, struct genoFind **gfRet, struct genoFind *transGf[2][3]) +/* load indexes from file. Only one of gfRet or transGf is set. */ +{ +FILE *f = mustOpen(fileName, "r"); +char fileMagic[sizeof(indexMagic) + 1]; +mustRead(f, fileMagic, sizeof(indexMagic)); +fileMagic[sizeof(indexMagic)] = '\0'; +if (strcmp(fileMagic, indexMagic)) + errAbort("wrong magic string for index file"); +boolean isTrans; +mustRead(f, &isTrans, sizeof(isTrans)); +if (doTrans != isTrans) + errAbort("index file isTrans==%d and -trans==%d", isTrans, doTrans); +void *memMapped = mmap(NULL, fileSize(fileName), PROT_READ, MAP_SHARED, fileno(f), 0); +if (memMapped == MAP_FAILED) + errnoAbort("mmap of index file failed"); + + +if (doTrans) + { + int i, j; + for (i = 0; i < 2; i++) + for (j = 0; j < 3; j++) + transGf[i][j] = loadGenoFind(f, memMapped); + } +else + { + *gfRet = loadGenoFind(f, memMapped); + } +carefulClose(&f); +} + void startServer(char *hostName, char *portName, int fileCount, char *seqFiles[]) /* Load up index and hang out in RAM. */ { struct genoFind *gf = NULL; -static struct genoFind *transGf[2][3]; +struct genoFind *transGf[2][3] = {{NULL, NULL, NULL}, {NULL, NULL, NULL}}; char buf[256]; char *line, *command; struct sockaddr_in6 fromAddr; socklen_t fromLen; int readSize; int socketHandle = 0, connectionHandle = 0; int port = atoi(portName); time_t curtime; struct tm *loctime; char timestr[256]; netBlockBrokenPipes(); curtime = time (NULL); /* Get the current time. */ loctime = localtime (&curtime); /* Convert it to local time representation. */ strftime (timestr, sizeof(timestr), "%Y-%m-%d %H:%M", loctime); /* formate datetime as string */ logInfo("gfServer version %s on host %s, port %s (%s)", gfVersion, hostName, portName, timestr); struct hash *perSeqMaxHash = maybePerSeqMax(fileCount, seqFiles); + +time_t startIndexTime = clock1000(); +if (writeIndex || (!writeIndex && (indexFile == NULL))) + { if (doTrans) { uglyf("starting translated server...\n"); logInfo("setting up translated index"); gfIndexTransNibsAndTwoBits(transGf, fileCount, seqFiles, minMatch, maxGap, tileSize, repMatch, NULL, allowOneMismatch, doMask, stepSize, noSimpRepMask); } else { uglyf("starting untranslated server...\n"); logInfo("setting up untranslated index"); gf = gfIndexNibsAndTwoBits(fileCount, seqFiles, minMatch, - maxGap, tileSize, repMatch, NULL, allowOneMismatch, stepSize, noSimpRepMask); + maxGap, tileSize, repMatch, NULL, allowOneMismatch, + stepSize, noSimpRepMask); + } + logInfo("indexing building complete in %4.3f seconds", 0.001 * (clock1000() - startIndexTime)); + if (writeIndex) + { + writeGenoFindIndex(gf, transGf, indexFile); + logInfo("index file built, exiting: %s", indexFile); + exit(0); + } + } +else + { + loadGenoFindIndex(indexFile, &gf, transGf); + logInfo("indexing loading complete in %4.3f seconds", 0.001 * (clock1000() - startIndexTime)); } -logInfo("indexing complete"); - /* Set up socket. Get ready to listen to it. */ socketHandle = netAcceptingSocket(port, 100); if (socketHandle < 0) errAbort("Fatal Error: Unable to open listening socket on port %d.", port); logInfo("Server ready for queries!"); printf("Server ready for queries!\n"); int connectFailCount = 0; for (;;) { ZeroVar(&fromAddr); fromLen = sizeof(fromAddr); connectionHandle = accept(socketHandle, (struct sockaddr*)&fromAddr, &fromLen); if (connectionHandle < 0) { @@ -1010,30 +1236,34 @@ stepSize = optionInt("stepSize", tileSize); if (optionExists("repMatch")) repMatch = optionInt("repMatch", 0); else repMatch = gfDefaultRepMatch(tileSize, stepSize, doTrans); minMatch = optionInt("minMatch", minMatch); maxDnaHits = optionInt("maxDnaHits", maxDnaHits); maxTransHits = optionInt("maxTransHits", maxTransHits); maxNtSize = optionInt("maxNtSize", maxNtSize); maxAaSize = optionInt("maxAaSize", maxAaSize); seqLog = optionExists("seqLog"); ipLog = optionExists("ipLog"); doMask = optionExists("mask"); canStop = optionExists("canStop"); noSimpRepMask = optionExists("noSimpRepMask"); +writeIndex = optionExists("writeIndex"); +indexFile = optionVal("indexFile", NULL); +if (writeIndex && (indexFile == NULL)) + errAbort("-writeIndex options requires -indexFile"); if (argc < 2) usage(); if (optionExists("log")) logOpenFile(argv[0], optionVal("log", NULL)); if (optionExists("syslog")) logOpenSyslog(argv[0], optionVal("logFacility", NULL)); if (optionExists("debugLog")) logSetMinPriority("debug"); if (sameWord(command, "direct")) { if (argc < 4) usage(); genoFindDirect(argv[2], argc-3, argv+3); }