9186d6410c3266a7069c81e642c9ea22dc981191 Merge parents bff6869 98c0054 markd Sun Oct 25 20:12:47 2020 -0700 merged master diff --cc src/gfServer/gfServer.c index 8b11e35,32a299c..88a6a98 --- src/gfServer/gfServer.c +++ src/gfServer/gfServer.c @@@ -34,54 -33,56 +34,58 @@@ {"maxDnaHits", OPTION_INT}, {"maxGap", OPTION_INT}, {"maxNtSize", OPTION_INT}, {"maxTransHits", OPTION_INT}, {"minMatch", OPTION_INT}, {"repMatch", OPTION_INT}, {"seqLog", OPTION_BOOLEAN}, {"ipLog", OPTION_BOOLEAN}, {"debugLog", OPTION_BOOLEAN}, {"stepSize", OPTION_INT}, {"tileSize", OPTION_INT}, {"trans", OPTION_BOOLEAN}, {"syslog", OPTION_BOOLEAN}, {"perSeqMax", OPTION_STRING}, {"noSimpRepMask", OPTION_BOOLEAN}, + {"indexFile", OPTION_STRING}, + {"timeout", OPTION_INT}, {NULL, 0} }; int maxNtSize = 40000; int maxAaSize = 8000; int minMatch = gfMinMatch; /* Can be overridden from command line. */ int tileSize = gfTileSize; /* Can be overridden from command line. */ int stepSize = 0; /* Can be overridden from command line. */ boolean doTrans = FALSE; /* Do translation? */ boolean allowOneMismatch = FALSE; boolean noSimpRepMask = FALSE; int repMatch = 1024; /* Can be overridden from command line. */ int maxDnaHits = 100; /* Can be overridden from command line. */ int maxTransHits = 200; /* Can be overridden from command line. */ int maxGap = gfMaxGap; boolean seqLog = FALSE; boolean ipLog = FALSE; boolean doMask = FALSE; boolean canStop = FALSE; +char *indexFile = NULL; + int timeout = 90; // default timeout in seconds + + void usage() /* Explain usage and exit. */ { errAbort( "gfServer v %s - Make a server to quickly find where DNA occurs in genome\n" " To set up a server:\n" " gfServer start host port file(s)\n" " where the files are .2bit or .nib format files specified relative to the current directory\n" " To remove a server:\n" " gfServer stop host port\n" " To query a server with DNA sequence:\n" " gfServer query host port probe.fa\n" " To query a server with protein sequence:\n" " gfServer protQuery host port probe.fa\n" " To query a server with translated DNA sequence:\n" @@@ -136,38 -115,36 +140,40 @@@ " -logFacility=facility Log to the specified syslog facility - default local0.\n" " -mask Use masking from .2bit file.\n" " -repMatch=N Number of occurrences of a tile (n-mer) that triggers repeat masking the\n" " tile. Default is %d.\n" " -noSimpRepMask Suppresses simple repeat masking.\n" " -maxDnaHits=N Maximum number of hits for a DNA query that are sent from the server.\n" " Default is %d.\n" " -maxTransHits=N Maximum number of hits for a translated query that are sent from the server.\n" " Default is %d.\n" " -maxNtSize=N Maximum size of untranslated DNA query sequence.\n" " Default is %d.\n" " -maxAaSize=N Maximum size of protein or translated DNA queries.\n" " Default is %d.\n" " -perSeqMax=file File contains one seq filename (possibly with ':seq' suffix) per line.\n" " -maxDnaHits will be applied to each filename[:seq] separately: each may\n" - " have at most maxDnaHits/2 hits.\n" + " have at most maxDnaHits/2 hits. The filename MUST not include the directory.\n" " Useful for assemblies with many alternate/patch sequences.\n" " -canStop If set, a quit message will actually take down the server.\n" + " -indexFile Index file create by `gfServer index'. Saving index can speed up\n" + " gfServer startup by two orders of magnitude. The parameters must\n" + " exactly match the parameters when the file is written or bad things\n" + " will happen.\n" - , gfVersion, repMatch, maxDnaHits, maxTransHits, maxNtSize, maxAaSize + " -timeout=N Timeout in seconds.\n" + " Default is %d.\n" + , gfVersion, repMatch, maxDnaHits, maxTransHits, maxNtSize, maxAaSize, timeout ); } /* Note about file(s) specified in the start command: The path(s) specified here are sent back exactly as-is to clients such as gfClient, hgBlat, webBlat. It is intended that relative paths are used. Absolute paths starting with '/' tend not to work unless the client is on the same machine as the server. For use with hgBlat and webBlat, cd to the directory where the file is and use the plain file name with no slashes. hgBlat will append the path(s) given to dbDb.nibPath. webBlat will append the path(s) given to path specified in webBlat.cfg. gfClient will append the path(s) given to the seqDir path specified. @@@ -297,44 -303,43 +333,44 @@@ ((perSeqCount = hashIntValDefault(perSeqMaxHash, ss->fileName, -1)) >= 0)) { if (perSeqCount >= (maxDnaHits / 2)) break; hashIncInt(perSeqMaxHash, ss->fileName); } else if (--limit < 0) break; } gfClumpFreeList(&clumpList); lmCleanup(&lm); logDebug("%lu %d clumps, %d hits", clock1000(), clumpCount, hitCount); } void transQuery(struct genoFind *transGf[2][3], aaSeq *seq, - int connectionHandle, char buf[256]) + int connectionHandle) /* Handle a query for protein/translated DNA match. */ { +char buf[256]; struct gfClump *clumps[3], *clump; int isRc, frame; char strand; struct dyString *dy = newDyString(1024); struct gfHit *hit; int clumpCount = 0, hitCount = 0, oneHit; struct lm *lm = lmInit(0); sprintf(buf, "tileSize %d", tileSize); - netSendString(connectionHandle, buf); + errSendString(connectionHandle, buf); for (frame = 0; frame < 3; ++frame) clumps[frame] = NULL; for (isRc = 0; isRc <= 1; ++isRc) { strand = (isRc ? '-' : '+'); gfTransFindClumps(transGf[isRc], seq, clumps, lm, &oneHit); hitCount += oneHit; for (frame = 0; frame < 3; ++frame) { int limit = maxTransHits; for (clump = clumps[frame]; clump != NULL; clump = clump->next) { struct gfSeqSource *ss = clump->target; sprintf(buf, "%d\t%d\t%s\t%d\t%d\t%d\t%c\t%d", clump->qStart, clump->qEnd, ss->fileName, @@@ -348,44 -353,43 +384,44 @@@ ++clumpCount; if (--limit < 0) break; } gfClumpFreeList(&clumps[frame]); } } if (clumpCount == 0) ++missCount; freeDyString(&dy); lmCleanup(&lm); logDebug("%lu %d clumps, %d hits", clock1000(), clumpCount, hitCount); } void transTransQuery(struct genoFind *transGf[2][3], struct dnaSeq *seq, - int connectionHandle, char buf[256]) + int connectionHandle) /* Handle a query for protein/translated DNA match. */ { +char buf[256]; struct gfClump *clumps[3][3], *clump; int isRc, qFrame, tFrame; char strand; struct trans3 *t3 = trans3New(seq); struct dyString *dy = newDyString(1024); struct gfHit *hit; int clumpCount = 0, hitCount = 0, oneCount; sprintf(buf, "tileSize %d", tileSize); - netSendString(connectionHandle, buf); + errSendString(connectionHandle, buf); for (qFrame = 0; qFrame<3; ++qFrame) for (tFrame=0; tFrame<3; ++tFrame) clumps[qFrame][tFrame] = NULL; for (isRc = 0; isRc <= 1; ++isRc) { struct lm *lm = lmInit(0); strand = (isRc ? '-' : '+'); gfTransTransFindClumps(transGf[isRc], t3->trans, clumps, lm, &oneCount); hitCount += oneCount; for (qFrame = 0; qFrame<3; ++qFrame) { for (tFrame=0; tFrame<3; ++tFrame) { int limit = maxTransHits; for (clump = clumps[qFrame][tFrame]; clump != NULL; clump = clump->next) @@@ -473,52 -477,53 +509,52 @@@ ripCord = needMem(64*1024); /* Memory for error recovery. memTrackerEnd frees */ } static void errorSafeCleanup() /* Clean up and report problem. */ { memTrackerEnd(); popAbortHandler(); // must come after memTracker } static void errorSafeCleanupMess(int connectionHandle, char *message) /* Clean up and report problem. */ { errorSafeCleanup(); logError("Recovering from error via longjmp"); - netSendString(connectionHandle, message); + errSendString(connectionHandle, message); } static void errorSafeQuery(boolean doTrans, boolean queryIsProt, - struct dnaSeq *seq, struct genoFind *gf, struct genoFind *transGf[2][3], + struct dnaSeq *seq, struct genoFindIndex *gfIdx, int connectionHandle, char *buf, struct hash *perSeqMaxHash) /* Wrap error handling code around index query. */ { int status; errorSafeSetup(); status = setjmp(gfRecover); if (status == 0) /* Always true except after long jump. */ { if (doTrans) { if (queryIsProt) - transQuery(transGf, seq, connectionHandle, buf); + transQuery(gfIdx->transGf, seq, connectionHandle); else - transTransQuery(transGf, seq, - connectionHandle, buf); + transTransQuery(gfIdx->transGf, seq, connectionHandle); } else - dnaQuery(gf, seq, connectionHandle, buf, perSeqMaxHash); + dnaQuery(gfIdx->untransGf, seq, connectionHandle, perSeqMaxHash); errorSafeCleanup(); } else /* They long jumped here because of an error. */ { errorSafeCleanupMess(connectionHandle, "Error: gfServer out of memory. Try reducing size of query."); } } static void errorSafePcr(struct genoFind *gf, char *fPrimer, char *rPrimer, int maxDistance, int connectionHandle) /* Wrap error handling around pcr index query. */ { int status; errorSafeSetup(); @@@ -616,73 -605,75 +652,75 @@@ int socketHandle = 0, connectionHandle = 0; int port = atoi(portName); time_t curtime; struct tm *loctime; char timestr[256]; netBlockBrokenPipes(); curtime = time (NULL); /* Get the current time. */ loctime = localtime (&curtime); /* Convert it to local time representation. */ strftime (timestr, sizeof(timestr), "%Y-%m-%d %H:%M", loctime); /* formate datetime as string */ logInfo("gfServer version %s on host %s, port %s (%s)", gfVersion, hostName, portName, timestr); struct hash *perSeqMaxHash = maybePerSeqMax(fileCount, seqFiles); -if (doTrans) + +time_t startIndexTime = clock1000(); +if (indexFile == NULL) { - uglyf("starting translated server...\n"); - logInfo("setting up translated index"); - gfIndexTransNibsAndTwoBits(transGf, fileCount, seqFiles, - minMatch, maxGap, tileSize, repMatch, NULL, allowOneMismatch, - doMask, stepSize, noSimpRepMask); + char *desc = doTrans ? "translated" : "untranslated"; + uglyf("starting %s server...\n", desc); + logInfo("setting up %s index", desc); + gfIdx = genoFindIndexBuild(fileCount, seqFiles, minMatch, maxGap, tileSize, repMatch, doTrans, NULL, + allowOneMismatch, doMask, stepSize, noSimpRepMask); + logInfo("index building completed in %4.3f seconds", 0.001 * (clock1000() - startIndexTime)); } else { - uglyf("starting untranslated server...\n"); - logInfo("setting up untranslated index"); - gf = gfIndexNibsAndTwoBits(fileCount, seqFiles, minMatch, - maxGap, tileSize, repMatch, NULL, allowOneMismatch, stepSize, noSimpRepMask); + gfIdx = genoFindIndexLoad(indexFile, doTrans); + logInfo("index loading completed in %4.3f seconds", 0.001 * (clock1000() - startIndexTime)); } -logInfo("indexing complete"); /* Set up socket. Get ready to listen to it. */ socketHandle = netAcceptingSocket(port, 100); if (socketHandle < 0) errAbort("Fatal Error: Unable to open listening socket on port %d.", port); logInfo("Server ready for queries!"); printf("Server ready for queries!\n"); int connectFailCount = 0; for (;;) { ZeroVar(&fromAddr); fromLen = sizeof(fromAddr); connectionHandle = accept(socketHandle, (struct sockaddr*)&fromAddr, &fromLen); + setSendOk(); if (connectionHandle < 0) { warn("Error accepting the connection"); ++warnCount; ++connectFailCount; if (connectFailCount >= 100) errAbort("100 continuous connection failures, no point in filling up the log in an infinite loop."); continue; } else { connectFailCount = 0; } + setSocketTimeout(connectionHandle, timeout); if (ipLog) { struct sockaddr_in6 clientAddr; unsigned int addrlen=sizeof(clientAddr); getpeername(connectionHandle, (struct sockaddr *)&clientAddr, &addrlen); char ipStr[NI_MAXHOST]; getAddrAsString6n4((struct sockaddr_storage *)&clientAddr, ipStr, sizeof ipStr); logInfo("gfServer version %s on host %s, port %s connection from %s", gfVersion, hostName, portName, ipStr); } readSize = read(connectionHandle, buf, sizeof(buf)-1); if (readSize < 0) { warn("Error reading from socket: %s", strerror(errno)); ++warnCount; @@@ -806,79 -797,79 +844,79 @@@ { ++trimCount; seq.size = maxSize; seq.dna[maxSize] = 0; } if (queryIsProt) aaCount += seq.size; else baseCount += seq.size; if (seqLog && (logGetFile() != NULL)) { FILE *lf = logGetFile(); faWriteNext(lf, "query", seq.dna, seq.size); fflush(lf); } - errorSafeQuery(doTrans, queryIsProt, &seq, gf, - transGf, connectionHandle, buf, perSeqMaxHash); + errorSafeQuery(doTrans, queryIsProt, &seq, gfIdx, + connectionHandle, buf, perSeqMaxHash); if (perSeqMaxHash) hashZeroVals(perSeqMaxHash); } freez(&seq.dna); } - netSendString(connectionHandle, "end"); + errSendString(connectionHandle, "end"); } } } } else if (sameString("pcr", command)) { char *f = nextWord(&line); char *r = nextWord(&line); char *s = nextWord(&line); int maxDistance; ++pcrCount; if (s == NULL || !isdigit(s[0])) { warn("Badly formatted pcr command"); ++warnCount; } else if (doTrans) { warn("Can't pcr on translated server"); ++warnCount; } else if (badPcrPrimerSeq(f) || badPcrPrimerSeq(r)) { warn("Can only handle ACGT in primer sequences."); ++warnCount; } else { maxDistance = atoi(s); - errorSafePcr(gf, f, r, maxDistance, connectionHandle); + errorSafePcr(gfIdx->untransGf, f, r, maxDistance, connectionHandle); } } else if (sameString("files", command)) { int i; sprintf(buf, "%d", fileCount); - netSendString(connectionHandle, buf); + errSendString(connectionHandle, buf); for (i=0; i<fileCount; ++i) { sprintf(buf, "%s", seqFiles[i]); - netSendString(connectionHandle, buf); + errSendString(connectionHandle, buf); } } else { warn("Unknown command %s", command); ++warnCount; } close(connectionHandle); connectionHandle = 0; } close(socketHandle); } void stopServer(char *hostName, char *portName) /* Send stop message to server. */ @@@ -1256,32 -1048,31 +1294,32 @@@ stepSize = optionInt("stepSize", tileSize); if (optionExists("repMatch")) repMatch = optionInt("repMatch", 0); else repMatch = gfDefaultRepMatch(tileSize, stepSize, doTrans); minMatch = optionInt("minMatch", minMatch); maxDnaHits = optionInt("maxDnaHits", maxDnaHits); maxTransHits = optionInt("maxTransHits", maxTransHits); maxNtSize = optionInt("maxNtSize", maxNtSize); maxAaSize = optionInt("maxAaSize", maxAaSize); seqLog = optionExists("seqLog"); ipLog = optionExists("ipLog"); doMask = optionExists("mask"); canStop = optionExists("canStop"); noSimpRepMask = optionExists("noSimpRepMask"); +indexFile = optionVal("indexFile", NULL); - + timeout = optionInt("timeout", timeout); if (argc < 2) usage(); if (optionExists("log")) logOpenFile(argv[0], optionVal("log", NULL)); if (optionExists("syslog")) logOpenSyslog(argv[0], optionVal("logFacility", NULL)); if (optionExists("debugLog")) logSetMinPriority("debug"); if (sameWord(command, "direct")) { if (argc < 4) usage(); genoFindDirect(argv[2], argc-3, argv+3); }