ee8ef81c6d82157859f16548a27726f4b7421891 markd Mon Jul 6 19:22:14 2020 -0700 added support for perSeqMax in dynamic server diff --git src/gfServer/gfServer.c src/gfServer/gfServer.c index 7c8a626..4462075 100644 --- src/gfServer/gfServer.c +++ src/gfServer/gfServer.c @@ -1,1323 +1,1343 @@ /* gfServer - set up an index of the genome in memory and * respond to search requests. */ /* Copyright 2001-2003 Jim Kent. All rights reserved. */ #include "common.h" #include #include #include #include #include #include "portable.h" #include "net.h" #include "dnautil.h" #include "dnaseq.h" #include "nib.h" #include "twoBit.h" #include "fa.h" #include "dystring.h" #include "errAbort.h" #include "memalloc.h" #include "genoFind.h" #include "options.h" #include "trans3.h" #include "log.h" #include "internet.h" #include "hash.h" static struct optionSpec optionSpecs[] = { {"canStop", OPTION_BOOLEAN}, {"log", OPTION_STRING}, {"logFacility", OPTION_STRING}, {"mask", OPTION_BOOLEAN}, {"maxAaSize", OPTION_INT}, {"maxDnaHits", OPTION_INT}, {"maxGap", OPTION_INT}, {"maxNtSize", OPTION_INT}, {"maxTransHits", OPTION_INT}, {"minMatch", OPTION_INT}, {"repMatch", OPTION_INT}, {"seqLog", OPTION_BOOLEAN}, {"ipLog", OPTION_BOOLEAN}, {"debugLog", OPTION_BOOLEAN}, {"stepSize", OPTION_INT}, {"tileSize", OPTION_INT}, {"trans", OPTION_BOOLEAN}, {"syslog", OPTION_BOOLEAN}, {"perSeqMax", OPTION_STRING}, {"noSimpRepMask", OPTION_BOOLEAN}, {"indexFile", OPTION_STRING}, {NULL, 0} }; int maxNtSize = 40000; int maxAaSize = 8000; int minMatch = gfMinMatch; /* Can be overridden from command line. */ int tileSize = gfTileSize; /* Can be overridden from command line. */ int stepSize = 0; /* Can be overridden from command line. */ boolean doTrans = FALSE; /* Do translation? */ boolean allowOneMismatch = FALSE; boolean noSimpRepMask = FALSE; int repMatch = 1024; /* Can be overridden from command line. */ int maxDnaHits = 100; /* Can be overridden from command line. */ int maxTransHits = 200; /* Can be overridden from command line. */ int maxGap = gfMaxGap; boolean seqLog = FALSE; boolean ipLog = FALSE; boolean doMask = FALSE; boolean canStop = FALSE; char *indexFile = NULL; void usage() /* Explain usage and exit. */ { errAbort( "gfServer v %s - Make a server to quickly find where DNA occurs in genome\n" " To set up a server:\n" " gfServer start host port file(s)\n" " where the files are .2bit or .nib format files specified relative to the current directory\n" " To remove a server:\n" " gfServer stop host port\n" " To query a server with DNA sequence:\n" " gfServer query host port probe.fa\n" " To query a server with protein sequence:\n" " gfServer protQuery host port probe.fa\n" " To query a server with translated DNA sequence:\n" " gfServer transQuery host port probe.fa\n" " To query server with PCR primers:\n" " gfServer pcr host port fPrimer rPrimer maxDistance\n" " To process one probe fa file against a .2bit format genome (not starting server):\n" " gfServer direct probe.fa file(s).2bit\n" " To test PCR without starting server:\n" " gfServer pcrDirect fPrimer rPrimer file(s).2bit\n" " To figure out usage level:\n" " gfServer status host port\n" " To get input file list:\n" " gfServer files host port\n" " To generate a precomputed index:\n" " gfServer index gfidx file(s)\n" " where the files are .2bit or .nib format files. Separate indexes must be created\n" " for untranslated and translated queries. These can be used with a persistent server\n" " as with 'start -indexFile or a dynamic server. They must follow the naming convention for\n" " for dynamic servers.\n" " To run a dynamic server (usually called by xinet):\n" " gfServer dynserver rootdir\n" " The root directory must contain directories for each genome with the twobit and index\n" " files following the convention:\n" " $rootdir/$genome/$genome.2bit\n" " $rootdir/$genome/$genome.untrans.gfidx\n" " $rootdir/$genome/$genome.trans.gfidx\n" - " Both indexes must exist.\n" + " The -perSeqMax functionality can be implemented by creating a file\n" + " $rootdir/$genome/$genome.perseqmax\n" + " Note: the dynamic, the file names in the perseqmax file MUST NOT contain directories.\n" "\n" "options:\n" " -tileSize=N Size of n-mers to index. Default is 11 for nucleotides, 4 for\n" " proteins (or translated nucleotides).\n" " -stepSize=N Spacing between tiles. Default is tileSize.\n" " -minMatch=N Number of n-mer matches that trigger detailed alignment.\n" " Default is 2 for nucleotides, 3 for proteins.\n" " -maxGap=N Number of insertions or deletions allowed between n-mers.\n" " Default is 2 for nucleotides, 0 for proteins.\n" " -trans Translate database to protein in 6 frames. Note: it is best\n" " to run this on RepeatMasked data in this case.\n" " -log=logFile Keep a log file that records server requests.\n" " -seqLog Include sequences in log file (not logged with -syslog).\n" " -ipLog Include user's IP in log file (not logged with -syslog).\n" " -debugLog Include debugging info in log file.\n" " -syslog Log to syslog.\n" " -logFacility=facility Log to the specified syslog facility - default local0.\n" " -mask Use masking from .2bit file.\n" " -repMatch=N Number of occurrences of a tile (n-mer) that triggers repeat masking the\n" " tile. Default is %d.\n" " -noSimpRepMask Suppresses simple repeat masking.\n" " -maxDnaHits=N Maximum number of hits for a DNA query that are sent from the server.\n" " Default is %d.\n" " -maxTransHits=N Maximum number of hits for a translated query that are sent from the server.\n" " Default is %d.\n" " -maxNtSize=N Maximum size of untranslated DNA query sequence.\n" " Default is %d.\n" " -maxAaSize=N Maximum size of protein or translated DNA queries.\n" " Default is %d.\n" " -perSeqMax=file File contains one seq filename (possibly with ':seq' suffix) per line.\n" " -maxDnaHits will be applied to each filename[:seq] separately: each may\n" " have at most maxDnaHits/2 hits.\n" " Useful for assemblies with many alternate/patch sequences.\n" " -canStop If set, a quit message will actually take down the server.\n" " -indexFile Index file create by `gfServer index'. Saving index can speed up\n" " gfServer startup by two orders of magnitude. The parameters must\n" " exactly match the parameters when the file is written or bad things\n" " will happen.\n" , gfVersion, repMatch, maxDnaHits, maxTransHits, maxNtSize, maxAaSize ); } /* Note about file(s) specified in the start command: The path(s) specified here are sent back exactly as-is to clients such as gfClient, hgBlat, webBlat. It is intended that relative paths are used. Absolute paths starting with '/' tend not to work unless the client is on the same machine as the server. For use with hgBlat and webBlat, cd to the directory where the file is and use the plain file name with no slashes. hgBlat will append the path(s) given to dbDb.nibPath. webBlat will append the path(s) given to path specified in webBlat.cfg. gfClient will append the path(s) given to the seqDir path specified. */ void genoFindDirect(char *probeName, int fileCount, char *seqFiles[]) /* Don't set up server - just directly look for matches. */ { struct genoFind *gf = NULL; struct lineFile *lf = lineFileOpen(probeName, TRUE); struct dnaSeq seq; int hitCount = 0, clumpCount = 0, oneHit; ZeroVar(&seq); if (doTrans) errAbort("Don't support translated direct stuff currently, sorry"); gf = gfIndexNibsAndTwoBits(fileCount, seqFiles, minMatch, maxGap, tileSize, repMatch, FALSE, allowOneMismatch, stepSize, noSimpRepMask); while (faSpeedReadNext(lf, &seq.dna, &seq.size, &seq.name)) { struct lm *lm = lmInit(0); struct gfClump *clumpList = gfFindClumps(gf, &seq, lm, &oneHit), *clump; hitCount += oneHit; for (clump = clumpList; clump != NULL; clump = clump->next) { ++clumpCount; printf("%s ", seq.name); gfClumpDump(gf, clump, stdout); } gfClumpFreeList(&clumpList); lmCleanup(&lm); } lineFileClose(&lf); genoFindFree(&gf); } void genoPcrDirect(char *fPrimer, char *rPrimer, int fileCount, char *seqFiles[]) /* Do direct PCR for testing purposes. */ { struct genoFind *gf = NULL; int fPrimerSize = strlen(fPrimer); int rPrimerSize = strlen(rPrimer); struct gfClump *clumpList, *clump; time_t startTime, endTime; startTime = clock1000(); gf = gfIndexNibsAndTwoBits(fileCount, seqFiles, minMatch, maxGap, tileSize, repMatch, FALSE, allowOneMismatch, stepSize, noSimpRepMask); endTime = clock1000(); printf("Index built in %4.3f seconds\n", 0.001 * (endTime - startTime)); printf("plus strand:\n"); startTime = clock1000(); clumpList = gfPcrClumps(gf, fPrimer, fPrimerSize, rPrimer, rPrimerSize, 0, 4*1024); endTime = clock1000(); printf("Index searched in %4.3f seconds\n", 0.001 * (endTime - startTime)); for (clump = clumpList; clump != NULL; clump = clump->next) { /* Clumps from gfPcrClumps have already had target->start subtracted out * of their coords, but gfClumpDump assumes they have not and does the * subtraction; rather than write a new gfClumpDump, tweak here: */ clump->tStart += clump->target->start; clump->tEnd += clump->target->start; gfClumpDump(gf, clump, stdout); } printf("minus strand:\n"); startTime = clock1000(); clumpList = gfPcrClumps(gf, rPrimer, rPrimerSize, fPrimer, fPrimerSize, 0, 4*1024); endTime = clock1000(); printf("Index searched in %4.3f seconds\n", 0.001 * (endTime - startTime)); for (clump = clumpList; clump != NULL; clump = clump->next) { /* Same as above, tweak before gfClumpDump: */ clump->tStart += clump->target->start; clump->tEnd += clump->target->start; gfClumpDump(gf, clump, stdout); } genoFindFree(&gf); } int getPortIx(char *portName) /* Convert from ascii to integer. */ { if (!isdigit(portName[0])) errAbort("Expecting a port number got %s", portName); return atoi(portName); } /* Some variables to gather statistics on usage. */ long baseCount = 0, blatCount = 0, aaCount = 0, pcrCount = 0; int warnCount = 0; int noSigCount = 0; int missCount = 0; int trimCount = 0; void dnaQuery(struct genoFind *gf, struct dnaSeq *seq, int connectionHandle, struct hash *perSeqMaxHash) /* Handle a query for DNA/DNA match. */ { char buf[256]; struct gfClump *clumpList = NULL, *clump; int limit = 1000; int clumpCount = 0, hitCount = -1; struct lm *lm = lmInit(0); if (seq->size > gf->tileSize + gf->stepSize + gf->stepSize) limit = maxDnaHits; clumpList = gfFindClumps(gf, seq, lm, &hitCount); if (clumpList == NULL) ++missCount; for (clump = clumpList; clump != NULL; clump = clump->next) { struct gfSeqSource *ss = clump->target; sprintf(buf, "%d\t%d\t%s\t%d\t%d\t%d", clump->qStart, clump->qEnd, ss->fileName, clump->tStart-ss->start, clump->tEnd-ss->start, clump->hitCount); netSendString(connectionHandle, buf); ++clumpCount; int perSeqCount = -1; if (perSeqMaxHash && ((perSeqCount = hashIntValDefault(perSeqMaxHash, ss->fileName, -1)) >= 0)) { if (perSeqCount >= (maxDnaHits / 2)) break; hashIncInt(perSeqMaxHash, ss->fileName); } else if (--limit < 0) break; } gfClumpFreeList(&clumpList); lmCleanup(&lm); logDebug("%lu %d clumps, %d hits", clock1000(), clumpCount, hitCount); } void transQuery(struct genoFind *transGf[2][3], aaSeq *seq, int connectionHandle) /* Handle a query for protein/translated DNA match. */ { char buf[256]; struct gfClump *clumps[3], *clump; int isRc, frame; char strand; struct dyString *dy = newDyString(1024); struct gfHit *hit; int clumpCount = 0, hitCount = 0, oneHit; struct lm *lm = lmInit(0); sprintf(buf, "tileSize %d", tileSize); netSendString(connectionHandle, buf); for (frame = 0; frame < 3; ++frame) clumps[frame] = NULL; for (isRc = 0; isRc <= 1; ++isRc) { strand = (isRc ? '-' : '+'); gfTransFindClumps(transGf[isRc], seq, clumps, lm, &oneHit); hitCount += oneHit; for (frame = 0; frame < 3; ++frame) { int limit = maxTransHits; for (clump = clumps[frame]; clump != NULL; clump = clump->next) { struct gfSeqSource *ss = clump->target; sprintf(buf, "%d\t%d\t%s\t%d\t%d\t%d\t%c\t%d", clump->qStart, clump->qEnd, ss->fileName, clump->tStart-ss->start, clump->tEnd-ss->start, clump->hitCount, strand, frame); netSendString(connectionHandle, buf); dyStringClear(dy); for (hit = clump->hitList; hit != NULL; hit = hit->next) dyStringPrintf(dy, " %d %d", hit->qStart, hit->tStart - ss->start); netSendLongString(connectionHandle, dy->string); ++clumpCount; if (--limit < 0) break; } gfClumpFreeList(&clumps[frame]); } } if (clumpCount == 0) ++missCount; freeDyString(&dy); lmCleanup(&lm); logDebug("%lu %d clumps, %d hits", clock1000(), clumpCount, hitCount); } void transTransQuery(struct genoFind *transGf[2][3], struct dnaSeq *seq, int connectionHandle) /* Handle a query for protein/translated DNA match. */ { char buf[256]; struct gfClump *clumps[3][3], *clump; int isRc, qFrame, tFrame; char strand; struct trans3 *t3 = trans3New(seq); struct dyString *dy = newDyString(1024); struct gfHit *hit; int clumpCount = 0, hitCount = 0, oneCount; sprintf(buf, "tileSize %d", tileSize); netSendString(connectionHandle, buf); for (qFrame = 0; qFrame<3; ++qFrame) for (tFrame=0; tFrame<3; ++tFrame) clumps[qFrame][tFrame] = NULL; for (isRc = 0; isRc <= 1; ++isRc) { struct lm *lm = lmInit(0); strand = (isRc ? '-' : '+'); gfTransTransFindClumps(transGf[isRc], t3->trans, clumps, lm, &oneCount); hitCount += oneCount; for (qFrame = 0; qFrame<3; ++qFrame) { for (tFrame=0; tFrame<3; ++tFrame) { int limit = maxTransHits; for (clump = clumps[qFrame][tFrame]; clump != NULL; clump = clump->next) { struct gfSeqSource *ss = clump->target; sprintf(buf, "%d\t%d\t%s\t%d\t%d\t%d\t%c\t%d\t%d", clump->qStart, clump->qEnd, ss->fileName, clump->tStart-ss->start, clump->tEnd-ss->start, clump->hitCount, strand, qFrame, tFrame); netSendString(connectionHandle, buf); dyStringClear(dy); for (hit = clump->hitList; hit != NULL; hit = hit->next) { dyStringPrintf(dy, " %d %d", hit->qStart, hit->tStart - ss->start); } netSendLongString(connectionHandle, dy->string); ++clumpCount; if (--limit < 0) break; } gfClumpFreeList(&clumps[qFrame][tFrame]); } } lmCleanup(&lm); } trans3Free(&t3); if (clumpCount == 0) ++missCount; logDebug("%lu %d clumps, %d hits", clock1000(), clumpCount, hitCount); } static void pcrQuery(struct genoFind *gf, char *fPrimer, char *rPrimer, int maxDistance, int connectionHandle) /* Do PCR query and report results down socket. */ { int fPrimerSize = strlen(fPrimer); int rPrimerSize = strlen(rPrimer); struct gfClump *clumpList, *clump; int clumpCount = 0; char buf[256]; clumpList = gfPcrClumps(gf, fPrimer, fPrimerSize, rPrimer, rPrimerSize, 0, maxDistance); for (clump = clumpList; clump != NULL; clump = clump->next) { struct gfSeqSource *ss = clump->target; safef(buf, sizeof(buf), "%s\t%d\t%d\t+", ss->fileName, clump->tStart, clump->tEnd); netSendString(connectionHandle, buf); ++clumpCount; } gfClumpFreeList(&clumpList); clumpList = gfPcrClumps(gf, rPrimer, rPrimerSize, fPrimer, fPrimerSize, 0, maxDistance); for (clump = clumpList; clump != NULL; clump = clump->next) { struct gfSeqSource *ss = clump->target; safef(buf, sizeof(buf), "%s\t%d\t%d\t-", ss->fileName, clump->tStart, clump->tEnd); netSendString(connectionHandle, buf); ++clumpCount; } gfClumpFreeList(&clumpList); netSendString(connectionHandle, "end"); logDebug("%lu PCR %s %s %d clumps\n", clock1000(), fPrimer, rPrimer, clumpCount); } static jmp_buf gfRecover; static char *ripCord = NULL; /* A little memory to give back to system * during error recovery. */ static void gfAbort() /* Abort query. */ { freez(&ripCord); longjmp(gfRecover, -1); } static void errorSafeSetup() /* Start up error safe stuff. */ { pushAbortHandler(gfAbort); // must come before memTracker memTrackerStart(); ripCord = needMem(64*1024); /* Memory for error recovery. memTrackerEnd frees */ } static void errorSafeCleanup() /* Clean up and report problem. */ { memTrackerEnd(); popAbortHandler(); // must come after memTracker } static void errorSafeCleanupMess(int connectionHandle, char *message) /* Clean up and report problem. */ { errorSafeCleanup(); logError("Recovering from error via longjmp"); netSendString(connectionHandle, message); } static void errorSafeQuery(boolean doTrans, boolean queryIsProt, struct dnaSeq *seq, struct genoFindIndex *gfIdx, int connectionHandle, char *buf, struct hash *perSeqMaxHash) /* Wrap error handling code around index query. */ { int status; errorSafeSetup(); status = setjmp(gfRecover); if (status == 0) /* Always true except after long jump. */ { if (doTrans) { if (queryIsProt) transQuery(gfIdx->transGf, seq, connectionHandle); else transTransQuery(gfIdx->transGf, seq, connectionHandle); } else dnaQuery(gfIdx->untransGf, seq, connectionHandle, perSeqMaxHash); errorSafeCleanup(); } else /* They long jumped here because of an error. */ { errorSafeCleanupMess(connectionHandle, "Error: gfServer out of memory. Try reducing size of query."); } } static void errorSafePcr(struct genoFind *gf, char *fPrimer, char *rPrimer, int maxDistance, int connectionHandle) /* Wrap error handling around pcr index query. */ { int status; errorSafeSetup(); status = setjmp(gfRecover); if (status == 0) /* Always true except after long jump. */ { pcrQuery(gf, fPrimer, rPrimer, maxDistance, connectionHandle); errorSafeCleanup(); } else /* They long jumped here because of an error. */ { errorSafeCleanupMess(connectionHandle, "Error: gfServer out of memory."); } } boolean badPcrPrimerSeq(char *s) /* Return TRUE if have a character we can't handle in sequence. */ { unsigned char c; while ((c = *s++) != 0) { if (ntVal[c] < 0) return TRUE; } return FALSE; } -static struct hash *maybePerSeqMax(int fileCount, char *seqFiles[]) -/* If options include -perSeqMax=file, then read the sequences named in the file into a hash - * for testing membership in the set of sequences to exclude from -maxDnaHits accounting. */ -{ -struct hash *perSeqMaxHash = NULL; -char *fileName = optionVal("perSeqMax", NULL); -if (isNotEmpty(fileName)) +static struct hash *buildPerSeqMax(int fileCount, char *seqFiles[], char* perSeqMaxFile) +/* do work of building perSeqMaxhash */ { - perSeqMaxHash = hashNew(0); - struct lineFile *lf = lineFileOpen(fileName, TRUE); +struct hash *perSeqMaxHash = hashNew(0); +struct lineFile *lf = lineFileOpen(perSeqMaxFile, TRUE); char *line; while (lineFileNextReal(lf, &line)) { // Make sure line contains a valid seq filename (before optional ':seq') char *seqFile = trimSpaces(line); char copy[strlen(seqFile)+1]; safecpy(copy, sizeof copy, seqFile); char *colon = strrchr(copy, ':'); if (colon) *colon = '\0'; if (stringArrayIx(copy, seqFiles, fileCount) < 0) lineFileAbort(lf, "'%s' does not appear to be a sequence file from the " "command line", copy); hashAddInt(perSeqMaxHash, seqFile, 0); } lineFileClose(&lf); - } return perSeqMaxHash; } + +static struct hash *maybePerSeqMax(int fileCount, char *seqFiles[]) +/* If options include -perSeqMax=file, then read the sequences named in the file into a hash + * for testing membership in the set of sequences to exclude from -maxDnaHits accounting. */ +{ +char *fileName = optionVal("perSeqMax", NULL); +if (isNotEmpty(fileName)) + return buildPerSeqMax(fileCount, seqFiles, fileName); +else + return NULL; +} + static void hashZeroVals(struct hash *hash) /* Set the value of every element of hash to NULL (0 for ints). */ { struct hashEl *hel; struct hashCookie cookie = hashFirst(hash); while ((hel = hashNext(&cookie)) != NULL) hel->val = 0; } void startServer(char *hostName, char *portName, int fileCount, char *seqFiles[]) /* Load up index and hang out in RAM. */ { struct genoFindIndex *gfIdx = NULL; char buf[256]; char *line, *command; struct sockaddr_in6 fromAddr; socklen_t fromLen; int readSize; int socketHandle = 0, connectionHandle = 0; int port = atoi(portName); time_t curtime; struct tm *loctime; char timestr[256]; netBlockBrokenPipes(); curtime = time (NULL); /* Get the current time. */ loctime = localtime (&curtime); /* Convert it to local time representation. */ strftime (timestr, sizeof(timestr), "%Y-%m-%d %H:%M", loctime); /* formate datetime as string */ logInfo("gfServer version %s on host %s, port %s (%s)", gfVersion, hostName, portName, timestr); struct hash *perSeqMaxHash = maybePerSeqMax(fileCount, seqFiles); time_t startIndexTime = clock1000(); if (indexFile == NULL) { char *desc = doTrans ? "translated" : "untranslated"; uglyf("starting %s server...\n", desc); logInfo("setting up %s index", desc); gfIdx = genoFindIndexBuild(fileCount, seqFiles, minMatch, maxGap, tileSize, repMatch, doTrans, NULL, allowOneMismatch, doMask, stepSize, noSimpRepMask); logInfo("index building completed in %4.3f seconds", 0.001 * (clock1000() - startIndexTime)); } else { gfIdx = genoFindIndexLoad(indexFile, doTrans); logInfo("index loading completed in %4.3f seconds", 0.001 * (clock1000() - startIndexTime)); } /* Set up socket. Get ready to listen to it. */ socketHandle = netAcceptingSocket(port, 100); if (socketHandle < 0) errAbort("Fatal Error: Unable to open listening socket on port %d.", port); logInfo("Server ready for queries!"); printf("Server ready for queries!\n"); int connectFailCount = 0; for (;;) { ZeroVar(&fromAddr); fromLen = sizeof(fromAddr); connectionHandle = accept(socketHandle, (struct sockaddr*)&fromAddr, &fromLen); if (connectionHandle < 0) { warn("Error accepting the connection"); ++warnCount; ++connectFailCount; if (connectFailCount >= 100) errAbort("100 continuous connection failures, no point in filling up the log in an infinite loop."); continue; } else { connectFailCount = 0; } if (ipLog) { struct sockaddr_in6 clientAddr; unsigned int addrlen=sizeof(clientAddr); getpeername(connectionHandle, (struct sockaddr *)&clientAddr, &addrlen); char ipStr[NI_MAXHOST]; getAddrAsString6n4((struct sockaddr_storage *)&clientAddr, ipStr, sizeof ipStr); logInfo("gfServer version %s on host %s, port %s connection from %s", gfVersion, hostName, portName, ipStr); } readSize = read(connectionHandle, buf, sizeof(buf)-1); if (readSize < 0) { warn("Error reading from socket: %s", strerror(errno)); ++warnCount; close(connectionHandle); continue; } if (readSize == 0) { warn("Zero sized query"); ++warnCount; close(connectionHandle); continue; } buf[readSize] = 0; logDebug("%s", buf); if (!startsWith(gfSignature(), buf)) { ++noSigCount; close(connectionHandle); continue; } line = buf + strlen(gfSignature()); command = nextWord(&line); if (sameString("quit", command)) { if (canStop) break; else logError("Ignoring quit message"); } else if (sameString("status", command)) { sprintf(buf, "version %s", gfVersion); netSendString(connectionHandle, buf); sprintf(buf, "type %s", (doTrans ? "translated" : "nucleotide")); netSendString(connectionHandle, buf); sprintf(buf, "host %s", hostName); netSendString(connectionHandle, buf); sprintf(buf, "port %s", portName); netSendString(connectionHandle, buf); sprintf(buf, "tileSize %d", tileSize); netSendString(connectionHandle, buf); sprintf(buf, "stepSize %d", stepSize); netSendString(connectionHandle, buf); sprintf(buf, "minMatch %d", minMatch); netSendString(connectionHandle, buf); sprintf(buf, "pcr requests %ld", pcrCount); netSendString(connectionHandle, buf); sprintf(buf, "blat requests %ld", blatCount); netSendString(connectionHandle, buf); sprintf(buf, "bases %ld", baseCount); netSendString(connectionHandle, buf); if (doTrans) { sprintf(buf, "aa %ld", aaCount); netSendString(connectionHandle, buf); } sprintf(buf, "misses %d", missCount); netSendString(connectionHandle, buf); sprintf(buf, "noSig %d", noSigCount); netSendString(connectionHandle, buf); sprintf(buf, "trimmed %d", trimCount); netSendString(connectionHandle, buf); sprintf(buf, "warnings %d", warnCount); netSendString(connectionHandle, buf); netSendString(connectionHandle, "end"); } else if (sameString("query", command) || sameString("protQuery", command) || sameString("transQuery", command)) { boolean queryIsProt = sameString(command, "protQuery"); char *s = nextWord(&line); if (s == NULL || !isdigit(s[0])) { warn("Expecting query size after query command"); ++warnCount; } else { struct dnaSeq seq; ZeroVar(&seq); if (queryIsProt && !doTrans) { warn("protein query sent to nucleotide server"); ++warnCount; queryIsProt = FALSE; } else { buf[0] = 'Y'; if (write(connectionHandle, buf, 1) == 1) { seq.size = atoi(s); seq.name = NULL; if (seq.size > 0) { ++blatCount; seq.dna = needLargeMem(seq.size+1); if (gfReadMulti(connectionHandle, seq.dna, seq.size) != seq.size) { warn("Didn't sockRecieveString all %d bytes of query sequence", seq.size); ++warnCount; } else { int maxSize = (doTrans ? maxAaSize : maxNtSize); seq.dna[seq.size] = 0; if (queryIsProt) { seq.size = aaFilteredSize(seq.dna); aaFilter(seq.dna, seq.dna); } else { seq.size = dnaFilteredSize(seq.dna); dnaFilter(seq.dna, seq.dna); } if (seq.size > maxSize) { ++trimCount; seq.size = maxSize; seq.dna[maxSize] = 0; } if (queryIsProt) aaCount += seq.size; else baseCount += seq.size; if (seqLog && (logGetFile() != NULL)) { FILE *lf = logGetFile(); faWriteNext(lf, "query", seq.dna, seq.size); fflush(lf); } errorSafeQuery(doTrans, queryIsProt, &seq, gfIdx, connectionHandle, buf, perSeqMaxHash); if (perSeqMaxHash) hashZeroVals(perSeqMaxHash); } freez(&seq.dna); } netSendString(connectionHandle, "end"); } } } } else if (sameString("pcr", command)) { char *f = nextWord(&line); char *r = nextWord(&line); char *s = nextWord(&line); int maxDistance; ++pcrCount; if (s == NULL || !isdigit(s[0])) { warn("Badly formatted pcr command"); ++warnCount; } else if (doTrans) { warn("Can't pcr on translated server"); ++warnCount; } else if (badPcrPrimerSeq(f) || badPcrPrimerSeq(r)) { warn("Can only handle ACGT in primer sequences."); ++warnCount; } else { maxDistance = atoi(s); errorSafePcr(gfIdx->untransGf, f, r, maxDistance, connectionHandle); } } else if (sameString("files", command)) { int i; sprintf(buf, "%d", fileCount); netSendString(connectionHandle, buf); for (i=0; isize); mustWriteFd(sd, buf, strlen(buf)); if (read(sd, buf, 1) < 0) errAbort("queryServer: read failed: %s", strerror(errno)); if (buf[0] != 'Y') errAbort("Expecting 'Y' from server, got %c", buf[0]); mustWriteFd(sd, seq->dna, seq->size); if (complex) { char *s = netRecieveString(sd, buf); printf("%s\n", s); } for (;;) { if (netGetString(sd, buf) == NULL) break; if (sameString(buf, "end")) { printf("%d matches\n", matchCount); break; } else if (startsWith("Error:", buf)) { errAbort("%s", buf); break; } else { printf("%s\n", buf); if (complex) { char *s = netGetLongString(sd); if (s == NULL) break; printf("%s\n", s); freeMem(s); } } ++matchCount; } close(sd); } void pcrServer(char *hostName, char *portName, char *fPrimer, char *rPrimer, int maxSize) /* Do a PCR query to server daemon. */ { char buf[256]; int sd = 0; /* Put together query command and send. */ sd = netMustConnectTo(hostName, portName); sprintf(buf, "%spcr %s %s %d", gfSignature(), fPrimer, rPrimer, maxSize); mustWriteFd(sd, buf, strlen(buf)); /* Fetch and display results. */ for (;;) { if (netGetString(sd, buf) == NULL) break; if (sameString(buf, "end")) break; else if (startsWith("Error:", buf)) { errAbort("%s", buf); break; } else { printf("%s\n", buf); } } close(sd); } void getFileList(char *hostName, char *portName) /* Get and display input file list. */ { char buf[256]; int sd = 0; int fileCount; int i; /* Put together command. */ sd = netMustConnectTo(hostName, portName); sprintf(buf, "%sfiles", gfSignature()); mustWriteFd(sd, buf, strlen(buf)); /* Get count of files, and then each file name. */ if (netGetString(sd, buf) != NULL) { fileCount = atoi(buf); for (i=0; isize = qSize; seq->dna = needLargeMem(qSize+1); if (gfReadMulti(STDIN_FILENO, seq->dna, qSize) != qSize) errAbort("read of %d bytes of query sequence failed", qSize); seq->dna[qSize] = '\0'; if (queryIsProt) { seq->size = aaFilteredSize(seq->dna); aaFilter(seq->dna, seq->dna); } else { seq->size = dnaFilteredSize(seq->dna); dnaFilter(seq->dna, seq->dna); } int maxSize = (isTrans ? maxAaSize : maxNtSize); if (seq->size > maxSize) { seq->size = maxSize; seq->dna[maxSize] = 0; } return seq; } -static void dynGetDataFiles(char *rootDir, char* genomeName, boolean isTrans, char seqFile[PATH_LEN], char gfIdxFile[PATH_LEN]) +static void dynGetDataFiles(char *rootDir, char* genomeName, boolean isTrans, char gfIdxFile[PATH_LEN], + struct hash **perSeqMaxHashRet) /* get paths for sequence files to handle requests and validate they exist */ { +char seqFile[PATH_LEN]; safef(seqFile, PATH_LEN, "%s/%s/%s.2bit", rootDir, genomeName, genomeName); if (!fileExists(seqFile)) errAbort("sequence file for %s does not exist: %s", genomeName, seqFile); + safef(gfIdxFile, PATH_LEN, "%s/%s/%s.%s.gfidx", rootDir, genomeName, genomeName, isTrans ? "trans" : "untrans"); if (!fileExists(gfIdxFile)) errAbort("gf index file for %s does not exist: %s", genomeName, gfIdxFile); + +char perSeqMaxFile[PATH_LEN]; +safef(perSeqMaxFile, PATH_LEN, "%s/%s/%s.perseqmax", rootDir, genomeName, genomeName); +*perSeqMaxHashRet = NULL; +if (fileExists(perSeqMaxFile)) + { + /* only the basename of the file is saved in the index */ + char *slash = strrchr(seqFile, '/'); + char *seqFiles[1] = {(slash != NULL) ? slash + 1 : seqFile}; + *perSeqMaxHashRet = buildPerSeqMax(1, seqFiles, perSeqMaxFile); + } } static void dynamicServerQuery(char *command, int qSize, char *genomeName, - char *seqFiles[1], struct genoFindIndex *gfIdx) + struct genoFindIndex *gfIdx, struct hash *perSeqMaxHash) /* handle search queries */ { mustWriteFd(STDOUT_FILENO, "Y", 1); boolean queryIsProt = sameString(command, "protQuery"); struct dnaSeq* seq = dynReadQuerySeq(qSize, gfIdx->isTrans, queryIsProt); if (gfIdx->isTrans) { if (queryIsProt) transQuery(gfIdx->transGf, seq, STDOUT_FILENO); else transTransQuery(gfIdx->transGf, seq, STDOUT_FILENO); } else { - struct hash *perSeqMaxHash = maybePerSeqMax(1, seqFiles); dnaQuery(gfIdx->untransGf, seq, STDOUT_FILENO, perSeqMaxHash); } netSendString(STDOUT_FILENO, "end"); logDebug("query done"); } static void dynamicServerInfo(char *command, char *genomeName, struct genoFindIndex *gfIdx) /* handle one of the info commands */ { char buf[256]; struct genoFind *gf = gfIdx->isTrans ? gfIdx->transGf[0][0] : gfIdx->untransGf; sprintf(buf, "version %s", gfVersion); netSendString(STDOUT_FILENO, buf); sprintf(buf, "type %s", (gfIdx->isTrans ? "translated" : "nucleotide")); netSendString(STDOUT_FILENO, buf); sprintf(buf, "tileSize %d", gf->tileSize); netSendString(STDOUT_FILENO, buf); sprintf(buf, "stepSize %d", gf->stepSize); netSendString(STDOUT_FILENO, buf); sprintf(buf, "minMatch %d", gf->minMatch); netSendString(STDOUT_FILENO, buf); netSendString(STDOUT_FILENO, "end"); } static void dynamicServer(char* rootDir) /* dynamic server for inetd. Read query from stdin, open index, query, respond, exit. * only one query at a time */ { // make sure error is logged pushWarnHandler(dynWarnErrorVa); char *command, *genomeName; int qSize; boolean isTrans; dynReadCommand(&command, &qSize, &isTrans, &genomeName); logInfo("dynserver: %s %s %s size=%d ", command, genomeName, (isTrans ? "trans" : "untrans"), qSize); time_t startTime = clock1000(); -char seqFile[PATH_LEN]; -char *seqFiles[1] = {seqFile}; // functions expect list of files char gfIdxFile[PATH_LEN]; -dynGetDataFiles(rootDir, genomeName, isTrans, seqFiles[0], gfIdxFile); +struct hash *perSeqMaxHash = NULL; +dynGetDataFiles(rootDir, genomeName, isTrans, gfIdxFile, &perSeqMaxHash); logInfo("dynserver: index loading completed in %4.3f seconds", 0.001 * (clock1000() - startTime)); startTime = clock1000(); struct genoFindIndex *gfIdx = genoFindIndexLoad(gfIdxFile, isTrans); if (endsWith(command, "Info")) dynamicServerInfo(command, genomeName, gfIdx); else - dynamicServerQuery(command, qSize, genomeName, seqFiles, gfIdx); + dynamicServerQuery(command, qSize, genomeName, gfIdx, perSeqMaxHash); logInfo("dynserver: %s completed in %4.3f seconds", command, 0.001 * (clock1000() - startTime)); } int main(int argc, char *argv[]) /* Process command line. */ { char *command; gfCatchPipes(); dnaUtilOpen(); optionInit(&argc, argv, optionSpecs); command = argv[1]; if (optionExists("trans")) { doTrans = TRUE; tileSize = 4; minMatch = 3; maxGap = 0; repMatch = gfPepMaxTileUse; } tileSize = optionInt("tileSize", tileSize); stepSize = optionInt("stepSize", tileSize); if (optionExists("repMatch")) repMatch = optionInt("repMatch", 0); else repMatch = gfDefaultRepMatch(tileSize, stepSize, doTrans); minMatch = optionInt("minMatch", minMatch); maxDnaHits = optionInt("maxDnaHits", maxDnaHits); maxTransHits = optionInt("maxTransHits", maxTransHits); maxNtSize = optionInt("maxNtSize", maxNtSize); maxAaSize = optionInt("maxAaSize", maxAaSize); seqLog = optionExists("seqLog"); ipLog = optionExists("ipLog"); doMask = optionExists("mask"); canStop = optionExists("canStop"); noSimpRepMask = optionExists("noSimpRepMask"); indexFile = optionVal("indexFile", NULL); if (argc < 2) usage(); if (optionExists("log")) logOpenFile(argv[0], optionVal("log", NULL)); if (optionExists("syslog")) logOpenSyslog(argv[0], optionVal("logFacility", NULL)); if (optionExists("debugLog")) logSetMinPriority("debug"); if (sameWord(command, "direct")) { if (argc < 4) usage(); genoFindDirect(argv[2], argc-3, argv+3); } else if (sameWord(command, "pcrDirect")) { if (argc < 5) usage(); genoPcrDirect(argv[2], argv[3], argc-4, argv+4); } else if (sameWord(command, "start")) { if (argc < 5) usage(); startServer(argv[2], argv[3], argc-4, argv+4); } else if (sameWord(command, "stop")) { if (argc != 4) usage(); stopServer(argv[2], argv[3]); } else if (sameWord(command, "query")) { if (argc != 5) usage(); queryServer(command, argv[2], argv[3], argv[4], FALSE, FALSE); } else if (sameWord(command, "protQuery")) { if (argc != 5) usage(); queryServer(command, argv[2], argv[3], argv[4], TRUE, TRUE); } else if (sameWord(command, "transQuery")) { if (argc != 5) usage(); queryServer(command, argv[2], argv[3], argv[4], TRUE, FALSE); } else if (sameWord(command, "pcr")) { if (argc != 7) usage(); pcrServer(argv[2], argv[3], argv[4], argv[5], atoi(argv[6])); } else if (sameWord(command, "status")) { if (argc != 4) usage(); if (statusServer(argv[2], argv[3])) { exit(-1); } } else if (sameWord(command, "files")) { if (argc != 4) usage(); getFileList(argv[2], argv[3]); } else if (sameWord(command, "index")) { if (argc < 4) usage(); buildIndex(argv[2], argc-3, argv+3); } else if (sameWord(command, "dynserver")) { if (argc < 3) usage(); dynamicServer(argv[2]); } else { usage(); } return 0; }