ecacb0a15e8ddbe9b2e62381e30621ba70f82d8f markd Sun Jun 28 14:36:10 2020 -0700 dynamic server tests working diff --git src/gfServer/gfServer.c src/gfServer/gfServer.c index 3403c72..d848d66 100644 --- src/gfServer/gfServer.c +++ src/gfServer/gfServer.c @@ -1,20 +1,21 @@ /* gfServer - set up an index of the genome in memory and * respond to search requests. */ /* Copyright 2001-2003 Jim Kent. All rights reserved. */ #include "common.h" #include <signal.h> +#include <stdarg.h> #include <sys/socket.h> #include <netinet/in.h> #include <netdb.h> #include "portable.h" #include "net.h" #include "dnautil.h" #include "dnaseq.h" #include "nib.h" #include "twoBit.h" #include "fa.h" #include "dystring.h" #include "errAbort.h" #include "memalloc.h" #include "genoFind.h" #include "options.h" @@ -90,30 +91,38 @@ " gfServer pcr host port fPrimer rPrimer maxDistance\n" " To process one probe fa file against a .2bit format genome (not starting server):\n" " gfServer direct probe.fa file(s).2bit\n" " To test PCR without starting server:\n" " gfServer pcrDirect fPrimer rPrimer file(s).2bit\n" " To figure out usage level:\n" " gfServer status host port\n" " To get input file list:\n" " gfServer files host port\n" " To generate a precomputed index:\n" " gfServer index gfidx file(s)\n" " where the files are .2bit or .nib format files. Separate indexes must be created\n" " for untranslated and translated queries. These can be used with a persistent server\n" " as with 'start -indexFile or a dynamic server. They must follow the naming convention for\n" " for dynamic servers.\n" + " To run a dynamic server (usually called by xinet):\n" + " gfServer dynserver rootdir\n" + " The root directory must contain directories for each genome with the twobit and index\n" + " files following the convention:\n" + " $genome/$genome.2bit\n" + " $genome/$genome.untrans.gfidx\n" + " $genome/$genome.trans.gfidx\n" + "\n" "options:\n" " -tileSize=N Size of n-mers to index. Default is 11 for nucleotides, 4 for\n" " proteins (or translated nucleotides).\n" " -stepSize=N Spacing between tiles. Default is tileSize.\n" " -minMatch=N Number of n-mer matches that trigger detailed alignment.\n" " Default is 2 for nucleotides, 3 for proteins.\n" " -maxGap=N Number of insertions or deletions allowed between n-mers.\n" " Default is 2 for nucleotides, 0 for proteins.\n" " -trans Translate database to protein in 6 frames. Note: it is best\n" " to run this on RepeatMasked data in this case.\n" " -log=logFile Keep a log file that records server requests.\n" " -seqLog Include sequences in log file (not logged with -syslog).\n" " -ipLog Include user's IP in log file (not logged with -syslog).\n" " -debugLog Include debugging info in log file.\n" " -syslog Log to syslog.\n" @@ -243,33 +252,34 @@ { if (!isdigit(portName[0])) errAbort("Expecting a port number got %s", portName); return atoi(portName); } /* Some variables to gather statistics on usage. */ long baseCount = 0, blatCount = 0, aaCount = 0, pcrCount = 0; int warnCount = 0; int noSigCount = 0; int missCount = 0; int trimCount = 0; void dnaQuery(struct genoFind *gf, struct dnaSeq *seq, - int connectionHandle, char buf[256], struct hash *perSeqMaxHash) + int connectionHandle, struct hash *perSeqMaxHash) /* Handle a query for DNA/DNA match. */ { +char buf[256]; struct gfClump *clumpList = NULL, *clump; int limit = 1000; int clumpCount = 0, hitCount = -1; struct lm *lm = lmInit(0); if (seq->size > gf->tileSize + gf->stepSize + gf->stepSize) limit = maxDnaHits; clumpList = gfFindClumps(gf, seq, lm, &hitCount); if (clumpList == NULL) ++missCount; for (clump = clumpList; clump != NULL; clump = clump->next) { struct gfSeqSource *ss = clump->target; sprintf(buf, "%d\t%d\t%s\t%d\t%d\t%d", clump->qStart, clump->qEnd, ss->fileName, @@ -281,33 +291,34 @@ ((perSeqCount = hashIntValDefault(perSeqMaxHash, ss->fileName, -1)) >= 0)) { if (perSeqCount >= (maxDnaHits / 2)) break; hashIncInt(perSeqMaxHash, ss->fileName); } else if (--limit < 0) break; } gfClumpFreeList(&clumpList); lmCleanup(&lm); logDebug("%lu %d clumps, %d hits", clock1000(), clumpCount, hitCount); } void transQuery(struct genoFind *transGf[2][3], aaSeq *seq, - int connectionHandle, char buf[256]) + int connectionHandle) /* Handle a query for protein/translated DNA match. */ { +char buf[256]; struct gfClump *clumps[3], *clump; int isRc, frame; char strand; struct dyString *dy = newDyString(1024); struct gfHit *hit; int clumpCount = 0, hitCount = 0, oneHit; struct lm *lm = lmInit(0); sprintf(buf, "tileSize %d", tileSize); netSendString(connectionHandle, buf); for (frame = 0; frame < 3; ++frame) clumps[frame] = NULL; for (isRc = 0; isRc <= 1; ++isRc) { strand = (isRc ? '-' : '+'); @@ -331,33 +342,34 @@ ++clumpCount; if (--limit < 0) break; } gfClumpFreeList(&clumps[frame]); } } if (clumpCount == 0) ++missCount; freeDyString(&dy); lmCleanup(&lm); logDebug("%lu %d clumps, %d hits", clock1000(), clumpCount, hitCount); } void transTransQuery(struct genoFind *transGf[2][3], struct dnaSeq *seq, - int connectionHandle, char buf[256]) + int connectionHandle) /* Handle a query for protein/translated DNA match. */ { +char buf[256]; struct gfClump *clumps[3][3], *clump; int isRc, qFrame, tFrame; char strand; struct trans3 *t3 = trans3New(seq); struct dyString *dy = newDyString(1024); struct gfHit *hit; int clumpCount = 0, hitCount = 0, oneCount; sprintf(buf, "tileSize %d", tileSize); netSendString(connectionHandle, buf); for (qFrame = 0; qFrame<3; ++qFrame) for (tFrame=0; tFrame<3; ++tFrame) clumps[qFrame][tFrame] = NULL; for (isRc = 0; isRc <= 1; ++isRc) { @@ -471,36 +483,36 @@ } static void errorSafeQuery(boolean doTrans, boolean queryIsProt, struct dnaSeq *seq, struct genoFindIndex *gfIdx, int connectionHandle, char *buf, struct hash *perSeqMaxHash) /* Wrap error handling code around index query. */ { int status; errorSafeSetup(); status = setjmp(gfRecover); if (status == 0) /* Always true except after long jump. */ { if (doTrans) { if (queryIsProt) - transQuery(gfIdx->transGf, seq, connectionHandle, buf); + transQuery(gfIdx->transGf, seq, connectionHandle); else - transTransQuery(gfIdx->transGf, seq, connectionHandle, buf); + transTransQuery(gfIdx->transGf, seq, connectionHandle); } else - dnaQuery(gfIdx->untransGf, seq, connectionHandle, buf, perSeqMaxHash); + dnaQuery(gfIdx->untransGf, seq, connectionHandle, perSeqMaxHash); errorSafeCleanup(); } else /* They long jumped here because of an error. */ { errorSafeCleanupMess(connectionHandle, "Error: gfServer out of memory. Try reducing size of query."); } } static void errorSafePcr(struct genoFind *gf, char *fPrimer, char *rPrimer, int maxDistance, int connectionHandle) /* Wrap error handling around pcr index query. */ { int status; errorSafeSetup(); @@ -998,30 +1010,165 @@ printf("%s\n", netRecieveString(sd, buf)); } } close(sd); } static void buildIndex(char *gfxFile, int fileCount, char *seqFiles[]) /* build pre-computed index for seqFiles and write to gfxFile */ { struct genoFindIndex *gfIdx = genoFindIndexBuild(fileCount, seqFiles, minMatch, maxGap, tileSize, repMatch, doTrans, NULL, allowOneMismatch, doMask, stepSize, noSimpRepMask); genoFindIndexWrite(gfIdx, gfxFile); } +static void dynErrorVa(char* msg, va_list args) +/* send back an error response and exit */ +{ +char buf[4096]; +int msgLen = vsnprintf(buf, sizeof(buf) - 1, msg, args); +buf[msgLen] = '\0'; +logDebug("Error: %s", buf); +printf("Error: %s\n", msg); +exit(1); +} + +static void dynError(char* msg, ...) +/* send back an error response and exit */ +{ +va_list args; +va_start(args, msg); +dynErrorVa(msg, args); +va_end(args); +exit(1); +} + +static void dynReadBytes(char *buf, int bufSize) +/* read pending bytes */ +{ +int readSize = read(STDIN_FILENO, buf, bufSize-1); +if (readSize < 0) + dynError("EOF from client"); +buf[readSize] = '\0'; +} + + +static void dynReadQuery(char **commandRet, int *qsizeRet, char **genomeNameRet) +/* read query request from stdin, same as server expect includes database + * Format is: + * signature command qsize genome + */ +{ +char buf[256]; +dynReadBytes(buf, sizeof(buf)); +logDebug("query: %s", buf); + +static int nwords = 4; +char *words[nwords]; +int numWords = chopByWhite(buf, words, nwords); +if (numWords != nwords) + dynError("expected %d words in request, got %d", nwords, numWords); +if (!sameString(words[0], gfSignature())) + dynError("query does not start with signature, got '%s'", words[0]); + +if (!(sameString("query", words[1]) || + sameString("protQuery", words[1]) || sameString("transQuery", words[1]))) + dynError("invalid command '%s'", words[1]); +*commandRet = cloneString(words[1]); +*qsizeRet = atoi(words[2]); +*genomeNameRet = cloneString(words[3]); +} + +static struct dnaSeq* dynReadQuerySeq(int qSize, boolean isTrans, boolean queryIsProt) +/* read the DNA sequence from the query, filtering junk */ +{ +struct dnaSeq *seq; +AllocVar(seq); +seq->size = qSize; +seq->dna = needLargeMem(qSize+1); +if (gfReadMulti(STDIN_FILENO, seq->dna, qSize) != qSize) + dynError("read of %d bytes of query sequence failed", qSize); + +if (queryIsProt) + { + seq->size = aaFilteredSize(seq->dna); + aaFilter(seq->dna, seq->dna); + } +else + { + seq->size = dnaFilteredSize(seq->dna); + dnaFilter(seq->dna, seq->dna); + } +int maxSize = (isTrans ? maxAaSize : maxNtSize); +if (seq->size > maxSize) + { + seq->size = maxSize; + seq->dna[maxSize] = 0; + } + +return seq; +} + +static void dynGetDataFiles(char *rootDir, char* genomeName, boolean isTrans, char seqFile[PATH_LEN], char gfIdxFile[PATH_LEN]) +/* get paths for sequence files to handle requests and validate they exist */ +{ +safef(seqFile, PATH_LEN, "%s/%s/%s.2bit", rootDir, genomeName, genomeName); +if (!fileExists(seqFile)) + dynError("sequence file for %s does not exist: %s", genomeName, seqFile); +safef(gfIdxFile, PATH_LEN, "%s/%s/%s.%s.gfidx", rootDir, genomeName, genomeName, isTrans ? "trans" : "untrans"); +if (!fileExists(gfIdxFile)) + dynError("gf index file for %s does not exist: %s", genomeName, gfIdxFile); +} + + +void dynamicServer(char* rootDir) +/* dynamic server for inetd. Read query from stdin, open index, query, respond, exit. + * only one query at a time */ +{ +char *command, *genomeName; +int qSize; +dynReadQuery(&command, &qSize, &genomeName); + +boolean isTrans = sameString("protQuery", command) || sameString("transQuery", command); +boolean queryIsProt = sameString(command, "protQuery"); + +char seqFile[PATH_LEN]; +char *seqFiles[1] = {seqFile}; // functions expect list of files +char gfIdxFile[PATH_LEN]; +dynGetDataFiles(rootDir, genomeName, isTrans, seqFiles[0], gfIdxFile); + + +struct genoFindIndex *gfIdx = genoFindIndexLoad(gfIdxFile, isTrans); +mustWriteFd(STDOUT_FILENO, "Y", 1); + +struct dnaSeq* seq = dynReadQuerySeq(qSize, isTrans, queryIsProt); +if (isTrans) + { + if (queryIsProt) + transQuery(gfIdx->transGf, seq, STDOUT_FILENO); + else + transTransQuery(gfIdx->transGf, seq, STDOUT_FILENO); + } +else + { + struct hash *perSeqMaxHash = maybePerSeqMax(1, seqFiles); + dnaQuery(gfIdx->untransGf, seq, STDOUT_FILENO, perSeqMaxHash); + } +logDebug("query done"); +netSendString(STDOUT_FILENO, "end"); +} int main(int argc, char *argv[]) /* Process command line. */ { char *command; gfCatchPipes(); dnaUtilOpen(); optionInit(&argc, argv, optionSpecs); command = argv[1]; if (optionExists("trans")) { doTrans = TRUE; tileSize = 4; minMatch = 3; maxGap = 0; @@ -1113,21 +1260,27 @@ exit(-1); } } else if (sameWord(command, "files")) { if (argc != 4) usage(); getFileList(argv[2], argv[3]); } else if (sameWord(command, "index")) { if (argc < 4) usage(); buildIndex(argv[2], argc-3, argv+3); } +else if (sameWord(command, "dynserver")) + { + if (argc < 3) + usage(); + dynamicServer(argv[2]); + } else { usage(); } return 0; }