src/gfServer/gfServer.c ecacb0a15e8ddbe9b2e62381e30621ba70f82d8f

ecacb0a15e8ddbe9b2e62381e30621ba70f82d8f
markd
  Sun Jun 28 14:36:10 2020 -0700
dynamic server tests working

diff --git src/gfServer/gfServer.c src/gfServer/gfServer.c
index 3403c72..d848d66 100644
--- src/gfServer/gfServer.c
+++ src/gfServer/gfServer.c
@@ -1,20 +1,21 @@
 /* gfServer - set up an index of the genome in memory and
  * respond to search requests. */
 /* Copyright 2001-2003 Jim Kent.  All rights reserved. */
 #include "common.h"
 #include <signal.h>
+#include <stdarg.h>
 #include <sys/socket.h>
 #include <netinet/in.h>
 #include <netdb.h>
 #include "portable.h"
 #include "net.h"
 #include "dnautil.h"
 #include "dnaseq.h"
 #include "nib.h"
 #include "twoBit.h"
 #include "fa.h"
 #include "dystring.h"
 #include "errAbort.h"
 #include "memalloc.h"
 #include "genoFind.h"
 #include "options.h"
@@ -90,30 +91,38 @@
   "      gfServer pcr host port fPrimer rPrimer maxDistance\n"
   "   To process one probe fa file against a .2bit format genome (not starting server):\n"
   "      gfServer direct probe.fa file(s).2bit\n"
   "   To test PCR without starting server:\n"
   "      gfServer pcrDirect fPrimer rPrimer file(s).2bit\n"
   "   To figure out usage level:\n"
   "      gfServer status host port\n"
   "   To get input file list:\n"
   "      gfServer files host port\n"
   "   To generate a precomputed index:\n"
   "      gfServer index gfidx file(s)\n"
   "     where the files are .2bit or .nib format files.  Separate indexes must be created\n"
   "     for untranslated and translated queries.  These can be used with a persistent server\n"
   "     as with 'start -indexFile or a dynamic server. They must follow the naming convention for\n"
   "     for dynamic servers.\n"
+  "   To run a dynamic server (usually called by xinet):\n"
+  "      gfServer dynserver rootdir\n"
+  "     The root directory must contain directories for each genome with the twobit and index\n"
+  "     files following the convention:\n"
+  "         $genome/$genome.2bit\n"
+  "         $genome/$genome.untrans.gfidx\n"
+  "         $genome/$genome.trans.gfidx\n"
+  "\n"
   "options:\n"
   "   -tileSize=N     Size of n-mers to index.  Default is 11 for nucleotides, 4 for\n"
   "                   proteins (or translated nucleotides).\n"
   "   -stepSize=N     Spacing between tiles. Default is tileSize.\n"
   "   -minMatch=N     Number of n-mer matches that trigger detailed alignment.\n"
   "                   Default is 2 for nucleotides, 3 for proteins.\n"
   "   -maxGap=N       Number of insertions or deletions allowed between n-mers.\n"
   "                   Default is 2 for nucleotides, 0 for proteins.\n"
   "   -trans          Translate database to protein in 6 frames.  Note: it is best\n"
   "                   to run this on RepeatMasked data in this case.\n"
   "   -log=logFile    Keep a log file that records server requests.\n"
   "   -seqLog         Include sequences in log file (not logged with -syslog).\n"
   "   -ipLog          Include user's IP in log file (not logged with -syslog).\n"
   "   -debugLog       Include debugging info in log file.\n"
   "   -syslog         Log to syslog.\n"
@@ -243,33 +252,34 @@
 {
 if (!isdigit(portName[0]))
     errAbort("Expecting a port number got %s", portName);
 return atoi(portName);
 }
 
 
 /* Some variables to gather statistics on usage. */
 long baseCount = 0, blatCount = 0, aaCount = 0, pcrCount = 0;
 int warnCount = 0;
 int noSigCount = 0;
 int missCount = 0;
 int trimCount = 0;
 
 void dnaQuery(struct genoFind *gf, struct dnaSeq *seq, 
-              int connectionHandle, char buf[256], struct hash *perSeqMaxHash)
+              int connectionHandle, struct hash *perSeqMaxHash)
 /* Handle a query for DNA/DNA match. */
 {
+char buf[256];
 struct gfClump *clumpList = NULL, *clump;
 int limit = 1000;
 int clumpCount = 0, hitCount = -1;
 struct lm *lm = lmInit(0);
 
 if (seq->size > gf->tileSize + gf->stepSize + gf->stepSize)
      limit = maxDnaHits;
 clumpList = gfFindClumps(gf, seq, lm, &hitCount);
 if (clumpList == NULL)
     ++missCount;
 for (clump = clumpList; clump != NULL; clump = clump->next)
     {
     struct gfSeqSource *ss = clump->target;
     sprintf(buf, "%d\t%d\t%s\t%d\t%d\t%d", 
 	clump->qStart, clump->qEnd, ss->fileName,
@@ -281,33 +291,34 @@
         ((perSeqCount = hashIntValDefault(perSeqMaxHash, ss->fileName, -1)) >= 0))
         {
         if (perSeqCount >= (maxDnaHits / 2))
             break;
         hashIncInt(perSeqMaxHash, ss->fileName);
         }
     else if (--limit < 0)
 	break;
     }
 gfClumpFreeList(&clumpList);
 lmCleanup(&lm);
 logDebug("%lu %d clumps, %d hits", clock1000(), clumpCount, hitCount);
 }
 
 void transQuery(struct genoFind *transGf[2][3], aaSeq *seq, 
-	int connectionHandle, char buf[256])	
+	int connectionHandle)
 /* Handle a query for protein/translated DNA match. */
 {
+char buf[256];
 struct gfClump *clumps[3], *clump;
 int isRc, frame;
 char strand;
 struct dyString *dy  = newDyString(1024);
 struct gfHit *hit;
 int clumpCount = 0, hitCount = 0, oneHit;
 struct lm *lm = lmInit(0);
 
 sprintf(buf, "tileSize %d", tileSize);
 netSendString(connectionHandle, buf);
 for (frame = 0; frame < 3; ++frame)
     clumps[frame] = NULL;
 for (isRc = 0; isRc <= 1; ++isRc)
     {
     strand = (isRc ? '-' : '+');
@@ -331,33 +342,34 @@
 	    ++clumpCount;
 	    if (--limit < 0)
 		break;
 	    }
 	gfClumpFreeList(&clumps[frame]);
 	}
     }
 if (clumpCount == 0)
     ++missCount;
 freeDyString(&dy);
 lmCleanup(&lm);
 logDebug("%lu %d clumps, %d hits", clock1000(), clumpCount, hitCount);
 }
 
 void transTransQuery(struct genoFind *transGf[2][3], struct dnaSeq *seq, 
-	int connectionHandle, char buf[256])	
+	int connectionHandle)
 /* Handle a query for protein/translated DNA match. */
 {
+char buf[256];
 struct gfClump *clumps[3][3], *clump;
 int isRc, qFrame, tFrame;
 char strand;
 struct trans3 *t3 = trans3New(seq);
 struct dyString *dy  = newDyString(1024);
 struct gfHit *hit;
 int clumpCount = 0, hitCount = 0, oneCount;
 
 sprintf(buf, "tileSize %d", tileSize);
 netSendString(connectionHandle, buf);
 for (qFrame = 0; qFrame<3; ++qFrame)
     for (tFrame=0; tFrame<3; ++tFrame)
 	clumps[qFrame][tFrame] = NULL;
 for (isRc = 0; isRc <= 1; ++isRc)
     {
@@ -471,36 +483,36 @@
 }
 
 static void errorSafeQuery(boolean doTrans, boolean queryIsProt, 
 	struct dnaSeq *seq, struct genoFindIndex *gfIdx, 
 	int connectionHandle, char *buf, struct hash *perSeqMaxHash)
 /* Wrap error handling code around index query. */
 {
 int status;
 errorSafeSetup();
 status = setjmp(gfRecover);
 if (status == 0)    /* Always true except after long jump. */
     {
     if (doTrans)
        {
        if (queryIsProt)
-	    transQuery(gfIdx->transGf, seq, connectionHandle, buf);
+	    transQuery(gfIdx->transGf, seq, connectionHandle);
        else
-	    transTransQuery(gfIdx->transGf, seq, connectionHandle, buf);
+	    transTransQuery(gfIdx->transGf, seq, connectionHandle);
        }
     else
-	dnaQuery(gfIdx->untransGf, seq, connectionHandle, buf, perSeqMaxHash);
+	dnaQuery(gfIdx->untransGf, seq, connectionHandle, perSeqMaxHash);
     errorSafeCleanup();
     }
 else    /* They long jumped here because of an error. */
     {
     errorSafeCleanupMess(connectionHandle, 
     	"Error: gfServer out of memory. Try reducing size of query.");
     }
 }
 
 static void errorSafePcr(struct genoFind *gf, char *fPrimer, char *rPrimer, 
 	int maxDistance, int connectionHandle)
 /* Wrap error handling around pcr index query. */
 {
 int status;
 errorSafeSetup();
@@ -998,30 +1010,165 @@
 	printf("%s\n", netRecieveString(sd, buf));
 	}
     }
 close(sd);
 }
 
 static void buildIndex(char *gfxFile, int fileCount, char *seqFiles[])
 /* build pre-computed index for seqFiles and write to gfxFile */
 {
 struct genoFindIndex *gfIdx = genoFindIndexBuild(fileCount, seqFiles, minMatch, maxGap, tileSize,
                                                  repMatch, doTrans, NULL, allowOneMismatch, doMask, stepSize, noSimpRepMask);
 genoFindIndexWrite(gfIdx, gfxFile);
 }
 
 
+static void dynErrorVa(char* msg, va_list args)
+/* send back an error response and exit */
+{
+char buf[4096];
+int msgLen = vsnprintf(buf, sizeof(buf) - 1, msg, args);
+buf[msgLen] = '\0';
+logDebug("Error: %s", buf);
+printf("Error: %s\n", msg);
+exit(1);
+}
+
+static void dynError(char* msg, ...)
+/* send back an error response and exit */
+{
+va_list args;
+va_start(args, msg);
+dynErrorVa(msg, args);
+va_end(args);
+exit(1);
+}
+
+static void dynReadBytes(char *buf, int bufSize)
+/* read pending bytes */
+{
+int readSize = read(STDIN_FILENO, buf, bufSize-1);
+if (readSize < 0)
+    dynError("EOF from client");
+buf[readSize] = '\0';
+}
+
+
+static void dynReadQuery(char **commandRet, int *qsizeRet, char **genomeNameRet)
+/* read query request from stdin, same as server expect includes database
+ * Format is:
+ *  signature command qsize genome
+ */
+{
+char buf[256];
+dynReadBytes(buf, sizeof(buf));
+logDebug("query: %s", buf);
+
+static int nwords = 4;
+char *words[nwords];
+int numWords = chopByWhite(buf, words, nwords);
+if (numWords != nwords)
+    dynError("expected %d words in request, got %d", nwords, numWords);
+if (!sameString(words[0], gfSignature()))
+    dynError("query does not start with signature, got '%s'", words[0]);
+
+if (!(sameString("query", words[1]) || 
+      sameString("protQuery", words[1]) || sameString("transQuery", words[1])))
+    dynError("invalid command '%s'", words[1]);
+*commandRet = cloneString(words[1]);
+*qsizeRet = atoi(words[2]);
+*genomeNameRet = cloneString(words[3]);
+}
+
+static struct dnaSeq* dynReadQuerySeq(int qSize, boolean isTrans, boolean queryIsProt)
+/* read the DNA sequence from the query, filtering junk  */
+{
+struct dnaSeq *seq;
+AllocVar(seq);
+seq->size = qSize;
+seq->dna = needLargeMem(qSize+1);
+if (gfReadMulti(STDIN_FILENO, seq->dna, qSize) != qSize)
+    dynError("read of %d bytes of query sequence failed", qSize);
+
+if (queryIsProt)
+    {
+    seq->size = aaFilteredSize(seq->dna);
+    aaFilter(seq->dna, seq->dna);
+    }
+else
+    {
+    seq->size = dnaFilteredSize(seq->dna);
+    dnaFilter(seq->dna, seq->dna);
+    }
+int maxSize = (isTrans ? maxAaSize : maxNtSize);
+if (seq->size > maxSize)
+    {
+    seq->size = maxSize;
+    seq->dna[maxSize] = 0;
+    }
+
+return seq;
+}
+
+static void dynGetDataFiles(char *rootDir, char* genomeName, boolean isTrans, char seqFile[PATH_LEN], char gfIdxFile[PATH_LEN])
+/* get paths for sequence files to handle requests and validate they exist */
+{
+safef(seqFile, PATH_LEN, "%s/%s/%s.2bit", rootDir, genomeName, genomeName);
+if (!fileExists(seqFile))
+    dynError("sequence file for %s does not exist: %s", genomeName, seqFile);
+safef(gfIdxFile, PATH_LEN, "%s/%s/%s.%s.gfidx", rootDir, genomeName, genomeName, isTrans ? "trans" : "untrans");
+if (!fileExists(gfIdxFile))
+    dynError("gf index file for %s does not exist: %s", genomeName, gfIdxFile);
+}
+
+
+void dynamicServer(char* rootDir)
+/* dynamic server for inetd. Read query from stdin, open index, query, respond, exit.
+ * only one query at a time */
+{
+char *command, *genomeName;
+int qSize;
+dynReadQuery(&command, &qSize, &genomeName);
+
+boolean isTrans = sameString("protQuery", command) || sameString("transQuery", command);
+boolean queryIsProt = sameString(command, "protQuery");
+
+char seqFile[PATH_LEN];
+char *seqFiles[1] = {seqFile};  // functions expect list of files
+char gfIdxFile[PATH_LEN];
+dynGetDataFiles(rootDir, genomeName, isTrans, seqFiles[0], gfIdxFile);
+
+
+struct genoFindIndex *gfIdx = genoFindIndexLoad(gfIdxFile, isTrans);
+mustWriteFd(STDOUT_FILENO, "Y", 1);
+
+struct dnaSeq* seq = dynReadQuerySeq(qSize, isTrans, queryIsProt);
+if (isTrans)
+    {
+    if (queryIsProt)
+        transQuery(gfIdx->transGf, seq, STDOUT_FILENO);
+    else
+        transTransQuery(gfIdx->transGf, seq, STDOUT_FILENO);
+    }
+else
+    {
+    struct hash *perSeqMaxHash = maybePerSeqMax(1, seqFiles);
+    dnaQuery(gfIdx->untransGf, seq, STDOUT_FILENO, perSeqMaxHash);
+    }
+logDebug("query done");
+netSendString(STDOUT_FILENO, "end");
+}
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 char *command;
 
 gfCatchPipes();
 dnaUtilOpen();
 optionInit(&argc, argv, optionSpecs);
 command = argv[1];
 if (optionExists("trans"))
     {
     doTrans = TRUE;
     tileSize = 4;
     minMatch = 3;
     maxGap = 0;
@@ -1113,21 +1260,27 @@
 	exit(-1);
 	}
     }
 else if (sameWord(command, "files"))
     {
     if (argc != 4)
 	usage();
     getFileList(argv[2], argv[3]);
     }
 else if (sameWord(command, "index"))
     {
     if (argc < 4)
         usage();
     buildIndex(argv[2], argc-3, argv+3);
     }
+else if (sameWord(command, "dynserver"))
+    {
+    if (argc < 3)
+        usage();
+    dynamicServer(argv[2]);
+    }
 else
     {
     usage();
     }
 return 0;
 }