ee8ef81c6d82157859f16548a27726f4b7421891
markd
  Mon Jul 6 19:22:14 2020 -0700
added support for perSeqMax in dynamic server

diff --git src/gfServer/gfServer.c src/gfServer/gfServer.c
index 7c8a626..4462075 100644
--- src/gfServer/gfServer.c
+++ src/gfServer/gfServer.c
@@ -97,31 +97,33 @@
   "   To get input file list:\n"
   "      gfServer files host port\n"
   "   To generate a precomputed index:\n"
   "      gfServer index gfidx file(s)\n"
   "     where the files are .2bit or .nib format files.  Separate indexes must be created\n"
   "     for untranslated and translated queries.  These can be used with a persistent server\n"
   "     as with 'start -indexFile or a dynamic server. They must follow the naming convention for\n"
   "     for dynamic servers.\n"
   "   To run a dynamic server (usually called by xinet):\n"
   "      gfServer dynserver rootdir\n"
   "     The root directory must contain directories for each genome with the twobit and index\n"
   "     files following the convention:\n"
   "         $rootdir/$genome/$genome.2bit\n"
   "         $rootdir/$genome/$genome.untrans.gfidx\n"
   "         $rootdir/$genome/$genome.trans.gfidx\n"
-  "     Both indexes must exist.\n"
+  "     The -perSeqMax functionality can be implemented by creating a file\n"
+  "         $rootdir/$genome/$genome.perseqmax\n"
+  "     Note: the dynamic, the file names in the perseqmax file MUST NOT contain directories.\n"
   "\n"
   "options:\n"
   "   -tileSize=N     Size of n-mers to index.  Default is 11 for nucleotides, 4 for\n"
   "                   proteins (or translated nucleotides).\n"
   "   -stepSize=N     Spacing between tiles. Default is tileSize.\n"
   "   -minMatch=N     Number of n-mer matches that trigger detailed alignment.\n"
   "                   Default is 2 for nucleotides, 3 for proteins.\n"
   "   -maxGap=N       Number of insertions or deletions allowed between n-mers.\n"
   "                   Default is 2 for nucleotides, 0 for proteins.\n"
   "   -trans          Translate database to protein in 6 frames.  Note: it is best\n"
   "                   to run this on RepeatMasked data in this case.\n"
   "   -log=logFile    Keep a log file that records server requests.\n"
   "   -seqLog         Include sequences in log file (not logged with -syslog).\n"
   "   -ipLog          Include user's IP in log file (not logged with -syslog).\n"
   "   -debugLog       Include debugging info in log file.\n"
@@ -528,60 +530,66 @@
     }
 }
 
 boolean badPcrPrimerSeq(char *s)
 /* Return TRUE if have a character we can't handle in sequence. */
 {
 unsigned char c;
 while ((c = *s++) != 0)
     {
     if (ntVal[c] < 0)
         return TRUE;
     }
 return FALSE;
 }
 
-static struct hash *maybePerSeqMax(int fileCount, char *seqFiles[])
-/* If options include -perSeqMax=file, then read the sequences named in the file into a hash
- * for testing membership in the set of sequences to exclude from -maxDnaHits accounting. */
-{
-struct hash *perSeqMaxHash = NULL;
-char *fileName = optionVal("perSeqMax", NULL);
-if (isNotEmpty(fileName))
+static struct hash *buildPerSeqMax(int fileCount, char *seqFiles[], char* perSeqMaxFile)
+/* do work of building perSeqMaxhash */
 {
-    perSeqMaxHash = hashNew(0);
-    struct lineFile *lf = lineFileOpen(fileName, TRUE);
+struct hash *perSeqMaxHash = hashNew(0);
+struct lineFile *lf = lineFileOpen(perSeqMaxFile, TRUE);
 char *line;
 while (lineFileNextReal(lf, &line))
     {
     // Make sure line contains a valid seq filename (before optional ':seq')
     char *seqFile = trimSpaces(line);
     char copy[strlen(seqFile)+1];
     safecpy(copy, sizeof copy, seqFile);
     char *colon = strrchr(copy, ':');
     if (colon)
         *colon = '\0';
     if (stringArrayIx(copy, seqFiles, fileCount) < 0)
         lineFileAbort(lf, "'%s' does not appear to be a sequence file from the "
                       "command line", copy);
     hashAddInt(perSeqMaxHash, seqFile, 0);
     }
 lineFileClose(&lf);
-    }
 return perSeqMaxHash;
 }
     
+
+static struct hash *maybePerSeqMax(int fileCount, char *seqFiles[])
+/* If options include -perSeqMax=file, then read the sequences named in the file into a hash
+ * for testing membership in the set of sequences to exclude from -maxDnaHits accounting. */
+{
+char *fileName = optionVal("perSeqMax", NULL);
+if (isNotEmpty(fileName))
+    return buildPerSeqMax(fileCount, seqFiles, fileName);
+else
+    return NULL;
+}
+
 static void hashZeroVals(struct hash *hash)
 /* Set the value of every element of hash to NULL (0 for ints). */
 {
 struct hashEl *hel;
 struct hashCookie cookie = hashFirst(hash);
 while ((hel = hashNext(&cookie)) != NULL)
     hel->val = 0;
 }
 
 void startServer(char *hostName, char *portName, int fileCount, 
 	char *seqFiles[])
 /* Load up index and hang out in RAM. */
 {
 struct genoFindIndex *gfIdx = NULL;
 char buf[256];
@@ -1102,59 +1110,72 @@
 else
     {
     seq->size = dnaFilteredSize(seq->dna);
     dnaFilter(seq->dna, seq->dna);
     }
 int maxSize = (isTrans ? maxAaSize : maxNtSize);
 if (seq->size > maxSize)
     {
     seq->size = maxSize;
     seq->dna[maxSize] = 0;
     }
 
 return seq;
 }
 
-static void dynGetDataFiles(char *rootDir, char* genomeName, boolean isTrans, char seqFile[PATH_LEN], char gfIdxFile[PATH_LEN])
+static void dynGetDataFiles(char *rootDir, char* genomeName, boolean isTrans, char gfIdxFile[PATH_LEN],
+                            struct hash **perSeqMaxHashRet)
 /* get paths for sequence files to handle requests and validate they exist */
 {
+char seqFile[PATH_LEN];
 safef(seqFile, PATH_LEN, "%s/%s/%s.2bit", rootDir, genomeName, genomeName);
 if (!fileExists(seqFile))
     errAbort("sequence file for %s does not exist: %s", genomeName, seqFile);
+
 safef(gfIdxFile, PATH_LEN, "%s/%s/%s.%s.gfidx", rootDir, genomeName, genomeName, isTrans ? "trans" : "untrans");
 if (!fileExists(gfIdxFile))
     errAbort("gf index file for %s does not exist: %s", genomeName, gfIdxFile);
+
+char perSeqMaxFile[PATH_LEN];
+safef(perSeqMaxFile, PATH_LEN, "%s/%s/%s.perseqmax", rootDir, genomeName, genomeName);
+*perSeqMaxHashRet = NULL;
+if (fileExists(perSeqMaxFile))
+    {
+    /* only the basename of the file is saved in the index */
+    char *slash = strrchr(seqFile, '/');
+    char *seqFiles[1] = {(slash != NULL) ? slash + 1 : seqFile};
+    *perSeqMaxHashRet = buildPerSeqMax(1, seqFiles, perSeqMaxFile);
+    }
 }
 
 static void dynamicServerQuery(char *command, int qSize, char *genomeName,
-                               char *seqFiles[1], struct genoFindIndex *gfIdx)
+                               struct genoFindIndex *gfIdx, struct hash *perSeqMaxHash)
 /* handle search queries */
 {
 mustWriteFd(STDOUT_FILENO, "Y", 1);
 
 boolean queryIsProt = sameString(command, "protQuery");
 struct dnaSeq* seq = dynReadQuerySeq(qSize, gfIdx->isTrans, queryIsProt);
 if (gfIdx->isTrans)
     {
     if (queryIsProt)
         transQuery(gfIdx->transGf, seq, STDOUT_FILENO);
     else
         transTransQuery(gfIdx->transGf, seq, STDOUT_FILENO);
     }
 else
     {
-    struct hash *perSeqMaxHash = maybePerSeqMax(1, seqFiles);
     dnaQuery(gfIdx->untransGf, seq, STDOUT_FILENO, perSeqMaxHash);
     }
 netSendString(STDOUT_FILENO, "end");
 logDebug("query done");
 }
 
 static void dynamicServerInfo(char *command, char *genomeName, struct genoFindIndex *gfIdx)
 /* handle one of the info commands */
 {
 char buf[256];
 struct genoFind *gf = gfIdx->isTrans ? gfIdx->transGf[0][0] : gfIdx->untransGf;
 sprintf(buf, "version %s", gfVersion);
 netSendString(STDOUT_FILENO, buf);
 sprintf(buf, "type %s", (gfIdx->isTrans ? "translated" : "nucleotide"));
 netSendString(STDOUT_FILENO, buf);
@@ -1169,42 +1190,41 @@
 
 static void dynamicServer(char* rootDir)
 /* dynamic server for inetd. Read query from stdin, open index, query, respond, exit.
  * only one query at a time */
 {
 // make sure error is logged
 pushWarnHandler(dynWarnErrorVa);
 
 char *command, *genomeName;
 int qSize;
 boolean isTrans;
 dynReadCommand(&command, &qSize, &isTrans, &genomeName);
 logInfo("dynserver: %s %s %s size=%d ", command, genomeName, (isTrans ? "trans" : "untrans"), qSize);
 
 time_t startTime = clock1000();
-char seqFile[PATH_LEN];
-char *seqFiles[1] = {seqFile};  // functions expect list of files
 char gfIdxFile[PATH_LEN];
-dynGetDataFiles(rootDir, genomeName, isTrans, seqFiles[0], gfIdxFile);
+struct hash *perSeqMaxHash = NULL;
+dynGetDataFiles(rootDir, genomeName, isTrans, gfIdxFile, &perSeqMaxHash);
 logInfo("dynserver: index loading completed in %4.3f seconds", 0.001 * (clock1000() - startTime));
 startTime = clock1000();
 
 struct genoFindIndex *gfIdx = genoFindIndexLoad(gfIdxFile, isTrans);
 if (endsWith(command, "Info"))
     dynamicServerInfo(command, genomeName, gfIdx);
 else
-    dynamicServerQuery(command, qSize, genomeName, seqFiles, gfIdx);
+    dynamicServerQuery(command, qSize, genomeName, gfIdx, perSeqMaxHash);
 logInfo("dynserver: %s completed in %4.3f seconds", command, 0.001 * (clock1000() - startTime));
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 char *command;
 
 gfCatchPipes();
 dnaUtilOpen();
 optionInit(&argc, argv, optionSpecs);
 command = argv[1];
 if (optionExists("trans"))
     {
     doTrans = TRUE;