src/gfServer/gfServer.c a74c645505a135d5c1e923afe477cdeb9c15bded

a74c645505a135d5c1e923afe477cdeb9c15bded
markd
  Wed Jul 8 22:27:42 2020 -0700
added including multiple container directoies in genome dir path to gfServer

diff --git src/gfServer/gfServer.c src/gfServer/gfServer.c
index 4462075..a189186 100644
--- src/gfServer/gfServer.c
+++ src/gfServer/gfServer.c
@@ -94,36 +94,36 @@
   "      gfServer pcrDirect fPrimer rPrimer file(s).2bit\n"
   "   To figure out usage level:\n"
   "      gfServer status host port\n"
   "   To get input file list:\n"
   "      gfServer files host port\n"
   "   To generate a precomputed index:\n"
   "      gfServer index gfidx file(s)\n"
   "     where the files are .2bit or .nib format files.  Separate indexes must be created\n"
   "     for untranslated and translated queries.  These can be used with a persistent server\n"
   "     as with 'start -indexFile or a dynamic server. They must follow the naming convention for\n"
   "     for dynamic servers.\n"
   "   To run a dynamic server (usually called by xinet):\n"
   "      gfServer dynserver rootdir\n"
   "     The root directory must contain directories for each genome with the twobit and index\n"
   "     files following the convention:\n"
-  "         $rootdir/$genome/$genome.2bit\n"
-  "         $rootdir/$genome/$genome.untrans.gfidx\n"
-  "         $rootdir/$genome/$genome.trans.gfidx\n"
+  "         $rootdir/$containingDirs/$genome/$genome.2bit\n"
+  "         $rootdir/$containingDirs/$genome/$genome.untrans.gfidx\n"
+  "         $rootdir/$containingDirs/$genome/$genome.trans.gfidx\n"
+  "     Where the contain directories are optional.\n"
   "     The -perSeqMax functionality can be implemented by creating a file\n"
   "         $rootdir/$genome/$genome.perseqmax\n"
-  "     Note: the dynamic, the file names in the perseqmax file MUST NOT contain directories.\n"
   "\n"
   "options:\n"
   "   -tileSize=N     Size of n-mers to index.  Default is 11 for nucleotides, 4 for\n"
   "                   proteins (or translated nucleotides).\n"
   "   -stepSize=N     Spacing between tiles. Default is tileSize.\n"
   "   -minMatch=N     Number of n-mer matches that trigger detailed alignment.\n"
   "                   Default is 2 for nucleotides, 3 for proteins.\n"
   "   -maxGap=N       Number of insertions or deletions allowed between n-mers.\n"
   "                   Default is 2 for nucleotides, 0 for proteins.\n"
   "   -trans          Translate database to protein in 6 frames.  Note: it is best\n"
   "                   to run this on RepeatMasked data in this case.\n"
   "   -log=logFile    Keep a log file that records server requests.\n"
   "   -seqLog         Include sequences in log file (not logged with -syslog).\n"
   "   -ipLog          Include user's IP in log file (not logged with -syslog).\n"
   "   -debugLog       Include debugging info in log file.\n"
@@ -131,31 +131,31 @@
   "   -logFacility=facility  Log to the specified syslog facility - default local0.\n"
   "   -mask           Use masking from .2bit file.\n"
   "   -repMatch=N     Number of occurrences of a tile (n-mer) that triggers repeat masking the\n"
   "                   tile. Default is %d.\n"
   "   -noSimpRepMask  Suppresses simple repeat masking.\n"
   "   -maxDnaHits=N   Maximum number of hits for a DNA query that are sent from the server.\n"
   "                   Default is %d.\n"
   "   -maxTransHits=N Maximum number of hits for a translated query that are sent from the server.\n"
   "                   Default is %d.\n"
   "   -maxNtSize=N    Maximum size of untranslated DNA query sequence.\n"
   "                   Default is %d.\n"
   "   -maxAaSize=N    Maximum size of protein or translated DNA queries.\n"
   "                   Default is %d.\n"
   "   -perSeqMax=file File contains one seq filename (possibly with ':seq' suffix) per line.\n"
   "                   -maxDnaHits will be applied to each filename[:seq] separately: each may\n"
-  "                   have at most maxDnaHits/2 hits.\n"
+  "                   have at most maxDnaHits/2 hits.  The filename MUST not include the directory.\n"
   "                   Useful for assemblies with many alternate/patch sequences.\n"
   "   -canStop        If set, a quit message will actually take down the server.\n"
   "   -indexFile      Index file create by `gfServer index'. Saving index can speed up\n"
   "                   gfServer startup by two orders of magnitude.  The parameters must\n"
   "                   exactly match the parameters when the file is written or bad things\n"
   "                   will happen.\n"
   ,	gfVersion, repMatch, maxDnaHits, maxTransHits, maxNtSize, maxAaSize
   );
 
 }
 /*
   Note about file(s) specified in the start command:
       The path(s) specified here are sent back exactly as-is
       to clients such as gfClient, hgBlat, webBlat.
       It is intended that relative paths are used.
@@ -530,46 +530,57 @@
     }
 }
 
 boolean badPcrPrimerSeq(char *s)
 /* Return TRUE if have a character we can't handle in sequence. */
 {
 unsigned char c;
 while ((c = *s++) != 0)
     {
     if (ntVal[c] < 0)
         return TRUE;
     }
 return FALSE;
 }
 
+static boolean haveFileBaseName(char *baseName, int fileCount, char *seqFiles[])
+/* check if the file list contains the base name of the per-seq max spec */
+{
+int i;
+for (i = 0; i < fileCount; i++)
+    if (sameString(findTail(seqFiles[i], '/'), baseName))
+        return TRUE;
+return FALSE;
+}
+
+
 static struct hash *buildPerSeqMax(int fileCount, char *seqFiles[], char* perSeqMaxFile)
 /* do work of building perSeqMaxhash */
 {
 struct hash *perSeqMaxHash = hashNew(0);
 struct lineFile *lf = lineFileOpen(perSeqMaxFile, TRUE);
 char *line;
 while (lineFileNextReal(lf, &line))
     {
-    // Make sure line contains a valid seq filename (before optional ':seq')
-    char *seqFile = trimSpaces(line);
+    // Make sure line contains a valid seq filename (before optional ':seq'), directories are ignored
+    char *seqFile = findTail(trimSpaces(line), '/');
     char copy[strlen(seqFile)+1];
     safecpy(copy, sizeof copy, seqFile);
     char *colon = strrchr(copy, ':');
     if (colon)
         *colon = '\0';
-    if (stringArrayIx(copy, seqFiles, fileCount) < 0)
+    if (haveFileBaseName(copy, fileCount, seqFiles) < 0)
         lineFileAbort(lf, "'%s' does not appear to be a sequence file from the "
                       "command line", copy);
     hashAddInt(perSeqMaxHash, seqFile, 0);
     }
 lineFileClose(&lf);
 return perSeqMaxHash;
 }
     
 
 static struct hash *maybePerSeqMax(int fileCount, char *seqFiles[])
 /* If options include -perSeqMax=file, then read the sequences named in the file into a hash
  * for testing membership in the set of sequences to exclude from -maxDnaHits accounting. */
 {
 char *fileName = optionVal("perSeqMax", NULL);
 if (isNotEmpty(fileName))
@@ -1036,71 +1047,75 @@
 int msgLen = vsnprintf(buf, sizeof(buf) - 1, msg, args);
 buf[msgLen] = '\0';
 logError("%s", buf);
 printf("Error: %s\n", buf);
 }
 
 static void dynReadBytes(char *buf, int bufSize)
 /* read pending bytes */
 {
 int readSize = read(STDIN_FILENO, buf, bufSize-1);
 if (readSize < 0)
     errAbort("EOF from client");
 buf[readSize] = '\0';
 }
 
-static void dynReadCommand(char **commandRet, int *qsizeRet, boolean *isTransRet, char **genomeNameRet)
+static void dynReadCommand(char **commandRet, int *qsizeRet, boolean *isTransRet,
+                           char **genomeNameRet, char **dynGenomeDirRet)
 /* read query request from stdin, same as server expect includes database
  * Format for query commands:
- *  signature+command qsize genome
+ *  signature+command qsize dynGenomeDir
  * Formats for info command:
- *  signature+command genome
+ *  signature+command dynGenomeDir
  */
 {
 char buf[256];
 dynReadBytes(buf, sizeof(buf));
 logDebug("query: %s", buf);
 
 if (!startsWith(gfSignature(), buf))
     errAbort("query does not start with signature, got '%s'", buf);
 
 char *words[5];
 int numWords = chopByWhite(buf, words, ArraySize(words));
 if (numWords == 0)
     errAbort("empty command");
 char *command = buf + strlen(gfSignature());
 *commandRet = cloneString(command);
 *isTransRet = sameString("protQuery", command) || sameString("transQuery", command)
     || sameString("transInfo", command);
 
 if (sameString("query", command) || sameString("protQuery", command)
       || sameString("transQuery", command))
     {
     if (numWords != 3)
         errAbort("expected 3 words in query command, got %d", numWords);
     *qsizeRet = atoi(words[1]);
-    *genomeNameRet = cloneString(words[2]);
+    *dynGenomeDirRet = cloneString(words[2]);
     }
 else if (sameString("untransInfo", command) || sameString("transInfo", command))
     {
     if (numWords != 2)
         errAbort("expected 2 words in query command, got %d", numWords);
     *qsizeRet = 0;
-    *genomeNameRet = cloneString(words[1]);
+    *dynGenomeDirRet = cloneString(words[1]);
     }
 else
     errAbort("invalid command '%s'", command);
+
+// parse genomeName out of directory
+*genomeNameRet = cloneString(findTail(*dynGenomeDirRet, '/'));
 }
 
 static struct dnaSeq* dynReadQuerySeq(int qSize, boolean isTrans, boolean queryIsProt)
 /* read the DNA sequence from the query, filtering junk  */
 {
 struct dnaSeq *seq;
 AllocVar(seq);
 seq->size = qSize;
 seq->dna = needLargeMem(qSize+1);
 if (gfReadMulti(STDIN_FILENO, seq->dna, qSize) != qSize)
     errAbort("read of %d bytes of query sequence failed", qSize);
 seq->dna[qSize] = '\0';
 
 if (queryIsProt)
     {
@@ -1110,45 +1125,46 @@
 else
     {
     seq->size = dnaFilteredSize(seq->dna);
     dnaFilter(seq->dna, seq->dna);
     }
 int maxSize = (isTrans ? maxAaSize : maxNtSize);
 if (seq->size > maxSize)
     {
     seq->size = maxSize;
     seq->dna[maxSize] = 0;
     }
 
 return seq;
 }
 
-static void dynGetDataFiles(char *rootDir, char* genomeName, boolean isTrans, char gfIdxFile[PATH_LEN],
+static void dynGetDataFiles(char *rootDir, char *genomeName, char *dynGenomeDir,
+                            boolean isTrans, char gfIdxFile[PATH_LEN],
                             struct hash **perSeqMaxHashRet)
 /* get paths for sequence files to handle requests and validate they exist */
 {
 char seqFile[PATH_LEN];
-safef(seqFile, PATH_LEN, "%s/%s/%s.2bit", rootDir, genomeName, genomeName);
+safef(seqFile, PATH_LEN, "%s/%s/%s.2bit", rootDir, dynGenomeDir, genomeName);
 if (!fileExists(seqFile))
     errAbort("sequence file for %s does not exist: %s", genomeName, seqFile);
 
-safef(gfIdxFile, PATH_LEN, "%s/%s/%s.%s.gfidx", rootDir, genomeName, genomeName, isTrans ? "trans" : "untrans");
+safef(gfIdxFile, PATH_LEN, "%s/%s/%s.%s.gfidx", rootDir, dynGenomeDir, genomeName, isTrans ? "trans" : "untrans");
 if (!fileExists(gfIdxFile))
     errAbort("gf index file for %s does not exist: %s", genomeName, gfIdxFile);
 
 char perSeqMaxFile[PATH_LEN];
-safef(perSeqMaxFile, PATH_LEN, "%s/%s/%s.perseqmax", rootDir, genomeName, genomeName);
+safef(perSeqMaxFile, PATH_LEN, "%s/%s/%s.perseqmax", rootDir, dynGenomeDir, genomeName);
 *perSeqMaxHashRet = NULL;
 if (fileExists(perSeqMaxFile))
     {
     /* only the basename of the file is saved in the index */
     char *slash = strrchr(seqFile, '/');
     char *seqFiles[1] = {(slash != NULL) ? slash + 1 : seqFile};
     *perSeqMaxHashRet = buildPerSeqMax(1, seqFiles, perSeqMaxFile);
     }
 }
 
 static void dynamicServerQuery(char *command, int qSize, char *genomeName,
                                struct genoFindIndex *gfIdx, struct hash *perSeqMaxHash)
 /* handle search queries */
 {
 mustWriteFd(STDOUT_FILENO, "Y", 1);
@@ -1183,40 +1199,40 @@
 netSendString(STDOUT_FILENO, buf);
 sprintf(buf, "stepSize %d", gf->stepSize);
 netSendString(STDOUT_FILENO, buf);
 sprintf(buf, "minMatch %d", gf->minMatch);
 netSendString(STDOUT_FILENO, buf);
 netSendString(STDOUT_FILENO, "end");
 }
 
 static void dynamicServer(char* rootDir)
 /* dynamic server for inetd. Read query from stdin, open index, query, respond, exit.
  * only one query at a time */
 {
 // make sure error is logged
 pushWarnHandler(dynWarnErrorVa);
 
-char *command, *genomeName;
+char *command, *genomeName, *dynGenomeDir;
 int qSize;
 boolean isTrans;
-dynReadCommand(&command, &qSize, &isTrans, &genomeName);
-logInfo("dynserver: %s %s %s size=%d ", command, genomeName, (isTrans ? "trans" : "untrans"), qSize);
+dynReadCommand(&command, &qSize, &isTrans, &genomeName, &dynGenomeDir);
+logInfo("dynserver: %s %s %s %s size=%d ", command, genomeName, dynGenomeDir, (isTrans ? "trans" : "untrans"), qSize);
 
 time_t startTime = clock1000();
 char gfIdxFile[PATH_LEN];
 struct hash *perSeqMaxHash = NULL;
-dynGetDataFiles(rootDir, genomeName, isTrans, gfIdxFile, &perSeqMaxHash);
+dynGetDataFiles(rootDir, genomeName, dynGenomeDir, isTrans, gfIdxFile, &perSeqMaxHash);
 logInfo("dynserver: index loading completed in %4.3f seconds", 0.001 * (clock1000() - startTime));
 startTime = clock1000();
 
 struct genoFindIndex *gfIdx = genoFindIndexLoad(gfIdxFile, isTrans);
 if (endsWith(command, "Info"))
     dynamicServerInfo(command, genomeName, gfIdx);
 else
     dynamicServerQuery(command, qSize, genomeName, gfIdx, perSeqMaxHash);
 logInfo("dynserver: %s completed in %4.3f seconds", command, 0.001 * (clock1000() - startTime));
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 char *command;