a74c645505a135d5c1e923afe477cdeb9c15bded markd Wed Jul 8 22:27:42 2020 -0700 added including multiple container directoies in genome dir path to gfServer diff --git src/gfServer/gfServer.c src/gfServer/gfServer.c index 4462075..a189186 100644 --- src/gfServer/gfServer.c +++ src/gfServer/gfServer.c @@ -94,36 +94,36 @@ " gfServer pcrDirect fPrimer rPrimer file(s).2bit\n" " To figure out usage level:\n" " gfServer status host port\n" " To get input file list:\n" " gfServer files host port\n" " To generate a precomputed index:\n" " gfServer index gfidx file(s)\n" " where the files are .2bit or .nib format files. Separate indexes must be created\n" " for untranslated and translated queries. These can be used with a persistent server\n" " as with 'start -indexFile or a dynamic server. They must follow the naming convention for\n" " for dynamic servers.\n" " To run a dynamic server (usually called by xinet):\n" " gfServer dynserver rootdir\n" " The root directory must contain directories for each genome with the twobit and index\n" " files following the convention:\n" - " $rootdir/$genome/$genome.2bit\n" - " $rootdir/$genome/$genome.untrans.gfidx\n" - " $rootdir/$genome/$genome.trans.gfidx\n" + " $rootdir/$containingDirs/$genome/$genome.2bit\n" + " $rootdir/$containingDirs/$genome/$genome.untrans.gfidx\n" + " $rootdir/$containingDirs/$genome/$genome.trans.gfidx\n" + " Where the contain directories are optional.\n" " The -perSeqMax functionality can be implemented by creating a file\n" " $rootdir/$genome/$genome.perseqmax\n" - " Note: the dynamic, the file names in the perseqmax file MUST NOT contain directories.\n" "\n" "options:\n" " -tileSize=N Size of n-mers to index. Default is 11 for nucleotides, 4 for\n" " proteins (or translated nucleotides).\n" " -stepSize=N Spacing between tiles. Default is tileSize.\n" " -minMatch=N Number of n-mer matches that trigger detailed alignment.\n" " Default is 2 for nucleotides, 3 for proteins.\n" " -maxGap=N Number of insertions or deletions allowed between n-mers.\n" " Default is 2 for nucleotides, 0 for proteins.\n" " -trans Translate database to protein in 6 frames. Note: it is best\n" " to run this on RepeatMasked data in this case.\n" " -log=logFile Keep a log file that records server requests.\n" " -seqLog Include sequences in log file (not logged with -syslog).\n" " -ipLog Include user's IP in log file (not logged with -syslog).\n" " -debugLog Include debugging info in log file.\n" @@ -131,31 +131,31 @@ " -logFacility=facility Log to the specified syslog facility - default local0.\n" " -mask Use masking from .2bit file.\n" " -repMatch=N Number of occurrences of a tile (n-mer) that triggers repeat masking the\n" " tile. Default is %d.\n" " -noSimpRepMask Suppresses simple repeat masking.\n" " -maxDnaHits=N Maximum number of hits for a DNA query that are sent from the server.\n" " Default is %d.\n" " -maxTransHits=N Maximum number of hits for a translated query that are sent from the server.\n" " Default is %d.\n" " -maxNtSize=N Maximum size of untranslated DNA query sequence.\n" " Default is %d.\n" " -maxAaSize=N Maximum size of protein or translated DNA queries.\n" " Default is %d.\n" " -perSeqMax=file File contains one seq filename (possibly with ':seq' suffix) per line.\n" " -maxDnaHits will be applied to each filename[:seq] separately: each may\n" - " have at most maxDnaHits/2 hits.\n" + " have at most maxDnaHits/2 hits. The filename MUST not include the directory.\n" " Useful for assemblies with many alternate/patch sequences.\n" " -canStop If set, a quit message will actually take down the server.\n" " -indexFile Index file create by `gfServer index'. Saving index can speed up\n" " gfServer startup by two orders of magnitude. The parameters must\n" " exactly match the parameters when the file is written or bad things\n" " will happen.\n" , gfVersion, repMatch, maxDnaHits, maxTransHits, maxNtSize, maxAaSize ); } /* Note about file(s) specified in the start command: The path(s) specified here are sent back exactly as-is to clients such as gfClient, hgBlat, webBlat. It is intended that relative paths are used. @@ -530,46 +530,57 @@ } } boolean badPcrPrimerSeq(char *s) /* Return TRUE if have a character we can't handle in sequence. */ { unsigned char c; while ((c = *s++) != 0) { if (ntVal[c] < 0) return TRUE; } return FALSE; } +static boolean haveFileBaseName(char *baseName, int fileCount, char *seqFiles[]) +/* check if the file list contains the base name of the per-seq max spec */ +{ +int i; +for (i = 0; i < fileCount; i++) + if (sameString(findTail(seqFiles[i], '/'), baseName)) + return TRUE; +return FALSE; +} + + static struct hash *buildPerSeqMax(int fileCount, char *seqFiles[], char* perSeqMaxFile) /* do work of building perSeqMaxhash */ { struct hash *perSeqMaxHash = hashNew(0); struct lineFile *lf = lineFileOpen(perSeqMaxFile, TRUE); char *line; while (lineFileNextReal(lf, &line)) { - // Make sure line contains a valid seq filename (before optional ':seq') - char *seqFile = trimSpaces(line); + // Make sure line contains a valid seq filename (before optional ':seq'), directories are ignored + char *seqFile = findTail(trimSpaces(line), '/'); char copy[strlen(seqFile)+1]; safecpy(copy, sizeof copy, seqFile); char *colon = strrchr(copy, ':'); if (colon) *colon = '\0'; - if (stringArrayIx(copy, seqFiles, fileCount) < 0) + if (haveFileBaseName(copy, fileCount, seqFiles) < 0) lineFileAbort(lf, "'%s' does not appear to be a sequence file from the " "command line", copy); hashAddInt(perSeqMaxHash, seqFile, 0); } lineFileClose(&lf); return perSeqMaxHash; } static struct hash *maybePerSeqMax(int fileCount, char *seqFiles[]) /* If options include -perSeqMax=file, then read the sequences named in the file into a hash * for testing membership in the set of sequences to exclude from -maxDnaHits accounting. */ { char *fileName = optionVal("perSeqMax", NULL); if (isNotEmpty(fileName)) @@ -1036,71 +1047,75 @@ int msgLen = vsnprintf(buf, sizeof(buf) - 1, msg, args); buf[msgLen] = '\0'; logError("%s", buf); printf("Error: %s\n", buf); } static void dynReadBytes(char *buf, int bufSize) /* read pending bytes */ { int readSize = read(STDIN_FILENO, buf, bufSize-1); if (readSize < 0) errAbort("EOF from client"); buf[readSize] = '\0'; } -static void dynReadCommand(char **commandRet, int *qsizeRet, boolean *isTransRet, char **genomeNameRet) +static void dynReadCommand(char **commandRet, int *qsizeRet, boolean *isTransRet, + char **genomeNameRet, char **dynGenomeDirRet) /* read query request from stdin, same as server expect includes database * Format for query commands: - * signature+command qsize genome + * signature+command qsize dynGenomeDir * Formats for info command: - * signature+command genome + * signature+command dynGenomeDir */ { char buf[256]; dynReadBytes(buf, sizeof(buf)); logDebug("query: %s", buf); if (!startsWith(gfSignature(), buf)) errAbort("query does not start with signature, got '%s'", buf); char *words[5]; int numWords = chopByWhite(buf, words, ArraySize(words)); if (numWords == 0) errAbort("empty command"); char *command = buf + strlen(gfSignature()); *commandRet = cloneString(command); *isTransRet = sameString("protQuery", command) || sameString("transQuery", command) || sameString("transInfo", command); if (sameString("query", command) || sameString("protQuery", command) || sameString("transQuery", command)) { if (numWords != 3) errAbort("expected 3 words in query command, got %d", numWords); *qsizeRet = atoi(words[1]); - *genomeNameRet = cloneString(words[2]); + *dynGenomeDirRet = cloneString(words[2]); } else if (sameString("untransInfo", command) || sameString("transInfo", command)) { if (numWords != 2) errAbort("expected 2 words in query command, got %d", numWords); *qsizeRet = 0; - *genomeNameRet = cloneString(words[1]); + *dynGenomeDirRet = cloneString(words[1]); } else errAbort("invalid command '%s'", command); + +// parse genomeName out of directory +*genomeNameRet = cloneString(findTail(*dynGenomeDirRet, '/')); } static struct dnaSeq* dynReadQuerySeq(int qSize, boolean isTrans, boolean queryIsProt) /* read the DNA sequence from the query, filtering junk */ { struct dnaSeq *seq; AllocVar(seq); seq->size = qSize; seq->dna = needLargeMem(qSize+1); if (gfReadMulti(STDIN_FILENO, seq->dna, qSize) != qSize) errAbort("read of %d bytes of query sequence failed", qSize); seq->dna[qSize] = '\0'; if (queryIsProt) { @@ -1110,45 +1125,46 @@ else { seq->size = dnaFilteredSize(seq->dna); dnaFilter(seq->dna, seq->dna); } int maxSize = (isTrans ? maxAaSize : maxNtSize); if (seq->size > maxSize) { seq->size = maxSize; seq->dna[maxSize] = 0; } return seq; } -static void dynGetDataFiles(char *rootDir, char* genomeName, boolean isTrans, char gfIdxFile[PATH_LEN], +static void dynGetDataFiles(char *rootDir, char *genomeName, char *dynGenomeDir, + boolean isTrans, char gfIdxFile[PATH_LEN], struct hash **perSeqMaxHashRet) /* get paths for sequence files to handle requests and validate they exist */ { char seqFile[PATH_LEN]; -safef(seqFile, PATH_LEN, "%s/%s/%s.2bit", rootDir, genomeName, genomeName); +safef(seqFile, PATH_LEN, "%s/%s/%s.2bit", rootDir, dynGenomeDir, genomeName); if (!fileExists(seqFile)) errAbort("sequence file for %s does not exist: %s", genomeName, seqFile); -safef(gfIdxFile, PATH_LEN, "%s/%s/%s.%s.gfidx", rootDir, genomeName, genomeName, isTrans ? "trans" : "untrans"); +safef(gfIdxFile, PATH_LEN, "%s/%s/%s.%s.gfidx", rootDir, dynGenomeDir, genomeName, isTrans ? "trans" : "untrans"); if (!fileExists(gfIdxFile)) errAbort("gf index file for %s does not exist: %s", genomeName, gfIdxFile); char perSeqMaxFile[PATH_LEN]; -safef(perSeqMaxFile, PATH_LEN, "%s/%s/%s.perseqmax", rootDir, genomeName, genomeName); +safef(perSeqMaxFile, PATH_LEN, "%s/%s/%s.perseqmax", rootDir, dynGenomeDir, genomeName); *perSeqMaxHashRet = NULL; if (fileExists(perSeqMaxFile)) { /* only the basename of the file is saved in the index */ char *slash = strrchr(seqFile, '/'); char *seqFiles[1] = {(slash != NULL) ? slash + 1 : seqFile}; *perSeqMaxHashRet = buildPerSeqMax(1, seqFiles, perSeqMaxFile); } } static void dynamicServerQuery(char *command, int qSize, char *genomeName, struct genoFindIndex *gfIdx, struct hash *perSeqMaxHash) /* handle search queries */ { mustWriteFd(STDOUT_FILENO, "Y", 1); @@ -1183,40 +1199,40 @@ netSendString(STDOUT_FILENO, buf); sprintf(buf, "stepSize %d", gf->stepSize); netSendString(STDOUT_FILENO, buf); sprintf(buf, "minMatch %d", gf->minMatch); netSendString(STDOUT_FILENO, buf); netSendString(STDOUT_FILENO, "end"); } static void dynamicServer(char* rootDir) /* dynamic server for inetd. Read query from stdin, open index, query, respond, exit. * only one query at a time */ { // make sure error is logged pushWarnHandler(dynWarnErrorVa); -char *command, *genomeName; +char *command, *genomeName, *dynGenomeDir; int qSize; boolean isTrans; -dynReadCommand(&command, &qSize, &isTrans, &genomeName); -logInfo("dynserver: %s %s %s size=%d ", command, genomeName, (isTrans ? "trans" : "untrans"), qSize); +dynReadCommand(&command, &qSize, &isTrans, &genomeName, &dynGenomeDir); +logInfo("dynserver: %s %s %s %s size=%d ", command, genomeName, dynGenomeDir, (isTrans ? "trans" : "untrans"), qSize); time_t startTime = clock1000(); char gfIdxFile[PATH_LEN]; struct hash *perSeqMaxHash = NULL; -dynGetDataFiles(rootDir, genomeName, isTrans, gfIdxFile, &perSeqMaxHash); +dynGetDataFiles(rootDir, genomeName, dynGenomeDir, isTrans, gfIdxFile, &perSeqMaxHash); logInfo("dynserver: index loading completed in %4.3f seconds", 0.001 * (clock1000() - startTime)); startTime = clock1000(); struct genoFindIndex *gfIdx = genoFindIndexLoad(gfIdxFile, isTrans); if (endsWith(command, "Info")) dynamicServerInfo(command, genomeName, gfIdx); else dynamicServerQuery(command, qSize, genomeName, gfIdx, perSeqMaxHash); logInfo("dynserver: %s completed in %4.3f seconds", command, 0.001 * (clock1000() - startTime)); } int main(int argc, char *argv[]) /* Process command line. */ { char *command;