ee8ef81c6d82157859f16548a27726f4b7421891 markd Mon Jul 6 19:22:14 2020 -0700 added support for perSeqMax in dynamic server diff --git src/gfServer/gfServer.c src/gfServer/gfServer.c index 7c8a626..4462075 100644 --- src/gfServer/gfServer.c +++ src/gfServer/gfServer.c @@ -97,31 +97,33 @@ " To get input file list:\n" " gfServer files host port\n" " To generate a precomputed index:\n" " gfServer index gfidx file(s)\n" " where the files are .2bit or .nib format files. Separate indexes must be created\n" " for untranslated and translated queries. These can be used with a persistent server\n" " as with 'start -indexFile or a dynamic server. They must follow the naming convention for\n" " for dynamic servers.\n" " To run a dynamic server (usually called by xinet):\n" " gfServer dynserver rootdir\n" " The root directory must contain directories for each genome with the twobit and index\n" " files following the convention:\n" " $rootdir/$genome/$genome.2bit\n" " $rootdir/$genome/$genome.untrans.gfidx\n" " $rootdir/$genome/$genome.trans.gfidx\n" - " Both indexes must exist.\n" + " The -perSeqMax functionality can be implemented by creating a file\n" + " $rootdir/$genome/$genome.perseqmax\n" + " Note: the dynamic, the file names in the perseqmax file MUST NOT contain directories.\n" "\n" "options:\n" " -tileSize=N Size of n-mers to index. Default is 11 for nucleotides, 4 for\n" " proteins (or translated nucleotides).\n" " -stepSize=N Spacing between tiles. Default is tileSize.\n" " -minMatch=N Number of n-mer matches that trigger detailed alignment.\n" " Default is 2 for nucleotides, 3 for proteins.\n" " -maxGap=N Number of insertions or deletions allowed between n-mers.\n" " Default is 2 for nucleotides, 0 for proteins.\n" " -trans Translate database to protein in 6 frames. Note: it is best\n" " to run this on RepeatMasked data in this case.\n" " -log=logFile Keep a log file that records server requests.\n" " -seqLog Include sequences in log file (not logged with -syslog).\n" " -ipLog Include user's IP in log file (not logged with -syslog).\n" " -debugLog Include debugging info in log file.\n" @@ -528,60 +530,66 @@ } } boolean badPcrPrimerSeq(char *s) /* Return TRUE if have a character we can't handle in sequence. */ { unsigned char c; while ((c = *s++) != 0) { if (ntVal[c] < 0) return TRUE; } return FALSE; } -static struct hash *maybePerSeqMax(int fileCount, char *seqFiles[]) -/* If options include -perSeqMax=file, then read the sequences named in the file into a hash - * for testing membership in the set of sequences to exclude from -maxDnaHits accounting. */ -{ -struct hash *perSeqMaxHash = NULL; -char *fileName = optionVal("perSeqMax", NULL); -if (isNotEmpty(fileName)) +static struct hash *buildPerSeqMax(int fileCount, char *seqFiles[], char* perSeqMaxFile) +/* do work of building perSeqMaxhash */ { - perSeqMaxHash = hashNew(0); - struct lineFile *lf = lineFileOpen(fileName, TRUE); +struct hash *perSeqMaxHash = hashNew(0); +struct lineFile *lf = lineFileOpen(perSeqMaxFile, TRUE); char *line; while (lineFileNextReal(lf, &line)) { // Make sure line contains a valid seq filename (before optional ':seq') char *seqFile = trimSpaces(line); char copy[strlen(seqFile)+1]; safecpy(copy, sizeof copy, seqFile); char *colon = strrchr(copy, ':'); if (colon) *colon = '\0'; if (stringArrayIx(copy, seqFiles, fileCount) < 0) lineFileAbort(lf, "'%s' does not appear to be a sequence file from the " "command line", copy); hashAddInt(perSeqMaxHash, seqFile, 0); } lineFileClose(&lf); - } return perSeqMaxHash; } + +static struct hash *maybePerSeqMax(int fileCount, char *seqFiles[]) +/* If options include -perSeqMax=file, then read the sequences named in the file into a hash + * for testing membership in the set of sequences to exclude from -maxDnaHits accounting. */ +{ +char *fileName = optionVal("perSeqMax", NULL); +if (isNotEmpty(fileName)) + return buildPerSeqMax(fileCount, seqFiles, fileName); +else + return NULL; +} + static void hashZeroVals(struct hash *hash) /* Set the value of every element of hash to NULL (0 for ints). */ { struct hashEl *hel; struct hashCookie cookie = hashFirst(hash); while ((hel = hashNext(&cookie)) != NULL) hel->val = 0; } void startServer(char *hostName, char *portName, int fileCount, char *seqFiles[]) /* Load up index and hang out in RAM. */ { struct genoFindIndex *gfIdx = NULL; char buf[256]; @@ -1102,59 +1110,72 @@ else { seq->size = dnaFilteredSize(seq->dna); dnaFilter(seq->dna, seq->dna); } int maxSize = (isTrans ? maxAaSize : maxNtSize); if (seq->size > maxSize) { seq->size = maxSize; seq->dna[maxSize] = 0; } return seq; } -static void dynGetDataFiles(char *rootDir, char* genomeName, boolean isTrans, char seqFile[PATH_LEN], char gfIdxFile[PATH_LEN]) +static void dynGetDataFiles(char *rootDir, char* genomeName, boolean isTrans, char gfIdxFile[PATH_LEN], + struct hash **perSeqMaxHashRet) /* get paths for sequence files to handle requests and validate they exist */ { +char seqFile[PATH_LEN]; safef(seqFile, PATH_LEN, "%s/%s/%s.2bit", rootDir, genomeName, genomeName); if (!fileExists(seqFile)) errAbort("sequence file for %s does not exist: %s", genomeName, seqFile); + safef(gfIdxFile, PATH_LEN, "%s/%s/%s.%s.gfidx", rootDir, genomeName, genomeName, isTrans ? "trans" : "untrans"); if (!fileExists(gfIdxFile)) errAbort("gf index file for %s does not exist: %s", genomeName, gfIdxFile); + +char perSeqMaxFile[PATH_LEN]; +safef(perSeqMaxFile, PATH_LEN, "%s/%s/%s.perseqmax", rootDir, genomeName, genomeName); +*perSeqMaxHashRet = NULL; +if (fileExists(perSeqMaxFile)) + { + /* only the basename of the file is saved in the index */ + char *slash = strrchr(seqFile, '/'); + char *seqFiles[1] = {(slash != NULL) ? slash + 1 : seqFile}; + *perSeqMaxHashRet = buildPerSeqMax(1, seqFiles, perSeqMaxFile); + } } static void dynamicServerQuery(char *command, int qSize, char *genomeName, - char *seqFiles[1], struct genoFindIndex *gfIdx) + struct genoFindIndex *gfIdx, struct hash *perSeqMaxHash) /* handle search queries */ { mustWriteFd(STDOUT_FILENO, "Y", 1); boolean queryIsProt = sameString(command, "protQuery"); struct dnaSeq* seq = dynReadQuerySeq(qSize, gfIdx->isTrans, queryIsProt); if (gfIdx->isTrans) { if (queryIsProt) transQuery(gfIdx->transGf, seq, STDOUT_FILENO); else transTransQuery(gfIdx->transGf, seq, STDOUT_FILENO); } else { - struct hash *perSeqMaxHash = maybePerSeqMax(1, seqFiles); dnaQuery(gfIdx->untransGf, seq, STDOUT_FILENO, perSeqMaxHash); } netSendString(STDOUT_FILENO, "end"); logDebug("query done"); } static void dynamicServerInfo(char *command, char *genomeName, struct genoFindIndex *gfIdx) /* handle one of the info commands */ { char buf[256]; struct genoFind *gf = gfIdx->isTrans ? gfIdx->transGf[0][0] : gfIdx->untransGf; sprintf(buf, "version %s", gfVersion); netSendString(STDOUT_FILENO, buf); sprintf(buf, "type %s", (gfIdx->isTrans ? "translated" : "nucleotide")); netSendString(STDOUT_FILENO, buf); @@ -1169,42 +1190,41 @@ static void dynamicServer(char* rootDir) /* dynamic server for inetd. Read query from stdin, open index, query, respond, exit. * only one query at a time */ { // make sure error is logged pushWarnHandler(dynWarnErrorVa); char *command, *genomeName; int qSize; boolean isTrans; dynReadCommand(&command, &qSize, &isTrans, &genomeName); logInfo("dynserver: %s %s %s size=%d ", command, genomeName, (isTrans ? "trans" : "untrans"), qSize); time_t startTime = clock1000(); -char seqFile[PATH_LEN]; -char *seqFiles[1] = {seqFile}; // functions expect list of files char gfIdxFile[PATH_LEN]; -dynGetDataFiles(rootDir, genomeName, isTrans, seqFiles[0], gfIdxFile); +struct hash *perSeqMaxHash = NULL; +dynGetDataFiles(rootDir, genomeName, isTrans, gfIdxFile, &perSeqMaxHash); logInfo("dynserver: index loading completed in %4.3f seconds", 0.001 * (clock1000() - startTime)); startTime = clock1000(); struct genoFindIndex *gfIdx = genoFindIndexLoad(gfIdxFile, isTrans); if (endsWith(command, "Info")) dynamicServerInfo(command, genomeName, gfIdx); else - dynamicServerQuery(command, qSize, genomeName, seqFiles, gfIdx); + dynamicServerQuery(command, qSize, genomeName, gfIdx, perSeqMaxHash); logInfo("dynserver: %s completed in %4.3f seconds", command, 0.001 * (clock1000() - startTime)); } int main(int argc, char *argv[]) /* Process command line. */ { char *command; gfCatchPipes(); dnaUtilOpen(); optionInit(&argc, argv, optionSpecs); command = argv[1]; if (optionExists("trans")) { doTrans = TRUE;