766edf73c99da98b3935f42c234d14acf08c2c0e markd Sat Jun 27 19:37:45 2020 -0700 move index building to a separate gfServer subcommand diff --git src/gfServer/gfServer.c src/gfServer/gfServer.c index 1723c85..3403c72 100644 --- src/gfServer/gfServer.c +++ src/gfServer/gfServer.c @@ -33,31 +33,30 @@ {"maxDnaHits", OPTION_INT}, {"maxGap", OPTION_INT}, {"maxNtSize", OPTION_INT}, {"maxTransHits", OPTION_INT}, {"minMatch", OPTION_INT}, {"repMatch", OPTION_INT}, {"seqLog", OPTION_BOOLEAN}, {"ipLog", OPTION_BOOLEAN}, {"debugLog", OPTION_BOOLEAN}, {"stepSize", OPTION_INT}, {"tileSize", OPTION_INT}, {"trans", OPTION_BOOLEAN}, {"syslog", OPTION_BOOLEAN}, {"perSeqMax", OPTION_STRING}, {"noSimpRepMask", OPTION_BOOLEAN}, - {"writeIndex", OPTION_BOOLEAN}, {"indexFile", OPTION_STRING}, {NULL, 0} }; int maxNtSize = 40000; int maxAaSize = 8000; int minMatch = gfMinMatch; /* Can be overridden from command line. */ int tileSize = gfTileSize; /* Can be overridden from command line. */ int stepSize = 0; /* Can be overridden from command line. */ boolean doTrans = FALSE; /* Do translation? */ boolean allowOneMismatch = FALSE; boolean noSimpRepMask = FALSE; int repMatch = 1024; /* Can be overridden from command line. */ @@ -85,30 +84,36 @@ " gfServer query host port probe.fa\n" " To query a server with protein sequence:\n" " gfServer protQuery host port probe.fa\n" " To query a server with translated DNA sequence:\n" " gfServer transQuery host port probe.fa\n" " To query server with PCR primers:\n" " gfServer pcr host port fPrimer rPrimer maxDistance\n" " To process one probe fa file against a .2bit format genome (not starting server):\n" " gfServer direct probe.fa file(s).2bit\n" " To test PCR without starting server:\n" " gfServer pcrDirect fPrimer rPrimer file(s).2bit\n" " To figure out usage level:\n" " gfServer status host port\n" " To get input file list:\n" " gfServer files host port\n" + " To generate a precomputed index:\n" + " gfServer index gfidx file(s)\n" + " where the files are .2bit or .nib format files. Separate indexes must be created\n" + " for untranslated and translated queries. These can be used with a persistent server\n" + " as with 'start -indexFile or a dynamic server. They must follow the naming convention for\n" + " for dynamic servers.\n" "options:\n" " -tileSize=N Size of n-mers to index. Default is 11 for nucleotides, 4 for\n" " proteins (or translated nucleotides).\n" " -stepSize=N Spacing between tiles. Default is tileSize.\n" " -minMatch=N Number of n-mer matches that trigger detailed alignment.\n" " Default is 2 for nucleotides, 3 for proteins.\n" " -maxGap=N Number of insertions or deletions allowed between n-mers.\n" " Default is 2 for nucleotides, 0 for proteins.\n" " -trans Translate database to protein in 6 frames. Note: it is best\n" " to run this on RepeatMasked data in this case.\n" " -log=logFile Keep a log file that records server requests.\n" " -seqLog Include sequences in log file (not logged with -syslog).\n" " -ipLog Include user's IP in log file (not logged with -syslog).\n" " -debugLog Include debugging info in log file.\n" " -syslog Log to syslog.\n" @@ -118,31 +123,30 @@ " tile. Default is %d.\n" " -noSimpRepMask Suppresses simple repeat masking.\n" " -maxDnaHits=N Maximum number of hits for a DNA query that are sent from the server.\n" " Default is %d.\n" " -maxTransHits=N Maximum number of hits for a translated query that are sent from the server.\n" " Default is %d.\n" " -maxNtSize=N Maximum size of untranslated DNA query sequence.\n" " Default is %d.\n" " -maxAaSize=N Maximum size of protein or translated DNA queries.\n" " Default is %d.\n" " -perSeqMax=file File contains one seq filename (possibly with ':seq' suffix) per line.\n" " -maxDnaHits will be applied to each filename[:seq] separately: each may\n" " have at most maxDnaHits/2 hits.\n" " Useful for assemblies with many alternate/patch sequences.\n" " -canStop If set, a quit message will actually take down the server.\n" - " -writeIndex Write the in-memory index to indexFile after building and exit.\n" " -indexFile File for index. If -writeIndex is specified, the file is created,\n" " otherwise it is loaded from this file. Saving index can speed up\n" " gfServer startup by two orders of magnitude. The parameters must\n" " exactly match the parameters when the file is written or bad things\n" " will happen.\n" , gfVersion, repMatch, maxDnaHits, maxTransHits, maxNtSize, maxAaSize ); } /* Note about file(s) specified in the start command: The path(s) specified here are sent back exactly as-is to clients such as gfClient, hgBlat, webBlat. It is intended that relative paths are used. Absolute paths starting with '/' tend not to work @@ -579,44 +583,38 @@ time_t curtime; struct tm *loctime; char timestr[256]; netBlockBrokenPipes(); curtime = time (NULL); /* Get the current time. */ loctime = localtime (&curtime); /* Convert it to local time representation. */ strftime (timestr, sizeof(timestr), "%Y-%m-%d %H:%M", loctime); /* formate datetime as string */ logInfo("gfServer version %s on host %s, port %s (%s)", gfVersion, hostName, portName, timestr); struct hash *perSeqMaxHash = maybePerSeqMax(fileCount, seqFiles); time_t startIndexTime = clock1000(); -if (writeIndex || (!writeIndex && (indexFile == NULL))) +if (indexFile == NULL) { char *desc = doTrans ? "translated" : "untranslated"; uglyf("starting %s server...\n", desc); logInfo("setting up %s index", desc); gfIdx = genoFindIndexBuild(fileCount, seqFiles, minMatch, maxGap, tileSize, repMatch, doTrans, NULL, allowOneMismatch, doMask, stepSize, noSimpRepMask); logInfo("indexing building complete in %4.3f seconds", 0.001 * (clock1000() - startIndexTime)); - if (writeIndex) - { - genoFindIndexWrite(gfIdx, indexFile); - logInfo("index file built, exiting: %s", indexFile); - exit(0); - } } else { gfIdx = genoFindIndexLoad(indexFile, doTrans); logInfo("indexing loading complete in %4.3f seconds", 0.001 * (clock1000() - startIndexTime)); } /* Set up socket. Get ready to listen to it. */ socketHandle = netAcceptingSocket(port, 100); if (socketHandle < 0) errAbort("Fatal Error: Unable to open listening socket on port %d.", port); logInfo("Server ready for queries!"); printf("Server ready for queries!\n"); int connectFailCount = 0; @@ -991,30 +989,39 @@ sprintf(buf, "%sfiles", gfSignature()); mustWriteFd(sd, buf, strlen(buf)); /* Get count of files, and then each file name. */ if (netGetString(sd, buf) != NULL) { fileCount = atoi(buf); for (i=0; i<fileCount; ++i) { printf("%s\n", netRecieveString(sd, buf)); } } close(sd); } +static void buildIndex(char *gfxFile, int fileCount, char *seqFiles[]) +/* build pre-computed index for seqFiles and write to gfxFile */ +{ +struct genoFindIndex *gfIdx = genoFindIndexBuild(fileCount, seqFiles, minMatch, maxGap, tileSize, + repMatch, doTrans, NULL, allowOneMismatch, doMask, stepSize, noSimpRepMask); +genoFindIndexWrite(gfIdx, gfxFile); +} + + int main(int argc, char *argv[]) /* Process command line. */ { char *command; gfCatchPipes(); dnaUtilOpen(); optionInit(&argc, argv, optionSpecs); command = argv[1]; if (optionExists("trans")) { doTrans = TRUE; tileSize = 4; minMatch = 3; maxGap = 0; @@ -1100,21 +1107,27 @@ else if (sameWord(command, "status")) { if (argc != 4) usage(); if (statusServer(argv[2], argv[3])) { exit(-1); } } else if (sameWord(command, "files")) { if (argc != 4) usage(); getFileList(argv[2], argv[3]); } +else if (sameWord(command, "index")) + { + if (argc < 4) + usage(); + buildIndex(argv[2], argc-3, argv+3); + } else { usage(); } return 0; }