155197c63e3259d0922c5c3a0e859f4812129db4 galt Sun May 17 17:52:03 2020 -0700 gfServer and blat and isPcr extended with new commandline option -noSimpRepMask to help with small genomes. refs #25477 diff --git src/gfServer/gfServer.c src/gfServer/gfServer.c index 1def652..45717e0 100644 --- src/gfServer/gfServer.c +++ src/gfServer/gfServer.c @@ -32,42 +32,44 @@ {"maxAaSize", OPTION_INT}, {"maxDnaHits", OPTION_INT}, {"maxGap", OPTION_INT}, {"maxNtSize", OPTION_INT}, {"maxTransHits", OPTION_INT}, {"minMatch", OPTION_INT}, {"repMatch", OPTION_INT}, {"seqLog", OPTION_BOOLEAN}, {"ipLog", OPTION_BOOLEAN}, {"debugLog", OPTION_BOOLEAN}, {"stepSize", OPTION_INT}, {"tileSize", OPTION_INT}, {"trans", OPTION_BOOLEAN}, {"syslog", OPTION_BOOLEAN}, {"perSeqMax", OPTION_STRING}, + {"noSimpRepMask", OPTION_BOOLEAN}, {NULL, 0} }; int maxNtSize = 40000; int maxAaSize = 8000; int minMatch = gfMinMatch; /* Can be overridden from command line. */ int tileSize = gfTileSize; /* Can be overridden from command line. */ int stepSize = 0; /* Can be overridden from command line. */ boolean doTrans = FALSE; /* Do translation? */ boolean allowOneMismatch = FALSE; +boolean noSimpRepMask = FALSE; int repMatch = 1024; /* Can be overridden from command line. */ int maxDnaHits = 100; /* Can be overridden from command line. */ int maxTransHits = 200; /* Can be overridden from command line. */ int maxGap = gfMaxGap; boolean seqLog = FALSE; boolean ipLog = FALSE; boolean doMask = FALSE; boolean canStop = FALSE; void usage() /* Explain usage and exit. */ { errAbort( "gfServer v %s - Make a server to quickly find where DNA occurs in genome\n" " To set up a server:\n" @@ -98,30 +100,31 @@ " -minMatch=N Number of n-mer matches that trigger detailed alignment.\n" " Default is 2 for nucleotides, 3 for proteins.\n" " -maxGap=N Number of insertions or deletions allowed between n-mers.\n" " Default is 2 for nucleotides, 0 for proteins.\n" " -trans Translate database to protein in 6 frames. Note: it is best\n" " to run this on RepeatMasked data in this case.\n" " -log=logFile Keep a log file that records server requests.\n" " -seqLog Include sequences in log file (not logged with -syslog).\n" " -ipLog Include user's IP in log file (not logged with -syslog).\n" " -debugLog Include debugging info in log file.\n" " -syslog Log to syslog.\n" " -logFacility=facility Log to the specified syslog facility - default local0.\n" " -mask Use masking from .2bit file.\n" " -repMatch=N Number of occurrences of a tile (n-mer) that triggers repeat masking the\n" " tile. Default is %d.\n" + " -noSimpRepMask Suppresses simple repeat masking.\n" " -maxDnaHits=N Maximum number of hits for a DNA query that are sent from the server.\n" " Default is %d.\n" " -maxTransHits=N Maximum number of hits for a translated query that are sent from the server.\n" " Default is %d.\n" " -maxNtSize=N Maximum size of untranslated DNA query sequence.\n" " Default is %d.\n" " -maxAaSize=N Maximum size of protein or translated DNA queries.\n" " Default is %d.\n" " -perSeqMax=file File contains one seq filename (possibly with ':seq' suffix) per line.\n" " -maxDnaHits will be applied to each filename[:seq] separately: each may\n" " have at most maxDnaHits/2 hits.\n" " Useful for assemblies with many alternate/patch sequences.\n" " -canStop If set, a quit message will actually take down the server.\n" , gfVersion, repMatch, maxDnaHits, maxTransHits, maxNtSize, maxAaSize ); @@ -143,31 +146,31 @@ void genoFindDirect(char *probeName, int fileCount, char *seqFiles[]) /* Don't set up server - just directly look for matches. */ { struct genoFind *gf = NULL; struct lineFile *lf = lineFileOpen(probeName, TRUE); struct dnaSeq seq; int hitCount = 0, clumpCount = 0, oneHit; ZeroVar(&seq); if (doTrans) errAbort("Don't support translated direct stuff currently, sorry"); gf = gfIndexNibsAndTwoBits(fileCount, seqFiles, minMatch, maxGap, tileSize, repMatch, FALSE, - allowOneMismatch, stepSize); + allowOneMismatch, stepSize, noSimpRepMask); while (faSpeedReadNext(lf, &seq.dna, &seq.size, &seq.name)) { struct lm *lm = lmInit(0); struct gfClump *clumpList = gfFindClumps(gf, &seq, lm, &oneHit), *clump; hitCount += oneHit; for (clump = clumpList; clump != NULL; clump = clump->next) { ++clumpCount; printf("%s ", seq.name); gfClumpDump(gf, clump, stdout); } gfClumpFreeList(&clumpList); lmCleanup(&lm); } @@ -175,31 +178,31 @@ genoFindFree(&gf); } void genoPcrDirect(char *fPrimer, char *rPrimer, int fileCount, char *seqFiles[]) /* Do direct PCR for testing purposes. */ { struct genoFind *gf = NULL; int fPrimerSize = strlen(fPrimer); int rPrimerSize = strlen(rPrimer); struct gfClump *clumpList, *clump; time_t startTime, endTime; startTime = clock1000(); gf = gfIndexNibsAndTwoBits(fileCount, seqFiles, minMatch, maxGap, tileSize, repMatch, FALSE, - allowOneMismatch, stepSize); + allowOneMismatch, stepSize, noSimpRepMask); endTime = clock1000(); printf("Index built in %4.3f seconds\n", 0.001 * (endTime - startTime)); printf("plus strand:\n"); startTime = clock1000(); clumpList = gfPcrClumps(gf, fPrimer, fPrimerSize, rPrimer, rPrimerSize, 0, 4*1024); endTime = clock1000(); printf("Index searched in %4.3f seconds\n", 0.001 * (endTime - startTime)); for (clump = clumpList; clump != NULL; clump = clump->next) { /* Clumps from gfPcrClumps have already had target->start subtracted out * of their coords, but gfClumpDump assumes they have not and does the * subtraction; rather than write a new gfClumpDump, tweak here: */ clump->tStart += clump->target->start; clump->tEnd += clump->target->start; @@ -572,38 +575,38 @@ netBlockBrokenPipes(); curtime = time (NULL); /* Get the current time. */ loctime = localtime (&curtime); /* Convert it to local time representation. */ strftime (timestr, sizeof(timestr), "%Y-%m-%d %H:%M", loctime); /* formate datetime as string */ logInfo("gfServer version %s on host %s, port %s (%s)", gfVersion, hostName, portName, timestr); struct hash *perSeqMaxHash = maybePerSeqMax(fileCount, seqFiles); if (doTrans) { uglyf("starting translated server...\n"); logInfo("setting up translated index"); gfIndexTransNibsAndTwoBits(transGf, fileCount, seqFiles, minMatch, maxGap, tileSize, repMatch, NULL, allowOneMismatch, - doMask, stepSize); + doMask, stepSize, noSimpRepMask); } else { uglyf("starting untranslated server...\n"); logInfo("setting up untranslated index"); gf = gfIndexNibsAndTwoBits(fileCount, seqFiles, minMatch, - maxGap, tileSize, repMatch, NULL, allowOneMismatch, stepSize); + maxGap, tileSize, repMatch, NULL, allowOneMismatch, stepSize, noSimpRepMask); } logInfo("indexing complete"); /* Set up socket. Get ready to listen to it. */ socketHandle = netAcceptingSocket(port, 100); if (socketHandle < 0) errAbort("Fatal Error: Unable to open listening socket on port %d.", port); logInfo("Server ready for queries!"); printf("Server ready for queries!\n"); int connectFailCount = 0; for (;;) { ZeroVar(&fromAddr); fromLen = sizeof(fromAddr); @@ -1006,30 +1009,31 @@ tileSize = optionInt("tileSize", tileSize); stepSize = optionInt("stepSize", tileSize); if (optionExists("repMatch")) repMatch = optionInt("repMatch", 0); else repMatch = gfDefaultRepMatch(tileSize, stepSize, doTrans); minMatch = optionInt("minMatch", minMatch); maxDnaHits = optionInt("maxDnaHits", maxDnaHits); maxTransHits = optionInt("maxTransHits", maxTransHits); maxNtSize = optionInt("maxNtSize", maxNtSize); maxAaSize = optionInt("maxAaSize", maxAaSize); seqLog = optionExists("seqLog"); ipLog = optionExists("ipLog"); doMask = optionExists("mask"); canStop = optionExists("canStop"); +noSimpRepMask = optionExists("noSimpRepMask"); if (argc < 2) usage(); if (optionExists("log")) logOpenFile(argv[0], optionVal("log", NULL)); if (optionExists("syslog")) logOpenSyslog(argv[0], optionVal("logFacility", NULL)); if (optionExists("debugLog")) logSetMinPriority("debug"); if (sameWord(command, "direct")) { if (argc < 4) usage(); genoFindDirect(argv[2], argc-3, argv+3); }