155197c63e3259d0922c5c3a0e859f4812129db4
galt
  Sun May 17 17:52:03 2020 -0700
gfServer and blat and isPcr extended with new commandline option -noSimpRepMask to help with small genomes. refs #25477

diff --git src/gfServer/gfServer.c src/gfServer/gfServer.c
index 1def652..45717e0 100644
--- src/gfServer/gfServer.c
+++ src/gfServer/gfServer.c
@@ -32,42 +32,44 @@
     {"maxAaSize", OPTION_INT},
     {"maxDnaHits", OPTION_INT},
     {"maxGap", OPTION_INT},
     {"maxNtSize", OPTION_INT},
     {"maxTransHits", OPTION_INT},
     {"minMatch", OPTION_INT},
     {"repMatch", OPTION_INT},
     {"seqLog", OPTION_BOOLEAN},
     {"ipLog", OPTION_BOOLEAN},
     {"debugLog", OPTION_BOOLEAN},
     {"stepSize", OPTION_INT},
     {"tileSize", OPTION_INT},
     {"trans", OPTION_BOOLEAN},
     {"syslog", OPTION_BOOLEAN},
     {"perSeqMax", OPTION_STRING},
+    {"noSimpRepMask", OPTION_BOOLEAN},
     {NULL, 0}
 };
 
 
 int maxNtSize = 40000;
 int maxAaSize = 8000;
 
 int minMatch = gfMinMatch;	/* Can be overridden from command line. */
 int tileSize = gfTileSize;	/* Can be overridden from command line. */
 int stepSize = 0;		/* Can be overridden from command line. */
 boolean doTrans = FALSE;	/* Do translation? */
 boolean allowOneMismatch = FALSE; 
+boolean noSimpRepMask = FALSE;
 int repMatch = 1024;    /* Can be overridden from command line. */
 int maxDnaHits = 100;   /* Can be overridden from command line. */
 int maxTransHits = 200; /* Can be overridden from command line. */
 int maxGap = gfMaxGap;
 boolean seqLog = FALSE;
 boolean ipLog = FALSE;
 boolean doMask = FALSE;
 boolean canStop = FALSE;
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "gfServer v %s - Make a server to quickly find where DNA occurs in genome\n"
   "   To set up a server:\n"
@@ -98,30 +100,31 @@
   "   -minMatch=N     Number of n-mer matches that trigger detailed alignment.\n"
   "                   Default is 2 for nucleotides, 3 for proteins.\n"
   "   -maxGap=N       Number of insertions or deletions allowed between n-mers.\n"
   "                   Default is 2 for nucleotides, 0 for proteins.\n"
   "   -trans          Translate database to protein in 6 frames.  Note: it is best\n"
   "                   to run this on RepeatMasked data in this case.\n"
   "   -log=logFile    Keep a log file that records server requests.\n"
   "   -seqLog         Include sequences in log file (not logged with -syslog).\n"
   "   -ipLog          Include user's IP in log file (not logged with -syslog).\n"
   "   -debugLog       Include debugging info in log file.\n"
   "   -syslog         Log to syslog.\n"
   "   -logFacility=facility  Log to the specified syslog facility - default local0.\n"
   "   -mask           Use masking from .2bit file.\n"
   "   -repMatch=N     Number of occurrences of a tile (n-mer) that triggers repeat masking the\n"
   "                   tile. Default is %d.\n"
+  "   -noSimpRepMask  Suppresses simple repeat masking.\n"
   "   -maxDnaHits=N   Maximum number of hits for a DNA query that are sent from the server.\n"
   "                   Default is %d.\n"
   "   -maxTransHits=N Maximum number of hits for a translated query that are sent from the server.\n"
   "                   Default is %d.\n"
   "   -maxNtSize=N    Maximum size of untranslated DNA query sequence.\n"
   "                   Default is %d.\n"
   "   -maxAaSize=N    Maximum size of protein or translated DNA queries.\n"
   "                   Default is %d.\n"
   "   -perSeqMax=file File contains one seq filename (possibly with ':seq' suffix) per line.\n"
   "                   -maxDnaHits will be applied to each filename[:seq] separately: each may\n"
   "                   have at most maxDnaHits/2 hits.\n"
   "                   Useful for assemblies with many alternate/patch sequences.\n"
   "   -canStop        If set, a quit message will actually take down the server.\n"
   ,	gfVersion, repMatch, maxDnaHits, maxTransHits, maxNtSize, maxAaSize
   );
@@ -143,31 +146,31 @@
 
 void genoFindDirect(char *probeName, int fileCount, char *seqFiles[])
 /* Don't set up server - just directly look for matches. */
 {
 struct genoFind *gf = NULL;
 struct lineFile *lf = lineFileOpen(probeName, TRUE);
 struct dnaSeq seq;
 int hitCount = 0, clumpCount = 0, oneHit;
 ZeroVar(&seq);
 
 if (doTrans)
     errAbort("Don't support translated direct stuff currently, sorry");
 
 gf = gfIndexNibsAndTwoBits(fileCount, seqFiles, minMatch, maxGap, 
 	tileSize, repMatch, FALSE,
-	allowOneMismatch, stepSize);
+	allowOneMismatch, stepSize, noSimpRepMask);
 
 while (faSpeedReadNext(lf, &seq.dna, &seq.size, &seq.name))
     {
     struct lm *lm = lmInit(0);
     struct gfClump *clumpList = gfFindClumps(gf, &seq, lm, &oneHit), *clump;
     hitCount += oneHit;
     for (clump = clumpList; clump != NULL; clump = clump->next)
 	{
 	++clumpCount;
 	printf("%s ", seq.name);
 	gfClumpDump(gf, clump, stdout);
 	}
     gfClumpFreeList(&clumpList);
     lmCleanup(&lm);
     }
@@ -175,31 +178,31 @@
 genoFindFree(&gf);
 }
 
 void genoPcrDirect(char *fPrimer, char *rPrimer, int fileCount, char *seqFiles[])
 /* Do direct PCR for testing purposes. */
 {
 struct genoFind *gf = NULL;
 int fPrimerSize = strlen(fPrimer);
 int rPrimerSize = strlen(rPrimer);
 struct gfClump *clumpList, *clump;
 time_t startTime, endTime;
 
 startTime = clock1000();
 gf = gfIndexNibsAndTwoBits(fileCount, seqFiles, minMatch, maxGap, 
 	tileSize, repMatch, FALSE,
-	allowOneMismatch, stepSize);
+	allowOneMismatch, stepSize, noSimpRepMask);
 endTime = clock1000();
 printf("Index built in %4.3f seconds\n", 0.001 * (endTime - startTime));
 
 printf("plus strand:\n");
 startTime = clock1000();
 clumpList = gfPcrClumps(gf, fPrimer, fPrimerSize, rPrimer, rPrimerSize, 0, 4*1024);
 endTime = clock1000();
 printf("Index searched in %4.3f seconds\n", 0.001 * (endTime - startTime));
 for (clump = clumpList; clump != NULL; clump = clump->next)
     {
     /* Clumps from gfPcrClumps have already had target->start subtracted out 
      * of their coords, but gfClumpDump assumes they have not and does the 
      * subtraction; rather than write a new gfClumpDump, tweak here: */
     clump->tStart += clump->target->start;
     clump->tEnd += clump->target->start;
@@ -572,38 +575,38 @@
 netBlockBrokenPipes();
 
 curtime = time (NULL);           /* Get the current time. */
 loctime = localtime (&curtime);  /* Convert it to local time representation. */
 strftime (timestr, sizeof(timestr), "%Y-%m-%d %H:%M", loctime); /* formate datetime as string */
 								
 logInfo("gfServer version %s on host %s, port %s  (%s)", gfVersion, 
 	hostName, portName, timestr);
 struct hash *perSeqMaxHash = maybePerSeqMax(fileCount, seqFiles);
 if (doTrans)
     {
     uglyf("starting translated server...\n");
     logInfo("setting up translated index");
     gfIndexTransNibsAndTwoBits(transGf, fileCount, seqFiles, 
     	minMatch, maxGap, tileSize, repMatch, NULL, allowOneMismatch, 
-	doMask, stepSize);
+	doMask, stepSize, noSimpRepMask);
     }
 else
     {
     uglyf("starting untranslated server...\n");
     logInfo("setting up untranslated index");
     gf = gfIndexNibsAndTwoBits(fileCount, seqFiles, minMatch, 
-    	maxGap, tileSize, repMatch, NULL, allowOneMismatch, stepSize);
+    	maxGap, tileSize, repMatch, NULL, allowOneMismatch, stepSize, noSimpRepMask);
     }
 logInfo("indexing complete");
 
 /* Set up socket.  Get ready to listen to it. */
 socketHandle = netAcceptingSocket(port, 100);
 if (socketHandle < 0)
     errAbort("Fatal Error: Unable to open listening socket on port %d.", port);
 
 logInfo("Server ready for queries!");
 printf("Server ready for queries!\n");
 int connectFailCount = 0;
 for (;;)
     {
     ZeroVar(&fromAddr);
     fromLen = sizeof(fromAddr);
@@ -1006,30 +1009,31 @@
 tileSize = optionInt("tileSize", tileSize);
 stepSize = optionInt("stepSize", tileSize);
 if (optionExists("repMatch"))
     repMatch = optionInt("repMatch", 0);
 else
     repMatch = gfDefaultRepMatch(tileSize, stepSize, doTrans);
 minMatch = optionInt("minMatch", minMatch);
 maxDnaHits = optionInt("maxDnaHits", maxDnaHits);
 maxTransHits = optionInt("maxTransHits", maxTransHits);
 maxNtSize = optionInt("maxNtSize", maxNtSize);
 maxAaSize = optionInt("maxAaSize", maxAaSize);
 seqLog = optionExists("seqLog");
 ipLog = optionExists("ipLog");
 doMask = optionExists("mask");
 canStop = optionExists("canStop");
+noSimpRepMask = optionExists("noSimpRepMask");
 if (argc < 2)
     usage();
 if (optionExists("log"))
     logOpenFile(argv[0], optionVal("log", NULL));
 if (optionExists("syslog"))
     logOpenSyslog(argv[0], optionVal("logFacility", NULL));
 if (optionExists("debugLog"))
     logSetMinPriority("debug");
 
 if (sameWord(command, "direct"))
     {
     if (argc < 4)
         usage();
     genoFindDirect(argv[2], argc-3, argv+3);
     }