155197c63e3259d0922c5c3a0e859f4812129db4
galt
  Sun May 17 17:52:03 2020 -0700
gfServer and blat and isPcr extended with new commandline option -noSimpRepMask to help with small genomes. refs #25477

diff --git src/jkOwnLib/genoFind.c src/jkOwnLib/genoFind.c
index 604746d..ae05696 100644
--- src/jkOwnLib/genoFind.c
+++ src/jkOwnLib/genoFind.c
@@ -98,31 +98,31 @@
 {
 if (isPep)
     {
     if (tileSize < 3 || tileSize > 8)
 	errAbort("protein tileSize must be between 3 and 8");
     }
 else
     {
     if (tileSize < 6 || tileSize > 18)
 	errAbort("DNA tileSize must be between 6 and 18");
     }
 }
 
 static struct genoFind *gfNewEmpty(int minMatch, int maxGap, 
 	int tileSize, int stepSize, int maxPat,
-	char *oocFile, boolean isPep, boolean allowOneMismatch)
+	char *oocFile, boolean isPep, boolean allowOneMismatch, boolean noSimpRepMask)
 /* Return an empty pattern space. oocFile parameter may be NULL*/
 {
 struct genoFind *gf;
 int tileSpaceSize;
 int segSize = 0;
 
 gfCheckTileSize(tileSize, isPep);
 if (stepSize == 0)
     stepSize = tileSize;
 AllocVar(gf);
 if (isPep)
     {
     if (tileSize > 5)
         {
 	segSize = tileSize - 5;
@@ -133,49 +133,51 @@
     {
     int seedBitSize;
     if (tileSize > 12)
         {
 	segSize = tileSize - 12;
 	}
     seedBitSize = (tileSize-segSize)*2;
     tileSpaceSize = gf->tileSpaceSize = (1<<seedBitSize);
     gf->tileMask = tileSpaceSize-1;
     }
 gf->segSize = segSize;
 gf->tileSize = tileSize;
 gf->stepSize = stepSize;
 gf->isPep = isPep;
 gf->allowOneMismatch = allowOneMismatch;
+gf->noSimpRepMask = noSimpRepMask;
 if (segSize > 0)
     {
     gf->endLists = needHugeZeroedMem(tileSpaceSize * sizeof(gf->endLists[0]));
     maxPat = BIGNUM;	/* Don't filter out overused on the big ones.  It is
                          * unnecessary and would be quite complex. */
     }
 else
     {
     gf->lists = needHugeZeroedMem(tileSpaceSize * sizeof(gf->lists[0]));
     }
 gf->listSizes = needHugeZeroedMem(tileSpaceSize * sizeof(gf->listSizes[0]));
 gf->minMatch = minMatch;
 gf->maxGap = maxGap;
 gf->maxPat = maxPat;
 if (oocFile != NULL)
     {
     if (segSize > 0)
         errAbort("Don't yet support ooc on large tile sizes");
     oocMaskCounts(oocFile, gf->listSizes, tileSize, maxPat);
+    if (!gf->noSimpRepMask)
          oocMaskSimpleRepeats(gf->listSizes, tileSize, maxPat);
     }
 return gf;
 }
 
 static int ntLookup[256];
 
 static void initNtLookup()
 {
 static boolean initted = FALSE;
 if (!initted)
     {
     int i;
     for (i=0; i<ArraySize(ntLookup); ++i)
         ntLookup[i] = -1;
@@ -500,42 +502,43 @@
 int maxPat = gf->maxPat;
 int overCount = 0;
 
 for (i=0; i<tileSpaceSize; ++i)
     {
     if (sizes[i] < maxPat)
 	{
         sizes[i] = 0;
 	++overCount;
 	}
     }
 }
 
 struct genoFind *gfIndexNibsAndTwoBits(int fileCount, char *fileNames[],
 	int minMatch, int maxGap, int tileSize, int maxPat, char *oocFile,
-	boolean allowOneMismatch, int stepSize)
+	boolean allowOneMismatch, int stepSize, boolean noSimpRepMask)
 /* Make index for all nibs and .2bits in list. 
  *      minMatch - minimum number of matching tiles to trigger alignments
  *      maxGap   - maximum deviation from diagonal of tiles
  *      tileSize - size of tile in nucleotides
  *      maxPat   - maximum use of tile to not be considered a repeat
  *      oocFile  - .ooc format file that lists repeat tiles.  May be NULL. 
  *      allowOneMismatch - allow one mismatch in a tile.  
- *      stepSize - space between tiles.  Zero means default (which is tileSize). */
+ *      stepSize - space between tiles.  Zero means default (which is tileSize). 
+ *      noSimpRepMask - skip simple repeat masking. */
 {
 struct genoFind *gf = gfNewEmpty(minMatch, maxGap, tileSize, stepSize,
-	maxPat, oocFile, FALSE, allowOneMismatch);
+	maxPat, oocFile, FALSE, allowOneMismatch, noSimpRepMask);
 int i;
 bits32 offset = 0, nibSize;
 char *fileName;
 struct gfSeqSource *ss;
 long long totalBases = 0, warnAt = maxTotalBases();
 int totalSeq = 0;
 
 if (allowOneMismatch)
     errAbort("Don't currently support allowOneMismatch in gfIndexNibsAndTwoBits");
 if (stepSize == 0)
     stepSize = gf->tileSize;
 for (i=0; i<fileCount; ++i)
     {
     fileName = fileNames[i];
     if (twoBitIsFile(fileName))
@@ -592,30 +595,36 @@
 	    dnaSeqFree(&seq);
 	    }
 	twoBitClose(&tbf);
 	}
     }
 gf->totalSeqSize = offset;
 gfZeroOverused(gf);
 printf("Done adding\n");
 return gf;
 }
 
 static void maskSimplePepRepeat(struct genoFind *gf)
 /* Remove tiles from index that represent repeats
  * of period one and two. */
 {
+
+if (gf->noSimpRepMask)
+    {
+    return;
+    }
+
 int i;
 int tileSize = gf->tileSize;
 int maxPat = gf->maxPat;
 bits32 *listSizes = gf->listSizes;
 
 for (i=0; i<20; ++i)
     {
     int j, k;
     for (j=0; j<20; ++j)
 	{
 	int tile = 0;
 	for (k=0; k<tileSize; ++k)
 	    {
 	    tile *= 20;
 	    if (k&1)
@@ -704,50 +713,50 @@
 	struct genoFind *gf = transGf[isRc][frame];
 	ss = gf->sources + sourceIx;
 	gfAddSeq(gf, t3->trans[frame], offset[isRc][frame]);
 	ss->fileName = cloneString(fileName);
 	ss->start = offset[isRc][frame];
 	offset[isRc][frame] += t3->trans[frame]->size;
 	ss->end = offset[isRc][frame];
 	}
     trans3Free(&t3);
     }
 }
 
 void gfIndexTransNibsAndTwoBits(struct genoFind *transGf[2][3], 
     int fileCount, char *fileNames[], 
     int minMatch, int maxGap, int tileSize, int maxPat, char *oocFile,
-    boolean allowOneMismatch, boolean doMask, int stepSize)
+    boolean allowOneMismatch, boolean doMask, int stepSize, boolean noSimpRepMask)
 /* Make translated (6 frame) index for all .nib and .2bit files. */
 {
 struct genoFind *gf;
 int i,isRc, frame;
 bits32 offset[2][3];
 char *fileName;
 struct dnaSeq *seq;
 int sourceCount = 0;
 long long totalBases = 0, warnAt = maxTotalBases();
 
 if (allowOneMismatch)
     errAbort("Don't currently support allowOneMismatch in gfIndexTransNibsAndTwoBits");
 /* Allocate indices for all reading frames. */
 for (isRc=0; isRc <= 1; ++isRc)
     {
     for (frame = 0; frame < 3; ++frame)
 	{
 	transGf[isRc][frame] = gf = gfNewEmpty(minMatch, maxGap, 
-		tileSize, stepSize, maxPat, oocFile, TRUE, allowOneMismatch);
+		tileSize, stepSize, maxPat, oocFile, TRUE, allowOneMismatch, noSimpRepMask);
 	}
     }
 
 /* Mask simple AA repeats (of period 1 and 2). */
 for (isRc = 0; isRc <= 1; ++isRc)
     for (frame = 0; frame < 3; ++frame)
 	maskSimplePepRepeat(transGf[isRc][frame]);
 
 /* Scan through .nib and .2bit files once counting tiles. */
 for (i=0; i<fileCount; ++i)
     {
     fileName = fileNames[i];
     printf("Counting %s\n", fileName);
     if (nibIsFile(fileName))
 	{
@@ -908,37 +917,37 @@
 struct hash *hash = hashNew(18);
 bioSeq *seq;
 for (seq = seqList; seq != NULL; seq = seq->next)
     {
     if (hashLookup(hash, seq->name))
         errAbort("Error: sequence name %s is repeated in the database, all names must be unique.",
 		seq->name);
     hashAdd(hash, seq->name, NULL);
     }
 hashFree(&hash);
 }
 
 struct genoFind *gfIndexSeq(bioSeq *seqList,
 	int minMatch, int maxGap, int tileSize, int maxPat, char *oocFile, 
 	boolean isPep, boolean allowOneMismatch, boolean maskUpper,
-	int stepSize)
+	int stepSize, boolean noSimpRepMask)
 /* Make index for all seqs in list.  For DNA sequences upper case bits will
  * be unindexed. */
 {
 checkUniqueNames(seqList);
 struct genoFind *gf = gfNewEmpty(minMatch, maxGap, tileSize, stepSize, maxPat, 
-				oocFile, isPep, allowOneMismatch);
+				oocFile, isPep, allowOneMismatch, noSimpRepMask);
 if (stepSize == 0)
     stepSize = tileSize;
 if (gf->segSize > 0)
     {
     gfLargeIndexSeq(gf, seqList, minMatch, maxGap, tileSize, maxPat, oocFile, isPep, maskUpper);
     }
 else
     {
     gfSmallIndexSeq(gf, seqList, minMatch, maxGap, tileSize, maxPat, oocFile, isPep, maskUpper);
     }
 return gf;
 }
 
 static int bCmpSeqSource(const void *vTarget, const void *vRange)
 /* Compare function for binary search of gfSeqSource. */
@@ -1949,36 +1958,36 @@
  * reading frames. Used for translated/translated protein comparisons. */
 {
 int qFrame;
 int oneHit;
 int hitCount = 0;
 
 for (qFrame = 0; qFrame<3; ++qFrame)
     {
     gfTransFindClumps(gfs, seqs[qFrame], clumps[qFrame], lm, &oneHit);
     hitCount += oneHit;
     }
 *retHitCount = hitCount;
 }
 
 void gfMakeOoc(char *outName, char *files[], int fileCount, 
-	int tileSize, bits32 maxPat, enum gfType tType)
+	int tileSize, bits32 maxPat, enum gfType tType, boolean noSimpRepMask)
 /* Count occurences of tiles in seqList and make a .ooc file. */
 {
 boolean dbIsPep = (tType == gftProt || tType == gftDnaX || tType == gftRnaX);
 struct genoFind *gf = gfNewEmpty(gfMinMatch, gfMaxGap, tileSize, tileSize,
-	maxPat, NULL, dbIsPep, FALSE);
+	maxPat, NULL, dbIsPep, FALSE, noSimpRepMask);
 bits32 *sizes = gf->listSizes;
 int tileSpaceSize = gf->tileSpaceSize;
 bioSeq *seq, *seqList;
 bits32 sig = oocSig, psz = tileSize;
 bits32 i;
 int oocCount = 0;
 char *inName;
 FILE *f = mustOpen(outName, "w");
 
 if (gf->segSize > 0)
     errAbort("Don't yet know how to make ooc files for large tile sizes.");
 for (i=0; i<fileCount; ++i)
     {
     inName = files[i];
     printf("Loading %s\n", inName);