155197c63e3259d0922c5c3a0e859f4812129db4
galt
  Sun May 17 17:52:03 2020 -0700
gfServer and blat and isPcr extended with new commandline option -noSimpRepMask to help with small genomes. refs #25477

diff --git src/blat/blat.c src/blat/blat.c
index dbb5a51..a48924c 100644
--- src/blat/blat.c
+++ src/blat/blat.c
@@ -31,30 +31,31 @@
 
 /* Variables that can be set from command line. */
 int tileSize = 11;
 int stepSize = 0;	/* Default (same as tileSize) */
 int minMatch = 2;
 int minScore = 30;
 int maxGap = 2;
 int repMatch = 1024*4;
 int dotEvery = 0;
 boolean oneOff = FALSE;
 boolean noHead = FALSE;
 boolean trimA = FALSE;
 boolean trimHardA = FALSE;
 boolean trimT = FALSE;
 boolean fastMap = FALSE;
+boolean noSimpRepMask = FALSE;
 char *makeOoc = NULL;
 char *ooc = NULL;
 enum gfType qType = gftDna;
 enum gfType tType = gftDna;
 char *mask = NULL;
 char *repeats = NULL;
 char *qMask = NULL;
 double minRepDivergence = 15;
 double minIdentity = 90;
 char *outputFormat = "psl";
 
 
 void usage()
 /* Explain usage and exit. */
 {
@@ -103,30 +104,31 @@
   "   -minScore=N    Sets minimum score.  This is the matches minus the \n"
   "                  mismatches minus some sort of gap penalty.  Default is 30.\n"
   "   -minIdentity=N Sets minimum sequence identity (in percent).  Default is\n"
   "                  90 for nucleotide searches, 25 for protein or translated\n"
   "                  protein searches.\n"
   "   -maxGap=N      Sets the size of maximum gap between tiles in a clump.  Usually\n"
   "                  set from 0 to 3.  Default is 2. Only relevent for minMatch > 1.\n"
   "   -noHead        Suppresses .psl header (so it's just a tab-separated file).\n"
   "   -makeOoc=N.ooc Make overused tile file. Target needs to be complete genome.\n"
   "   -repMatch=N    Sets the number of repetitions of a tile allowed before\n"
   "                  it is marked as overused.  Typically this is 256 for tileSize\n"
   "                  12, 1024 for tile size 11, 4096 for tile size 10.\n"
   "                  Default is 1024.  Typically comes into play only with makeOoc.\n"
   "                  Also affected by stepSize: when stepSize is halved, repMatch is\n"
   "                  doubled to compensate.\n"
+  "   -noSimpRepMask Suppresses simple repeat masking.\n"
   "   -mask=type     Mask out repeats.  Alignments won't be started in masked region\n"
   "                  but may extend through it in nucleotide searches.  Masked areas\n"
   "                  are ignored entirely in protein or translated searches. Types are:\n"
   "                    lower - mask out lower-cased sequence\n"
   "                    upper - mask out upper-cased sequence\n"
   "                    out   - mask according to database.out RepeatMasker .out file\n"
   "                    file.out - mask database according to RepeatMasker file.out\n"
   "   -qMask=type    Mask out repeats in query sequence.  Similar to -mask above, but\n"
   "                  for query rather than target sequence.\n"
   "   -repeats=type  Type is same as mask types above.  Repeat bases will not be\n"
   "                  masked in any way, but matches in repeat areas will be reported\n"
   "                  separately from matches in other areas in the psl output.\n"
   "   -minRepDivergence=NN   Minimum percent divergence of repeats to allow \n"
   "                  them to be unmasked.  Default is 15.  Only relevant for \n"
   "                  masking using RepeatMasker .out files.\n"
@@ -173,30 +175,31 @@
    {"makeOoc", OPTION_STRING},
    {"repMatch", OPTION_INT},
    {"mask", OPTION_STRING},
    {"qMask", OPTION_STRING},
    {"repeats", OPTION_STRING},
    {"minRepDivergence", OPTION_FLOAT},
    {"dots", OPTION_INT},
    {"trimT", OPTION_BOOLEAN},
    {"noTrimA", OPTION_BOOLEAN},
    {"trimHardA", OPTION_BOOLEAN},
    {"fastMap", OPTION_BOOLEAN},
    {"out", OPTION_STRING},
    {"fine", OPTION_BOOLEAN},
    {"maxIntron", OPTION_INT},
    {"extendThroughN", OPTION_BOOLEAN},
+   {"noSimpRepMask", OPTION_BOOLEAN},
    {NULL, 0},
 };
 
 
 /* Stuff to support various output formats. */
 struct gfOutput *gvo;		/* Overall output controller */
 
 void searchOneStrand(struct dnaSeq *seq, struct genoFind *gf, FILE *psl, 
 	boolean isRc, struct hash *maskHash, Bits *qMaskBits)
 /* Search for seq in index, align it, and write results to psl. */
 {
 if (fastMap && (seq->size > MAXSINGLEPIECESIZE))
     errAbort("Maximum single piece size (%d) exceeded by query %s of size (%d). "
 	"Larger pieces will have to be split up until no larger than this limit "
 	"when the -fastMap option is used."	
@@ -484,31 +487,31 @@
     gvo->fileHead(gvo, out);
 
 for (isRc = FALSE; isRc <= 1; ++isRc)
     {
     /* Initialize local pointer arrays to NULL to prevent surprises. */
     for (frame = 0; frame < 3; ++frame)
 	{
 	gfs[frame] = NULL;
 	dbSeqLists[frame] = NULL;
 	}
 
     t3List = seqListToTrans3List(untransList, dbSeqLists, &t3Hash);
     for (frame = 0; frame < 3; ++frame)
 	{
 	gfs[frame] = gfIndexSeq(dbSeqLists[frame], minMatch, maxGap, tileSize, 
-		repMatch, ooc, TRUE, oneOff, FALSE, stepSize);
+		repMatch, ooc, TRUE, oneOff, FALSE, stepSize, noSimpRepMask);
 	}
 
     for (i=0; i<queryCount; ++i)
         {
 	aaSeq qSeq;
 
 	lf = lineFileOpen(queryFiles[i], TRUE);
 	while (faMixedSpeedReadNext(lf, &qSeq.dna, &qSeq.size, &qSeq.name))
 	    {
 	    dotOut();
 	    /* Put it into right case and optionally mask on case. */
 	    if (forceLower)
 	        toLowerN(qSeq.dna, qSeq.size);
 	    else if (forceUpper)
 	        toUpperN(qSeq.dna, qSeq.size);
@@ -556,31 +559,31 @@
 char **dbFiles, **queryFiles;
 int dbCount, queryCount;
 struct dnaSeq *dbSeqList, *seq;
 struct genoFind *gf;
 boolean tIsProt = (tType == gftProt);
 boolean qIsProt = (qType == gftProt);
 boolean bothSimpleNuc = (tType == gftDna && (qType == gftDna || qType == gftRna));
 boolean bothSimpleProt = (tIsProt && qIsProt);
 FILE *f = mustOpen(outName, "w");
 boolean showStatus = (f != stdout);
 
 databaseName = dbFile;
 gfClientFileArray(dbFile, &dbFiles, &dbCount);
 if (makeOoc != NULL)
     {
-    gfMakeOoc(makeOoc, dbFiles, dbCount, tileSize, repMatch, tType);
+    gfMakeOoc(makeOoc, dbFiles, dbCount, tileSize, repMatch, tType, noSimpRepMask);
     if (showStatus)
 	printf("Done making %s\n", makeOoc);
     exit(0);
     }
 gfClientFileArray(queryFile, &queryFiles, &queryCount);
 dbSeqList = gfClientSeqList(dbCount, dbFiles, tIsProt, tType == gftDnaX, repeats, 
 	minRepDivergence, showStatus);
 databaseSeqCount = slCount(dbSeqList);
 for (seq = dbSeqList; seq != NULL; seq = seq->next)
     databaseLetters += seq->size;
 
 gvo = gfOutputAny(outputFormat, minIdentity*10, qIsProt, tIsProt, noHead, 
 	databaseName, databaseSeqCount, databaseLetters, minIdentity, f);
 
 if (bothSimpleNuc || bothSimpleProt)
@@ -593,31 +596,31 @@
 	maskHash = newHash(0);
 	for (seq = dbSeqList; seq != NULL; seq = seq->next)
 	    {
 	    Bits *maskedBits = maskFromUpperCaseSeq(seq);
 	    hashAdd(maskHash, seq->name, maskedBits);
 	    }
 	}
 
     /* Handle masking and indexing.  If masking is off, we want the indexer
      * to see unmasked sequence, otherwise we want it to see masked.  However
      * after indexing we always want it unmasked, because things are always
      * unmasked for the extension phase. */
     if (mask == NULL && !bothSimpleProt)
         gfClientUnmask(dbSeqList);
     gf = gfIndexSeq(dbSeqList, minMatch, maxGap, tileSize, repMatch, ooc, 
-    	tIsProt, oneOff, FALSE, stepSize);
+    	tIsProt, oneOff, FALSE, stepSize, noSimpRepMask);
     if (mask != NULL)
         gfClientUnmask(dbSeqList);
 
     searchOneIndex(queryCount, queryFiles, gf, outName, tIsProt, maskHash, f, showStatus);
     freeHash(&maskHash);
     }
 else if (tType == gftDnaX && qType == gftProt)
     {
     bigBlat(dbSeqList, queryCount, queryFiles, outName, FALSE, TRUE, f, showStatus);
     }
 else if (tType == gftDnaX && (qType == gftDnaX || qType == gftRnaX))
     {
     bigBlat(dbSeqList, queryCount, queryFiles, outName, TRUE, qType == gftDnaX, f, showStatus);
     }
 else
@@ -702,30 +705,31 @@
     oneOff = FALSE;
     maxGap = 0;
     }
 
 /* Get tile size and related parameters from user and make sure
  * they are within range. */
 tileSize = optionInt("tileSize", tileSize);
 stepSize = optionInt("stepSize", tileSize);
 minMatch = optionInt("minMatch", minMatch);
 oneOff = optionExists("oneOff");
 fastMap = optionExists("fastMap");
 minScore = optionInt("minScore", minScore);
 maxGap = optionInt("maxGap", maxGap);
 minRepDivergence = optionFloat("minRepDivergence", minRepDivergence);
 minIdentity = optionFloat("minIdentity", minIdentity);
+noSimpRepMask = optionExists("noSimpRepMask");
 gfCheckTileSize(tileSize, tIsProtLike);
 if (minMatch < 0)
     errAbort("minMatch must be at least 1");
 if (maxGap > 100)
     errAbort("maxGap must be less than 100");
 
 
 
 /* Set repMatch parameter from command line, or
  * to reasonable value that depends on tile size. */
 if (optionExists("repMatch"))
     repMatch = optionInt("repMatch", repMatch);
 else
     repMatch = gfDefaultRepMatch(tileSize, stepSize, tIsProtLike);