155197c63e3259d0922c5c3a0e859f4812129db4
galt
  Sun May 17 17:52:03 2020 -0700
gfServer and blat and isPcr extended with new commandline option -noSimpRepMask to help with small genomes. refs #25477

diff --git src/inc/genoFind.h src/inc/genoFind.h
index 483a6f3..6077e70 100644
--- src/inc/genoFind.h
+++ src/inc/genoFind.h
@@ -89,30 +89,31 @@
 /* An index of all K-mers in the genome. */
     {
     int maxPat;                          /* Max # of times pattern can occur
                                           * before it is ignored. */
     int minMatch;                        /* Minimum number of tile hits needed
                                           * to trigger a clump hit. */
     int maxGap;                          /* Max gap between tiles in a clump. */
     int tileSize;			 /* Size of each N-mer. */
     int stepSize;			 /* Spacing between N-mers. */
     int tileSpaceSize;                   /* Number of N-mer values. */
     int tileMask;			 /* 1-s for each N-mer. */
     int sourceCount;			 /* Count of source files. */
     struct gfSeqSource *sources;         /* List of sequence sources. */
     bool isPep;			 	 /* Is a peptide. */
     bool allowOneMismatch;		 /* Allow a single mismatch? */
+    bool noSimpRepMask;			  /* Dis-Allow simple repeat masking. */
     int segSize;			 /* Index is segmented if non-zero. */
     bits32 totalSeqSize;		 /* Total size of all sequences. */
     bits32 *listSizes;                   /* Size of list for each N-mer */
     void *allocated;                     /* Storage space for all lists. */
     bits32 **lists;                      /* A list for each N-mer. Used if
                                           * isSegmented is false. */
     bits16 **endLists;                   /* A more complex list for each N-mer.
                                           * Used if isSegmented is true.
 					  * Values come in groups of threes.
 					  * The first is the packed last few
 					  * letters of the tile.  The next two
 					  * are the offset in the genome.  This
 					  * would be a struct but that would take
 					  * 8 bytes instead of 6, or nearly an
 					  * extra gigabyte of RAM. */
@@ -218,58 +219,60 @@
 
 void gfOutputHead(struct gfOutput *out, FILE *f);
 /* Write out header if any. */
 
 void gfOutputFree(struct gfOutput **pOut);
 /* Free up output. */
 
 /* -------- Routines to build up index ------------ */
 
 void gfCheckTileSize(int tileSize, boolean isPep);
 /* Check that tile size is legal.  Abort if not. */
 
 struct genoFind *gfIndexSeq(bioSeq *seqList,
 	int minMatch, int maxGap, int tileSize, int maxPat, char *oocFile,
 	boolean isPep, boolean allowOneMismatch, boolean maskUpper,
-	int stepSize);
+	int stepSize, boolean noSimpRepMask);
 /* Make index for all seqs in list. 
  *      minMatch - minimum number of matching tiles to trigger alignments
  *      maxGap   - maximum deviation from diagonal of tiles
  *      tileSize - size of tile in nucleotides
  *      maxPat   - maximum use of tile to not be considered a repeat
  *      oocFile  - .ooc format file that lists repeat tiles.  May be NULL. 
  *      isPep    - TRUE if indexing proteins, FALSE for DNA. 
  *      maskUpper - Mask out upper case sequence (currently only for nucleotides).
  *      stepSize - space between tiles.  Zero means default (which is tileSize). 
+ *      noSimpRepMask - skip simple repeat masking. 
  * For DNA sequences upper case bits will be unindexed. */
 
 struct genoFind *gfIndexNibsAndTwoBits(int fileCount, char *fileNames[],
 	int minMatch, int maxGap, int tileSize, int maxPat, char *oocFile, 
-	boolean allowOneMismatch, int stepSize);
+	boolean allowOneMismatch, int stepSize, boolean noSimpRepMask);
 /* Make index for all .nib and .2bits in list. 
  *      minMatch - minimum number of matching tiles to trigger alignments
  *      maxGap   - maximum deviation from diagonal of tiles
  *      tileSize - size of tile in nucleotides
  *      maxPat   - maximum use of tile to not be considered a repeat
  *      oocFile  - .ooc format file that lists repeat tiles.  May be NULL. 
  *      allowOneMismatch - allow one mismatch in a tile.  
- *      stepSize - space between tiles.  Zero means default (which is tileSize). */
+ *      stepSize - space between tiles.  Zero means default (which is tileSize).
+ *      noSimpRepMask - skip simple repeat masking. */
 
 void gfIndexTransNibsAndTwoBits(struct genoFind *transGf[2][3], 
     int fileCount, char *fileNames[], 
     int minMatch, int maxGap, int tileSize, int maxPat, char *oocFile,
-    boolean allowOneMismatch, boolean mask, int stepSize);
+    boolean allowOneMismatch, boolean mask, int stepSize, boolean noSimpRepMask);
 /* Make translated (6 frame) index for all .nib and .2bit files. */
 
 /* -------- Routines to scan index for homolgous areas ------------ */
 
 struct gfClump *gfFindClumps(struct genoFind *gf, struct dnaSeq *seq, 
 	struct lm *lm, int *retHitCount);
 /* Find clumps associated with one sequence. */
 
 struct gfClump *gfFindClumpsWithQmask(struct genoFind *gf, bioSeq *seq, 
         Bits *qMaskBits, int qMaskOffset,
 	struct lm *lm, int *retHitCount);
 /* Find clumps associated with one sequence soft-masking seq according to qMaskBits */
 
 struct gfHit *gfFindHitsInRegion(struct genoFind *gf, bioSeq *seq, 
 	Bits *qMaskBits, int qMaskOffset, struct lm *lm, 
@@ -344,45 +347,45 @@
 /* Search indexed translated genome on server with an dna sequence.  Translate
  * this sequence in three frames. Load homologous bits of genome locally
  * and do detailed alignment.  Call 'outFunction' with each alignment
  * that is found. */
 
 int gfMayConnect(char *hostName, char *portName);
 /* Set up our network connection to server, or return -1. */
 
 int gfConnect(char *hostName, char *portName);
 /* Set up our network connection to server. Aborts on error. */
 
 int gfDefaultRepMatch(int tileSize, int stepSize, boolean protTiles);
 /* Figure out appropriate step repMatch value. */
 
 void gfMakeOoc(char *outName, char *files[], int fileCount, 
-	int tileSize, bits32 maxPat, enum gfType tType);
+	int tileSize, bits32 maxPat, enum gfType tType, boolean noSimpRepMask);
 /* Count occurences of tiles in seqList and make a .ooc file. */
 
 void gfLongDnaInMem(struct dnaSeq *query, struct genoFind *gf, 
    boolean isRc, int minScore, Bits *qMaskBits, struct gfOutput *out,
    boolean fastMap, boolean band);
 /* Chop up query into pieces, align each, and stitch back
  * together again. */
 
 void gfLongTransTransInMem(struct dnaSeq *query, struct genoFind *gfs[3], 
    struct hash *t3Hash, boolean qIsRc, boolean tIsRc, boolean qIsRna,
    int minScore, struct gfOutput *out);
 /* Chop up query into pieces, align each in translated space, and stitch back
  * together again as nucleotides. */
 
 struct gfClump *gfPcrClumps(struct genoFind *gf, 
         char *fPrimer, int fPrimerSize, char *rPrimer, int rPrimerSize,
 	int minDistance, int maxDistance);
 /* Find possible PCR hits.  The fPrimer and rPrimer are on opposite strands.
  * Note: unlike clumps from other query functions, PCR clumps from this 
  * function have already had clump->target->start subtracted from 
  * clump->tStart and clump->tEnd so that the coords are relative to that 
  * target sequence (not the collection of all target sequences). */
 
 #define MAXSINGLEPIECESIZE 5000 /* maximum size of a single piece */
 
-#define gfVersion "36x5"	/* Current BLAT version number */
+#define gfVersion "36x6"	/* Current BLAT version number */
 
 #endif /* GENOFIND_H */