155197c63e3259d0922c5c3a0e859f4812129db4 galt Sun May 17 17:52:03 2020 -0700 gfServer and blat and isPcr extended with new commandline option -noSimpRepMask to help with small genomes. refs #25477 diff --git src/inc/genoFind.h src/inc/genoFind.h index 483a6f3..6077e70 100644 --- src/inc/genoFind.h +++ src/inc/genoFind.h @@ -89,30 +89,31 @@ /* An index of all K-mers in the genome. */ { int maxPat; /* Max # of times pattern can occur * before it is ignored. */ int minMatch; /* Minimum number of tile hits needed * to trigger a clump hit. */ int maxGap; /* Max gap between tiles in a clump. */ int tileSize; /* Size of each N-mer. */ int stepSize; /* Spacing between N-mers. */ int tileSpaceSize; /* Number of N-mer values. */ int tileMask; /* 1-s for each N-mer. */ int sourceCount; /* Count of source files. */ struct gfSeqSource *sources; /* List of sequence sources. */ bool isPep; /* Is a peptide. */ bool allowOneMismatch; /* Allow a single mismatch? */ + bool noSimpRepMask; /* Dis-Allow simple repeat masking. */ int segSize; /* Index is segmented if non-zero. */ bits32 totalSeqSize; /* Total size of all sequences. */ bits32 *listSizes; /* Size of list for each N-mer */ void *allocated; /* Storage space for all lists. */ bits32 **lists; /* A list for each N-mer. Used if * isSegmented is false. */ bits16 **endLists; /* A more complex list for each N-mer. * Used if isSegmented is true. * Values come in groups of threes. * The first is the packed last few * letters of the tile. The next two * are the offset in the genome. This * would be a struct but that would take * 8 bytes instead of 6, or nearly an * extra gigabyte of RAM. */ @@ -218,58 +219,60 @@ void gfOutputHead(struct gfOutput *out, FILE *f); /* Write out header if any. */ void gfOutputFree(struct gfOutput **pOut); /* Free up output. */ /* -------- Routines to build up index ------------ */ void gfCheckTileSize(int tileSize, boolean isPep); /* Check that tile size is legal. Abort if not. */ struct genoFind *gfIndexSeq(bioSeq *seqList, int minMatch, int maxGap, int tileSize, int maxPat, char *oocFile, boolean isPep, boolean allowOneMismatch, boolean maskUpper, - int stepSize); + int stepSize, boolean noSimpRepMask); /* Make index for all seqs in list. * minMatch - minimum number of matching tiles to trigger alignments * maxGap - maximum deviation from diagonal of tiles * tileSize - size of tile in nucleotides * maxPat - maximum use of tile to not be considered a repeat * oocFile - .ooc format file that lists repeat tiles. May be NULL. * isPep - TRUE if indexing proteins, FALSE for DNA. * maskUpper - Mask out upper case sequence (currently only for nucleotides). * stepSize - space between tiles. Zero means default (which is tileSize). + * noSimpRepMask - skip simple repeat masking. * For DNA sequences upper case bits will be unindexed. */ struct genoFind *gfIndexNibsAndTwoBits(int fileCount, char *fileNames[], int minMatch, int maxGap, int tileSize, int maxPat, char *oocFile, - boolean allowOneMismatch, int stepSize); + boolean allowOneMismatch, int stepSize, boolean noSimpRepMask); /* Make index for all .nib and .2bits in list. * minMatch - minimum number of matching tiles to trigger alignments * maxGap - maximum deviation from diagonal of tiles * tileSize - size of tile in nucleotides * maxPat - maximum use of tile to not be considered a repeat * oocFile - .ooc format file that lists repeat tiles. May be NULL. * allowOneMismatch - allow one mismatch in a tile. - * stepSize - space between tiles. Zero means default (which is tileSize). */ + * stepSize - space between tiles. Zero means default (which is tileSize). + * noSimpRepMask - skip simple repeat masking. */ void gfIndexTransNibsAndTwoBits(struct genoFind *transGf[2][3], int fileCount, char *fileNames[], int minMatch, int maxGap, int tileSize, int maxPat, char *oocFile, - boolean allowOneMismatch, boolean mask, int stepSize); + boolean allowOneMismatch, boolean mask, int stepSize, boolean noSimpRepMask); /* Make translated (6 frame) index for all .nib and .2bit files. */ /* -------- Routines to scan index for homolgous areas ------------ */ struct gfClump *gfFindClumps(struct genoFind *gf, struct dnaSeq *seq, struct lm *lm, int *retHitCount); /* Find clumps associated with one sequence. */ struct gfClump *gfFindClumpsWithQmask(struct genoFind *gf, bioSeq *seq, Bits *qMaskBits, int qMaskOffset, struct lm *lm, int *retHitCount); /* Find clumps associated with one sequence soft-masking seq according to qMaskBits */ struct gfHit *gfFindHitsInRegion(struct genoFind *gf, bioSeq *seq, Bits *qMaskBits, int qMaskOffset, struct lm *lm, @@ -344,45 +347,45 @@ /* Search indexed translated genome on server with an dna sequence. Translate * this sequence in three frames. Load homologous bits of genome locally * and do detailed alignment. Call 'outFunction' with each alignment * that is found. */ int gfMayConnect(char *hostName, char *portName); /* Set up our network connection to server, or return -1. */ int gfConnect(char *hostName, char *portName); /* Set up our network connection to server. Aborts on error. */ int gfDefaultRepMatch(int tileSize, int stepSize, boolean protTiles); /* Figure out appropriate step repMatch value. */ void gfMakeOoc(char *outName, char *files[], int fileCount, - int tileSize, bits32 maxPat, enum gfType tType); + int tileSize, bits32 maxPat, enum gfType tType, boolean noSimpRepMask); /* Count occurences of tiles in seqList and make a .ooc file. */ void gfLongDnaInMem(struct dnaSeq *query, struct genoFind *gf, boolean isRc, int minScore, Bits *qMaskBits, struct gfOutput *out, boolean fastMap, boolean band); /* Chop up query into pieces, align each, and stitch back * together again. */ void gfLongTransTransInMem(struct dnaSeq *query, struct genoFind *gfs[3], struct hash *t3Hash, boolean qIsRc, boolean tIsRc, boolean qIsRna, int minScore, struct gfOutput *out); /* Chop up query into pieces, align each in translated space, and stitch back * together again as nucleotides. */ struct gfClump *gfPcrClumps(struct genoFind *gf, char *fPrimer, int fPrimerSize, char *rPrimer, int rPrimerSize, int minDistance, int maxDistance); /* Find possible PCR hits. The fPrimer and rPrimer are on opposite strands. * Note: unlike clumps from other query functions, PCR clumps from this * function have already had clump->target->start subtracted from * clump->tStart and clump->tEnd so that the coords are relative to that * target sequence (not the collection of all target sequences). */ #define MAXSINGLEPIECESIZE 5000 /* maximum size of a single piece */ -#define gfVersion "36x5" /* Current BLAT version number */ +#define gfVersion "36x6" /* Current BLAT version number */ #endif /* GENOFIND_H */