155197c63e3259d0922c5c3a0e859f4812129db4 galt Sun May 17 17:52:03 2020 -0700 gfServer and blat and isPcr extended with new commandline option -noSimpRepMask to help with small genomes. refs #25477 diff --git src/jkOwnLib/genoFind.c src/jkOwnLib/genoFind.c index 604746d..ae05696 100644 --- src/jkOwnLib/genoFind.c +++ src/jkOwnLib/genoFind.c @@ -98,31 +98,31 @@ { if (isPep) { if (tileSize < 3 || tileSize > 8) errAbort("protein tileSize must be between 3 and 8"); } else { if (tileSize < 6 || tileSize > 18) errAbort("DNA tileSize must be between 6 and 18"); } } static struct genoFind *gfNewEmpty(int minMatch, int maxGap, int tileSize, int stepSize, int maxPat, - char *oocFile, boolean isPep, boolean allowOneMismatch) + char *oocFile, boolean isPep, boolean allowOneMismatch, boolean noSimpRepMask) /* Return an empty pattern space. oocFile parameter may be NULL*/ { struct genoFind *gf; int tileSpaceSize; int segSize = 0; gfCheckTileSize(tileSize, isPep); if (stepSize == 0) stepSize = tileSize; AllocVar(gf); if (isPep) { if (tileSize > 5) { segSize = tileSize - 5; @@ -133,49 +133,51 @@ { int seedBitSize; if (tileSize > 12) { segSize = tileSize - 12; } seedBitSize = (tileSize-segSize)*2; tileSpaceSize = gf->tileSpaceSize = (1<tileMask = tileSpaceSize-1; } gf->segSize = segSize; gf->tileSize = tileSize; gf->stepSize = stepSize; gf->isPep = isPep; gf->allowOneMismatch = allowOneMismatch; +gf->noSimpRepMask = noSimpRepMask; if (segSize > 0) { gf->endLists = needHugeZeroedMem(tileSpaceSize * sizeof(gf->endLists[0])); maxPat = BIGNUM; /* Don't filter out overused on the big ones. It is * unnecessary and would be quite complex. */ } else { gf->lists = needHugeZeroedMem(tileSpaceSize * sizeof(gf->lists[0])); } gf->listSizes = needHugeZeroedMem(tileSpaceSize * sizeof(gf->listSizes[0])); gf->minMatch = minMatch; gf->maxGap = maxGap; gf->maxPat = maxPat; if (oocFile != NULL) { if (segSize > 0) errAbort("Don't yet support ooc on large tile sizes"); oocMaskCounts(oocFile, gf->listSizes, tileSize, maxPat); + if (!gf->noSimpRepMask) oocMaskSimpleRepeats(gf->listSizes, tileSize, maxPat); } return gf; } static int ntLookup[256]; static void initNtLookup() { static boolean initted = FALSE; if (!initted) { int i; for (i=0; imaxPat; int overCount = 0; for (i=0; itileSize; for (i=0; itotalSeqSize = offset; gfZeroOverused(gf); printf("Done adding\n"); return gf; } static void maskSimplePepRepeat(struct genoFind *gf) /* Remove tiles from index that represent repeats * of period one and two. */ { + +if (gf->noSimpRepMask) + { + return; + } + int i; int tileSize = gf->tileSize; int maxPat = gf->maxPat; bits32 *listSizes = gf->listSizes; for (i=0; i<20; ++i) { int j, k; for (j=0; j<20; ++j) { int tile = 0; for (k=0; ksources + sourceIx; gfAddSeq(gf, t3->trans[frame], offset[isRc][frame]); ss->fileName = cloneString(fileName); ss->start = offset[isRc][frame]; offset[isRc][frame] += t3->trans[frame]->size; ss->end = offset[isRc][frame]; } trans3Free(&t3); } } void gfIndexTransNibsAndTwoBits(struct genoFind *transGf[2][3], int fileCount, char *fileNames[], int minMatch, int maxGap, int tileSize, int maxPat, char *oocFile, - boolean allowOneMismatch, boolean doMask, int stepSize) + boolean allowOneMismatch, boolean doMask, int stepSize, boolean noSimpRepMask) /* Make translated (6 frame) index for all .nib and .2bit files. */ { struct genoFind *gf; int i,isRc, frame; bits32 offset[2][3]; char *fileName; struct dnaSeq *seq; int sourceCount = 0; long long totalBases = 0, warnAt = maxTotalBases(); if (allowOneMismatch) errAbort("Don't currently support allowOneMismatch in gfIndexTransNibsAndTwoBits"); /* Allocate indices for all reading frames. */ for (isRc=0; isRc <= 1; ++isRc) { for (frame = 0; frame < 3; ++frame) { transGf[isRc][frame] = gf = gfNewEmpty(minMatch, maxGap, - tileSize, stepSize, maxPat, oocFile, TRUE, allowOneMismatch); + tileSize, stepSize, maxPat, oocFile, TRUE, allowOneMismatch, noSimpRepMask); } } /* Mask simple AA repeats (of period 1 and 2). */ for (isRc = 0; isRc <= 1; ++isRc) for (frame = 0; frame < 3; ++frame) maskSimplePepRepeat(transGf[isRc][frame]); /* Scan through .nib and .2bit files once counting tiles. */ for (i=0; inext) { if (hashLookup(hash, seq->name)) errAbort("Error: sequence name %s is repeated in the database, all names must be unique.", seq->name); hashAdd(hash, seq->name, NULL); } hashFree(&hash); } struct genoFind *gfIndexSeq(bioSeq *seqList, int minMatch, int maxGap, int tileSize, int maxPat, char *oocFile, boolean isPep, boolean allowOneMismatch, boolean maskUpper, - int stepSize) + int stepSize, boolean noSimpRepMask) /* Make index for all seqs in list. For DNA sequences upper case bits will * be unindexed. */ { checkUniqueNames(seqList); struct genoFind *gf = gfNewEmpty(minMatch, maxGap, tileSize, stepSize, maxPat, - oocFile, isPep, allowOneMismatch); + oocFile, isPep, allowOneMismatch, noSimpRepMask); if (stepSize == 0) stepSize = tileSize; if (gf->segSize > 0) { gfLargeIndexSeq(gf, seqList, minMatch, maxGap, tileSize, maxPat, oocFile, isPep, maskUpper); } else { gfSmallIndexSeq(gf, seqList, minMatch, maxGap, tileSize, maxPat, oocFile, isPep, maskUpper); } return gf; } static int bCmpSeqSource(const void *vTarget, const void *vRange) /* Compare function for binary search of gfSeqSource. */ @@ -1949,36 +1958,36 @@ * reading frames. Used for translated/translated protein comparisons. */ { int qFrame; int oneHit; int hitCount = 0; for (qFrame = 0; qFrame<3; ++qFrame) { gfTransFindClumps(gfs, seqs[qFrame], clumps[qFrame], lm, &oneHit); hitCount += oneHit; } *retHitCount = hitCount; } void gfMakeOoc(char *outName, char *files[], int fileCount, - int tileSize, bits32 maxPat, enum gfType tType) + int tileSize, bits32 maxPat, enum gfType tType, boolean noSimpRepMask) /* Count occurences of tiles in seqList and make a .ooc file. */ { boolean dbIsPep = (tType == gftProt || tType == gftDnaX || tType == gftRnaX); struct genoFind *gf = gfNewEmpty(gfMinMatch, gfMaxGap, tileSize, tileSize, - maxPat, NULL, dbIsPep, FALSE); + maxPat, NULL, dbIsPep, FALSE, noSimpRepMask); bits32 *sizes = gf->listSizes; int tileSpaceSize = gf->tileSpaceSize; bioSeq *seq, *seqList; bits32 sig = oocSig, psz = tileSize; bits32 i; int oocCount = 0; char *inName; FILE *f = mustOpen(outName, "w"); if (gf->segSize > 0) errAbort("Don't yet know how to make ooc files for large tile sizes."); for (i=0; i