155197c63e3259d0922c5c3a0e859f4812129db4 galt Sun May 17 17:52:03 2020 -0700 gfServer and blat and isPcr extended with new commandline option -noSimpRepMask to help with small genomes. refs #25477 diff --git src/blat/blat.c src/blat/blat.c index dbb5a51..a48924c 100644 --- src/blat/blat.c +++ src/blat/blat.c @@ -31,30 +31,31 @@ /* Variables that can be set from command line. */ int tileSize = 11; int stepSize = 0; /* Default (same as tileSize) */ int minMatch = 2; int minScore = 30; int maxGap = 2; int repMatch = 1024*4; int dotEvery = 0; boolean oneOff = FALSE; boolean noHead = FALSE; boolean trimA = FALSE; boolean trimHardA = FALSE; boolean trimT = FALSE; boolean fastMap = FALSE; +boolean noSimpRepMask = FALSE; char *makeOoc = NULL; char *ooc = NULL; enum gfType qType = gftDna; enum gfType tType = gftDna; char *mask = NULL; char *repeats = NULL; char *qMask = NULL; double minRepDivergence = 15; double minIdentity = 90; char *outputFormat = "psl"; void usage() /* Explain usage and exit. */ { @@ -103,30 +104,31 @@ " -minScore=N Sets minimum score. This is the matches minus the \n" " mismatches minus some sort of gap penalty. Default is 30.\n" " -minIdentity=N Sets minimum sequence identity (in percent). Default is\n" " 90 for nucleotide searches, 25 for protein or translated\n" " protein searches.\n" " -maxGap=N Sets the size of maximum gap between tiles in a clump. Usually\n" " set from 0 to 3. Default is 2. Only relevent for minMatch > 1.\n" " -noHead Suppresses .psl header (so it's just a tab-separated file).\n" " -makeOoc=N.ooc Make overused tile file. Target needs to be complete genome.\n" " -repMatch=N Sets the number of repetitions of a tile allowed before\n" " it is marked as overused. Typically this is 256 for tileSize\n" " 12, 1024 for tile size 11, 4096 for tile size 10.\n" " Default is 1024. Typically comes into play only with makeOoc.\n" " Also affected by stepSize: when stepSize is halved, repMatch is\n" " doubled to compensate.\n" + " -noSimpRepMask Suppresses simple repeat masking.\n" " -mask=type Mask out repeats. Alignments won't be started in masked region\n" " but may extend through it in nucleotide searches. Masked areas\n" " are ignored entirely in protein or translated searches. Types are:\n" " lower - mask out lower-cased sequence\n" " upper - mask out upper-cased sequence\n" " out - mask according to database.out RepeatMasker .out file\n" " file.out - mask database according to RepeatMasker file.out\n" " -qMask=type Mask out repeats in query sequence. Similar to -mask above, but\n" " for query rather than target sequence.\n" " -repeats=type Type is same as mask types above. Repeat bases will not be\n" " masked in any way, but matches in repeat areas will be reported\n" " separately from matches in other areas in the psl output.\n" " -minRepDivergence=NN Minimum percent divergence of repeats to allow \n" " them to be unmasked. Default is 15. Only relevant for \n" " masking using RepeatMasker .out files.\n" @@ -173,30 +175,31 @@ {"makeOoc", OPTION_STRING}, {"repMatch", OPTION_INT}, {"mask", OPTION_STRING}, {"qMask", OPTION_STRING}, {"repeats", OPTION_STRING}, {"minRepDivergence", OPTION_FLOAT}, {"dots", OPTION_INT}, {"trimT", OPTION_BOOLEAN}, {"noTrimA", OPTION_BOOLEAN}, {"trimHardA", OPTION_BOOLEAN}, {"fastMap", OPTION_BOOLEAN}, {"out", OPTION_STRING}, {"fine", OPTION_BOOLEAN}, {"maxIntron", OPTION_INT}, {"extendThroughN", OPTION_BOOLEAN}, + {"noSimpRepMask", OPTION_BOOLEAN}, {NULL, 0}, }; /* Stuff to support various output formats. */ struct gfOutput *gvo; /* Overall output controller */ void searchOneStrand(struct dnaSeq *seq, struct genoFind *gf, FILE *psl, boolean isRc, struct hash *maskHash, Bits *qMaskBits) /* Search for seq in index, align it, and write results to psl. */ { if (fastMap && (seq->size > MAXSINGLEPIECESIZE)) errAbort("Maximum single piece size (%d) exceeded by query %s of size (%d). " "Larger pieces will have to be split up until no larger than this limit " "when the -fastMap option is used." @@ -484,31 +487,31 @@ gvo->fileHead(gvo, out); for (isRc = FALSE; isRc <= 1; ++isRc) { /* Initialize local pointer arrays to NULL to prevent surprises. */ for (frame = 0; frame < 3; ++frame) { gfs[frame] = NULL; dbSeqLists[frame] = NULL; } t3List = seqListToTrans3List(untransList, dbSeqLists, &t3Hash); for (frame = 0; frame < 3; ++frame) { gfs[frame] = gfIndexSeq(dbSeqLists[frame], minMatch, maxGap, tileSize, - repMatch, ooc, TRUE, oneOff, FALSE, stepSize); + repMatch, ooc, TRUE, oneOff, FALSE, stepSize, noSimpRepMask); } for (i=0; i<queryCount; ++i) { aaSeq qSeq; lf = lineFileOpen(queryFiles[i], TRUE); while (faMixedSpeedReadNext(lf, &qSeq.dna, &qSeq.size, &qSeq.name)) { dotOut(); /* Put it into right case and optionally mask on case. */ if (forceLower) toLowerN(qSeq.dna, qSeq.size); else if (forceUpper) toUpperN(qSeq.dna, qSeq.size); @@ -556,31 +559,31 @@ char **dbFiles, **queryFiles; int dbCount, queryCount; struct dnaSeq *dbSeqList, *seq; struct genoFind *gf; boolean tIsProt = (tType == gftProt); boolean qIsProt = (qType == gftProt); boolean bothSimpleNuc = (tType == gftDna && (qType == gftDna || qType == gftRna)); boolean bothSimpleProt = (tIsProt && qIsProt); FILE *f = mustOpen(outName, "w"); boolean showStatus = (f != stdout); databaseName = dbFile; gfClientFileArray(dbFile, &dbFiles, &dbCount); if (makeOoc != NULL) { - gfMakeOoc(makeOoc, dbFiles, dbCount, tileSize, repMatch, tType); + gfMakeOoc(makeOoc, dbFiles, dbCount, tileSize, repMatch, tType, noSimpRepMask); if (showStatus) printf("Done making %s\n", makeOoc); exit(0); } gfClientFileArray(queryFile, &queryFiles, &queryCount); dbSeqList = gfClientSeqList(dbCount, dbFiles, tIsProt, tType == gftDnaX, repeats, minRepDivergence, showStatus); databaseSeqCount = slCount(dbSeqList); for (seq = dbSeqList; seq != NULL; seq = seq->next) databaseLetters += seq->size; gvo = gfOutputAny(outputFormat, minIdentity*10, qIsProt, tIsProt, noHead, databaseName, databaseSeqCount, databaseLetters, minIdentity, f); if (bothSimpleNuc || bothSimpleProt) @@ -593,31 +596,31 @@ maskHash = newHash(0); for (seq = dbSeqList; seq != NULL; seq = seq->next) { Bits *maskedBits = maskFromUpperCaseSeq(seq); hashAdd(maskHash, seq->name, maskedBits); } } /* Handle masking and indexing. If masking is off, we want the indexer * to see unmasked sequence, otherwise we want it to see masked. However * after indexing we always want it unmasked, because things are always * unmasked for the extension phase. */ if (mask == NULL && !bothSimpleProt) gfClientUnmask(dbSeqList); gf = gfIndexSeq(dbSeqList, minMatch, maxGap, tileSize, repMatch, ooc, - tIsProt, oneOff, FALSE, stepSize); + tIsProt, oneOff, FALSE, stepSize, noSimpRepMask); if (mask != NULL) gfClientUnmask(dbSeqList); searchOneIndex(queryCount, queryFiles, gf, outName, tIsProt, maskHash, f, showStatus); freeHash(&maskHash); } else if (tType == gftDnaX && qType == gftProt) { bigBlat(dbSeqList, queryCount, queryFiles, outName, FALSE, TRUE, f, showStatus); } else if (tType == gftDnaX && (qType == gftDnaX || qType == gftRnaX)) { bigBlat(dbSeqList, queryCount, queryFiles, outName, TRUE, qType == gftDnaX, f, showStatus); } else @@ -702,30 +705,31 @@ oneOff = FALSE; maxGap = 0; } /* Get tile size and related parameters from user and make sure * they are within range. */ tileSize = optionInt("tileSize", tileSize); stepSize = optionInt("stepSize", tileSize); minMatch = optionInt("minMatch", minMatch); oneOff = optionExists("oneOff"); fastMap = optionExists("fastMap"); minScore = optionInt("minScore", minScore); maxGap = optionInt("maxGap", maxGap); minRepDivergence = optionFloat("minRepDivergence", minRepDivergence); minIdentity = optionFloat("minIdentity", minIdentity); +noSimpRepMask = optionExists("noSimpRepMask"); gfCheckTileSize(tileSize, tIsProtLike); if (minMatch < 0) errAbort("minMatch must be at least 1"); if (maxGap > 100) errAbort("maxGap must be less than 100"); /* Set repMatch parameter from command line, or * to reasonable value that depends on tile size. */ if (optionExists("repMatch")) repMatch = optionInt("repMatch", repMatch); else repMatch = gfDefaultRepMatch(tileSize, stepSize, tIsProtLike);