457e1eaee582c512cf6e860884f9f6677c8cf218 angie Mon Aug 5 15:16:45 2019 -0700 Added -perSeqMax=file option to gfServer, for assemblies with many alt/fix sequences. refs #23112 gfServer's default maxDnaHits=100 is exceeded easily for transcripts like NM_020535.3 that have clump matches to many spots on chr19 and its dozens of _alts, so only the largest exon matches make the cutoff and we lose the best full transcript alignments. Passing in -perSeqMax with a list of _alt and _fix filename:seq names enables us to apply the limit separately to each alt (maxDnaHits/2 per alt), so full transcript matches are found. There can be many more results for hgBlat to work through, which slows it down, but this should apply only when there are many clump matches to many alts. In the normal case in which a transcript has relatively few hits across the genome, and none to alts, hgBlat's performance is not affected. diff --git src/gfServer/gfServer.c src/gfServer/gfServer.c index 36f8e01..1def652 100644 --- src/gfServer/gfServer.c +++ src/gfServer/gfServer.c @@ -9,51 +9,53 @@ #include "portable.h" #include "net.h" #include "dnautil.h" #include "dnaseq.h" #include "nib.h" #include "twoBit.h" #include "fa.h" #include "dystring.h" #include "errAbort.h" #include "memalloc.h" #include "genoFind.h" #include "options.h" #include "trans3.h" #include "log.h" #include "internet.h" +#include "hash.h" static struct optionSpec optionSpecs[] = { {"canStop", OPTION_BOOLEAN}, {"log", OPTION_STRING}, {"logFacility", OPTION_STRING}, {"mask", OPTION_BOOLEAN}, {"maxAaSize", OPTION_INT}, {"maxDnaHits", OPTION_INT}, {"maxGap", OPTION_INT}, {"maxNtSize", OPTION_INT}, {"maxTransHits", OPTION_INT}, {"minMatch", OPTION_INT}, {"repMatch", OPTION_INT}, {"seqLog", OPTION_BOOLEAN}, {"ipLog", OPTION_BOOLEAN}, {"debugLog", OPTION_BOOLEAN}, {"stepSize", OPTION_INT}, {"tileSize", OPTION_INT}, {"trans", OPTION_BOOLEAN}, {"syslog", OPTION_BOOLEAN}, + {"perSeqMax", OPTION_STRING}, {NULL, 0} }; int maxNtSize = 40000; int maxAaSize = 8000; int minMatch = gfMinMatch; /* Can be overridden from command line. */ int tileSize = gfTileSize; /* Can be overridden from command line. */ int stepSize = 0; /* Can be overridden from command line. */ boolean doTrans = FALSE; /* Do translation? */ boolean allowOneMismatch = FALSE; int repMatch = 1024; /* Can be overridden from command line. */ int maxDnaHits = 100; /* Can be overridden from command line. */ int maxTransHits = 200; /* Can be overridden from command line. */ @@ -104,30 +106,34 @@ " -ipLog Include user's IP in log file (not logged with -syslog).\n" " -debugLog Include debugging info in log file.\n" " -syslog Log to syslog.\n" " -logFacility=facility Log to the specified syslog facility - default local0.\n" " -mask Use masking from .2bit file.\n" " -repMatch=N Number of occurrences of a tile (n-mer) that triggers repeat masking the\n" " tile. Default is %d.\n" " -maxDnaHits=N Maximum number of hits for a DNA query that are sent from the server.\n" " Default is %d.\n" " -maxTransHits=N Maximum number of hits for a translated query that are sent from the server.\n" " Default is %d.\n" " -maxNtSize=N Maximum size of untranslated DNA query sequence.\n" " Default is %d.\n" " -maxAaSize=N Maximum size of protein or translated DNA queries.\n" " Default is %d.\n" + " -perSeqMax=file File contains one seq filename (possibly with ':seq' suffix) per line.\n" + " -maxDnaHits will be applied to each filename[:seq] separately: each may\n" + " have at most maxDnaHits/2 hits.\n" + " Useful for assemblies with many alternate/patch sequences.\n" " -canStop If set, a quit message will actually take down the server.\n" , gfVersion, repMatch, maxDnaHits, maxTransHits, maxNtSize, maxAaSize ); } /* Note about file(s) specified in the start command: The path(s) specified here are sent back exactly as-is to clients such as gfClient, hgBlat, webBlat. It is intended that relative paths are used. Absolute paths starting with '/' tend not to work unless the client is on the same machine as the server. For use with hgBlat and webBlat, cd to the directory where the file is and use the plain file name with no slashes. hgBlat will append the path(s) given to dbDb.nibPath. @@ -220,52 +226,60 @@ { if (!isdigit(portName[0])) errAbort("Expecting a port number got %s", portName); return atoi(portName); } /* Some variables to gather statistics on usage. */ long baseCount = 0, blatCount = 0, aaCount = 0, pcrCount = 0; int warnCount = 0; int noSigCount = 0; int missCount = 0; int trimCount = 0; void dnaQuery(struct genoFind *gf, struct dnaSeq *seq, - int connectionHandle, char buf[256]) + int connectionHandle, char buf[256], struct hash *perSeqMaxHash) /* Handle a query for DNA/DNA match. */ { struct gfClump *clumpList = NULL, *clump; int limit = 1000; int clumpCount = 0, hitCount = -1; struct lm *lm = lmInit(0); if (seq->size > gf->tileSize + gf->stepSize + gf->stepSize) limit = maxDnaHits; clumpList = gfFindClumps(gf, seq, lm, &hitCount); if (clumpList == NULL) ++missCount; for (clump = clumpList; clump != NULL; clump = clump->next) { struct gfSeqSource *ss = clump->target; sprintf(buf, "%d\t%d\t%s\t%d\t%d\t%d", clump->qStart, clump->qEnd, ss->fileName, clump->tStart-ss->start, clump->tEnd-ss->start, clump->hitCount); netSendString(connectionHandle, buf); ++clumpCount; - if (--limit < 0) + int perSeqCount = -1; + if (perSeqMaxHash && + ((perSeqCount = hashIntValDefault(perSeqMaxHash, ss->fileName, -1)) >= 0)) + { + if (perSeqCount >= (maxDnaHits / 2)) + break; + hashIncInt(perSeqMaxHash, ss->fileName); + } + else if (--limit < 0) break; } gfClumpFreeList(&clumpList); lmCleanup(&lm); logDebug("%lu %d clumps, %d hits", clock1000(), clumpCount, hitCount); } void transQuery(struct genoFind *transGf[2][3], aaSeq *seq, int connectionHandle, char buf[256]) /* Handle a query for protein/translated DNA match. */ { struct gfClump *clumps[3], *clump; int isRc, frame; char strand; struct dyString *dy = newDyString(1024); @@ -429,48 +443,48 @@ { memTrackerEnd(); popAbortHandler(); // must come after memTracker } static void errorSafeCleanupMess(int connectionHandle, char *message) /* Clean up and report problem. */ { errorSafeCleanup(); logError("Recovering from error via longjmp"); netSendString(connectionHandle, message); } static void errorSafeQuery(boolean doTrans, boolean queryIsProt, struct dnaSeq *seq, struct genoFind *gf, struct genoFind *transGf[2][3], - int connectionHandle, char *buf) + int connectionHandle, char *buf, struct hash *perSeqMaxHash) /* Wrap error handling code around index query. */ { int status; errorSafeSetup(); status = setjmp(gfRecover); if (status == 0) /* Always true except after long jump. */ { if (doTrans) { if (queryIsProt) transQuery(transGf, seq, connectionHandle, buf); else transTransQuery(transGf, seq, connectionHandle, buf); } else - dnaQuery(gf, seq, connectionHandle, buf); + dnaQuery(gf, seq, connectionHandle, buf, perSeqMaxHash); errorSafeCleanup(); } else /* They long jumped here because of an error. */ { errorSafeCleanupMess(connectionHandle, "Error: gfServer out of memory. Try reducing size of query."); } } static void errorSafePcr(struct genoFind *gf, char *fPrimer, char *rPrimer, int maxDistance, int connectionHandle) /* Wrap error handling around pcr index query. */ { int status; errorSafeSetup(); @@ -487,55 +501,95 @@ } } boolean badPcrPrimerSeq(char *s) /* Return TRUE if have a character we can't handle in sequence. */ { unsigned char c; while ((c = *s++) != 0) { if (ntVal[c] < 0) return TRUE; } return FALSE; } +static struct hash *maybePerSeqMax(int fileCount, char *seqFiles[]) +/* If options include -perSeqMax=file, then read the sequences named in the file into a hash + * for testing membership in the set of sequences to exclude from -maxDnaHits accounting. */ +{ +struct hash *perSeqMaxHash = NULL; +char *fileName = optionVal("perSeqMax", NULL); +if (isNotEmpty(fileName)) + { + perSeqMaxHash = hashNew(0); + struct lineFile *lf = lineFileOpen(fileName, TRUE); + char *line; + while (lineFileNextReal(lf, &line)) + { + // Make sure line contains a valid seq filename (before optional ':seq') + char *seqFile = trimSpaces(line); + char copy[strlen(seqFile)+1]; + safecpy(copy, sizeof copy, seqFile); + char *colon = strrchr(copy, ':'); + if (colon) + *colon = '\0'; + if (stringArrayIx(copy, seqFiles, fileCount) < 0) + lineFileAbort(lf, "'%s' does not appear to be a sequence file from the " + "command line", copy); + hashAddInt(perSeqMaxHash, seqFile, 0); + } + lineFileClose(&lf); + } +return perSeqMaxHash; +} + +static void hashZeroVals(struct hash *hash) +/* Set the value of every element of hash to NULL (0 for ints). */ +{ +struct hashEl *hel; +struct hashCookie cookie = hashFirst(hash); +while ((hel = hashNext(&cookie)) != NULL) + hel->val = 0; +} + void startServer(char *hostName, char *portName, int fileCount, char *seqFiles[]) /* Load up index and hang out in RAM. */ { struct genoFind *gf = NULL; static struct genoFind *transGf[2][3]; char buf[256]; char *line, *command; struct sockaddr_in6 fromAddr; socklen_t fromLen; int readSize; int socketHandle = 0, connectionHandle = 0; int port = atoi(portName); time_t curtime; struct tm *loctime; char timestr[256]; netBlockBrokenPipes(); curtime = time (NULL); /* Get the current time. */ loctime = localtime (&curtime); /* Convert it to local time representation. */ strftime (timestr, sizeof(timestr), "%Y-%m-%d %H:%M", loctime); /* formate datetime as string */ logInfo("gfServer version %s on host %s, port %s (%s)", gfVersion, hostName, portName, timestr); +struct hash *perSeqMaxHash = maybePerSeqMax(fileCount, seqFiles); if (doTrans) { uglyf("starting translated server...\n"); logInfo("setting up translated index"); gfIndexTransNibsAndTwoBits(transGf, fileCount, seqFiles, minMatch, maxGap, tileSize, repMatch, NULL, allowOneMismatch, doMask, stepSize); } else { uglyf("starting untranslated server...\n"); logInfo("setting up untranslated index"); gf = gfIndexNibsAndTwoBits(fileCount, seqFiles, minMatch, maxGap, tileSize, repMatch, NULL, allowOneMismatch, stepSize); } @@ -703,31 +757,33 @@ ++trimCount; seq.size = maxSize; seq.dna[maxSize] = 0; } if (queryIsProt) aaCount += seq.size; else baseCount += seq.size; if (seqLog && (logGetFile() != NULL)) { FILE *lf = logGetFile(); faWriteNext(lf, "query", seq.dna, seq.size); fflush(lf); } errorSafeQuery(doTrans, queryIsProt, &seq, gf, - transGf, connectionHandle, buf); + transGf, connectionHandle, buf, perSeqMaxHash); + if (perSeqMaxHash) + hashZeroVals(perSeqMaxHash); } freez(&seq.dna); } netSendString(connectionHandle, "end"); } } } } else if (sameString("pcr", command)) { char *f = nextWord(&line); char *r = nextWord(&line); char *s = nextWord(&line); int maxDistance; ++pcrCount;