b8456c1e9a533efa1cce9556dd2fd717f83bc7f4 hiram Thu Jul 13 17:13:36 2017 -0700 get the bed output file open immediately in case there is trouble and do not thread unless explicitly requested refs #18969 diff --git src/utils/crisprKmers/crisprKmers.c src/utils/crisprKmers/crisprKmers.c index f4f3a18..c9793ae 100644 --- src/utils/crisprKmers/crisprKmers.c +++ src/utils/crisprKmers/crisprKmers.c @@ -64,46 +64,46 @@ void usage() /* Explain usage and exit. */ { errAbort( "crisprKmers - annotate crispr guide sequences\n" "usage:\n" " crisprKmers <sequence>\n" "options:\n" " where <sequence> is a .2bit file or .fa fasta sequence\n" " -verbose=N - for debugging control processing steps with level of verbose:\n" " -bed=<file> - output results to given bed9+ file\n" " -offTargets=<file> - output off target data to given file\n" " -ranges=<file> - use specified bed3 file to limit which guides are\n" " - measured, only those with any overlap to these bed items.\n" - " -memLimit=N - N number of gigabytes for each thread, default: 8\n" - " when memory size goes beyond N gB program will thread\n" + " -memLimit=N - N number of gigabytes for each thread, default: no threading\n" + " when memory size goes beyond N gB program will thread gB/N times\n" " -dumpKmers=<file> - NOT VALID after scan of sequence, output kmers to file\n" " - process will exit after this, use -loadKmers to continue\n" " -loadKmers=<file> - NOT VALID load kmers from previous scan of sequence from -dumpKmers" ); } static char *bedFileOut = NULL; /* set with -bed=<file> argument, kmer output */ static char *offTargets = NULL; /* write offTargets to <file> */ static FILE *offFile = NULL; /* file handle to write off targets */ static char *ranges = NULL; /* use ranges <file> to limit scanning */ static struct hash *rangesHash = NULL; /* ranges into hash + binkeeper */ static char *dumpKmers = NULL; /* file name to write out kmers from scan */ static char *loadKmers = NULL; /* file name to read in kmers from previous scan */ -static int memLimit = 8; /* gB limit before going into thread mode */ +static int memLimit = 0; /* gB limit before going into thread mode */ static int threadCount = 1; /* will be adjusted depending upon vmPeak */ /* Command line validation table. */ static struct optionSpec options[] = { {"bed", OPTION_STRING}, {"offTargets", OPTION_STRING}, {"ranges", OPTION_STRING}, {"memLimit", OPTION_INT}, {"dumpKmers", OPTION_STRING}, {"loadKmers", OPTION_STRING}, {NULL, 0}, }; #define negativeStrand 0x0000800000000000 @@ -468,40 +468,36 @@ else { int offSum = offBy[1][index] + offBy[2][index] + offBy[3][index] + offBy[4][index]; if (offSum < 100) return highScore; else if (offSum < 150) return mediumScore; else if (offSum < 250) return lowScore; else return twoMany; } } -static void countsOutput(struct crisprList *all) +static void countsOutput(struct crisprList *all, FILE *bedFH) /* everything has been scanned and counted, print out all the data from arrays*/ { long startTime = clock1000(); long long itemsOutput = 0; -FILE *bedFH = NULL; -if (bedFileOut) - bedFH = mustOpen(bedFileOut, "w"); - struct crisprList *list; long long totalOut = 0; for (list = all; list; list = list->next) { long long c = 0; for (c = 0; c < list->crisprCount; ++c) { ++itemsOutput; int negativeOffset = 0; char strand = '+'; if (negativeStrand & list->sequence[c]) { strand = '-'; negativeOffset = 3; } @@ -521,31 +517,31 @@ verbose(1, "# PERFECT score %s:%lld %c\t%s\n", list->chrom, list->start[c], strand, kmerString); if (bedFH) fprintf(bedFH, "%s\t%lld\t%lld\t%d,%d,%d,%d,%d\t%d\t%c\t%lld\t%lld\t%s\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\n", list->chrom, list->start[c], list->start[c]+23, list->offBy[0][c], list->offBy[1][c], list->offBy[2][c], list->offBy[3][c], list->offBy[4][c], list->offBy[0][c], strand, txStart, txEnd, color, color, color, kmerString, pamString, list->offBy[1][c], list->offBy[2][c], list->offBy[3][c], list->offBy[4][c]); if (list->offBy[0][c]) verbose(3, "# array identical: %d %s:%lld %c\t%s\n", list->offBy[0][c], list->chrom, list->start[c], strand, kmerString); ++totalOut; } } if (bedFH) carefulClose(&bedFH); long elapsedMs = clock1000() - startTime; timingMessage("copyToArray:", itemsOutput, "items output", elapsedMs, "items/sec", "seconds/item"); -} // static void countsOutput(struct crisprList *all) +} // static void countsOutput(struct crisprList *all, FILE *bedFH) static struct crisprList *scanSequence(char *inFile) /* scan the given file, return list of crisprs */ { verbose(1, "#\tscanning sequence file: %s\n", inFile); dnaUtilOpen(); struct dnaLoad *dl = dnaLoadOpen(inFile); struct dnaSeq *seq; struct crisprList *listReturn = NULL; long elapsedMs = 0; long scanStart = clock1000(); long long totalCrisprs = 0; /* scanning all sequences, setting up crisprs on the listReturn */ @@ -792,55 +788,52 @@ /* this queryVsTarget can be used by threads, it should be safe, * remains to be proven. */ static void queryVsTarget(struct crisprList *query, struct crisprList *target, int threadCount, int threadId) /* run the query crisprs list against the target list in the array structures */ { struct crisprList *qList = NULL; long long totalCrisprsQuery = 0; long long totalCrisprsTarget = 0; long long totalCompares = 0; struct loopControl *control = NULL; AllocVar(control); -if (threadCount > 1) - verbose(1, "# thread %d of %d threads running queryVsTarget()\n", - 1+threadId, threadCount); long processStart = clock1000(); long elapsedMs = 0; for (qList = query; qList; qList = qList->next) { long long qCount = 0; totalCrisprsQuery += qList->crisprCount; - verbose(1, "# queryVsTarget %lld query crisprs on chrom %s\n", qList->crisprCount, qList->chrom); if (threadCount > 1) { setLoopEnds(control, qList->crisprCount, threadCount, threadId); if (control->listStart >= qList->crisprCount) continue; /* next chrom, no part to be done for this thread */ } else { control->listStart = 0; control->listEnd = qList->crisprCount; } - verbose(1, "# thread # %d of %d running %s items [ %lld : %lld )\n", - 1 + threadId, threadCount, qList->chrom, control->listStart, + verbose(1, "# thread %d of %d running %s %lld items [ %lld : %lld )\n", + 1 + threadId, threadCount, qList->chrom, + control->listEnd - control->listStart, control->listStart, control->listEnd); totalCrisprsTarget += control->listEnd - control->listStart; for (qCount = control->listStart; qCount < control->listEnd; ++qCount) { struct crisprList *tList = NULL; for (tList = target; tList; tList = tList->next) { long long tCount = 0; totalCompares += tList->crisprCount; for (tCount = 0; tCount < tList->crisprCount; ++tCount) { /* the XOR determine differences in two sequences, the * shift right 6 removes the PAM sequence and * the 'fortyBits &' eliminates the negativeStrand bit */ @@ -1054,31 +1047,31 @@ threadData->query = query; threadData->target = target; threadIds[pt] = *threadData; int rc = pthread_create(&threads[pt], NULL, threadFunction, &threadIds[pt]); if (rc) { errAbort("Unexpected error %d from pthread_create(): %s",rc,strerror(rc)); } } /* Wait for threads to finish */ for (pt = 0; pt < threadCount; ++pt) pthread_join(threads[pt], NULL); } -static void crisprKmers(char *sequence) +static void crisprKmers(char *sequence, FILE *bedFH) /* crisprKmers - find and annotate crispr sequences. */ { struct crisprList *queryGuides = NULL; struct crisprList *allGuides = NULL; if (loadKmers) allGuides = readKmers(loadKmers); else allGuides = scanSequence(sequence); if (dumpKmers) { writeKmers(allGuides, dumpKmers); return; } @@ -1093,68 +1086,72 @@ offFile = mustOpen(offTargets, "w"); /* if ranges have been specified, construct queryList of kmers to measure */ if (rangesHash) { /* result here is two exclusive sets: query, and allGuides * the query crisprs have been extracted from the allGuides */ queryGuides = rangeExtraction(& allGuides); } if (queryGuides) copyToArray(queryGuides); if (allGuides) copyToArray(allGuides); vmPeak = currentVmPeak(); verbose(1, "# vmPeak after copyToArray: %lld kB\n", vmPeak); /* larger example: 62646196 kB */ - if ((vmPeak >> 20) > memLimit) // the >> 20 converts kB to gB + if ((memLimit > 0) && ((vmPeak >> 20) > memLimit)) // the >> 20 converts kB to gB { threadCount = 1 + ((vmPeak >> 20) / memLimit); verbose(1, "# over %d Gb at %lld kB, threadCount: %d\n", memLimit, vmPeak, threadCount); } if (queryGuides) // when range selected some query sequences { if (allGuides) // if there are any left on the all list { if (threadCount > 1) runThreads(threadCount, queryGuides, allGuides); else queryVsTarget(queryGuides, allGuides, 1, 0); } /* then run up the query vs. itself avoiding self vs. self */ queryVsSelf(queryGuides); - countsOutput(queryGuides); + countsOutput(queryGuides, bedFH); } else { queryVsSelf(allGuides); /* run up all vs. all avoiding self vs. self */ - countsOutput(allGuides); + countsOutput(allGuides, bedFH); } carefulClose(&offFile); } -} // static void crisprKmers(char *sequence) +} // static void crisprKmers(char *sequence, FILE *bedFH) int main(int argc, char *argv[]) /* Process command line, initialize translation arrays and call the process */ { optionInit(&argc, argv, options); if (argc != 2) usage(); verbose(0, "# running verboseLevel: %d\n", verboseLevel()); bedFileOut = optionVal("bed", bedFileOut); dumpKmers = optionVal("dumpKmers", dumpKmers); loadKmers = optionVal("loadKmers", loadKmers); offTargets = optionVal("offTargets", offTargets); memLimit = optionInt("memLimit", memLimit); ranges = optionVal("ranges", ranges); if (ranges) rangesHash = readRanges(ranges); +FILE *bedFH = NULL; +if (bedFileOut) + bedFH = mustOpen(bedFileOut, "w"); + initOrderedNtVal(); /* set up orderedNtVal[] */ -crisprKmers(argv[1]); +crisprKmers(argv[1], bedFH); if (verboseLevel() > 1) printVmPeak(); return 0; }