b8456c1e9a533efa1cce9556dd2fd717f83bc7f4
hiram
  Thu Jul 13 17:13:36 2017 -0700
get the bed output file open immediately in case there is trouble and do not thread unless explicitly requested refs #18969

diff --git src/utils/crisprKmers/crisprKmers.c src/utils/crisprKmers/crisprKmers.c
index f4f3a18..c9793ae 100644
--- src/utils/crisprKmers/crisprKmers.c
+++ src/utils/crisprKmers/crisprKmers.c
@@ -64,46 +64,46 @@
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "crisprKmers - annotate crispr guide sequences\n"
   "usage:\n"
   "   crisprKmers <sequence>\n"
   "options:\n"
   "   where <sequence> is a .2bit file or .fa fasta sequence\n"
   "   -verbose=N - for debugging control processing steps with level of verbose:\n"
   "   -bed=<file> - output results to given bed9+ file\n"
   "   -offTargets=<file> - output off target data to given file\n"
   "   -ranges=<file> - use specified bed3 file to limit which guides are\n"
   "                  - measured, only those with any overlap to these bed items.\n"
-  "   -memLimit=N - N number of gigabytes for each thread, default: 8\n"
-  "                 when memory size goes beyond N gB program will thread\n"
+  "   -memLimit=N - N number of gigabytes for each thread, default: no threading\n"
+  "               when memory size goes beyond N gB program will thread gB/N times\n"
   "   -dumpKmers=<file> - NOT VALID after scan of sequence, output kmers to file\n"
   "                     - process will exit after this, use -loadKmers to continue\n"
   "   -loadKmers=<file> - NOT VALID load kmers from previous scan of sequence from -dumpKmers"
   );
 }
 
 static char *bedFileOut = NULL;	/* set with -bed=<file> argument, kmer output */
 static char *offTargets = NULL;	/* write offTargets to <file> */
 static FILE *offFile = NULL;	/* file handle to write off targets */
 static char *ranges = NULL;	/* use ranges <file> to limit scanning */
 static struct hash *rangesHash = NULL;	/* ranges into hash + binkeeper */
 static char *dumpKmers = NULL;	/* file name to write out kmers from scan */
 static char *loadKmers = NULL;	/* file name to read in kmers from previous scan */
-static int memLimit = 8;	/* gB limit before going into thread mode */
+static int memLimit = 0;	/* gB limit before going into thread mode */
 static int threadCount = 1;	/* will be adjusted depending upon vmPeak */
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
    {"bed", OPTION_STRING},
    {"offTargets", OPTION_STRING},
    {"ranges", OPTION_STRING},
    {"memLimit", OPTION_INT},
    {"dumpKmers", OPTION_STRING},
    {"loadKmers", OPTION_STRING},
    {NULL, 0},
 };
 
 #define negativeStrand	0x0000800000000000
 
@@ -468,40 +468,36 @@
 else
     {
     int offSum = offBy[1][index] + offBy[2][index] + offBy[3][index] +
 		offBy[4][index];
     if (offSum < 100)
        return highScore;
     else if (offSum < 150)
        return mediumScore;
     else if (offSum < 250)
        return lowScore;
     else
        return twoMany;
     }
 }
 
-static void countsOutput(struct crisprList *all)
+static void countsOutput(struct crisprList *all, FILE *bedFH)
 /* everything has been scanned and counted, print out all the data from arrays*/
 {
 long startTime = clock1000();
 long long itemsOutput = 0;
 
-FILE *bedFH = NULL;
-if (bedFileOut)
-    bedFH = mustOpen(bedFileOut, "w");
-
 struct crisprList *list;
 long long totalOut = 0;
 for (list = all; list; list = list->next)
     {
     long long c = 0;
     for (c = 0; c < list->crisprCount; ++c)
         {
 	++itemsOutput;
 	int negativeOffset = 0;
         char strand = '+';
         if (negativeStrand & list->sequence[c])
 	    {
             strand = '-';
 	    negativeOffset = 3;
 	    }
@@ -521,31 +517,31 @@
 verbose(1, "# PERFECT score %s:%lld %c\t%s\n", list->chrom, list->start[c], strand, kmerString);
 
 	if (bedFH)
 	    fprintf(bedFH, "%s\t%lld\t%lld\t%d,%d,%d,%d,%d\t%d\t%c\t%lld\t%lld\t%s\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\n", list->chrom, list->start[c], list->start[c]+23, list->offBy[0][c], list->offBy[1][c], list->offBy[2][c], list->offBy[3][c], list->offBy[4][c],  list->offBy[0][c], strand, txStart, txEnd, color, color, color, kmerString, pamString, list->offBy[1][c], list->offBy[2][c], list->offBy[3][c], list->offBy[4][c]);
 
         if (list->offBy[0][c])
            verbose(3, "# array identical: %d %s:%lld %c\t%s\n", list->offBy[0][c], list->chrom, list->start[c], strand, kmerString);
 	++totalOut;
 	}
     }
 if (bedFH)
     carefulClose(&bedFH);
 
 long elapsedMs = clock1000() - startTime;
 timingMessage("copyToArray:", itemsOutput, "items output", elapsedMs, "items/sec", "seconds/item");
-}	// static void countsOutput(struct crisprList *all)
+}	//	static void countsOutput(struct crisprList *all, FILE *bedFH)
 
 static struct crisprList *scanSequence(char *inFile)
 /* scan the given file, return list of crisprs */
 {
 verbose(1, "#\tscanning sequence file: %s\n", inFile);
 dnaUtilOpen();
 struct dnaLoad *dl = dnaLoadOpen(inFile);
 struct dnaSeq *seq;
 struct crisprList *listReturn = NULL;
 
 long elapsedMs = 0;
 long scanStart = clock1000();
 long long totalCrisprs = 0;
 
 /* scanning all sequences, setting up crisprs on the listReturn */
@@ -792,55 +788,52 @@
 
 
 /* this queryVsTarget can be used by threads, it should be safe,
  * remains to be proven.
  */
 static void queryVsTarget(struct crisprList *query, struct crisprList *target,
     int threadCount, int threadId)
 /* run the query crisprs list against the target list in the array structures */
 {
 struct crisprList *qList = NULL;
 long long totalCrisprsQuery = 0;
 long long totalCrisprsTarget = 0;
 long long totalCompares = 0;
 struct loopControl *control = NULL;
 AllocVar(control);
-if (threadCount > 1)
-    verbose(1, "# thread %d of %d threads running queryVsTarget()\n",
-	1+threadId, threadCount);
 
 long processStart = clock1000();
 long elapsedMs = 0;
 
 for (qList = query; qList; qList = qList->next)
     {
     long long qCount = 0;
     totalCrisprsQuery += qList->crisprCount;
-    verbose(1, "# queryVsTarget %lld query crisprs on chrom %s\n", qList->crisprCount, qList->chrom);
     if (threadCount > 1)
 	{
 	setLoopEnds(control, qList->crisprCount, threadCount, threadId);
 	if (control->listStart >= qList->crisprCount)
 	    continue;	/* next chrom, no part to be done for this thread */
 	}
     else
 	{
 	control->listStart = 0;
 	control->listEnd = qList->crisprCount;
 	}
-    verbose(1, "# thread # %d of %d running %s items [ %lld : %lld )\n",
-	1 + threadId, threadCount, qList->chrom, control->listStart,
+    verbose(1, "# thread %d of %d running %s %lld items [ %lld : %lld )\n",
+	1 + threadId, threadCount, qList->chrom,
+	    control->listEnd - control->listStart, control->listStart,
 		control->listEnd);
     totalCrisprsTarget += control->listEnd - control->listStart;
     for (qCount = control->listStart; qCount < control->listEnd; ++qCount)
 	{
         struct crisprList *tList = NULL;
         for (tList = target; tList; tList = tList->next)
             {
             long long tCount = 0;
 	    totalCompares += tList->crisprCount;
             for (tCount = 0; tCount < tList->crisprCount; ++tCount)
                 {
 		/* the XOR determine differences in two sequences, the
 		 * shift right 6 removes the PAM sequence and
 		 * the 'fortyBits &' eliminates the negativeStrand bit
 		 */
@@ -1054,31 +1047,31 @@
     threadData->query = query;
     threadData->target = target;
     threadIds[pt] = *threadData;
     int rc = pthread_create(&threads[pt], NULL, threadFunction, &threadIds[pt]);
     if (rc)
         {
         errAbort("Unexpected error %d from pthread_create(): %s",rc,strerror(rc));
         }
     }
 
 /* Wait for threads to finish */
 for (pt = 0; pt < threadCount; ++pt)
     pthread_join(threads[pt], NULL);
 }
 
-static void crisprKmers(char *sequence)
+static void crisprKmers(char *sequence, FILE *bedFH)
 /* crisprKmers - find and annotate crispr sequences. */
 {
 struct crisprList *queryGuides = NULL;
 struct crisprList *allGuides = NULL;
 
 if (loadKmers)
     allGuides = readKmers(loadKmers);
 else
     allGuides = scanSequence(sequence);
 
 if (dumpKmers)
     {
     writeKmers(allGuides, dumpKmers);
     return;
     }
@@ -1093,68 +1086,72 @@
 	offFile = mustOpen(offTargets, "w");
     /* if ranges have been specified, construct queryList of kmers to measure */
     if (rangesHash)
         {
 	/* result here is two exclusive sets: query, and allGuides
 	 *    the query crisprs have been extracted from the allGuides */
         queryGuides = rangeExtraction(& allGuides);
         }
     if (queryGuides)
         copyToArray(queryGuides);
     if (allGuides)
 	copyToArray(allGuides);
     vmPeak = currentVmPeak();
     verbose(1, "# vmPeak after copyToArray: %lld kB\n", vmPeak);
     /* larger example: 62646196 kB */
-    if ((vmPeak >> 20) > memLimit)	// the >> 20 converts kB to gB
+    if ((memLimit > 0) && ((vmPeak >> 20) > memLimit))	// the >> 20 converts kB to gB
 	{
 	threadCount = 1 + ((vmPeak >> 20) / memLimit);
 	verbose(1, "# over %d Gb at %lld kB, threadCount: %d\n", memLimit, vmPeak, threadCount);
 	}
     if (queryGuides)	// when range selected some query sequences
 	{
 	if (allGuides) // if there are any left on the all list
 	    {
 	    if (threadCount > 1)
 		runThreads(threadCount, queryGuides, allGuides);
 	    else
 		queryVsTarget(queryGuides, allGuides, 1, 0);
 	    }
 	/* then run up the query vs. itself avoiding self vs. self */
         queryVsSelf(queryGuides);
-        countsOutput(queryGuides);
+        countsOutput(queryGuides, bedFH);
         }
     else
         {
 	queryVsSelf(allGuides); /* run up all vs. all avoiding self vs. self */
-	countsOutput(allGuides);
+	countsOutput(allGuides, bedFH);
         }
 
     carefulClose(&offFile);
     }
-}	// static void crisprKmers(char *sequence)
+}	// static void crisprKmers(char *sequence, FILE *bedFH)
 
 int main(int argc, char *argv[])
 /* Process command line, initialize translation arrays and call the process */
 {
 optionInit(&argc, argv, options);
 if (argc != 2)
     usage();
 
 verbose(0, "# running verboseLevel: %d\n", verboseLevel());
 
 bedFileOut = optionVal("bed", bedFileOut);
 dumpKmers = optionVal("dumpKmers", dumpKmers);
 loadKmers = optionVal("loadKmers", loadKmers);
 offTargets = optionVal("offTargets", offTargets);
 memLimit = optionInt("memLimit", memLimit);
 ranges = optionVal("ranges", ranges);
 if (ranges)
     rangesHash = readRanges(ranges);
 
+FILE *bedFH = NULL;
+if (bedFileOut)
+    bedFH = mustOpen(bedFileOut, "w");
+
 initOrderedNtVal();	/* set up orderedNtVal[] */
-crisprKmers(argv[1]);
+crisprKmers(argv[1], bedFH);
 
 if (verboseLevel() > 1)
     printVmPeak();
 return 0;
 }