d75dbdc035385f4122259f47091fc09c06ad16c2
hiram
  Thu Jul 13 11:29:41 2017 -0700
appears to work OK with threads refs #18969

diff --git src/utils/crisprKmers/crisprKmers.c src/utils/crisprKmers/crisprKmers.c
index 48c210a..f4f3a18 100644
--- src/utils/crisprKmers/crisprKmers.c
+++ src/utils/crisprKmers/crisprKmers.c
@@ -353,34 +353,35 @@
     cl->start = (long long *)needLargeMem(memSize);
     memSize = 5 * sizeof(int *);
     cl->offBy = (int **)needLargeMem(memSize);
     memSize = cl->crisprCount * sizeof(int);
     int r = 0;
     for (r = 0; r < 5; ++r)
         cl->offBy[r] = (int *)needLargeZeroedMem(memSize);
 
     long long i = 0;
     struct crispr *c = NULL;
     for (c = cl->chromCrisprs; c; c = c->next)
         {
 	++itemsCopied;
         cl->sequence[i] = c->sequence;
         cl->start[i] = c->start;
+	++i;
         }
     }
 long elapsedMs = clock1000() - startTime;
-timingMessage("copyToArray:", itemsCopied, "items copied", elapsedMs, "items/sec", "seconds/item");
+timingMessage("copyToArray", itemsCopied, "items copied", elapsedMs, "items/sec", "seconds/item");
 }	//	static void copyToArray(struct crisprList *list)	*/
 
 static struct crisprList *generateKmers(struct dnaSeq *seq)
 {
 struct crispr *crisprSet = NULL;
 struct crisprList *returnList = NULL;
 AllocVar(returnList);
 returnList->chrom = cloneString(seq->name);
 returnList->size = seq->size;
 int i;
 DNA *dna;
 long long chromPosition = 0;
 long long startGap = 0;
 long long gapCount = 0;
 int kmerLength = 0;
@@ -499,33 +500,33 @@
 	++itemsOutput;
 	int negativeOffset = 0;
         char strand = '+';
         if (negativeStrand & list->sequence[c])
 	    {
             strand = '-';
 	    negativeOffset = 3;
 	    }
 	long long txStart = list->start[c] + negativeOffset;
 	long long txEnd = txStart + 20;
 
         int totalOffs = list->offBy[0][c] + list->offBy[1][c] +
 	    list->offBy[2][c] + list->offBy[3][c] + list->offBy[4][c];
 
         char *color = itemColor(list->offBy, c);
-        char kmerString[32];
+        char kmerString[33];
         kmerValToString(kmerString, list->sequence[c], 3);
-        char pamString[32];
+        char pamString[33];
         kmerPAMString(pamString, list->sequence[c]);
 
         if (0 == totalOffs)
 verbose(1, "# PERFECT score %s:%lld %c\t%s\n", list->chrom, list->start[c], strand, kmerString);
 
 	if (bedFH)
 	    fprintf(bedFH, "%s\t%lld\t%lld\t%d,%d,%d,%d,%d\t%d\t%c\t%lld\t%lld\t%s\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\n", list->chrom, list->start[c], list->start[c]+23, list->offBy[0][c], list->offBy[1][c], list->offBy[2][c], list->offBy[3][c], list->offBy[4][c],  list->offBy[0][c], strand, txStart, txEnd, color, color, color, kmerString, pamString, list->offBy[1][c], list->offBy[2][c], list->offBy[3][c], list->offBy[4][c]);
 
         if (list->offBy[0][c])
            verbose(3, "# array identical: %d %s:%lld %c\t%s\n", list->offBy[0][c], list->chrom, list->start[c], strand, kmerString);
 	++totalOut;
 	}
     }
 if (bedFH)
     carefulClose(&bedFH);
@@ -706,150 +707,186 @@
     }
 
 double score2 = 1.0;
 if (distCount > 1)
     {
     double avgDist = (double)distSum / distCount;
     score2 = 1.0 / (((19-avgDist)/19.0) * 4 + 1);
     }
 double score3 = 1.0;
 if (mmCount > 0)
     score3 = 1.0 / (mmCount * mmCount);
 
 return (score1 * score2 * score3 * 100);
 } //	static double calcHitScore(long long sequence1, long long sequence2)
 
+#ifdef NOT
+static double calcCfdScore(long long sequence1, long long sequence2)
+/* calcCfdScore - from Max' crispor.py script and paper:
+ *    https://www.ncbi.nlm.nih.gov/pubmed/27380939 */
+{
+return 0.0;
+}	// static double calcCfdScore(long long sequence1, long long sequence2)
+#endif
+
+static void misMatchString(char *stringReturn, long long misMatch)
+/* return ascii string ...*.....*.*.....*.. to indicate mis matches */
+{
+int i = 0;
+long long bitMask = 0xc000000000;
+while (bitMask)
+    {
+    stringReturn[i] = '.';
+    if (bitMask & misMatch)
+	stringReturn[i] = '*';
+    ++i;
+    bitMask >>= 2;
+    }
+stringReturn[i] = 0;
+}	// static void misMatchString(char *returnString, long long misMatch)
+
 static void recordOffTargets(struct crisprList *query,
     struct crisprList *target, int bitsOn, long long qIndex,
-	long long tIndex)
+	long long tIndex, long long twoBitMisMatch)
 /* bitsOn is from 1 to 4, record this match when less than 1000 total */
 {
-char queryString[32];	/* local storage for string */
-char targetString[32];	/* local storage for string */
-char queryPAM[32];	/* local storage for string */
-char targetPAM[32];	/* local storage for string */
+char queryString[33];	/* local storage for string */
+char targetString[33];	/* local storage for string */
+char queryPAM[33];	/* local storage for string */
+char targetPAM[33];	/* local storage for string */
+char misMatch[33];	/* ...*.....*.*... ascii string represent misMatch */
 
 if (query->offBy[0][qIndex] ) // no need to accumulate if 0-mismatch > 0
     return;
 
 if (offFile)
     {
     int i = 0;
     int bitsOnSum = 0;
     for (i = 1; i < 5; ++i)
         bitsOnSum += query->offBy[i][qIndex];
 
+    misMatchString(misMatch, twoBitMisMatch);
+
     if (bitsOnSum < 1000)	// could be command line option limit
         {
         double hitScore =
 		calcHitScore(query->sequence[qIndex], target->sequence[tIndex]);
+//        double cfdScore =
+//		calcCfdScore(query->sequence[qIndex], target->sequence[tIndex]);
         kmerValToString(queryString, query->sequence[qIndex], 3);
         kmerValToString(targetString, target->sequence[tIndex], 3);
 	kmerPAMString(queryPAM, query->sequence[qIndex]);
 	kmerPAMString(targetPAM, target->sequence[tIndex]);
-        fprintf(offFile, "%s:%lld %c %s %s %.2f\t", query->chrom,
-            query->start[qIndex],
+        fprintf(offFile, "%s:%lld %c %s%s %s%s %s %d %.8f\t%s:%lld %c\n",
+	    query->chrom, query->start[qIndex],
 		negativeStrand & query->sequence[qIndex] ? '-' : '+',
-                queryString, queryPAM, hitScore);
-        fprintf(offFile, "%s:%lld %c %s %s\t%d\n", target->chrom,
+                queryString, queryPAM, targetString, targetPAM,
+                misMatch, bitsOn, hitScore, target->chrom,
 		target->start[tIndex],
-		negativeStrand & target->sequence[tIndex] ? '-' : '+',
-                targetString, targetPAM, bitsOn);
+		negativeStrand & target->sequence[tIndex] ? '-' : '+');
         }
     }
 }	//	static void recordOffTargets(struct crisprList *query,
 	//	    struct crisprList *target, int bitsOn, long long qIndex,
 	//		long long tIndex)
 
-static void queryVsAll(struct crisprList *query, struct crisprList *target,
+
+/* this queryVsTarget can be used by threads, it should be safe,
+ * remains to be proven.
+ */
+static void queryVsTarget(struct crisprList *query, struct crisprList *target,
     int threadCount, int threadId)
 /* run the query crisprs list against the target list in the array structures */
 {
 struct crisprList *qList = NULL;
 long long totalCrisprsQuery = 0;
 long long totalCrisprsTarget = 0;
 long long totalCompares = 0;
 struct loopControl *control = NULL;
 AllocVar(control);
 if (threadCount > 1)
-    verbose(1, "# thread %d of %d threads running queryVsAll()\n",
-	threadId, threadCount);
+    verbose(1, "# thread %d of %d threads running queryVsTarget()\n",
+	1+threadId, threadCount);
 
 long processStart = clock1000();
 long elapsedMs = 0;
 
 for (qList = query; qList; qList = qList->next)
     {
     long long qCount = 0;
     totalCrisprsQuery += qList->crisprCount;
-    verbose(1, "# queryVsAll %lld query crisprs on chrom %s\n", qList->crisprCount, qList->chrom);
+    verbose(1, "# queryVsTarget %lld query crisprs on chrom %s\n", qList->crisprCount, qList->chrom);
     if (threadCount > 1)
 	{
 	setLoopEnds(control, qList->crisprCount, threadCount, threadId);
 	if (control->listStart >= qList->crisprCount)
 	    continue;	/* next chrom, no part to be done for this thread */
 	}
     else
 	{
 	control->listStart = 0;
 	control->listEnd = qList->crisprCount;
 	}
+    verbose(1, "# thread # %d of %d running %s items [ %lld : %lld )\n",
+	1 + threadId, threadCount, qList->chrom, control->listStart,
+	    control->listEnd);
     totalCrisprsTarget += control->listEnd - control->listStart;
     for (qCount = control->listStart; qCount < control->listEnd; ++qCount)
 	{
         struct crisprList *tList = NULL;
         for (tList = target; tList; tList = tList->next)
             {
             long long tCount = 0;
 	    totalCompares += tList->crisprCount;
             for (tCount = 0; tCount < tList->crisprCount; ++tCount)
                 {
 		/* the XOR determine differences in two sequences, the
 		 * shift right 6 removes the PAM sequence and
 		 * the 'fortyBits &' eliminates the negativeStrand bit
 		 */
                 long long misMatch = fortyBits &
                     ((qList->sequence[qCount] ^ tList->sequence[tCount]) >> 6);
                 if (misMatch)
                     {
                     /* possible misMatch bit values: 01 10 11
                      *  turn those three values into just: 01
                      */
                     misMatch = (misMatch | (misMatch >> 1)) & 0x5555555555;
                     int bitsOn = _mm_popcnt_u64(misMatch);
 		    if (bitsOn < 5)
 			{
-			recordOffTargets(qList, tList, bitsOn, qCount, tCount);
+			recordOffTargets(qList, tList, bitsOn, qCount, tCount, misMatch);
 			qList->offBy[bitsOn][qCount] += 1;
 //			tList->offBy[bitsOn][tCount] += 1; not needed
 			}
                     }
                 else
                     { 	/* no misMatch, identical crisprs */
                     qList->offBy[0][qCount] += 1;
 //                  tList->offBy[0][tCount] += 1;	not needed
                     }
                 } // for (tCount = 0; tCount < tList->crisprCount; ++tCount)
             }	//	for (tList = target; tList; tList = tList->next)
 	}	//	for (qCount = 0; qCount < qList->crisprCount; ++qCount)
     }	//	for (qList = query; qList; qList = qList->next)
 verbose(1, "# done with scanning, check timing\n");
 elapsedMs = clock1000() - processStart;
-timingMessage("queryVsAll", totalCrisprsQuery, "query crisprs processed", elapsedMs, "crisprs/sec", "seconds/crispr");
-timingMessage("queryVsAll", totalCrisprsTarget, "vs target crisprs", elapsedMs, "crisprs/sec", "seconds/crispr");
-timingMessage("queryVsAll", totalCompares, "total comparisons", elapsedMs, "compares/sec", "seconds/compare");
-}	/* static struct crisprList *queryVsAll(struct crisprList *query,
+timingMessage("queryVsTarget", totalCrisprsQuery, "query crisprs processed", elapsedMs, "crisprs/sec", "seconds/crispr");
+timingMessage("queryVsTarget", totalCrisprsTarget, "vs target crisprs", elapsedMs, "crisprs/sec", "seconds/crispr");
+timingMessage("queryVsTarget", totalCompares, "total comparisons", elapsedMs, "compares/sec", "seconds/compare");
+}	/* static struct crisprList *queryVsTarget(struct crisprList *query,
 	    struct crisprList *target) */
 
 static void queryVsSelf(struct crisprList *all)
 /* run this 'all' list vs. itself avoiding self to self comparisons */
 {
 struct crisprList *qList = NULL;
 long long totalCrisprsQuery = 0;
 long long totalCrisprsCompare = 0;
 
 long processStart = clock1000();
 long elapsedMs = 0;
 
 /* query runs through all chroms */
 for (qList = all; qList; qList = qList->next)
     {
@@ -871,53 +908,54 @@
 		/* the XOR determine differences in two sequences, the
 		 * shift right 6 removes the PAM sequence and
 		 * the 'fortyBits &' eliminates the negativeStrand bit
 		 */
                 long long misMatch = fortyBits &
                     ((qList->sequence[qCount] ^ tList->sequence[tCount]) >> 6);
                 if (misMatch)
                     {
                     /* possible misMatch bit values: 01 10 11
                      *  turn those three values into just: 01
                      */
                     misMatch = (misMatch | (misMatch >> 1)) & 0x5555555555;
                     int bitsOn = _mm_popcnt_u64(misMatch);
 		    if (bitsOn < 5)
 			{
-			recordOffTargets(qList, tList, bitsOn, qCount, tCount);
+			recordOffTargets(qList, tList, bitsOn, qCount, tCount, misMatch);
 			qList->offBy[bitsOn][qCount] += 1;
 			tList->offBy[bitsOn][tCount] += 1;
 			}
                     }
                 else
                     { 	/* no misMatch, identical crisprs */
                     qList->offBy[0][qCount] += 1;
                     tList->offBy[0][tCount] += 1;
                     }
                 } // for (tCount = 0; tCount < tList->crisprCount; ++tCount)
                 tStart = 0;	/* following chroms run through all */
             }	//	for (tList = target; tList; tList = tList->next)
 	}	//	for (qCount = 0; qCount < qList->crisprCount; ++qCount)
     }	//	for (qList = query; qList; qList = qList->next)
 elapsedMs = clock1000() - processStart;
 timingMessage("queryVsSelf", totalCrisprsQuery, "crisprs processed", elapsedMs, "crisprs/sec", "seconds/crispr");
 timingMessage("queryVsSelf", totalCrisprsCompare, "total comparisons", elapsedMs, "compares/sec", "seconds/compare");
 }	/* static void queryVsSelf(struct crisprList *all) */
 
 static struct crisprList *readKmers(char *fileIn)
 /* read in kmer list from 'fileIn', return list structure */
 {
+errAbort("# XXX readKmers function not implemented\n");
 verbose(1, "# reading crisprs from: %s\n", fileIn);
 struct crisprList *listReturn = NULL;
 struct lineFile *lf = lineFileOpen(fileIn, TRUE);
 char *row[10];
 int wordCount = 0;
 long long crisprsInput = 0;
 
 long startMs = clock1000();
 
 while (0 < (wordCount = lineFileChopNextTab(lf, row, ArraySize(row))) )
     {
     if (3 != wordCount)
 	errAbort("expecing three words at line %d in file %s, found: %d",
 	    lf->lineIx, fileIn, wordCount);
     struct crisprList *newItem = NULL;
@@ -945,82 +983,66 @@
 	    newItem->crisprCount, lf->lineIx, fileIn, verifyCount);
     crisprsInput += verifyCount;
     }
 
 lineFileClose(&lf);
 
 long elapsedMs = clock1000() - startMs;
 timingMessage("readKmers", crisprsInput, "crisprs read in", elapsedMs, "crisprs/sec", "seconds/crispr");
 
 return listReturn;
 }	//	static struct crisprList *readKmers(char *fileIn)
 
 static void writeKmers(struct crisprList *all, char *fileOut)
 /* write kmer list 'all' to 'fileOut' */
 {
+errAbort("# XXX writeKmers function not implemented\n");
 FILE *fh = mustOpen(fileOut, "w");
 struct crisprList *list = NULL;
 long long crisprsWritten = 0;
+char kmerString[33];
+char pamString[33];
 
 long startMs = clock1000();
 
 slReverse(&all);
 for (list = all; list; list = list->next)
     {
     fprintf(fh, "%s\t%lld\t%d\n", list->chrom, list->crisprCount, list->size);
     struct crispr *c = NULL;
     slReverse(&list->chromCrisprs);
     for (c = list->chromCrisprs; c; c = c->next)
 	{
-	fprintf(fh, "%lld\t%lld\t%c\n", c->sequence,
+        kmerValToString(kmerString, c->sequence, 3);
+        kmerPAMString(pamString, c->sequence);
+	fprintf(fh, "%s\t%s\t%lld\t%c\n", kmerString, pamString,
 	    c->start, negativeStrand & c->sequence ? '-' : '+');
 	++crisprsWritten;
 	}
     }
 carefulClose(&fh);
 
 long elapsedMs = clock1000() - startMs;
 timingMessage("writeKmers", crisprsWritten, "crisprs written", elapsedMs, "crisprs/sec", "seconds/crispr");
 }	//	static void writeKmers(struct crisprList *all, char *fileOut)
 
 static void *threadFunction(void *id)
 /* thread entry */
 {
 struct threadControl *tId = (struct threadControl *)id;
 
-queryVsAll(tId->query, tId->target, tId->threadCount, tId->threadId);
-
-#ifdef NOT
-/*
- * for each list to process, divide the list into tId->threadCount parts,
- * then one thread runs part number tId->threadId bits of that list
- */
-struct loopControl *control = NULL;
-AllocVar(control);
-
-
-long long listCount = 1;
-while (listCount < 100) {
-setLoopEnds(control, listCount, tId->threadCount, tId->threadId);
-
-if (control->listStart >= listCount)
-  verbose(1, "#\t%lld\t# %d of %d does not run from %lld to < %lld in list of %lld size\n", listCount, tId->threadId, tId->threadCount, control->listStart, control->listEnd, listCount);
-else
-  verbose(1, "#\t%lld\t# %d of %d runs from %lld to < %lld in list of %lld size\n", listCount, tId->threadId, tId->threadCount, control->listStart, control->listEnd, listCount);
-  listCount += 13;
-}
-#endif
+queryVsTarget(tId->query, tId->target, tId->threadCount, tId->threadId);
 
 return NULL;
 }
 
 static void runThreads(int threadCount, struct crisprList *query,
     struct crisprList *target)
 {
 struct threadControl *threadIds = NULL;
 AllocArray(threadIds, threadCount);
 
 pthread_t *threads = NULL;
 AllocArray(threads, threadCount);
 
 int pt = 0;
 for (pt = 0; pt < threadCount; ++pt)
@@ -1036,31 +1058,30 @@
     if (rc)
         {
         errAbort("Unexpected error %d from pthread_create(): %s",rc,strerror(rc));
         }
     }
 
 /* Wait for threads to finish */
 for (pt = 0; pt < threadCount; ++pt)
     pthread_join(threads[pt], NULL);
 }
 
 static void crisprKmers(char *sequence)
 /* crisprKmers - find and annotate crispr sequences. */
 {
 struct crisprList *queryGuides = NULL;
-// struct crisprList *countedCrisprs = NULL;
 struct crisprList *allGuides = NULL;
 
 if (loadKmers)
     allGuides = readKmers(loadKmers);
 else
     allGuides = scanSequence(sequence);
 
 if (dumpKmers)
     {
     writeKmers(allGuides, dumpKmers);
     return;
     }
 
 long long vmPeak = currentVmPeak();
 verbose(1, "# vmPeak after scanSequence: %lld kB\n", vmPeak);
@@ -1081,34 +1102,34 @@
         copyToArray(queryGuides);
     if (allGuides)
 	copyToArray(allGuides);
     vmPeak = currentVmPeak();
     verbose(1, "# vmPeak after copyToArray: %lld kB\n", vmPeak);
     /* larger example: 62646196 kB */
     if ((vmPeak >> 20) > memLimit)	// the >> 20 converts kB to gB
 	{
 	threadCount = 1 + ((vmPeak >> 20) / memLimit);
 	verbose(1, "# over %d Gb at %lld kB, threadCount: %d\n", memLimit, vmPeak, threadCount);
 	}
     if (queryGuides)	// when range selected some query sequences
 	{
 	if (allGuides) // if there are any left on the all list
 	    {
-	    if (0 == 1 && threadCount > 1)
+	    if (threadCount > 1)
 		runThreads(threadCount, queryGuides, allGuides);
 	    else
-		queryVsAll(queryGuides, allGuides, 1, 0);
+		queryVsTarget(queryGuides, allGuides, 1, 0);
 	    }
 	/* then run up the query vs. itself avoiding self vs. self */
         queryVsSelf(queryGuides);
         countsOutput(queryGuides);
         }
     else
         {
 	queryVsSelf(allGuides); /* run up all vs. all avoiding self vs. self */
 	countsOutput(allGuides);
         }
 
     carefulClose(&offFile);
     }
 }	// static void crisprKmers(char *sequence)