47a02857acb0e52455455f577dbe801d9b2b0a66
hiram
  Sat Jul 8 23:04:40 2017 -0700
now recording off-target data and discovered that (a > 1) is dramatically different from (a >> 1) with no warning of course, refs #18969

diff --git src/utils/crisprKmers/crisprKmers.c src/utils/crisprKmers/crisprKmers.c
index 4dc93cb..6a0595b 100644
--- src/utils/crisprKmers/crisprKmers.c
+++ src/utils/crisprKmers/crisprKmers.c
@@ -18,49 +18,53 @@
 /* Explain usage and exit. */
 {
 errAbort(
   "crisprKmers - annotate crispr sequences\n"
   "usage:\n"
   "   crisprKmers <sequence>\n"
   "options:\n"
   "   where <sequence> is a .2bit file or .fa fasta sequence\n"
   "   -verbose=N - control processing steps with level of verbose:\n"
   "                default N=1 - only find crisprs, set up listings\n"
   "                N=2 - empty process crispr list into new list\n"
   "                N=3 - only count identicals during processing\n"
   "                N=4 - attempt measure all off targets and\n"
   "                      print out all crisprs as bed format\n"
   "   -bed=<file> - output kmers to given bed9+ file\n"
+  "   -offTargets=<file> - output offTargets to given file\n"
   "   -ranges=<file> - use specified bed3 file to limit which crisprs are\n"
   "                  - measured, only those with any overlap to these bed items.\n"
   "   -dumpKmers=<file> - after scan of sequence, output kmers to file\n"
   "                     - process will exit after this, use -loadKmers to continue\n"
   "   -loadKmers=<file> - load kmers from previous scan of sequence from -dumpKmers\n"
   "   NOTE: It is faster to simply scan the sequence to get the system ready\n"
   "         to go.  Reading in the kmer file takes longer than scanning."
   );
 }
 
-static char *bedFileOut = NULL;	/* set with -bed=<file> argument */
-static char *ranges = NULL;	/* set with -ranges=<file> argument */
+static char *bedFileOut = NULL;	/* set with -bed=<file> argument, kmer output */
+static char *offTargets = NULL;	/* write offTargets to <file> */
+static FILE *offFile = NULL;	/* file handle to write off targets */
+static char *ranges = NULL;	/* use ranges <file> to limit scanning */
 static struct hash *rangesHash = NULL;	/* ranges into hash + binkeeper */
 static char *dumpKmers = NULL;	/* file name to write out kmers from scan */
 static char *loadKmers = NULL;	/* file name to read in kmers from previous scan */
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
    {"bed", OPTION_STRING},
+   {"offTargets", OPTION_STRING},
    {"ranges", OPTION_STRING},
    {"dumpKmers", OPTION_STRING},
    {"loadKmers", OPTION_STRING},
    {NULL, 0},
 };
 
 struct crispr
 /* one chromosome set of crisprs */
     {
     struct crispr *next;		/* Next in list. */
     long long sequence;	/* sequence value in 2bit format */
     long long start;		/* chromosome start 0-relative */
     char strand;			/* strand: + or - */
     };
 
@@ -163,67 +167,73 @@
 /* Compare slPairs on start value */
 {
 const struct crispr *a = *((struct crispr **)va);
 const struct crispr *b = *((struct crispr **)vb);
 long long aVal = a->start;
 long long bVal = b->start;
 if (aVal < bVal)
     return -1;
 else if (aVal > bVal)
     return 1;
 else
     return 0;
 }
 #endif
 
-static char *kmerPAMStringArray(long long val)
+/* these two kmerVal to strings routines could be reduced to one and they
+ * could use lookup tables to run faster
+ */
+static char *kmerPAMString(long long val)
 /* return the ASCII string for last three bases in then binary sequence value */
 {
 static char pamString[32];
 long long twoBitMask = 0x30;
 int shiftCount = 4;
 int baseCount = 0;
 
 while (twoBitMask)
     {
     int base = (val & twoBitMask) >> shiftCount;
     pamString[baseCount++] = bases[base];
     twoBitMask >>= 2;
     shiftCount -= 2;
     }
 pamString[baseCount] = 0;
 return pamString;
-}	//	static char *kmerValToStringArray(struct crispr *c, int trim)
+}	//	static char *kmerPAMString(struct crispr *c, int trim)
 
-static char *kmerValToStringArray(long long val, int trim)
+/* beware, this needs to be used immediately upon return since it is
+ * returning its static answer
+ */
+static char *kmerValToString(long long val, int trim)
 /* return ASCII string for binary sequence value */
 {
 static char kmerString[32];
 long long twoBitMask = 0x300000000000;
 int shiftCount = 44;
 int baseCount = 0;
 
 while (twoBitMask && (shiftCount >= (2*trim)))
     {
     int base = (val & twoBitMask) >> shiftCount;
     kmerString[baseCount++] = bases[base];
     twoBitMask >>= 2;
     shiftCount -= 2;
     }
 kmerString[baseCount] = 0;
 return kmerString;
-}	//	static char *kmerValToStringArray(long long val, int trim)
+}	//	static char *kmerValToString(long long val, int trim)
 
 static long long revComp(long long val)
 /* reverse complement the 2-bit numerical value kmer */
 {
 /* complement bases and add 18 0 bits
  *    because this divide and conquer works on 64 bits, not 46,
  *      hence the addition of the 18 zero bits which fall out
  * The 'val ^ fortySixBits' does the complement, the shifting and
  *     masking does the reversing.
  */
 register long long v = (val ^ fortySixBits) << 18;
 v = ((v & high32bits) >> 32) | ((v & low32bits) << 32);
 v = ((v & high16bits) >> 16) | ((v & low16bits) << 16);
 v = ((v & high8bits) >> 8) | ((v & low8bits) << 8);
 v = ((v & high4bits) >> 4) | ((v & low4bits) << 4);
@@ -324,66 +334,66 @@
             if (startGap != chromPosition)
                 verbose(4, "#GAP %s\t%lld\t%lld\t%lld\t%lld\t%s\n", seq->name, startGap, chromPosition, gapCount, chromPosition-startGap, "+");
             else
                 verbose(4, "#GAP %s\t%lld\t%lld\t%lld\t%lld\t%s\n", seq->name, startGap, chromPosition+1, gapCount, 1+chromPosition-startGap, "+");
             kmerLength = 0;  /* reset, start over */
             kmerVal = 0;
             }	// else if (val >= 0)
     ++chromPosition;
     }	// for (i=0; i < seq->size; ++i)
 // slReverse(&crisprSet);	// save time, order not important at this time
 returnList->chromCrisprs = crisprSet;
 returnList->crisprCount = slCount(crisprSet);
 return returnList;
 }	// static struct crisprList *generateKmers(struct dnaSeq *seq)
 
-static void showCountsArray(struct crisprList *all)
+static void countsOutput(struct crisprList *all)
 /* everything has been scanned and counted, print out all the data from arrays*/
 {
 FILE *bedFH = NULL;
 if (bedFileOut)
     bedFH = mustOpen(bedFileOut, "w");
 
 struct crisprList *list;
 long long totalOut = 0;
 for (list = all; list; list = list->next)
     {
     long long c = 0;
     for (c = 0; c < list->crisprCount; ++c)
         {
 	int negativeOffset = 0;
         if (list->strand[c] == '-')
 	    negativeOffset = 3;
 	long long txStart = list->start[c] + negativeOffset;
-	long long txEnd = txStart + 20 + negativeOffset;
+	long long txEnd = txStart + 20;
         int totalOffs = list->offBy[0][c] + list->offBy[1][c] +
 	    list->offBy[2][c] + list->offBy[3][c] + list->offBy[4][c];
 
         if (0 == totalOffs)
-verbose(1, "# PERFECT score %s:%lld %c\t%s\n", list->chrom, list->start[c], list->strand[c], kmerValToStringArray(list->sequence[c], 3));
+verbose(1, "# PERFECT score %s:%lld %c\t%s\n", list->chrom, list->start[c], list->strand[c], kmerValToString(list->sequence[c], 3));
 
 	if (bedFH)
-	    fprintf(bedFH, "%s\t%lld\t%lld\t%s\t%d\t%c\t%lld\t%lld\t%s\t%d\t%d\t%d\t%d\n", list->chrom, list->start[c], list->start[c]+23, kmerValToStringArray(list->sequence[c], 3), list->offBy[0][c], list->strand[c], txStart, txEnd, kmerPAMStringArray(list->sequence[c]), list->offBy[1][c], list->offBy[2][c], list->offBy[3][c], list->offBy[4][c]);
+	    fprintf(bedFH, "%s\t%lld\t%lld\t%d,%d,%d,%d,%d\t%d\t%c\t%lld\t%lld\t%s\t%d\t%d\t%d\t%d\t%s\n", list->chrom, list->start[c], list->start[c]+23, list->offBy[0][c], list->offBy[1][c], list->offBy[2][c], list->offBy[3][c], list->offBy[4][c],  list->offBy[0][c], list->strand[c], txStart, txEnd, kmerPAMString(list->sequence[c]), list->offBy[1][c], list->offBy[2][c], list->offBy[3][c], list->offBy[4][c], kmerValToString(list->sequence[c], 3));
 
         if (list->offBy[0][c])
-           verbose(3, "# array identical: %d %s:%lld %c\t%s\n", list->offBy[0][c], list->chrom, list->start[c], list->strand[c], kmerValToStringArray(list->sequence[c], 3));
+           verbose(3, "# array identical: %d %s:%lld %c\t%s\n", list->offBy[0][c], list->chrom, list->start[c], list->strand[c], kmerValToString(list->sequence[c], 3));
 	++totalOut;
 	}
     }
 if (bedFH)
     carefulClose(&bedFH);
-}	// static void showCountsArray(struct crisprList *all)
+}	// static void countsOutput(struct crisprList *all)
 
 static struct crisprList *scanSequence(char *inFile)
 /* scan the given file, return list of crisprs */
 {
 verbose(1, "#\tscanning sequence file: %s\n", inFile);
 dnaUtilOpen();
 struct dnaLoad *dl = dnaLoadOpen(inFile);
 struct dnaSeq *seq;
 struct crisprList *listReturn = NULL;
 
 long elapsedMs = 0;
 long scanStart = clock1000();
 long long totalCrisprs = 0;
 
 /* scanning all sequences, setting up crisprs on the listReturn */
@@ -418,40 +428,40 @@
 struct crisprList *all = *allReference;
 struct crisprList *listReturn = NULL;
 struct crisprList *list = NULL;
 int inputChromCount = slCount(all);
 long long returnListCrisprCount = 0;
 long long examinedCrisprCount = 0;
 struct crisprList *prevChromList = NULL;
 
 long elapsedMs = 0;
 long scanStart = clock1000();
 
 struct crisprList *nextList = NULL;
 for (list = all; list; list = nextList)
     {
     nextList = list->next;	// remember before perhaps lost
+    examinedCrisprCount += list->crisprCount;
     struct crispr *c = NULL;
     struct binKeeper *bk = hashFindVal(rangesHash, list->chrom);
     struct crispr *newCrispr = NULL;
     if (bk != NULL)
         {
 	struct crispr *prevCrispr = NULL;
 	struct crispr *next = NULL;
         for (c = list->chromCrisprs; c; c = next)
             {
-	    ++examinedCrisprCount;
             struct binElement *hitList = NULL;
 	    next = c->next;	// remember before perhaps lost
             int start = c->start;
             if (c->strand == '-')
                 start += 2;
             int end = start + 20;
             hitList = binKeeperFind(bk, start, end);
             if (hitList)
 		{
 		++returnListCrisprCount;
 		if (prevCrispr)	// remove this one from the all list
 		    prevCrispr->next = next;
                 else
                     list->chromCrisprs = next;	// removing the first one
 		c->next = NULL;
@@ -477,169 +487,196 @@
 		prevChromList->next = nextList;
 	    else
                 all = nextList;	// removing the first one
 	    }
 	else
 	    {
 	    list->crisprCount = slCount(list->chromCrisprs);
 	    verbose(1, "# range scan same chrom list still has %lld crisprs on chrom %s\n",
 		list->crisprCount, list->chrom);
 	    }
 	}
     prevChromList = list;
     }	//	for (list = all; list; list = list->next)
 
 elapsedMs = clock1000() - scanStart;
-verbose(1, "# range scan, input %d chromosomes, return %d chromosomes\n",
-    inputChromCount, slCount(listReturn));
+verbose(1, "# range scanning %d chroms, return %d selected chroms, leaving %d chroms\n",
+    inputChromCount, slCount(listReturn), slCount(all));
 timingMessage("range scan", examinedCrisprCount, "examined crisprs", elapsedMs, "crisprs/sec");
 timingMessage("range scan", returnListCrisprCount, "returned crisprs", elapsedMs, "crisprs/sec");
 if (NULL == all)
     {
     allReference = NULL;	// they have all been removed
     verbose(1, "# range scan, every single chrom has been removed\n");
     }
 else if (*allReference != all)
     {
     verbose(1, "# range scan, first chrom list has been moved from %p to %p\n", (void *)*allReference, (void *)all);
     *allReference = all;
     }
 return listReturn;
 }	//	static crisprList *rangeExtraction(crisprList *all)
 
-static void queryVsAllArray(struct crisprList *query,
+static void recordOffTargets(struct crisprList *query,
+    struct crisprList *target, int bitsOn, long long qIndex,
+	long long tIndex)
+/* bitsOn is from 1 to 4, record this match when less than 10 count */
+{
+if (offFile)
+    {
+    if (query->offBy[bitsOn][qIndex] < 10)
+	{/* needs to be two prints as the kmerValToString return is confused */
+	fprintf(offFile, "%s:%lld %c %s %#020llx\t", query->chrom, query->start[qIndex],
+	    query->strand[qIndex], kmerValToString(query->sequence[qIndex], 3),
+(unsigned long long)((query->sequence[qIndex] & 0x3fffffffffc0)>>6));
+        fprintf(offFile, "%s:%lld %c %s %#020llx\t%d\n", target->chrom,
+	    target->start[tIndex], target->strand[tIndex],
+		kmerValToString(target->sequence[tIndex], 3),
+(unsigned long long)((target->sequence[tIndex] & 0x3fffffffffc0)>>6), bitsOn);
+	}
+    }
+}
+
+static void queryVsAll(struct crisprList *query,
     struct crisprList *target)
 /* run the query crisprs list against the target list in the array structures */
 {
 struct crisprList *qList = NULL;
 long long totalCrisprsQuery = 0;
+long long totalCrisprsTarget = 0;
 long long totalCompares = 0;
 
 long processStart = clock1000();
 long elapsedMs = 0;
 
 for (qList = query; qList; qList = qList->next)
     {
     long long qCount = 0;
     totalCrisprsQuery += qList->crisprCount;
-    verbose(1, "# queryVsAllArray %lld query crisprs on chrom %s\n", qList->crisprCount, qList->chrom);
+    verbose(1, "# queryVsAll %lld query crisprs on chrom %s\n", qList->crisprCount, qList->chrom);
     for (qCount = 0; qCount < qList->crisprCount; ++qCount)
 	{
         struct crisprList *tList = NULL;
         for (tList = target; tList; tList = tList->next)
             {
             long long tCount = 0;
 	    totalCompares += tList->crisprCount;
             if (0 == qCount)
-                verbose(1, "# queryVsAllArray %lld target crisprs on chrom %s\n", tList->crisprCount, tList->chrom);
+{
+totalCrisprsTarget += tList->crisprCount;
+verbose(1, "# queryVsAll %lld target crisprs on chrom %s %lld targets %lld compares\n", tList->crisprCount, tList->chrom, totalCrisprsTarget, totalCompares);
+}
             for (tCount = 0; tCount < tList->crisprCount; ++tCount)
                 {
                 /* the XOR will determine differences in two sequences
                  *  the shift right 6 removes the PAM sequence
                  */
                 long long misMatch =
                     (qList->sequence[qCount] ^ tList->sequence[tCount]) >> 6;
                 if (misMatch)
                     {
                     /* possible misMatch bit values: 01 10 11
                      *  turn those three values into just: 01
                      */
-                    misMatch = (misMatch | (misMatch > 1)) & 0x5555555555;
+                    misMatch = (misMatch | (misMatch >> 1)) & 0x5555555555;
                     int bitsOn = _mm_popcnt_u64(misMatch);
 		    if (bitsOn < 5)
 			{
+			recordOffTargets(qList, tList, bitsOn, qCount, tCount);
 			qList->offBy[bitsOn][qCount] += 1;
 			tList->offBy[bitsOn][tCount] += 1;
 			}
                     }
                 else
                     { 	/* no misMatch, identical crisprs */
                     qList->offBy[0][qCount] += 1;
                     tList->offBy[0][tCount] += 1;
                     }
                 } // for (tCount = 0; tCount < tList->crisprCount; ++tCount)
             }	//	for (tList = target; tList; tList = tList->next)
 	}	//	for (qCount = 0; qCount < qList->crisprCount; ++qCount)
     }	//	for (qList = query; qList; qList = qList->next)
 verbose(1, "# done with scanning, check timing\n");
 elapsedMs = clock1000() - processStart;
-timingMessage("queryVsAllArray", totalCrisprsQuery, "crisprs processed", elapsedMs, "crisprs/sec");
-timingMessage("queryVsAllArray", totalCompares, "total comparisons", elapsedMs, "compares/sec");
-}	/* static struct crisprList *queryVsAllArray(struct crisprList *query,
+timingMessage("queryVsAll", totalCrisprsQuery, "query crisprs processed", elapsedMs, "crisprs/sec");
+timingMessage("queryVsAll", totalCrisprsTarget, "vs target crisprs", elapsedMs, "crisprs/sec");
+timingMessage("queryVsAll", totalCompares, "total comparisons", elapsedMs, "compares/sec");
+}	/* static struct crisprList *queryVsAll(struct crisprList *query,
 	    struct crisprList *target) */
 
-static void allVsAllArray(struct crisprList *all)
+static void allVsAll(struct crisprList *all)
 /* run this 'all' list vs. itself with arrays */
 {
 struct crisprList *qList = NULL;
 long long totalCrisprsQuery = 0;
 long long totalCrisprsCompare = 0;
 
 long processStart = clock1000();
 long elapsedMs = 0;
 
 /* query runs through all chroms */
 for (qList = all; qList; qList = qList->next)
     {
     long long qCount = 0;
     totalCrisprsQuery += qList->crisprCount;
-    verbose(1, "# allVsAllArray %lld query crisprs on chrom %s\n", qList->crisprCount, qList->chrom);
+    verbose(1, "# allVsAll %lld query crisprs on chrom %s\n", qList->crisprCount, qList->chrom);
     /* query runs through all kmers on this chrom */
     for (qCount = 0; qCount < qList->crisprCount; ++qCount)
 	{
 	/* target starts on same chrom as query, and
 	   at next kmer after query for this first chrom */
         long long tStart = qCount+1;
         struct crisprList *tList = NULL;
         for (tList = qList; tList; tList = tList->next)
             {
             long long tCount = tStart;
 	    totalCrisprsCompare += tList->crisprCount - tStart;
             for (tCount = tStart; tCount < tList->crisprCount; ++tCount)
                 {
                 /* the XOR will determine differences in two sequences
                  *  the shift right 6 removes the PAM sequence
                  */
                 long long misMatch =
                     (qList->sequence[qCount] ^ tList->sequence[tCount]) >> 6;
                 if (misMatch)
                     {
                     /* possible misMatch bit values: 01 10 11
                      *  turn those three values into just: 01
                      */
-                    misMatch = (misMatch | (misMatch > 1)) & 0x5555555555;
+                    misMatch = (misMatch | (misMatch >> 1)) & 0x5555555555;
                     int bitsOn = _mm_popcnt_u64(misMatch);
 		    if (bitsOn < 5)
 			{
+			recordOffTargets(qList, tList, bitsOn, qCount, tCount);
 			qList->offBy[bitsOn][qCount] += 1;
 			tList->offBy[bitsOn][tCount] += 1;
 			}
                     }
                 else
                     { 	/* no misMatch, identical crisprs */
                     qList->offBy[0][qCount] += 1;
                     tList->offBy[0][tCount] += 1;
                     }
                 } // for (tCount = 0; tCount < tList->crisprCount; ++tCount)
                 tStart = 0;	/* following chroms run through all */
             }	//	for (tList = target; tList; tList = tList->next)
 	}	//	for (qCount = 0; qCount < qList->crisprCount; ++qCount)
     }	//	for (qList = query; qList; qList = qList->next)
 elapsedMs = clock1000() - processStart;
-timingMessage("allVsAllArray", totalCrisprsQuery, "crisprs processed", elapsedMs, "crisprs/sec");
-timingMessage("allVsAllArray", totalCrisprsCompare, "total comparisons", elapsedMs, "compares/sec");
-}	/* static struct crisprList *allVsAllArray(struct crisprList *query,
+timingMessage("allVsAll", totalCrisprsQuery, "crisprs processed", elapsedMs, "crisprs/sec");
+timingMessage("allVsAll", totalCrisprsCompare, "total comparisons", elapsedMs, "compares/sec");
+}	/* static struct crisprList *allVsAll(struct crisprList *query,
 	    struct crisprList *target) */
 
 static struct crisprList *readKmers(char *fileIn)
 /* read in kmer list from 'fileIn', return list structure */
 {
 verbose(1, "# reading crisprs from: %s\n", fileIn);
 struct crisprList *listReturn = NULL;
 struct lineFile *lf = lineFileOpen(fileIn, TRUE);
 char *row[10];
 int wordCount = 0;
 long long crisprsInput = 0;
 
 long startMs = clock1000();
 
 while (0 < (wordCount = lineFileChopNextTab(lf, row, ArraySize(row))) )
@@ -719,64 +756,68 @@
 
 if (loadKmers)
     allCrisprs = readKmers(loadKmers);
 else
     allCrisprs = scanSequence(sequence);
 
 if (dumpKmers)
     {
     writeKmers(allCrisprs, dumpKmers);
     return;
     }
 
 /* processing those crisprs */
 if (verboseLevel() > 1)
     {
+    if (offTargets)
+	offFile = mustOpen(offTargets, "w");
     /* if ranges have been specified, construct queryList of kmers to measure */
     if (rangesHash)
         {
 	/* result here is two exclusive sets: query, and allCrisprs
 	 *    the query crisprs have been extracted from the allCrisprs */
         queryCrisprs = rangeExtraction(& allCrisprs);
         copyToArray(queryCrisprs);
 	if (allCrisprs)	// if there are any left on the all list
 	    {
 	    copyToArray(allCrisprs);
             /* first run up query vs. all */
-            queryVsAllArray(queryCrisprs, allCrisprs);
+            queryVsAll(queryCrisprs, allCrisprs);
 	    }
         /* then run up the query vs. itself avoiding self vs. self */
-        allVsAllArray(queryCrisprs);
-        showCountsArray(queryCrisprs);
+        allVsAll(queryCrisprs);
+        countsOutput(queryCrisprs);
         }
     else
         {
 	copyToArray(allCrisprs);
         /* run up all vs. all avoiding self vs. self */
-        allVsAllArray(allCrisprs);
-        showCountsArray(allCrisprs);
+        allVsAll(allCrisprs);
+        countsOutput(allCrisprs);
         }
+    carefulClose(&offFile);
     }
 }	// static void crisprKmers(char *sequence)
 
 int main(int argc, char *argv[])
 /* Process command line, initialize translation arrays and call the process */
 {
 optionInit(&argc, argv, options);
 if (argc != 2)
     usage();
 
 verbose(0, "# running verboseLevel: %d\n", verboseLevel());
 
 bedFileOut = optionVal("bed", bedFileOut);
 dumpKmers = optionVal("dumpKmers", dumpKmers);
 loadKmers = optionVal("loadKmers", loadKmers);
+offTargets = optionVal("offTargets", offTargets);
 ranges = optionVal("ranges", ranges);
 if (ranges)
     rangesHash = readRanges(ranges);
 
 initOrderedNtVal();	/* set up orderedNtVal[] */
 crisprKmers(argv[1]);
 if (verboseLevel() > 1)
     printVmPeak();
 return 0;
 }