131be66e554e489d599a2e54988a45f1985e194e
hiram
  Mon Jul 17 08:27:54 2017 -0700
correct selection of guides vs. ranges to avoid duplicates when ranges are split refs #18969

diff --git src/utils/crisprKmers/crisprKmers.c src/utils/crisprKmers/crisprKmers.c
index ebbc08b..662d4f9 100644
--- src/utils/crisprKmers/crisprKmers.c
+++ src/utils/crisprKmers/crisprKmers.c
@@ -93,30 +93,32 @@
 static char *loadKmers = NULL;	/* file name to read in kmers from previous scan */
 static int memLimit = 0;	/* gB limit before going into thread mode */
 static int threadCount = 1;	/* will be adjusted depending upon vmPeak */
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
    {"bed", OPTION_STRING},
    {"offTargets", OPTION_STRING},
    {"ranges", OPTION_STRING},
    {"memLimit", OPTION_INT},
    {"dumpKmers", OPTION_STRING},
    {"loadKmers", OPTION_STRING},
    {NULL, 0},
 };
 
+#define	guideSize	20	// 20 bases
+#define	pamSize		3	//  3 bases
 #define negativeStrand	0x0000800000000000
 
 #define fortyEightBits	0xffffffffffff
 #define fortySixBits	0x3fffffffffff
 #define fortyBits	0xffffffffff
 #define high32bits	0xffffffff00000000
 #define low32bits	0x00000000ffffffff
 #define high16bits	0xffff0000ffff0000
 #define low16bits	0x0000ffff0000ffff
 #define high8bits	0xff00ff00ff00ff00
 #define low8bits	0x00ff00ff00ff00ff
 #define high4bits	0xf0f0f0f0f0f0f0f0
 #define low4bits	0x0f0f0f0f0f0f0f0f
 #define high2bits	0xcccccccccccccccc
 #define low2bits	0x3333333333333333
@@ -201,31 +203,31 @@
 bases[G_BASE] = 'G';
 bases[T_BASE] = 'T';
 }	//	static void initOrderedNtVal()
 
 static void timingMessage(char *prefix, long long count, char *message,
     long ms, char *units, char *invUnits)
 {
 double perSecond = 0.0;
 double inverse = 0.0;
 if ((ms > 0) && (count > 0))
     {
     perSecond = 1000.0 * count / ms;
     inverse = 1.0 / perSecond;
     }
 
-verbose(1, "# %s: %lld %s @ %ld ms -> %.2f %s == %g %s\n", prefix, count, message, ms, perSecond, units, inverse, invUnits);
+verbose(1, "# %s: %lld %s @ %ld ms\n#\t%.2f %s == %g %s\n", prefix, count, message, ms, perSecond, units, inverse, invUnits);
 }
 
 static void setLoopEnds(struct loopControl *control, long long listSize,
     int partCount, int partNumber)
 /* for multiple thread processing, given a list of listSize,
  #    a partCount and a partNumber
  *    return listStart, listEnd indexes in control structure
  */
 {
 long long partSize = 1 + listSize / partCount;
 long long listStart = partSize * partNumber;
 long long listEnd = partSize * (partNumber + 1);
 if (listEnd > listSize)
     listEnd = listSize;
 control->listStart = listStart;
@@ -487,60 +489,60 @@
 long long itemsOutput = 0;
 
 struct crisprList *list;
 long long totalOut = 0;
 for (list = all; list; list = list->next)
     {
     long long c = 0;
     for (c = 0; c < list->crisprCount; ++c)
         {
 	++itemsOutput;
 	int negativeOffset = 0;
         char strand = '+';
         if (negativeStrand & list->sequence[c])
 	    {
             strand = '-';
-	    negativeOffset = 3;
+	    negativeOffset = pamSize;
 	    }
 	long long txStart = list->start[c] + negativeOffset;
-	long long txEnd = txStart + 20;
+	long long txEnd = txStart + guideSize;
 
         int totalOffs = list->offBy[0][c] + list->offBy[1][c] +
 	    list->offBy[2][c] + list->offBy[3][c] + list->offBy[4][c];
 
         char *color = itemColor(list->offBy, c);
         char kmerString[33];
-        kmerValToString(kmerString, list->sequence[c], 3);
+        kmerValToString(kmerString, list->sequence[c], pamSize);
         char pamString[33];
         kmerPAMString(pamString, list->sequence[c]);
 
         if (0 == totalOffs)
 verbose(1, "# PERFECT score %s:%lld %c\t%s\n", list->chrom, list->start[c], strand, kmerString);
 
 	if (bedFH)
-	    fprintf(bedFH, "%s\t%lld\t%lld\t%d,%d,%d,%d,%d\t%d\t%c\t%lld\t%lld\t%s\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\n", list->chrom, list->start[c], list->start[c]+23, list->offBy[0][c], list->offBy[1][c], list->offBy[2][c], list->offBy[3][c], list->offBy[4][c],  list->offBy[0][c], strand, txStart, txEnd, color, color, color, kmerString, pamString, list->offBy[1][c], list->offBy[2][c], list->offBy[3][c], list->offBy[4][c]);
+	    fprintf(bedFH, "%s\t%lld\t%lld\t%d,%d,%d,%d,%d\t%d\t%c\t%lld\t%lld\t%s\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\n", list->chrom, list->start[c], list->start[c]+pamSize+guideSize, list->offBy[0][c], list->offBy[1][c], list->offBy[2][c], list->offBy[3][c], list->offBy[4][c], list->offBy[0][c], strand, txStart, txEnd, color, color, color, kmerString, pamString, list->offBy[1][c], list->offBy[2][c], list->offBy[3][c], list->offBy[4][c]);
 
-        if (list->offBy[0][c])
-           verbose(3, "# array identical: %d %s:%lld %c\t%s\n", list->offBy[0][c], list->chrom, list->start[c], strand, kmerString);
+//        if (list->offBy[0][c])
+//           verbose(3, "# array identical: %d %s:%lld %c\t%s\n", list->offBy[0][c], list->chrom, list->start[c], strand, kmerString);
 	++totalOut;
 	}
     }
 if (bedFH)
     carefulClose(&bedFH);
 
 long elapsedMs = clock1000() - startTime;
-timingMessage("copyToArray:", itemsOutput, "items output", elapsedMs, "items/sec", "seconds/item");
+timingMessage("countsOutput:", itemsOutput, "items output", elapsedMs, "items/sec", "seconds/item");
 }	//	static void countsOutput(struct crisprList *all, FILE *bedFH)
 
 static struct crisprList *scanSequence(char *inFile)
 /* scan the given file, return list of crisprs */
 {
 verbose(1, "#\tscanning sequence file: %s\n", inFile);
 dnaUtilOpen();
 struct dnaLoad *dl = dnaLoadOpen(inFile);
 struct dnaSeq *seq;
 struct crisprList *listReturn = NULL;
 
 long elapsedMs = 0;
 long scanStart = clock1000();
 long long totalCrisprs = 0;
 
@@ -589,46 +591,47 @@
     {
     nextList = list->next;	// remember before perhaps lost
     long long beforeCrisprCount = list->crisprCount;
     examinedCrisprCount += list->crisprCount;
     struct crispr *c = NULL;
     struct binKeeper *bk = hashFindVal(rangesHash, list->chrom);
     struct crispr *newCrispr = NULL;
     if (bk != NULL)
         {
 	struct crispr *prevCrispr = NULL;
 	struct crispr *next = NULL;
         for (c = list->chromCrisprs; c; c = next)
             {
             struct binElement *hitList = NULL;
 	    next = c->next;	// remember before perhaps lost
-            int start = c->start;
-            if (negativeStrand & c->sequence)
-                start += 2;
-            int end = start + 20;
-            hitList = binKeeperFind(bk, start, end);
+	    // select any guide that is at least half contained in any range
+	    int midPoint = c->start + ((pamSize + guideSize) >> 1);
+//            if (negativeStrand & c->sequence)
+//                start += 2;
+//            int end = midPoint + 1;
+            hitList = binKeeperFind(bk, midPoint, midPoint + 1);
             if (hitList)
 		{
-		if (prevCrispr)	// remove this one from the all list
+		if (prevCrispr)	// remove this one from the 'all' list
 		    prevCrispr->next = next;
                 else
                     list->chromCrisprs = next;	// removing the first one
-		c->next = NULL;
+		c->next = NULL;			// new item for new list
 		slAddHead(&newCrispr, c);	// constructing new list
 		}
 	    else
-		prevCrispr = c;	// remains on all list
+		prevCrispr = c;	// remains on 'all' list
 	    slFreeList(&hitList);
             }
 	}
     if (newCrispr)
 	{
 	struct crisprList *newItem = NULL;
 	AllocVar(newItem);
 	newItem->chrom = cloneString(list->chrom);
 	newItem->size = list->size;
 	newItem->chromCrisprs = newCrispr;
 	newItem->crisprCount = slCount(newCrispr);
 	returnListCrisprCount += newItem->crisprCount;
 	slAddHead(&listReturn, newItem);
         if (NULL == list->chromCrisprs)	// all have been removed for this chrom
 	    {
@@ -673,31 +676,31 @@
 static double calcHitScore(long long sequence1, long long sequence2)
 /* calcHitScore - from Max' crispor.py script and paper:
  *    https://www.ncbi.nlm.nih.gov/pubmed/27380939 */
 {
 double score1 = 1.0;
 int mmCount = 0;
 int lastMmPos = -1;
 /* the XOR determines differences in two sequences, the shift
  * right 6 removes the PAM sequence and the 'fortyBits &' eliminates
  * the negativeStrand bit
  */
 long long misMatch = fortyBits & ((sequence1 ^ sequence2) >> 6);
 int distCount = 0;
 int distSum = 0;
 int pos = 0;
-for (pos = 0; pos < 20; ++pos)
+for (pos = 0; pos < guideSize; ++pos)
     {
     int diff = misMatch & 0x3;
     if (diff)
 	{
 	++mmCount;
         if (lastMmPos != -1)
 	    {
 	    distSum += pos-lastMmPos;
 	    ++distCount;
 	    }
 	score1 *= 1 - hitScoreM[pos];
 	lastMmPos = pos;
 	}
     misMatch >>= 2;
     }
@@ -733,56 +736,55 @@
     {
     stringReturn[i] = '.';
     if (bitMask & misMatch)
 	stringReturn[i] = '*';
     ++i;
     bitMask >>= 2;
     }
 stringReturn[i] = 0;
 }	// static void misMatchString(char *returnString, long long misMatch)
 
 static void recordOffTargets(struct crisprList *query,
     struct crisprList *target, int bitsOn, long long qIndex,
 	long long tIndex, long long twoBitMisMatch)
 /* bitsOn is from 1 to 4, record this match when less than 1000 total */
 {
+if (query->offBy[0][qIndex] ) // no need to accumulate if 0 mismatch > 0
+    return;
+
+if (offFile)
+    {
     char queryString[33];	/* local storage for string */
     char targetString[33];	/* local storage for string */
     char queryPAM[33];	/* local storage for string */
     char targetPAM[33];	/* local storage for string */
     char misMatch[33];	/* ...*.....*.*... ascii string represent misMatch */
 
-if (query->offBy[0][qIndex] ) // no need to accumulate if 0-mismatch > 0
-    return;
-
-if (offFile)
-    {
     int i = 0;
     int bitsOnSum = 0;
     for (i = 1; i < 5; ++i)
         bitsOnSum += query->offBy[i][qIndex];
 
-    misMatchString(misMatch, twoBitMisMatch);
-
     if (bitsOnSum < 1000)	// could be command line option limit
         {
         double hitScore =
 		calcHitScore(query->sequence[qIndex], target->sequence[tIndex]);
 //        double cfdScore =
 //		calcCfdScore(query->sequence[qIndex], target->sequence[tIndex]);
-        kmerValToString(queryString, query->sequence[qIndex], 3);
-        kmerValToString(targetString, target->sequence[tIndex], 3);
+        misMatchString(misMatch, twoBitMisMatch);
+        kmerValToString(queryString, query->sequence[qIndex], pamSize);
+        kmerValToString(targetString, target->sequence[tIndex], pamSize);
 	kmerPAMString(queryPAM, query->sequence[qIndex]);
 	kmerPAMString(targetPAM, target->sequence[tIndex]);
         fprintf(offFile, "%s:%lld %c %s%s %s%s %s %d %.8f\t%s:%lld %c\n",
 	    query->chrom, query->start[qIndex],
 		negativeStrand & query->sequence[qIndex] ? '-' : '+',
                 queryString, queryPAM, targetString, targetPAM,
                 misMatch, bitsOn, hitScore, target->chrom,
 		target->start[tIndex],
 		negativeStrand & target->sequence[tIndex] ? '-' : '+');
         }
     }
 }	//	static void recordOffTargets(struct crisprList *query,
 	//	    struct crisprList *target, int bitsOn, long long qIndex,
 	//		long long tIndex)
 
@@ -993,31 +995,31 @@
 struct crisprList *list = NULL;
 long long crisprsWritten = 0;
 char kmerString[33];
 char pamString[33];
 
 long startMs = clock1000();
 
 slReverse(&all);
 for (list = all; list; list = list->next)
     {
     fprintf(fh, "%s\t%lld\t%d\n", list->chrom, list->crisprCount, list->size);
     struct crispr *c = NULL;
     slReverse(&list->chromCrisprs);
     for (c = list->chromCrisprs; c; c = c->next)
 	{
-        kmerValToString(kmerString, c->sequence, 3);
+        kmerValToString(kmerString, c->sequence, pamSize);
         kmerPAMString(pamString, c->sequence);
 	fprintf(fh, "%s\t%s\t%lld\t%c\n", kmerString, pamString,
 	    c->start, negativeStrand & c->sequence ? '-' : '+');
 	++crisprsWritten;
 	}
     }
 carefulClose(&fh);
 
 long elapsedMs = clock1000() - startMs;
 timingMessage("writeKmers", crisprsWritten, "crisprs written", elapsedMs, "crisprs/sec", "seconds/crispr");
 }	//	static void writeKmers(struct crisprList *all, char *fileOut)
 
 static void *threadFunction(void *id)
 /* thread entry */
 {