95a4ab24a874500faece4f5e71815b950697afb3
hiram
  Mon Jul 10 00:16:59 2017 -0700
adding some fake color to the bed output to test runtime on some larger ranges refs #18969

diff --git src/utils/crisprKmers/crisprKmers.c src/utils/crisprKmers/crisprKmers.c
index 6a0595b..d4ea4f0 100644
--- src/utils/crisprKmers/crisprKmers.c
+++ src/utils/crisprKmers/crisprKmers.c
@@ -119,31 +119,30 @@
 bases[C_BASE] = 'C';
 bases[G_BASE] = 'G';
 bases[T_BASE] = 'T';
 }	//	static void initOrderedNtVal()
 
 static void timingMessage(char *prefix, long long count, char *message,
     long ms, char *units)
 {
 double perSecond = 0.0;
 if (ms > 0)
     perSecond = 1000.0 * count / ms;
 
 verbose(1, "# %s: %lld %s @ %ld ms -> %.2f %s\n", prefix, count, message, ms, perSecond, units);
 }
 
-
 static struct hash *readRanges(char *bedFile)
 /* Read bed and return it as a hash keyed by chromName
  * with binKeeper values.  (from: src/hg/bedIntersect/bedIntersec.c) */
 {
 struct hash *hash = newHash(0);	/* key is chromName, value is binkeeper data */
 struct lineFile *lf = lineFileOpen(bedFile, TRUE);
 char *row[3];
 
 while (lineFileNextRow(lf, row, 3))
     {
     struct binKeeper *bk;
     struct bed3 *item;
     struct hashEl *hel = hashLookup(hash, row[0]);
     if (hel == NULL)
        {
@@ -232,54 +231,60 @@
  * The 'val ^ fortySixBits' does the complement, the shifting and
  *     masking does the reversing.
  */
 register long long v = (val ^ fortySixBits) << 18;
 v = ((v & high32bits) >> 32) | ((v & low32bits) << 32);
 v = ((v & high16bits) >> 16) | ((v & low16bits) << 16);
 v = ((v & high8bits) >> 8) | ((v & low8bits) << 8);
 v = ((v & high4bits) >> 4) | ((v & low4bits) << 4);
 v = ((v & high2bits) >> 2) | ((v & low2bits) << 2);
 return v;
 }	//	static long long revComp(long long val)
 
 static void copyToArray(struct crisprList *list)
 /* copy the crispr list data into arrays */
 {
+long startTime = clock1000();
 struct crisprList *cl = NULL;
+long long itemsCopied = 0;
+
 for (cl = list; cl; cl = cl->next)
     {
     size_t memSize = cl->crisprCount * sizeof(long long);
     cl->sequence = (long long *)needLargeMem(memSize);
     cl->start = (long long *)needLargeMem(memSize);
     memSize = cl->crisprCount * sizeof(char);
     cl->strand = (char *)needLargeMem(memSize);
     memSize = 5 * sizeof(int *);
     cl->offBy = (int **)needLargeMem(memSize);
     memSize = cl->crisprCount * sizeof(int);
     int r = 0;
     for (r = 0; r < 5; ++r)
         cl->offBy[r] = (int *)needLargeZeroedMem(memSize);
 
     long long i = 0;
     struct crispr *c = NULL;
     for (c = cl->chromCrisprs; c; c = c->next)
         {
+	++itemsCopied;
         cl->sequence[i] = c->sequence;
         cl->start[i] = c->start;
         cl->strand[i++] = c->strand;
         }
     }
+long elapsedMs = clock1000() - startTime;
+timingMessage("copyToArray:", itemsCopied, "items copied", elapsedMs, "items/sec");
 }	//	static void copyToArray(struct crisprList *list)	*/
 
 static struct crisprList *generateKmers(struct dnaSeq *seq)
 {
 struct crispr *crisprSet = NULL;
 struct crisprList *returnList = NULL;
 AllocVar(returnList);
 returnList->chrom = cloneString(seq->name);
 returnList->size = seq->size;
 int i;
 DNA *dna;
 long long chromPosition = 0;
 long long startGap = 0;
 long long gapCount = 0;
 int kmerLength = 0;
@@ -334,65 +339,112 @@
             if (startGap != chromPosition)
                 verbose(4, "#GAP %s\t%lld\t%lld\t%lld\t%lld\t%s\n", seq->name, startGap, chromPosition, gapCount, chromPosition-startGap, "+");
             else
                 verbose(4, "#GAP %s\t%lld\t%lld\t%lld\t%lld\t%s\n", seq->name, startGap, chromPosition+1, gapCount, 1+chromPosition-startGap, "+");
             kmerLength = 0;  /* reset, start over */
             kmerVal = 0;
             }	// else if (val >= 0)
     ++chromPosition;
     }	// for (i=0; i < seq->size; ++i)
 // slReverse(&crisprSet);	// save time, order not important at this time
 returnList->chromCrisprs = crisprSet;
 returnList->crisprCount = slCount(crisprSet);
 return returnList;
 }	// static struct crisprList *generateKmers(struct dnaSeq *seq)
 
+static char *itemColor(int **offBy, long long index)
+{
+static char *notUnique = "150,150,150";
+// static char *notUniqueAlt = "150,150,150";
+static char *twoMany = "120,120,120";
+// static char *twoManyAlt = "120,120,120";
+static char *highScore = "0,255,0";	// > 70
+static char *mediumScore = "128,128,0";	// > 50
+static char *lowScore = "255,0,0";	// >= 50
+// static char *lowestScore = "80,80,80";	// < 50
+// static char *lowestScoreAlt = "80,80,80";	// < 50
+if (offBy[0][index])
+    return notUnique;
+else
+    {
+    int offSum = offBy[1][index] + offBy[2][index] + offBy[3][index] +
+		offBy[4][index];
+    if (offSum < 100)
+       return highScore;
+    else if (offSum < 150)
+       return mediumScore;
+    else if (offSum < 250)
+       return lowScore;
+    else
+       return twoMany;
+    }
+}
+// sample of offBy sums from the chrM vs. hg38 run:
+// Q1 91.000000
+// median 150.000000
+// Q3 251.000000
+// average 238.378477
+// min 4.000000
+// max 9023.000000
+// count 2193
+// total 522764.000000
+// standard deviation 424.309155
+
 static void countsOutput(struct crisprList *all)
 /* everything has been scanned and counted, print out all the data from arrays*/
 {
+long startTime = clock1000();
+long long itemsOutput = 0;
+
 FILE *bedFH = NULL;
 if (bedFileOut)
     bedFH = mustOpen(bedFileOut, "w");
 
 struct crisprList *list;
 long long totalOut = 0;
 for (list = all; list; list = list->next)
     {
     long long c = 0;
     for (c = 0; c < list->crisprCount; ++c)
         {
+	++itemsOutput;
 	int negativeOffset = 0;
         if (list->strand[c] == '-')
 	    negativeOffset = 3;
 	long long txStart = list->start[c] + negativeOffset;
 	long long txEnd = txStart + 20;
         int totalOffs = list->offBy[0][c] + list->offBy[1][c] +
 	    list->offBy[2][c] + list->offBy[3][c] + list->offBy[4][c];
 
+        char *color = itemColor(list->offBy, c);
+
         if (0 == totalOffs)
 verbose(1, "# PERFECT score %s:%lld %c\t%s\n", list->chrom, list->start[c], list->strand[c], kmerValToString(list->sequence[c], 3));
 
 	if (bedFH)
-	    fprintf(bedFH, "%s\t%lld\t%lld\t%d,%d,%d,%d,%d\t%d\t%c\t%lld\t%lld\t%s\t%d\t%d\t%d\t%d\t%s\n", list->chrom, list->start[c], list->start[c]+23, list->offBy[0][c], list->offBy[1][c], list->offBy[2][c], list->offBy[3][c], list->offBy[4][c],  list->offBy[0][c], list->strand[c], txStart, txEnd, kmerPAMString(list->sequence[c]), list->offBy[1][c], list->offBy[2][c], list->offBy[3][c], list->offBy[4][c], kmerValToString(list->sequence[c], 3));
+	    fprintf(bedFH, "%s\t%lld\t%lld\t%d,%d,%d,%d,%d\t%d\t%c\t%lld\t%lld\t%s\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\n", list->chrom, list->start[c], list->start[c]+23, list->offBy[0][c], list->offBy[1][c], list->offBy[2][c], list->offBy[3][c], list->offBy[4][c],  list->offBy[0][c], list->strand[c], txStart, txEnd, color, color, color, kmerValToString(list->sequence[c], 3), kmerPAMString(list->sequence[c]), list->offBy[1][c], list->offBy[2][c], list->offBy[3][c], list->offBy[4][c]);
 
         if (list->offBy[0][c])
            verbose(3, "# array identical: %d %s:%lld %c\t%s\n", list->offBy[0][c], list->chrom, list->start[c], list->strand[c], kmerValToString(list->sequence[c], 3));
 	++totalOut;
 	}
     }
 if (bedFH)
     carefulClose(&bedFH);
+
+long elapsedMs = clock1000() - startTime;
+timingMessage("copyToArray:", itemsOutput, "items output", elapsedMs, "items/sec");
 }	// static void countsOutput(struct crisprList *all)
 
 static struct crisprList *scanSequence(char *inFile)
 /* scan the given file, return list of crisprs */
 {
 verbose(1, "#\tscanning sequence file: %s\n", inFile);
 dnaUtilOpen();
 struct dnaLoad *dl = dnaLoadOpen(inFile);
 struct dnaSeq *seq;
 struct crisprList *listReturn = NULL;
 
 long elapsedMs = 0;
 long scanStart = clock1000();
 long long totalCrisprs = 0;
 
@@ -428,155 +480,158 @@
 struct crisprList *all = *allReference;
 struct crisprList *listReturn = NULL;
 struct crisprList *list = NULL;
 int inputChromCount = slCount(all);
 long long returnListCrisprCount = 0;
 long long examinedCrisprCount = 0;
 struct crisprList *prevChromList = NULL;
 
 long elapsedMs = 0;
 long scanStart = clock1000();
 
 struct crisprList *nextList = NULL;
 for (list = all; list; list = nextList)
     {
     nextList = list->next;	// remember before perhaps lost
+    long long beforeCrisprCount = list->crisprCount;
     examinedCrisprCount += list->crisprCount;
     struct crispr *c = NULL;
     struct binKeeper *bk = hashFindVal(rangesHash, list->chrom);
     struct crispr *newCrispr = NULL;
     if (bk != NULL)
         {
 	struct crispr *prevCrispr = NULL;
 	struct crispr *next = NULL;
         for (c = list->chromCrisprs; c; c = next)
             {
             struct binElement *hitList = NULL;
 	    next = c->next;	// remember before perhaps lost
             int start = c->start;
             if (c->strand == '-')
                 start += 2;
             int end = start + 20;
             hitList = binKeeperFind(bk, start, end);
             if (hitList)
 		{
-		++returnListCrisprCount;
 		if (prevCrispr)	// remove this one from the all list
 		    prevCrispr->next = next;
                 else
                     list->chromCrisprs = next;	// removing the first one
 		c->next = NULL;
 		slAddHead(&newCrispr, c);	// constructing new list
 		}
 	    else
 		prevCrispr = c;	// remains on all list
 	    slFreeList(&hitList);
             }
 	}
     if (newCrispr)
 	{
 	struct crisprList *newItem = NULL;
 	AllocVar(newItem);
 	newItem->chrom = cloneString(list->chrom);
 	newItem->size = list->size;
 	newItem->chromCrisprs = newCrispr;
 	newItem->crisprCount = slCount(newCrispr);
+	returnListCrisprCount += newItem->crisprCount;
 	slAddHead(&listReturn, newItem);
         if (NULL == list->chromCrisprs)	// all have been removed for this chrom
 	    {
 	    if (prevChromList)	// remove it from the chrom list
 		prevChromList->next = nextList;
 	    else
                 all = nextList;	// removing the first one
 	    }
 	else
 	    {
 	    list->crisprCount = slCount(list->chromCrisprs);
-	    verbose(1, "# range scan same chrom list still has %lld crisprs on chrom %s\n",
-		list->crisprCount, list->chrom);
+            long long removedCrisprs = beforeCrisprCount - list->crisprCount;
+	    verbose(1, "# range scan chrom %s had %lld crisprs, removed %lld leaving %lld target\n",
+		list->chrom, beforeCrisprCount, removedCrisprs, list->crisprCount);
 	    }
 	}
     prevChromList = list;
     }	//	for (list = all; list; list = list->next)
 
 elapsedMs = clock1000() - scanStart;
 verbose(1, "# range scanning %d chroms, return %d selected chroms, leaving %d chroms\n",
     inputChromCount, slCount(listReturn), slCount(all));
+long long targetCrisprCount = examinedCrisprCount - returnListCrisprCount;
 timingMessage("range scan", examinedCrisprCount, "examined crisprs", elapsedMs, "crisprs/sec");
-timingMessage("range scan", returnListCrisprCount, "returned crisprs", elapsedMs, "crisprs/sec");
+timingMessage("range scan", targetCrisprCount, "remaining target crisprs", elapsedMs, "crisprs/sec");
+timingMessage("range scan", returnListCrisprCount, "returned query crisprs", elapsedMs, "crisprs/sec");
 if (NULL == all)
     {
     allReference = NULL;	// they have all been removed
     verbose(1, "# range scan, every single chrom has been removed\n");
     }
 else if (*allReference != all)
     {
     verbose(1, "# range scan, first chrom list has been moved from %p to %p\n", (void *)*allReference, (void *)all);
     *allReference = all;
     }
 return listReturn;
 }	//	static crisprList *rangeExtraction(crisprList *all)
 
 static void recordOffTargets(struct crisprList *query,
     struct crisprList *target, int bitsOn, long long qIndex,
 	long long tIndex)
 /* bitsOn is from 1 to 4, record this match when less than 10 count */
 {
 if (offFile)
     {
     if (query->offBy[bitsOn][qIndex] < 10)
 	{/* needs to be two prints as the kmerValToString return is confused */
-	fprintf(offFile, "%s:%lld %c %s %#020llx\t", query->chrom, query->start[qIndex],
-	    query->strand[qIndex], kmerValToString(query->sequence[qIndex], 3),
-(unsigned long long)((query->sequence[qIndex] & 0x3fffffffffc0)>>6));
-        fprintf(offFile, "%s:%lld %c %s %#020llx\t%d\n", target->chrom,
+	fprintf(offFile, "%s:%lld %c %s\t", query->chrom, query->start[qIndex],
+	    query->strand[qIndex], kmerValToString(query->sequence[qIndex], 3));
+        fprintf(offFile, "%s:%lld %c %s\t%d\n", target->chrom,
 	    target->start[tIndex], target->strand[tIndex],
-		kmerValToString(target->sequence[tIndex], 3),
-(unsigned long long)((target->sequence[tIndex] & 0x3fffffffffc0)>>6), bitsOn);
-	}
+		kmerValToString(target->sequence[tIndex], 3), bitsOn);
 	}
     }
+}	//	static void recordOffTargets(struct crisprList *query,
+	//	    struct crisprList *target, int bitsOn, long long qIndex,
+	//		long long tIndex)
 
-static void queryVsAll(struct crisprList *query,
-    struct crisprList *target)
+static void queryVsAll(struct crisprList *query, struct crisprList *target)
 /* run the query crisprs list against the target list in the array structures */
 {
 struct crisprList *qList = NULL;
 long long totalCrisprsQuery = 0;
 long long totalCrisprsTarget = 0;
 long long totalCompares = 0;
 
 long processStart = clock1000();
 long elapsedMs = 0;
 
 for (qList = query; qList; qList = qList->next)
     {
     long long qCount = 0;
     totalCrisprsQuery += qList->crisprCount;
     verbose(1, "# queryVsAll %lld query crisprs on chrom %s\n", qList->crisprCount, qList->chrom);
     for (qCount = 0; qCount < qList->crisprCount; ++qCount)
 	{
         struct crisprList *tList = NULL;
         for (tList = target; tList; tList = tList->next)
             {
             long long tCount = 0;
 	    totalCompares += tList->crisprCount;
-            if (0 == qCount)
-{
-totalCrisprsTarget += tList->crisprCount;
-verbose(1, "# queryVsAll %lld target crisprs on chrom %s %lld targets %lld compares\n", tList->crisprCount, tList->chrom, totalCrisprsTarget, totalCompares);
-}
+//            if (0 == qCount)
+// {
+// totalCrisprsTarget += tList->crisprCount;
+// verbose(1, "# queryVsAll %lld target crisprs on chrom %s %lld targets %lld compares\n", tList->crisprCount, tList->chrom, totalCrisprsTarget, totalCompares);
+// }
             for (tCount = 0; tCount < tList->crisprCount; ++tCount)
                 {
                 /* the XOR will determine differences in two sequences
                  *  the shift right 6 removes the PAM sequence
                  */
                 long long misMatch =
                     (qList->sequence[qCount] ^ tList->sequence[tCount]) >> 6;
                 if (misMatch)
                     {
                     /* possible misMatch bit values: 01 10 11
                      *  turn those three values into just: 01
                      */
                     misMatch = (misMatch | (misMatch >> 1)) & 0x5555555555;
                     int bitsOn = _mm_popcnt_u64(misMatch);
 		    if (bitsOn < 5)
@@ -592,31 +647,31 @@
                     tList->offBy[0][tCount] += 1;
                     }
                 } // for (tCount = 0; tCount < tList->crisprCount; ++tCount)
             }	//	for (tList = target; tList; tList = tList->next)
 	}	//	for (qCount = 0; qCount < qList->crisprCount; ++qCount)
     }	//	for (qList = query; qList; qList = qList->next)
 verbose(1, "# done with scanning, check timing\n");
 elapsedMs = clock1000() - processStart;
 timingMessage("queryVsAll", totalCrisprsQuery, "query crisprs processed", elapsedMs, "crisprs/sec");
 timingMessage("queryVsAll", totalCrisprsTarget, "vs target crisprs", elapsedMs, "crisprs/sec");
 timingMessage("queryVsAll", totalCompares, "total comparisons", elapsedMs, "compares/sec");
 }	/* static struct crisprList *queryVsAll(struct crisprList *query,
 	    struct crisprList *target) */
 
 static void allVsAll(struct crisprList *all)
-/* run this 'all' list vs. itself with arrays */
+/* run this 'all' list vs. itself avoiding self to self comparisons */
 {
 struct crisprList *qList = NULL;
 long long totalCrisprsQuery = 0;
 long long totalCrisprsCompare = 0;
 
 long processStart = clock1000();
 long elapsedMs = 0;
 
 /* query runs through all chroms */
 for (qList = all; qList; qList = qList->next)
     {
     long long qCount = 0;
     totalCrisprsQuery += qList->crisprCount;
     verbose(1, "# allVsAll %lld query crisprs on chrom %s\n", qList->crisprCount, qList->chrom);
     /* query runs through all kmers on this chrom */