0e23fd19b0162759909cee1861a70decad3e85f0
hiram
  Tue Jul 25 22:04:10 2017 -0700
marking duplicates during first queryVsSelf refs #18969

diff --git src/utils/crisprKmers/crisprKmers.c src/utils/crisprKmers/crisprKmers.c
index 5da7ab7..2a4ea99 100644
--- src/utils/crisprKmers/crisprKmers.c
+++ src/utils/crisprKmers/crisprKmers.c
@@ -98,54 +98,56 @@
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
    {"bed", OPTION_STRING},
    {"offTargets", OPTION_STRING},
    {"ranges", OPTION_STRING},
    {"threads", OPTION_INT},
    {"dumpGuides", OPTION_STRING},
    {"loadKmers", OPTION_STRING},
    {NULL, 0},
 };
 
 #define	guideSize	20	// 20 bases
 #define	pamSize		3	//  3 bases
 #define negativeStrand	0x0000800000000000
+#define duplicateGuide	0x0001000000000000
 
 #define fortyEightBits	0xffffffffffff
 #define fortySixBits	0x3fffffffffff
 #define fortyBits	0xffffffffff
 #define high32bits	0xffffffff00000000
 #define low32bits	0x00000000ffffffff
 #define high16bits	0xffff0000ffff0000
 #define low16bits	0x0000ffff0000ffff
 #define high8bits	0xff00ff00ff00ff00
 #define low8bits	0x00ff00ff00ff00ff
 #define high4bits	0xf0f0f0f0f0f0f0f0
 #define low4bits	0x0f0f0f0f0f0f0f0f
 #define high2bits	0xcccccccccccccccc
 #define low2bits	0x3333333333333333
 
 #define pamMask 0x3f
 
 //  0x0000 0000 0000 0000
 //    6348 4732 3116 15-0
 
 // sequence word structure:
 // bits 5-0 - PAM sequence in 2bit encoding for 3 bases
 // bits 45-6 - 20 base sequence in 2bit encoding format
 // bit 47 - negative strand indication
+// bit 48 - duplicate guide indication
 // considering using other bits during processing to mark
 // an item for no more consideration
 
 // sizeof(struct crispr): 32
 struct crispr
 /* one chromosome set of guides */
     {
     struct crispr *next;		/* Next in list. */
     long long sequence;	/* sequence value in 2bit format */
     long long start;		/* chromosome start 0-relative */
     };
 
 // sizeof(struct crisprList): 72
 struct crisprList
 /* all chromosome sets of guides */
@@ -469,84 +471,89 @@
 static char *green = "50,205,50";	/* #32cd32 */
 static char *yellow = "255,255,0";	/* #ffff00 */
 static char *black = "0,0,0";
 static char *red = "170,1,20";		/* #aa0114 */
 if (mitScore > 50)
     return green;
 else if (mitScore > 20)
     return yellow;
 else if (mitScore == -1)
     return black;
 else
     return red;
 }
 
 static void countsOutput(struct crisprList *all, FILE *bedFH)
-/* everything has been scanned and counted, print out all the data from arrays*/
+/* all done scanning and counting, print out all the guide data */
 {
 long startTime = clock1000();
 long long itemsOutput = 0;
+long long duplicatesMarked = 0;
 
 struct crisprList *list;
 long long totalOut = 0;
 for (list = all; list; list = list->next)
     {
     long long c;
     for (c = 0; c < list->crisprCount; ++c)
         {
 	++itemsOutput;
+	if (list->sequence[c] & duplicateGuide)
+	    ++duplicatesMarked;
 	int negativeOffset = 0;
         char strand = '+';
         if (negativeStrand & list->sequence[c])
 	    {
             strand = '-';
 	    negativeOffset = pamSize;
 	    }
 	long long txStart = list->start[c] + negativeOffset;
 	long long txEnd = txStart + guideSize;
 
         int mitScoreCount = + list->offBy[1][c] +
 	    list->offBy[2][c] + list->offBy[3][c] + list->offBy[4][c];
 
         char kmerString[33];
         kmerValToString(kmerString, list->sequence[c], pamSize);
         char pamString[33];
         kmerPAMString(pamString, list->sequence[c]);
 
 	if (bedFH)
 	    {
 	    long mitScore = 1.0;
             if (mitScoreCount > 0)  /* note: interger <- float conversion  */
 	    {
 		mitScore = roundf((list->mitSum[c] / (float) mitScoreCount));
 	    }
-	    else if (list->offBy[0][c] > 0)
+	    else if (list->sequence[c] & duplicateGuide)
 	    {
 		mitScore = 0.0;
 	    }
             char *color = scoreToColor(mitScore);
 	    /* the end zero will be filled in by post-processing, it will be
 	     *   the _offset into the crisprDetails.tab file
 	     */
 	    fprintf(bedFH, "%s\t%lld\t%lld\t\t%ld\t%c\t%lld\t%lld\t%s\t%s\t%s\t%d,%d,%d,%d,%d\t0\n", list->chrom, list->start[c], list->start[c]+pamSize+guideSize, mitScore, strand, txStart, txEnd, color, kmerString, pamString, list->offBy[0][c], list->offBy[1][c], list->offBy[2][c], list->offBy[3][c], list->offBy[4][c]);
 	    }
 	++totalOut;
 	}
     }
 if (bedFH)
     carefulClose(&bedFH);
 
+verbose(1, "# countsOutput: %lld guides marked as duplicate sequence\n",
+    duplicatesMarked);
 timingMessage("countsOutput:", itemsOutput, "items output", startTime,
     "items/sec", "seconds/item");
 
 }	//	static void countsOutput(struct crisprList *all, FILE *bedFH)
 
 static struct crisprList *scanSequence(char *inFile)
 /* scan the given file, return list of guides */
 {
 verbose(1, "# scanning sequence file: %s\n", inFile);
 dnaUtilOpen();
 struct dnaLoad *dl = dnaLoadOpen(inFile);
 struct dnaSeq *seq;
 struct crisprList *listReturn = NULL;
 
 long startTime = clock1000();
@@ -689,31 +696,31 @@
     0,0,0.014,0,0,
     0.395,0.317,0,0.389,0.079,
     0.445,0.508,0.613,0.851,0.732,
     0.828,0.615,0.804,0.685,0.583
 };
 
 static float calcHitScore(long long sequence1, long long sequence2)
 /* calcHitScore - from Max's crispor.py script and paper:
  *    https://www.ncbi.nlm.nih.gov/pubmed/27380939 */
 {
 float score1 = 1.0;
 int mmCount = 0;
 int lastMmPos = -1;
 /* the XOR determines differences in two sequences, the shift
  * right 6 removes the PAM sequence and the 'fortyBits &' eliminates
- * the negativeStrand bit
+ * the negativeStrand and duplicateGuide bits
  */
 long long misMatch = fortyBits & ((sequence1 ^ sequence2) >> 6);
 int distCount = 0;
 int distSum = 0;
 int pos;
 for (pos = 0; pos < guideSize; ++pos)
     {
     int diff = misMatch & 0x3;
     if (diff)
 	{
 	++mmCount;
         if (lastMmPos != -1)
 	    {
 	    distSum += pos-lastMmPos;
 	    ++distCount;
@@ -868,31 +875,31 @@
 	{ -1, 0.227272727, 0.764705882, 0.6 },
 	{ 0.5, -1, 0.058823529, 0.3 },
 	{ 0.9375, 0.428571429, -1, 0.7 },
 	{ 0.5625, 0.090909091, 0.176470588, -1 },
     },
 };
 
 /* Cutting Frequency Determination */
 static float calcCfdScore(long long sequence1, long long sequence2)
 /* calcCfdScore - from cfd_score_calculator.py script and paper:
  *    https://www.ncbi.nlm.nih.gov/pubmed/27380939 */
 {
 float score = 1.0;
 /* the XOR determine differences in two sequences, the
  * shift right 6 removes the PAM sequence and
- * the 'fortyBits &' eliminates the negativeStrand bit
+ * the 'fortyBits &' eliminates the negativeStrand and duplicateGuide bits
  */
 long long misMatch = fortyBits & ((sequence1 ^ sequence2) >> 6);
 long long misMatchBitMask = 0xc000000000;
 long long twoBitMask = 0x300000000000;
 int shiftRight = 44;	/* to move the 2bits to bits 1,0 */
 int index = 0;
 
 while (misMatchBitMask)
     {
     if (misMatchBitMask & misMatch)
 	{
         int queryIndex = (sequence1 & twoBitMask) >> shiftRight;
         int targetIndex = (sequence2 & twoBitMask) >> shiftRight;
 	score *= cfdMmScores[index][queryIndex][targetIndex];
 	}
@@ -988,158 +995,176 @@
 
 
 /* this queryVsTarget can be used by threads, appears to be safe */
 static void queryVsTarget(struct crisprList *query, struct crisprList *target,
     int threadCount, int threadId)
 /* run the query guides list against the target list in the array structures */
 {
 struct crisprList *qList;
 long long totalCrisprsQuery = 0;
 long long totalCrisprsTarget = 0;
 long long totalCompares = 0;
 struct loopControl *control = NULL;
 AllocVar(control);
 
 long startTime = clock1000();
+long long duplicatesMarked = 0;
 
 for (qList = query; qList; qList = qList->next)
     {
     totalCrisprsQuery += qList->crisprCount;
     if (threadCount > 1)
 	{
 	setLoopEnds(control, qList->crisprCount, threadCount, threadId);
 	if (control->listStart >= qList->crisprCount)
 	    continue;	/* next chrom, no part to be done for this thread */
 	}
     else
 	{
 	control->listStart = 0;
 	control->listEnd = qList->crisprCount;
 	}
     verbose(1, "# thread %d of %d running %s %lld items [ %lld : %lld )\n",
 	1 + threadId, threadCount, qList->chrom,
 	    control->listEnd - control->listStart, control->listStart,
 		control->listEnd);
     totalCrisprsTarget += control->listEnd - control->listStart;
     long long qCount;
     for (qCount = control->listStart; qCount < control->listEnd; ++qCount)
 	{
+        if (qList->sequence[qCount] & duplicateGuide)
+	    continue;	/* already marked as duplicate */
         struct crisprList *tList;
         for (tList = target; tList; tList = tList->next)
             {
             long long tCount;
             totalCompares += tList->crisprCount;
             for (tCount = 0; tCount < tList->crisprCount; ++tCount)
                 {
                 /* the XOR determine differences in two sequences, the
                  * shift right 6 removes the PAM sequence and
-                 * the 'fortyBits &' eliminates the negativeStrand bit
+                 * the 'fortyBits &' eliminates the negativeStrand and
+		 * duplicateGuide bits
                  */
                 long long misMatch = fortyBits &
                     ((qList->sequence[qCount] ^ tList->sequence[tCount]) >> 6);
                 if (misMatch)
                     {
                     /* possible misMatch bit values: 01 10 11
                      *  turn those three values into just: 01
                      */
                     misMatch = (misMatch | (misMatch >> 1)) & 0x5555555555;
                     int bitsOn = _mm_popcnt_u64(misMatch);
                     if (bitsOn < 5)
                         {
-                        recordOffTargets(qList, tList, bitsOn, qCount, tCount, misMatch);
+                        recordOffTargets(qList, tList, bitsOn, qCount,
+			    tCount, misMatch);
                         qList->offBy[bitsOn][qCount] += 1;
 //			tList->offBy[bitsOn][tCount] += 1; not needed
                         }
                     }
                 else
                     { 	/* no misMatch, identical guides */
                     qList->offBy[0][qCount] += 1;
+                    qList->sequence[qCount] |= duplicateGuide;
+		    ++duplicatesMarked;
 //                  tList->offBy[0][tCount] += 1;	not needed
                     }
                 } // for (tCount = 0; tCount < tList->crisprCount; ++tCount)
             }	//	for (tList = target; tList; tList = tList->next)
 	}	//	for (qCount = 0; qCount < qList->crisprCount; ++qCount)
     }	//	for (qList = query; qList; qList = qList->next)
 
+verbose(1, "# queryVsTarget: an additional %lld duplicates marked\n",
+    duplicatesMarked);
 timingMessage("queryVsTarget", totalCrisprsQuery, "query guides processed",
     startTime, "guides/sec", "seconds/guide");
 timingMessage("queryVsTarget", totalCrisprsTarget, "vs target guides",
     startTime, "guides/sec", "seconds/guide");
 timingMessage("queryVsTarget", totalCompares, "total comparisons",
     startTime, "compares/sec", "seconds/compare");
 
 }	/* static struct crisprList *queryVsTarget(struct crisprList *query,
 	    struct crisprList *target) */
 
 static void queryVsSelf(struct crisprList *all)
 /* run this 'all' list vs. itself avoiding self to self comparisons */
 {
 struct crisprList *qList;
 long long totalCrisprsQuery = 0;
 long long totalCrisprsCompare = 0;
 
 long startTime = clock1000();
 
+long long duplicatesMarked = 0;
+
 /* query runs through all chroms */
 for (qList = all; qList; qList = qList->next)
     {
     long long qCount;
     totalCrisprsQuery += qList->crisprCount;
     verbose(1, "# queryVsSelf %lld query guides on chrom %s\n", qList->crisprCount, qList->chrom);
     for (qCount = 0; qCount < qList->crisprCount; ++qCount)
 	{
 	/* target starts on same chrom as query, and
 	   at next kmer after query for this first chrom */
         long long tStart = qCount+1;
         struct crisprList *tList;
+        if (qList->sequence[qCount] & duplicateGuide)
+	    continue;	/* already marked as duplicate */
         for (tList = qList; tList; tList = tList->next)
             {
             long long tCount;
 	    totalCrisprsCompare += tList->crisprCount - tStart;
             for (tCount = tStart; tCount < tList->crisprCount; ++tCount)
                 {
 		/* the XOR determine differences in two sequences, the
 		 * shift right 6 removes the PAM sequence and
-		 * the 'fortyBits &' eliminates the negativeStrand bit
+		 * the 'fortyBits &' eliminates the negativeStrand and
+		 * duplicateGuide bits
 		 */
                 long long misMatch = fortyBits &
                     ((qList->sequence[qCount] ^ tList->sequence[tCount]) >> 6);
                 if (misMatch)
                     {
                     /* possible misMatch bit values: 01 10 11
                      *  turn those three values into just: 01
                      */
                     misMatch = (misMatch | (misMatch >> 1)) & 0x5555555555;
                     int bitsOn = _mm_popcnt_u64(misMatch);
 		    if (bitsOn < 5)
 			{
 			recordOffTargets(qList, tList, bitsOn, qCount, tCount, misMatch);
 			qList->offBy[bitsOn][qCount] += 1;
 			tList->offBy[bitsOn][tCount] += 1;
 			}
                     }
                 else
                     { 	/* no misMatch, identical guides */
+                    qList->sequence[qCount] |= duplicateGuide;
+                    tList->sequence[tCount] |= duplicateGuide;
                     qList->offBy[0][qCount] += 1;
                     tList->offBy[0][tCount] += 1;
+		    ++duplicatesMarked;
                     }
                 } // for (tCount = 0; tCount < tList->crisprCount; ++tCount)
                 tStart = 0;	/* following chroms run through all */
             }	//	for (tList = target; tList; tList = tList->next)
 	}	//	for (qCount = 0; qCount < qList->crisprCount; ++qCount)
     }	//	for (qList = query; qList; qList = qList->next)
 
+verbose(1, "# queryVsSelf: counted %lld duplicate guides\n", duplicatesMarked);
 timingMessage("queryVsSelf", totalCrisprsQuery, "guides processed",
     startTime, "guides/sec", "seconds/guide");
 timingMessage("queryVsSelf", totalCrisprsCompare, "total comparisons",
     startTime, "compares/sec", "seconds/compare");
 
 }	/* static void queryVsSelf(struct crisprList *all) */
 
 static struct crisprList *readKmers(char *fileIn)
 /* read in kmer list from 'fileIn', return list structure */
 {
 errAbort("# XXX readKmers function not implemented\n");
 verbose(1, "# reading guides from: %s\n", fileIn);
 struct crisprList *listReturn = NULL;
 struct lineFile *lf = lineFileOpen(fileIn, TRUE);
 char *row[10];
@@ -1186,50 +1211,50 @@
 
 return listReturn;
 }	//	static struct crisprList *readKmers(char *fileIn)
 
 static void writeGuides(struct crisprList *all, char *fileOut)
 /* write guide list 'all' to 'fileOut' */
 {
 FILE *fh = mustOpen(fileOut, "w");
 struct crisprList *list;
 long long crisprsWritten = 0;
 char kmerString[33];
 char pamString[33];
 
 long startTime = clock1000();
 
-slReverse(&all);
+/* slReverse(&all);   * can not do this here, destroy's caller's list,
+                      * caller's value of all doesn't change */
+
 for (list = all; list; list = list->next)
     {
     fprintf(fh, "%s\t%lld\t%d\n", list->chrom, list->crisprCount, list->size);
     struct crispr *c;
-//    slReverse(&list->chromCrisprs);
     for (c = list->chromCrisprs; c; c = c->next)
 	{
         kmerValToString(kmerString, c->sequence, pamSize);
         kmerPAMString(pamString, c->sequence);
 	fprintf(fh, "%s\t%s\t%lld\t%c\n", kmerString, pamString,
 	    c->start, negativeStrand & c->sequence ? '-' : '+');
 	++crisprsWritten;
 	}
     }
 carefulClose(&fh);
 
 timingMessage("writeGuides", crisprsWritten, "guides written", startTime,
     "guides/sec", "seconds/guide");
-
 }	//	static void writeGuides(struct crisprList *all, char *fileOut)
 
 static void *threadFunction(void *id)
 /* thread entry */
 {
 struct threadControl *tId = (struct threadControl *)id;
 
 queryVsTarget(tId->query, tId->target, tId->threadCount, tId->threadId);
 
 return NULL;
 }
 
 static void runThreads(int threadCount, struct crisprList *query,
     struct crisprList *target)
 {
@@ -1298,39 +1323,39 @@
     if (queryGuides)
         copyToArray(queryGuides);
     if (allGuides)
 	copyToArray(allGuides);
     vmPeak = currentVmPeak();
     verbose(1, "# vmPeak after copyToArray: %lld kB\n", vmPeak);
     /* larger example: 62646196 kB */
     if (threads > 1)
 	{
 	int gB = vmPeak >> 20;	/* convert kB to Gb */
 	threadCount = threads;
 	verbose(1, "# at %d Gb (%lld kB), running %d threads\n", gB, vmPeak, threadCount);
 	}
     if (queryGuides)	// when range selected some query sequences
 	{
+	/* the query vs. itself avoiding self vs. self and marking dups */
+        queryVsSelf(queryGuides);
 	if (allGuides) // if there are any left on the all list
 	    {
 	    if (threadCount > 1)
 		runThreads(threadCount, queryGuides, allGuides);
 	    else
-		queryVsTarget(queryGuides, allGuides, 1, 0);
+		queryVsTarget(queryGuides, allGuides, 0, 0);
 	    }
-	/* then run up the query vs. itself avoiding self vs. self */
-        queryVsSelf(queryGuides);
         countsOutput(queryGuides, bedFH);
         }
     else
         {
 	queryVsSelf(allGuides); /* run up all vs. all avoiding self vs. self */
 	countsOutput(allGuides, bedFH);
         }
 
     carefulClose(&offFile);
     }
 }	// static void crisprKmers(char *sequence, FILE *bedFH)
 
 int main(int argc, char *argv[])
 /* Process command line, initialize translation arrays and call the process */
 {