47205c92c3cffc5a7db8835b772e32b8717bd253
hiram
  Wed Jul 5 10:31:25 2017 -0700
now counting bits with intrinsics builtin popcnt instruction and better representation of the base bit values refs #18969

diff --git src/utils/crisprKmers/crisprKmers.c src/utils/crisprKmers/crisprKmers.c
index 1580c05..12123ce 100644
--- src/utils/crisprKmers/crisprKmers.c
+++ src/utils/crisprKmers/crisprKmers.c
@@ -1,16 +1,17 @@
 /* crisprKmers - annotate crispr sequences. */
+#include <popcntintrin.h>
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "dnautil.h"
 #include "dnaseq.h"
 #include "dnaLoad.h"
 #include "portable.h"
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "crisprKmers - annotate crispr sequences\n"
   "usage:\n"
@@ -23,205 +24,214 @@
   "                N=3 - only count identicals during processing\n"
   "                N=4 - attempt measure all off targets and\n"
   "                      print out all crisprs as bed format"
   );
 }
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
    {NULL, 0},
 };
 
 struct crispr
 /* one chromosome set of crisprs */
     {
     struct crispr *next;		/* Next in list. */
-    unsigned long long sequence;	/* sequence value in 2bit format */
+    long long sequence;	/* sequence value in 2bit format */
     unsigned long long start;		/* chromosome start 0-relative */
     char strand;			/* strand: + or - */
     int offBy0;		/* counting number of off targets, 0 mismatch */
     int offBy1;		/* counting number of off targets, 1 mismatch */
     int offBy2;		/* counting number of off targets, 2 mismatch */
     int offBy3;		/* counting number of off targets, 3 mismatch */
     int offBy4;		/* counting number of off targets, 4 mismatch */
     };
 
 struct crisprList
 /* all chromosome sets of crisprs */
     {
     struct crisprList *next;	/* Next in list. */
     char *chrom;		/* chrom name */
     int size;			/* chrom size */
     struct crispr *chromCrisprs;	/* all the crisprs on this chrom */
     unsigned long long crisprCount;	/* number of crisprs on this chrom */
     };
 
+#define A_BASE	0
+#define C_BASE	1
+#define G_BASE	2
+#define T_BASE	3
+#define U_BASE	3
+static int orderedNtVal[256];	/* values in alpha order: ACGT 00 01 10 11 */
+				/* for easier sorting and complementing */
 static char bases[4];  /* for binary to ascii conversion */
-static char revBases[4]; /* reverse complement for binary to ascii conversion */
-static unsigned long long revValue[4];	/* for reverse complement binary bit values */
+
+#define fortySixBits	0x3fffffffffff
+#define fortyEixhtBits	0xffffffffffff
+#define high32bits	0xffffffff00000000
+#define low32bits	0x00000000ffffffff
+#define high16bits	0xffff0000ffff0000
+#define low16bits	0x0000ffff0000ffff
+#define high8bits	0xff00ff00ff00ff00
+#define low8bits	0x00ff00ff00ff00ff
+#define high4bits	0xf0f0f0f0f0f0f0f0
+#define low4bits	0x0f0f0f0f0f0f0f0f
+#define high2bits	0xcccccccccccccccc
+#define low2bits	0x3333333333333333
+
+static void initOrderedNtVal()
+{
+int i;
+for (i=0; i<ArraySize(orderedNtVal); i++)
+    orderedNtVal[i] = -1;
+orderedNtVal['A'] = orderedNtVal['a'] = A_BASE;
+orderedNtVal['C'] = orderedNtVal['c'] = C_BASE;
+orderedNtVal['G'] = orderedNtVal['g'] = G_BASE;
+orderedNtVal['T'] = orderedNtVal['t'] = T_BASE;
+orderedNtVal['U'] = orderedNtVal['u'] = T_BASE;
+bases[A_BASE] = 'A';
+bases[C_BASE] = 'C';
+bases[G_BASE] = 'G';
+bases[T_BASE] = 'T';
+}
 
 #ifdef NOT
 // Do not need this, slReverse does this job, perhaps a bit more efficiently
 static int slCmpStart(const void *va, const void *vb)
 /* Compare slPairs on start value */
 {
 const struct crispr *a = *((struct crispr **)va);
 const struct crispr *b = *((struct crispr **)vb);
-unsigned long long aVal = a->start;
-unsigned long long bVal = b->start;
+long long aVal = a->start;
+long long bVal = b->start;
 if (aVal < bVal)
     return -1;
 else if (aVal > bVal)
     return 1;
 else
     return 0;
 }
 #endif
 
 static char *kmerValToString(struct crispr *c, int trim)
 /* print out ASCII string for binary sequence value */
 {
-unsigned long long val = c->sequence;
-// char strand = c->strand;
-// strand = '+'; // IS STRAND NO LONGER NEEDED ?
+long long val = c->sequence;
 static char kmerString[32];
 long long twoBitMask = 0x300000000000;
 int shiftCount = 44;
 int baseCount = 0;
 
-#ifdef NOT
-kmerString[baseCount] = 0;
-if ('-' == strand)
-    {
-    twoBitMask = 0x3;
-    shiftCount = 0;
-    while (shiftCount < 46)
-        {
-        int base = (val & twoBitMask) >> shiftCount;
-        kmerString[baseCount++] = revBases[base];
-        twoBitMask <<= 2;
-        shiftCount += 2;
-        }
-    }
-else
-    {
-#endif
 while (twoBitMask && (shiftCount >= (2*trim)))
     {
     int base = (val & twoBitMask) >> shiftCount;
     kmerString[baseCount++] = bases[base];
     twoBitMask >>= 2;
     shiftCount -= 2;
     }
-#ifdef NOT
-    }
-#endif
 kmerString[baseCount] = 0;
 return kmerString;
 }
  
-static unsigned long long revComp(unsigned long long val)
-/* reverse complement the bit pattern, there is a better way to do this */
-{
-unsigned long long reverse = 0;
-unsigned long long twoBitMask = 0x300000000000;
-int leftShift = 0;
-while(twoBitMask)
-    {
-    int base = (val & twoBitMask) >> (44 - leftShift);
-    reverse |= revValue[base] << leftShift;
-    twoBitMask >>= 2;
-    leftShift += 2;
-    }
-return reverse;
+static long long revComp(long long val)
+/* reverse complement the 2-bit numerical value kmer */
+{
+/* complement bases and add 18 0 bits
+ *    because this divide and conquer works on 64 bits, not 46,
+ *      hence the addition of the 18 zero bits which fall out
+ * The 'val ^ fortySixBits' does the complement, the shifting and
+ *     masking does the reversing.
+ */
+register long long v = (val ^ fortySixBits) << 18;
+v = ((v & high32bits) >> 32) | ((v & low32bits) << 32);
+v = ((v & high16bits) >> 16) | ((v & low16bits) << 16);
+v = ((v & high8bits) >> 8) | ((v & low8bits) << 8);
+v = ((v & high4bits) >> 4) | ((v & low4bits) << 4);
+v = ((v & high2bits) >> 2) | ((v & low2bits) << 2);
+return v;
 }
 
 static struct crisprList *generateKmers(struct dnaSeq *seq)
 {
 struct crispr *crisprSet = NULL;
 struct crisprList *oneChromList = NULL;
 AllocVar(oneChromList);
 oneChromList->chrom = cloneString(seq->name);
 oneChromList->size = seq->size;
 int i;
 DNA *dna;
 unsigned long long chromPosition = 0;
 unsigned long long startGap = 0;
 unsigned long long gapCount = 0;
 int kmerLength = 0;
 long long kmerVal = 0;
-long long fortySixBits = 0x3fffffffffff;
-long long endsGG = (G_BASE_VAL << 2) | G_BASE_VAL;
-long long beginsCC = (long long)((C_BASE_VAL << 2) | C_BASE_VAL) << 42;
+long long endsGG = (G_BASE << 2) | G_BASE;
+long long beginsCC = (long long)((C_BASE << 2) | C_BASE) << 42;
 long long reverseMask = (long long)0xf << 42;
 verbose(4, "#   endsGG: %032llx\n", endsGG);
 verbose(4, "# beginsCC: %032llx\n", beginsCC);
-verbose(4, "#  46 bits: %032llx\n", fortySixBits);
+verbose(4, "#  46 bits: %032llx\n", (unsigned long long) fortySixBits);
 
 dna=seq->dna;
 for (i=0; i < seq->size; ++i)
     {
-    int val;
-    val = ntVal[(int)dna[i]];
-    switch (val)
+    int val = orderedNtVal[(int)dna[i]];
+    if (val >= 0)
 	{
-	case T_BASE_VAL:	// 0
-	case C_BASE_VAL:	// 1
-	case A_BASE_VAL:	// 2
-	case G_BASE_VAL:	// 3
         kmerVal = fortySixBits & ((kmerVal << 2) | val);
         ++kmerLength;
 	if (kmerLength > 22)
 	    {
-		if (endsGG == (kmerVal & 0xf))
+	    if (endsGG == (kmerVal & 0xf))	/* check positive strand */
                 {
                 struct crispr *oneCrispr = NULL;
                 AllocVar(oneCrispr);
                 oneCrispr->start = chromPosition - 22;
                 oneCrispr->strand = '+';
                 oneCrispr->sequence = kmerVal;
                 slAddHead(&crisprSet, oneCrispr);
                 }
-                 if (beginsCC == (kmerVal & reverseMask))
+	    if (beginsCC == (kmerVal & reverseMask))	/* check neg strand */
                 {
                 struct crispr *oneCrispr = NULL;
                 AllocVar(oneCrispr);
                 oneCrispr->start = chromPosition - 22;
                 oneCrispr->strand = '-';
                 oneCrispr->sequence = revComp(kmerVal);
                 slAddHead(&crisprSet, oneCrispr);
                 }
 	    }
-            break;
-        default:
+        }	// if (val >= 0)
+        else
+            {
             ++gapCount;
             startGap = chromPosition;
             /* skip all N's == any value not = 0,1,2,3 */
-          while ( ((i+1) < seq->size) && (0xfffffffd & ntVal[(int)dna[i+1]]))
+            while ( ((i+1) < seq->size) && (orderedNtVal[(int)dna[i+1]] < 0))
                 {
                 ++chromPosition;
                 ++i;
                 }
             if (startGap != chromPosition)
                 verbose(4, "#GAP %s\t%llu\t%llu\t%llu\t%llu\t%s\n", seq->name, startGap, chromPosition, gapCount, chromPosition-startGap, "+");
             else
                 verbose(4, "#GAP %s\t%llu\t%llu\t%llu\t%llu\t%s\n", seq->name, startGap, chromPosition+1, gapCount, 1+chromPosition-startGap, "+");
             kmerLength = 0;  /* reset, start over */
             kmerVal = 0;
-	}
+            }	// else if (val >= 0)
     ++chromPosition;
-    }
+    }	// for (i=0; i < seq->size; ++i)
 // slReverse(&crisprSet);	// save time, order not important at this time
 oneChromList->chromCrisprs = crisprSet;
 oneChromList->crisprCount = slCount(crisprSet);
 return oneChromList;
 }	// static struct crisprList *generateKmers(struct dnaSeq *seq)
 
 static void showCounts(struct crisprList *all)
 /* everything has been scanned and counted, print out all the data */
 {
 verbose(1, "#\tprint out all data\n");
 struct crisprList *list;
 unsigned long long totalChecked = 0;
 verbose(1, "# %d number of chromosomes to display\n", slCount(all));
 for (list = all; list; list = list->next)
     {
@@ -242,60 +252,89 @@
     }
 verbose(1, "# total crisprs displayed: %llu\n", totalChecked);
 }	// static void showCounts(struct crisprList *all)
 
 static unsigned long long countOffTargets(char *chrom, struct crispr *c, struct crisprList *all)
 /* given one crispr c, scan against all others to find off targets */
 {
 struct crisprList *listPtr;
 unsigned long long totalCompares = 0;
 for (listPtr = all; listPtr; listPtr = listPtr->next)
     {
     struct crispr *crisprPtr = NULL;
     struct crispr *next = NULL;
     for (crisprPtr = listPtr->chromCrisprs; crisprPtr; crisprPtr = next)
 	{
-	/* XOR will determine differences in two sequences */
-        unsigned long long misMatch = c->sequence ^ crisprPtr->sequence;
-        misMatch &= 0x3fffffffffc0;
-        misMatch >>= 6;	/* eliminate *GG from comparison */
+	/* the XOR will determine differences in two sequences
+	 *  the shift right 6 removes the PAM sequence
+	 */
+        unsigned long long misMatch = (c->sequence ^ crisprPtr->sequence) >> 6;
         if (! misMatch)	/* no misMatch, identical crisprs */
 	    {
             c->offBy0 += 1;
             crisprPtr->offBy0 += 1;
-            verbose(4, "# identical: %d %s:%llu %c == %s:%llu %c\t%s == %s\n", c->offBy0, chrom, c->start, c->strand, listPtr->chrom, crisprPtr->start, crisprPtr->strand, kmerValToString(c, 0), kmerValToString(crisprPtr, 0));
+//            verbose(4, "# identical: %d %s:%llu %c == %s:%llu %c\t%s == %s\n", c->offBy0, chrom, c->start, c->strand, listPtr->chrom, crisprPtr->start, crisprPtr->strand, kmerValToString(c, 0), kmerValToString(crisprPtr, 0));
 	    }
         else
 	    {
             if (verboseLevel() > 3)
+		{
+                /* possible misMatch bit values: 01 10 11
+		 *  turn those three values into just: 01
+		 */
+		misMatch = (misMatch | (misMatch > 1)) & 0x5555555555;
+		int bitsOn = _mm_popcnt_u64(misMatch);
+                switch(bitsOn)
+                    {
+                    case 1:
+                        c->offBy1 += 1;
+                        crisprPtr->offBy1 += 1;
+                        break;
+                    case 2:
+                        c->offBy2 += 1;
+                        crisprPtr->offBy2 += 1;
+                        break;
+                    case 3:
+                        c->offBy3 += 1;
+                        crisprPtr->offBy3 += 1;
+                        break;
+                    case 4:
+                        c->offBy4 += 1;
+                        crisprPtr->offBy4 += 1;
+                        break;
+                    default: break;
+                    }
+                }
+#ifdef NOT
                 {	/* this is not efficient, there are better ways */
-                unsigned long long mask = 0x3;
+                long long mask = 0x3;
                 int count = 0;
                 int i;
                 for (i = 0; i < 23; ++i)
                     if (misMatch & mask)
                         ++count;
                     mask <<= 2;
                 switch(count)
                     {
                     case 1: c->offBy1 += 1; crisprPtr->offBy1 += 1; break;
                     case 2: c->offBy2 += 1; crisprPtr->offBy2 += 1; break;
                     case 3: c->offBy3 += 1; crisprPtr->offBy3 += 1; break;
                     case 4: c->offBy4 += 1; crisprPtr->offBy4 += 1; break;
                     default: break;
                     }
                 }
+#endif
 	    }
         ++totalCompares;
 	next = crisprPtr->next;
 	}
     }
 // verbose(3,"#\ttotal comparisons %llu\n", totalCompares);
 return totalCompares;
 }	// static void countOffTargets( . . . )
 
 static void crisprKmers(char *sequence)
 /* crisprKmers - annotate crispr sequences. */
 {
 verbose(1, "#\tscanning sequence: %s\n", sequence);
 dnaUtilOpen();
 struct dnaLoad *dl = dnaLoadOpen(sequence);
@@ -387,31 +426,20 @@
     verbose(1, "# %llu total crisprs processed @ %ld ms -> %.2f crisprs/sec\n", totalCrisprsIn, elapsedMs, perSecond);
     perSecond = 0.0;
     if (elapsedMs > 0)
         perSecond = 1000.0 * totalCompares / elapsedMs;
     verbose(1, "# %llu total comparisons @ %ld ms -> %.2f crisprs/sec\n", totalCompares, elapsedMs, perSecond);
     showCounts(countedCrisprs);
     }
 }	// static void crisprKmers(char *sequence)
 
 int main(int argc, char *argv[])
 /* Process command line, set up translation arrays and call the process */
 {
 optionInit(&argc, argv, options);
 if (argc != 2)
     usage();
+initOrderedNtVal();	/* set up orderedNtVal[] */
 verboseTimeInit();
-bases[0] = 'T';
-bases[1] = 'C';
-bases[2] = 'A';
-bases[3] = 'G';
-revBases[0] = 'A';
-revBases[1] = 'G';
-revBases[2] = 'T';
-revBases[3] = 'C';
-revValue[0] = 2;
-revValue[1] = 3;
-revValue[2] = 0;
-revValue[3] = 1;
 crisprKmers(argv[1]);
 return 0;
 }