48d09ab3cbb4ce1bb172e0479f1e80f878dac655
hiram
  Sat Jul 1 00:16:53 2017 -0700
this is getting close refs #18969

diff --git src/utils/crisprKmers/crisprKmers.c src/utils/crisprKmers/crisprKmers.c
index f8f06ff..0307ce1 100644
--- src/utils/crisprKmers/crisprKmers.c
+++ src/utils/crisprKmers/crisprKmers.c
@@ -20,94 +20,127 @@
   );
 }
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
    {NULL, 0},
 };
 
 struct crispr
 /* one chromosome set of crisprs */
     {
     struct crispr *next;		/* Next in list. */
     unsigned long long sequence;	/* sequence value in 2bit format */
     unsigned long long start;		/* chromosome start 0-relative */
     char strand;			/* strand: + or - */
+    int offBy0;		/* counting number of off targets, 0 mismatch */
+    int offBy1;		/* counting number of off targets, 1 mismatch */
+    int offBy2;		/* counting number of off targets, 2 mismatch */
+    int offBy3;		/* counting number of off targets, 3 mismatch */
+    int offBy4;		/* counting number of off targets, 4 mismatch */
     };
 
 struct crisprList
 /* all chromosome sets of crisprs */
     {
     struct crisprList *next;	/* Next in list. */
     char *chrom;		/* chrom name */
     int size;			/* chrom size */
     struct crispr *chromCrisprs;	/* all the crisprs on this chrom */
     };
 
 static char bases[4];  /* for binary to ascii conversion */
 static char revBases[4]; /* reverse complement for binary to ascii conversion */
+static unsigned long long revValue[4];	/* for reverse complement binary bit values */
 
+#ifdef NOT
+// Do not need this, slReverse does this job, perhaps a bit more efficiently
 static int slCmpStart(const void *va, const void *vb)
 /* Compare slPairs on start value */
 {
 const struct crispr *a = *((struct crispr **)va);
 const struct crispr *b = *((struct crispr **)vb);
 unsigned long long aVal = a->start;
 unsigned long long bVal = b->start;
 if (aVal < bVal)
     return -1;
 else if (aVal > bVal)
     return 1;
 else
     return 0;
 }
+#endif
 
-static char *kmerValToString(long long val, boolean revComp)
+static char *kmerValToString(struct crispr *c, int trim)
+/* print out ASCII string for binary sequence value */
 {
+unsigned long long val = c->sequence;
+// char strand = c->strand;
+// strand = '+'; // IS STRAND NO LONGER NEEDED ?
 static char kmerString[32];
 long long twoBitMask = 0x300000000000;
 int shiftCount = 44;
 int baseCount = 0;
 
+#ifdef NOT
 kmerString[baseCount] = 0;
-if (revComp)
+if ('-' == strand)
     {
     twoBitMask = 0x3;
     shiftCount = 0;
     while (shiftCount < 46)
         {
         int base = (val & twoBitMask) >> shiftCount;
         kmerString[baseCount++] = revBases[base];
         twoBitMask <<= 2;
         shiftCount += 2;
         }
     }
 else
     {
-    while (twoBitMask)
+#endif
+    while (twoBitMask && (shiftCount >= (2*trim)))
         {
-        int base = (val & twoBitMask) >> shiftCount;;
+        int base = (val & twoBitMask) >> shiftCount;
         kmerString[baseCount++] = bases[base];
         twoBitMask >>= 2;
         shiftCount -= 2;
         }
+#ifdef NOT
     }
+#endif
 kmerString[baseCount] = 0;
 return kmerString;
 }
  
-static struct crisprList *scanSeq(struct dnaSeq *seq)
+static unsigned long long revComp(unsigned long long val)
+/* reverse complement the bit pattern, there is a better way to do this */
+{
+unsigned long long reverse = 0;
+unsigned long long twoBitMask = 0x300000000000;
+int leftShift = 0;
+while(twoBitMask)
+    {
+    int base = (val & twoBitMask) >> (44 - leftShift);
+    reverse |= revValue[base] << leftShift;
+    twoBitMask >>= 2;
+    leftShift += 2;
+    }
+return reverse;
+}
+
+static struct crisprList *generateKmers(struct dnaSeq *seq)
 {
 struct crispr *crisprSet = NULL;
 struct crisprList *oneChromList = NULL;
 AllocVar(oneChromList);
 oneChromList->chrom = cloneString(seq->name);
 oneChromList->size = seq->size;
 int i;
 DNA *dna;
 unsigned long long chromPosition = 0;
 unsigned long long startGap = 0;
 unsigned long long gapCount = 0;
 int kmerLength = 0;
 long long kmerVal = 0;
 long long fortySixBits = 0x3fffffffffff;
 long long endsGG = (G_BASE_VAL << 2) | G_BASE_VAL;
@@ -118,118 +151,207 @@
 verbose(4, "#  46 bits: %032llx\n", fortySixBits);
 
 dna=seq->dna;
 for (i=0; i < seq->size; ++i)
     {
     int val;
     val = ntVal[(int)dna[i]];
     switch (val)
 	{
 	case T_BASE_VAL:	// 0
 	case C_BASE_VAL:	// 1
 	case A_BASE_VAL:	// 2
 	case G_BASE_VAL:	// 3
               kmerVal = fortySixBits & ((kmerVal << 2) | val);
               ++kmerLength;
-//              boolean testA = endsGG == ((long long)kmerVal & endsGG);
               if (kmerLength > 22)
                  {
 		if (endsGG == (kmerVal & 0xf))
 		    {
 		    struct crispr *oneCrispr = NULL;
 		    AllocVar(oneCrispr);
                     oneCrispr->start = chromPosition - 22;
                     oneCrispr->strand = '+';
                     oneCrispr->sequence = kmerVal;
                     slAddHead(&crisprSet, oneCrispr);
 		    }
                  if (beginsCC == (kmerVal & reverseMask))
 		    {
 		    struct crispr *oneCrispr = NULL;
 		    AllocVar(oneCrispr);
                     oneCrispr->start = chromPosition - 22;
                     oneCrispr->strand = '-';
-                    oneCrispr->sequence = kmerVal;
+                    oneCrispr->sequence = revComp(kmerVal);
                     slAddHead(&crisprSet, oneCrispr);
 		    }
                  }
             break;
         default:
           ++gapCount;
           startGap = chromPosition;
           /* skip all N's == any value not = 0,1,2,3 */
           while ( ((i+1) < seq->size) && (0xfffffffd & ntVal[(int)dna[i+1]]))
               {
               ++chromPosition;
               ++i;
               }
           if (startGap != chromPosition)
               verbose(4, "#GAP %s\t%llu\t%llu\t%llu\t%llu\t%s\n", seq->name, startGap, chromPosition, gapCount, chromPosition-startGap, "+");
           else
               verbose(4, "#GAP %s\t%llu\t%llu\t%llu\t%llu\t%s\n", seq->name, startGap, chromPosition+1, gapCount, 1+chromPosition-startGap, "+");
           kmerLength = 0;  /* reset, start over */
           kmerVal = 0;
 	}
     ++chromPosition;
     }
+slReverse(&crisprSet);	// may not need to do this at this time
 oneChromList->chromCrisprs = crisprSet;
 return oneChromList;
+}	// static struct crisprList *generateKmers(struct dnaSeq *seq)
+
+static void showCounts(struct crisprList *all)
+/* everything has been scanned and counted, print out all the data */
+{
+verbose(1, "# showCounts entered\n");
+struct crisprList *list;
+unsigned long long totalChecked = 0;
+verbose(1, "# chrom count %d\n", slCount(all));
+for (list = all; list; list = list->next)
+    {
+    struct crispr *c = NULL;
+    struct crispr *next = NULL;
+verbose(1, "# crisprs count %d on chrom %s\n", slCount(list->chromCrisprs), list->chrom);
+    for (c = list->chromCrisprs; c; c = next)
+	{
+        if (c->strand == '+')
+	    verbose(1, "%s\t%llu\t%llu\t%s\t%d\t%c\t%llu\t%llu\t%d\t%d\t%d\t%d\n", list->chrom, c->start, c->start+23, kmerValToString(c, 3), c->offBy0, c->strand, c->start, c->start+20, c->offBy1, c->offBy2, c->offBy3, c->offBy4);
+	else
+	    verbose(1, "%s\t%llu\t%llu\t%s\t%d\t%c\t%llu\t%llu\t%d\t%d\t%d\t%d\n", list->chrom, c->start, c->start+23, kmerValToString(c, 3), c->offBy0, c->strand, c->start+3, c->start+23, c->offBy1, c->offBy2, c->offBy3, c->offBy4);
+	++totalChecked;
+        if (c->offBy0)
+           verbose(1, "# identical: %d %s:%llu %c\t%s\n", c->offBy0, list->chrom, c->start, c->strand, kmerValToString(c, 3));
+	next = c->next;
+	}
+    }
+verbose(1, "# total checked: %llu\n", totalChecked);
+}	// static void showCounts(struct crisprList *all)
+
+static void countOffTargets(char *chrom, struct crispr *c, struct crisprList *all)
+{
+struct crisprList *listPtr;
+unsigned long long totalCompares = 0;
+for (listPtr = all; listPtr; listPtr = listPtr->next)
+    {
+    struct crispr *crisprPtr = NULL;
+    struct crispr *next = NULL;
+    for (crisprPtr = listPtr->chromCrisprs; crisprPtr; crisprPtr = next)
+	{
+        unsigned long long misMatch = c->sequence ^ crisprPtr->sequence;
+        misMatch &= 0x3fffffffffc0;
+        misMatch >>= 6;
+        if (! misMatch)
+	    {
+            c->offBy0 += 1;
+            crisprPtr->offBy0 += 1;
+            verbose(1, "# identical: %d %s:%llu %c == %s:%llu %c\t%s == %s\n", c->offBy0, chrom, c->start, c->strand, listPtr->chrom, crisprPtr->start, crisprPtr->strand, kmerValToString(c, 0), kmerValToString(crisprPtr, 0));
+	    }
+        else
+	    {
+	    unsigned long long mask = 0x3;
+            int count = 0;
+            int i;
+            for (i = 0; i < 23; ++i)
+		if (misMatch & mask)
+		    ++count;
+                mask <<= 2;
+            switch(count)
+		{
+		case 1: c->offBy1 += 1; crisprPtr->offBy1 += 1; break;
+		case 2: c->offBy2 += 1; crisprPtr->offBy2 += 1; break;
+		case 3: c->offBy3 += 1; crisprPtr->offBy3 += 1; break;
+		case 4: c->offBy4 += 1; crisprPtr->offBy4 += 1; break;
+		default: break;
+		}
 	    }
+        ++totalCompares;
+	next = crisprPtr->next;
+	}
+    }
+}	// static void countOffTargets( . . . )
 
-void crisprKmers(char *sequence)
+static void crisprKmers(char *sequence)
 /* crisprKmers - annotate crispr sequences. */
 {
-verbose(1, "# scanning sequence: %s\n", sequence);
+verbose(1, "#\tscanning sequence: %s\n", sequence);
 dnaUtilOpen();
 struct dnaLoad *dl = dnaLoadOpen(sequence);
 struct dnaSeq *seq;
 struct crisprList *allCrisprs = NULL;
 
 while ((seq = dnaLoadNext(dl)) != NULL)
     {
-    verbose(2, "#\tprocessing: %s, size: %d\n", seq->name, seq->size);
-    struct crisprList *oneList = scanSeq(seq);
+    struct crisprList *oneList = generateKmers(seq);
     slAddHead(&allCrisprs, oneList);
+    verboseTime(2, "#\tcrispr kmer scanning on: %s, size: %d\t", seq->name, seq->size);
     }
-if (verboseLevel() > 2)
+
+if (verboseLevel() > 1)
     {
+    struct crisprList *countedCrisprs = NULL;
+    unsigned long long totalCrisprs = 0;
     int chrCount = slCount(allCrisprs);
-verbose(1, "# printing crispr list, chrCount: %d\n", chrCount);
+    verbose(1, "#\tcrispr list, scanned %d chromosomes, now processing . . .\n", chrCount);
     struct crisprList *listPtr;
-    for (listPtr = allCrisprs; listPtr; listPtr = listPtr->next)
+    struct crisprList *nextChr;
+    for (listPtr = allCrisprs; listPtr; listPtr = nextChr)
 	{
-        struct crispr *crisprPtr;
+        struct crispr *newCrispr = NULL;
+        struct crispr *c = NULL;
         int crisprCount = slCount(listPtr->chromCrisprs);
-        slSort(&listPtr->chromCrisprs, slCmpStart);
-verbose(1, "# printing crispr list, crisprCount: %d on %s\n", crisprCount, listPtr->chrom);
-        for (crisprPtr = listPtr->chromCrisprs; crisprPtr; crisprPtr = crisprPtr->next)
+        totalCrisprs += crisprCount;
+        verbose(1, "#\tcrispr count: %d on %s\n", crisprCount, listPtr->chrom);
+        struct crispr *next;
+        for (c = listPtr->chromCrisprs; c; c = next)
 	    {
-            if ('+' == crisprPtr->strand)
-		{
-	        verbose (3, "%s\t%s\t%llu\t+\n", kmerValToString(crisprPtr->sequence, FALSE), listPtr->chrom, crisprPtr->start);
-		}
-            else
-		{
-	        verbose (3, "%s\t%s\t%llu\t-\n", kmerValToString(crisprPtr->sequence, TRUE), listPtr->chrom, crisprPtr->start);
-		}
-	    }
+	    verbose (3, "%s\t%s\t%llu\t%c\n", kmerValToString(c, 0), listPtr->chrom, c->start, c->strand);
+            next = c->next;	// remember before lost
+//            slRemoveEl(&listPtr->chromCrisprs, c);
+            c->next = NULL;	// taking out of list
+            listPtr->chromCrisprs = next; // first item removed from list
+            if (next)
+                countOffTargets(listPtr->chrom, c, allCrisprs);
+            slAddHead(&newCrispr, c);	// constructing new list
 	    }
+	nextChr = listPtr->next;	// remember before lost
+	listPtr->next = NULL;		// taking out of list
+	listPtr->chromCrisprs = newCrispr;	// the new crispr list
+        allCrisprs = nextChr;		// removes this one from list
+        slAddHead(&countedCrisprs, listPtr);
 	}
+verbose(1, "#\tcounted chr length: %d\n", slCount(countedCrisprs));
+    verboseTime(2, "#\ttotal crispr count: %llu on %d chromosomes\t", totalCrisprs, chrCount);
+    showCounts(countedCrisprs);
     }
+}	// static void crisprKmers(char *sequence)
 
 int main(int argc, char *argv[])
-/* Process command line. */
+/* Process command line, set up translation arrays and call the process */
 {
 optionInit(&argc, argv, options);
 if (argc != 2)
     usage();
+verboseTimeInit();
 bases[0] = 'T';
 bases[1] = 'C';
 bases[2] = 'A';
 bases[3] = 'G';
 revBases[0] = 'A';
 revBases[1] = 'G';
 revBases[2] = 'T';
 revBases[3] = 'C';
+revValue[0] = 2;
+revValue[1] = 3;
+revValue[2] = 0;
+revValue[3] = 1;
 crisprKmers(argv[1]);
 return 0;
 }