d821d48d0f454a343f806d2567929bad4be1a09c
hiram
  Tue Jul 18 13:03:50 2017 -0700
looks like MIT and CFD scores are functioning correctly now refs #18969

diff --git src/utils/crisprKmers/crisprKmers.c src/utils/crisprKmers/crisprKmers.c
index 93fc106..7460eac 100644
--- src/utils/crisprKmers/crisprKmers.c
+++ src/utils/crisprKmers/crisprKmers.c
@@ -144,57 +144,63 @@
     long long start;		/* chromosome start 0-relative */
     };
 
 // sizeof(struct crisprList): 72
 struct crisprList
 /* all chromosome sets of crisprs */
     {
     struct crisprList *next;	/* Next in list. */
     char *chrom;		/* chrom name */
     int size;			/* chrom size */
     struct crispr *chromCrisprs;	/* all the crisprs on this chrom */
     long long crisprCount;	/* number of crisprs on this chrom */
     long long *sequence;	/* array of the sequences */
     long long *start;		/* array of the starts */
     int **offBy;		/* offBy[5][n] */
+    float *mitSum;		/* accumulating sum of MIT scores */
     };
 
 struct threadControl
 /* data passed to thread to control execution */
     {
     int threadId;	/* this thread Id */
     int threadCount;	/* total threads running */
     struct crisprList *query;	/* running query guides against */
     struct crisprList *target;	/* target guides */
     };
 
 struct loopControl
 /* calculate index for start to < end processing for multiple threads */
     {
     long long listStart;
     long long listEnd;
     };
 
 /* base values here are different than standard '2bit' format
  * these are in order numerically and alphabetically
  * plus they complement to their complement with an XOR with 0x3
  */
 #define A_BASE	0
 #define C_BASE	1
 #define G_BASE	2
 #define T_BASE	3
 #define U_BASE	3
+#define endsGG	((G_BASE << 2) | G_BASE)
+#define endsAG	((A_BASE << 2) | G_BASE)
+#define beginsCT	((long long)((C_BASE << 2) | T_BASE) << 42)
+#define beginsCC	((long long)((C_BASE << 2) | C_BASE) << 42)
+
 static int orderedNtVal[256];	/* values in alpha order: ACGT 00 01 10 11 */
 				/* for easier sorting and complementing */
 static char bases[4];  /* for binary to ascii conversion */
 
 static void initOrderedNtVal()
 /* initialization of base value lookup arrays */
 {
 int i;
 for (i=0; i<ArraySize(orderedNtVal); i++)
     orderedNtVal[i] = -1;
 orderedNtVal['A'] = orderedNtVal['a'] = A_BASE;
 orderedNtVal['C'] = orderedNtVal['c'] = C_BASE;
 orderedNtVal['G'] = orderedNtVal['g'] = G_BASE;
 orderedNtVal['T'] = orderedNtVal['t'] = T_BASE;
 orderedNtVal['U'] = orderedNtVal['u'] = T_BASE;
@@ -347,89 +353,85 @@
 long startTime = clock1000();
 struct crisprList *cl = NULL;
 long long itemsCopied = 0;
 
 for (cl = list; cl; cl = cl->next)
     {
     size_t memSize = cl->crisprCount * sizeof(long long);
     cl->sequence = (long long *)needLargeMem(memSize);
     cl->start = (long long *)needLargeMem(memSize);
     memSize = 5 * sizeof(int *);
     cl->offBy = (int **)needLargeMem(memSize);
     memSize = cl->crisprCount * sizeof(int);
     int r = 0;
     for (r = 0; r < 5; ++r)
         cl->offBy[r] = (int *)needLargeZeroedMem(memSize);
+    memSize = cl->crisprCount * sizeof(float);
+    cl->mitSum = (float *)needLargeMem(memSize);
 
     long long i = 0;
     struct crispr *c = NULL;
     for (c = cl->chromCrisprs; c; c = c->next)
         {
 	++itemsCopied;
         cl->sequence[i] = c->sequence;
         cl->start[i] = c->start;
+        cl->mitSum[i] = 0.0;
 	++i;
         }
     }
 long elapsedMs = clock1000() - startTime;
 timingMessage("copyToArray", itemsCopied, "items copied", elapsedMs, "items/sec", "seconds/item");
 }	//	static void copyToArray(struct crisprList *list)	*/
 
 static struct crisprList *generateKmers(struct dnaSeq *seq)
 {
 struct crispr *crisprSet = NULL;
 struct crisprList *returnList = NULL;
 AllocVar(returnList);
 returnList->chrom = cloneString(seq->name);
 returnList->size = seq->size;
 int i;
 DNA *dna;
 long long chromPosition = 0;
 long long startGap = 0;
 long long gapCount = 0;
 int kmerLength = 0;
 long long kmerVal = 0;
-long long endsAG = (A_BASE << 2) | G_BASE;
-long long endsGG = (G_BASE << 2) | G_BASE;
-long long beginsCT = (long long)((C_BASE << 2) | T_BASE) << 42;
-long long beginsCC = (long long)((C_BASE << 2) | C_BASE) << 42;
 long long reverseMask = (long long)0xf << 42;
-verbose(4, "#   endsAG: %032llx\n", endsAG);
-verbose(4, "#   endsGG: %032llx\n", endsGG);
-verbose(4, "# beginsCT: %032llx\n", beginsCT);
-verbose(4, "# beginsCC: %032llx\n", beginsCC);
 verbose(4, "#  46 bits: %032llx\n", (long long) fortySixBits);
 
 dna=seq->dna;
 for (i=0; i < seq->size; ++i)
     {
     int val = orderedNtVal[(int)dna[i]];
     if (val >= 0)
 	{
         kmerVal = fortySixBits & ((kmerVal << 2) | val);
         ++kmerLength;
 	if (kmerLength > 22)
 	    {
 	    if ((endsAG == (kmerVal & 0xf)) || (endsGG == (kmerVal & 0xf)))
                 {	/* have match for positive strand */
                 struct crispr *oneCrispr = NULL;
                 AllocVar(oneCrispr);
                 oneCrispr->start = chromPosition - 22;
                 oneCrispr->sequence = kmerVal;
                 slAddHead(&crisprSet, oneCrispr);
                 }
-	    if ((beginsCT == (kmerVal & reverseMask)) || (beginsCC == (kmerVal & reverseMask)))
+	    if ((beginsCT == (kmerVal & reverseMask))
+		|| (beginsCC == (kmerVal & reverseMask)))
                 {	/* have match for negative strand */
                 struct crispr *oneCrispr = NULL;
                 AllocVar(oneCrispr);
                 oneCrispr->start = chromPosition - 22;
                 oneCrispr->sequence = revComp(kmerVal);
                 oneCrispr->sequence |= negativeStrand;
                 slAddHead(&crisprSet, oneCrispr);
                 }
 	    }
         }	// if (val >= 0)
         else
             {
             ++gapCount;
             startGap = chromPosition;
             /* skip all N's == any value not = 0,1,2,3 */
@@ -494,47 +496,46 @@
     {
     long long c = 0;
     for (c = 0; c < list->crisprCount; ++c)
         {
 	++itemsOutput;
 	int negativeOffset = 0;
         char strand = '+';
         if (negativeStrand & list->sequence[c])
 	    {
             strand = '-';
 	    negativeOffset = pamSize;
 	    }
 	long long txStart = list->start[c] + negativeOffset;
 	long long txEnd = txStart + guideSize;
 
-        int totalOffs = list->offBy[0][c] + list->offBy[1][c] +
+        int mitScoreCount = + list->offBy[1][c] +
 	    list->offBy[2][c] + list->offBy[3][c] + list->offBy[4][c];
 
         char *color = itemColor(list->offBy, c);
         char kmerString[33];
         kmerValToString(kmerString, list->sequence[c], pamSize);
         char pamString[33];
         kmerPAMString(pamString, list->sequence[c]);
 
-        if (0 == totalOffs)
-verbose(1, "# PERFECT score %s:%lld %c\t%s\n", list->chrom, list->start[c], strand, kmerString);
-
 	if (bedFH)
-	    fprintf(bedFH, "%s\t%lld\t%lld\t%d,%d,%d,%d,%d\t%d\t%c\t%lld\t%lld\t%s\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\n", list->chrom, list->start[c], list->start[c]+pamSize+guideSize, list->offBy[0][c], list->offBy[1][c], list->offBy[2][c], list->offBy[3][c], list->offBy[4][c], list->offBy[0][c], strand, txStart, txEnd, color, color, color, kmerString, pamString, list->offBy[1][c], list->offBy[2][c], list->offBy[3][c], list->offBy[4][c]);
-
-//        if (list->offBy[0][c])
-//           verbose(3, "# array identical: %d %s:%lld %c\t%s\n", list->offBy[0][c], list->chrom, list->start[c], strand, kmerString);
+	    {
+	    long mitScore = 0;
+            if (mitScoreCount > 0)  /* note: interger <- float conversion  */
+		mitScore = roundf((list->mitSum[c] / (float) mitScoreCount));
+	    fprintf(bedFH, "%s\t%lld\t%lld\t%d,%d,%d,%d,%d\t%d\t%c\t%lld\t%lld\t%s\t%s\t%s\t%ld\t%d\t%d\t%d\t%d\n", list->chrom, list->start[c], list->start[c]+pamSize+guideSize, list->offBy[0][c], list->offBy[1][c], list->offBy[2][c], list->offBy[3][c], list->offBy[4][c], list->offBy[0][c], strand, txStart, txEnd, color, kmerString, pamString, mitScore, list->offBy[1][c], list->offBy[2][c], list->offBy[3][c], list->offBy[4][c]);
+	    }
 	++totalOut;
 	}
     }
 if (bedFH)
     carefulClose(&bedFH);
 
 long elapsedMs = clock1000() - startTime;
 timingMessage("countsOutput:", itemsOutput, "items output", elapsedMs, "items/sec", "seconds/item");
 }	//	static void countsOutput(struct crisprList *all, FILE *bedFH)
 
 static struct crisprList *scanSequence(char *inFile)
 /* scan the given file, return list of crisprs */
 {
 verbose(1, "#\tscanning sequence file: %s\n", inFile);
 dnaUtilOpen();
@@ -661,95 +662,95 @@
 timingMessage("range scan", targetCrisprCount, "remaining target crisprs", elapsedMs, "crisprs/sec", "seconds/crispr");
 timingMessage("range scan", returnListCrisprCount, "returned query crisprs", elapsedMs, "crisprs/sec", "seconds/crispr");
 if (NULL == all)
     {
     allReference = NULL;	// they have all been removed
     verbose(1, "# range scan, every single chrom has been removed\n");
     }
 else if (*allReference != all)
     {
     verbose(1, "# range scan, first chrom list has been moved from %p to %p\n", (void *)*allReference, (void *)all);
     *allReference = all;
     }
 return listReturn;
 }	//	static crisprList *rangeExtraction(crisprList *all)
 
-static double hitScoreM[20] =
+static float hitScoreM[20] =
 {
     0,0,0.014,0,0,
     0.395,0.317,0,0.389,0.079,
     0.445,0.508,0.613,0.851,0.732,
     0.828,0.615,0.804,0.685,0.583
 };
 
-static double calcHitScore(long long sequence1, long long sequence2)
+static float calcHitScore(long long sequence1, long long sequence2)
 /* calcHitScore - from Max's crispor.py script and paper:
  *    https://www.ncbi.nlm.nih.gov/pubmed/27380939 */
 {
-double score1 = 1.0;
+float score1 = 1.0;
 int mmCount = 0;
 int lastMmPos = -1;
 /* the XOR determines differences in two sequences, the shift
  * right 6 removes the PAM sequence and the 'fortyBits &' eliminates
  * the negativeStrand bit
  */
 long long misMatch = fortyBits & ((sequence1 ^ sequence2) >> 6);
 int distCount = 0;
 int distSum = 0;
 int pos = 0;
 for (pos = 0; pos < guideSize; ++pos)
     {
     int diff = misMatch & 0x3;
     if (diff)
 	{
 	++mmCount;
         if (lastMmPos != -1)
 	    {
 	    distSum += pos-lastMmPos;
 	    ++distCount;
 	    }
 	score1 *= 1 - hitScoreM[pos];
 	lastMmPos = pos;
 	}
     misMatch >>= 2;
     }
 
-double score2 = 1.0;
+float score2 = 1.0;
 if (distCount > 1)
     {
-    double avgDist = (double)distSum / distCount;
+    float avgDist = (float)distSum / distCount;
     score2 = 1.0 / (((19-avgDist)/19.0) * 4 + 1);
     }
-double score3 = 1.0;
+float score3 = 1.0;
 if (mmCount > 0)
     score3 = 1.0 / (mmCount * mmCount);
 
 return (score1 * score2 * score3 * 100);
-} //	static double calcHitScore(long long sequence1, long long sequence2)
+} //	static float calcHitScore(long long sequence1, long long sequence2)
 
-static double pamScores[4][4] = {
+static float pamScores[4][4] = {
     { 0.0, 0.0, 0.25925926, 0.0 },	/* A[ACGT] */
     { 0.0, 0.0, 0.107142857, 0.0 },	/* C[ACGT] */
     { 0.069444444, 0.022222222, 1.0, 0.016129032 },	/* G[ACGT] */
     { 0.0, 0.0, 0.03896104, 0.0 }	/* T[ACGT] */
 };
 
 /* the [20] index is the position in the string
  * the first [4] index is the query base at that position
  * the second [4] index is the target base at that position
  */
-static double cfdMmScores[20][4][4] =
+static float cfdMmScores[20][4][4] =
 {
     {	/* 0 */
 	{ -1, 0.857142857, 1.0, 1.0 },
 	{ 1.0, -1, 0.913043478, 1.0 },
 	{ 0.9, 0.714285714, -1, 1.0 },
 	{ 1.0, 0.857142857, 0.956521739, -1 },
     },
     {	/* 1 */
 	{ -1, 0.785714286, 0.8, 0.727272727 },
 	{ 0.727272727, -1, 0.695652174, 0.909090909 },
 	{ 0.846153846, 0.692307692, -1, 0.636363636 },
 	{ 0.846153846, 0.857142857, 0.84, -1 },
     },
     {	/* 2 */
 	{ -1, 0.428571429, 0.611111111, 0.705882353 },
@@ -850,118 +851,131 @@
     {	/* 18 */
 	{ -1, 0.206896552, 0.375, 0.538461538 },
 	{ 0.428571429, -1, 0.125, 0.461538462 },
 	{ 0.714285714, 0.448275862, -1, 0.666666667 },
 	{ 0.285714286, 0.275862069, 0.25, -1 },
     },
     {	/* 19 */
 	{ -1, 0.227272727, 0.764705882, 0.6 },
 	{ 0.5, -1, 0.058823529, 0.3 },
 	{ 0.9375, 0.428571429, -1, 0.7 },
 	{ 0.5625, 0.090909091, 0.176470588, -1 },
     },
 };
 
 /* Cutting Frequency Determination */
-static double calcCfdScore(long long sequence1, long long sequence2)
+static float calcCfdScore(long long sequence1, long long sequence2)
 /* calcCfdScore - from cfd_score_calculator.py script and paper:
  *    https://www.ncbi.nlm.nih.gov/pubmed/27380939 */
 {
-double score = 1.0;
+float score = 1.0;
 /* the XOR determine differences in two sequences, the
  * shift right 6 removes the PAM sequence and
  * the 'fortyBits &' eliminates the negativeStrand bit
  */
 long long misMatch = fortyBits & ((sequence1 ^ sequence2) >> 6);
 long long misMatchBitMask = 0xc000000000;
 long long twoBitMask = 0x300000000000;
 int shiftRight = 44;	/* to move the 2bits to bits 1,0 */
 int index = 0;
 
 while (misMatchBitMask)
     {
     if (misMatchBitMask & misMatch)
 	{
         int queryIndex = (sequence1 & twoBitMask) >> shiftRight;
         int targetIndex = (sequence2 & twoBitMask) >> shiftRight;
 	score *= cfdMmScores[index][queryIndex][targetIndex];
 	}
     twoBitMask >>= 2;
     misMatchBitMask >>= 2;
     shiftRight -= 2;
     ++index;
     }
 int pam1 = (sequence2 & 0xc) >> 2;
 int pam2 = (sequence2 & 0x3);
 score *= pamScores[pam1][pam2];
 
 return score;
-}	// static double calcCfdScore(long long sequence1, long long sequence2)
+}	// static float calcCfdScore(long long sequence1, long long sequence2)
 
 static void misMatchString(char *stringReturn, long long misMatch)
 /* return ascii string ...*.....*.*.....*.. to indicate mis matches */
 {
 int i = 0;
 long long twoBitMask = 0xc000000000;
 while (twoBitMask)
     {
     stringReturn[i] = '.';
     if (twoBitMask & misMatch)
 	stringReturn[i] = '*';
     ++i;
     twoBitMask >>= 2;
     }
 stringReturn[i] = 0;
 }	// static void misMatchString(char *returnString, long long misMatch)
 
 static void recordOffTargets(struct crisprList *query,
     struct crisprList *target, int bitsOn, long long qIndex,
 	long long tIndex, long long twoBitMisMatch)
 /* bitsOn is from 1 to 4, record this match when less than 1000 total */
 {
+float mitScore =
+    calcHitScore(query->sequence[qIndex], target->sequence[tIndex]);
+float cfdScore =
+    calcCfdScore(query->sequence[qIndex], target->sequence[tIndex]);
+
+/* Note from Max's script:
+ *	this is a heuristic based on the guideSeq data where alternative
+ *	PAMs represent only ~10% of all cleaveage events.
+ *	We divide the MIT score by 5 to make sure that these off-targets
+ *	are not ranked among the top but still appear in the list somewhat
+ */
+if ( (endsGG == (query->sequence[qIndex] & 0xf)) &&
+	(endsGG != (target->sequence[tIndex] & 0xf) ) )
+    mitScore *= 0.2;
+
+query->mitSum[qIndex] += mitScore;
+
 if (query->offBy[0][qIndex] ) // no need to accumulate if 0 mismatch > 0
     return;
 
 if (offFile)
     {
     char queryString[33];	/* local storage for string */
     char targetString[33];	/* local storage for string */
     char queryPAM[33];	/* local storage for string */
     char targetPAM[33];	/* local storage for string */
     char misMatch[33];	/* ...*.....*.*... ascii string represent misMatch */
 
     int i = 0;
     int bitsOnSum = 0;
     for (i = 1; i < 5; ++i)
         bitsOnSum += query->offBy[i][qIndex];
 
     if (bitsOnSum < 1000)	// could be command line option limit
         {
-        double hitScore =
-		calcHitScore(query->sequence[qIndex], target->sequence[tIndex]);
-        double cfdScore =
-		calcCfdScore(query->sequence[qIndex], target->sequence[tIndex]);
         misMatchString(misMatch, twoBitMisMatch);
         kmerValToString(queryString, query->sequence[qIndex], pamSize);
         kmerValToString(targetString, target->sequence[tIndex], pamSize);
 	kmerPAMString(queryPAM, query->sequence[qIndex]);
 	kmerPAMString(targetPAM, target->sequence[tIndex]);
         fprintf(offFile, "%s:%lld %c %s%s %s%s %s %d %.8f %.8f\t%s:%lld %c\n",
 	    query->chrom, query->start[qIndex],
 		negativeStrand & query->sequence[qIndex] ? '-' : '+',
                 queryString, queryPAM, targetString, targetPAM,
-                misMatch, bitsOn, hitScore, cfdScore, target->chrom,
+                misMatch, bitsOn, mitScore, cfdScore, target->chrom,
 		target->start[tIndex],
 		negativeStrand & target->sequence[tIndex] ? '-' : '+');
         }
     }
 }	//	static void recordOffTargets(struct crisprList *query,
 	//	    struct crisprList *target, int bitsOn, long long qIndex,
 	//		long long tIndex)
 
 
 /* this queryVsTarget can be used by threads, it should be safe,
  * remains to be proven.
  */
 static void queryVsTarget(struct crisprList *query, struct crisprList *target,
     int threadCount, int threadId)
 /* run the query crisprs list against the target list in the array structures */