118ed185e6dd714a9896cfbc8287ca81d9decfbb hiram Mon Jul 17 23:14:43 2017 -0700 adding the CFD score calculations refs #18969 diff --git src/utils/crisprKmers/crisprKmers.c src/utils/crisprKmers/crisprKmers.c index 662d4f9..93fc106 100644 --- src/utils/crisprKmers/crisprKmers.c +++ src/utils/crisprKmers/crisprKmers.c @@ -540,44 +540,46 @@ dnaUtilOpen(); struct dnaLoad *dl = dnaLoadOpen(inFile); struct dnaSeq *seq; struct crisprList *listReturn = NULL; long elapsedMs = 0; long scanStart = clock1000(); long long totalCrisprs = 0; /* scanning all sequences, setting up crisprs on the listReturn */ while ((seq = dnaLoadNext(dl)) != NULL) { if (startsWithNoCase("chrUn", seq->name) || rStringIn("hap", seq->name) || rStringIn("_alt", seq->name) ) { - verbose(1, "# skip chrom: %s\n", seq->name); + verbose(4, "# skip chrom: %s\n", seq->name); continue; } long startTime = clock1000(); struct crisprList *oneList = generateKmers(seq); slAddHead(&listReturn, oneList); totalCrisprs += oneList->crisprCount; elapsedMs = clock1000() - startTime; + if (verboseLevel() > 3) timingMessage(seq->name, oneList->crisprCount, "crisprs", elapsedMs, "crisprs/sec", "seconds/crispr"); } elapsedMs = clock1000() - scanStart; timingMessage("scanSequence", totalCrisprs, "total crisprs", elapsedMs, "crisprs/sec", "seconds/crispr"); + return listReturn; } /* static crisprList *scanSequence(char *inFile) */ static struct crisprList *rangeExtraction(struct crisprList **allReference) /* given ranges in global rangesHash, construct new list of crisprs that * have any type of overlap with the ranges, also extract those items from * the all list. Returns new list. */ { struct crisprList *all = *allReference; struct crisprList *listReturn = NULL; struct crisprList *list = NULL; int inputChromCount = slCount(all); long long returnListCrisprCount = 0; long long examinedCrisprCount = 0; @@ -659,34 +661,40 @@ timingMessage("range scan", targetCrisprCount, "remaining target crisprs", elapsedMs, "crisprs/sec", "seconds/crispr"); timingMessage("range scan", returnListCrisprCount, "returned query crisprs", elapsedMs, "crisprs/sec", "seconds/crispr"); if (NULL == all) { allReference = NULL; // they have all been removed verbose(1, "# range scan, every single chrom has been removed\n"); } else if (*allReference != all) { verbose(1, "# range scan, first chrom list has been moved from %p to %p\n", (void *)*allReference, (void *)all); *allReference = all; } return listReturn; } // static crisprList *rangeExtraction(crisprList *all) -static double hitScoreM[20] = {0,0,0.014,0,0,0.395,0.317,0,0.389,0.079,0.445,0.508,0.613,0.851,0.732,0.828,0.615,0.804,0.685,0.583}; +static double hitScoreM[20] = +{ + 0,0,0.014,0,0, + 0.395,0.317,0,0.389,0.079, + 0.445,0.508,0.613,0.851,0.732, + 0.828,0.615,0.804,0.685,0.583 +}; static double calcHitScore(long long sequence1, long long sequence2) -/* calcHitScore - from Max' crispor.py script and paper: +/* calcHitScore - from Max's crispor.py script and paper: * https://www.ncbi.nlm.nih.gov/pubmed/27380939 */ { double score1 = 1.0; int mmCount = 0; int lastMmPos = -1; /* the XOR determines differences in two sequences, the shift * right 6 removes the PAM sequence and the 'fortyBits &' eliminates * the negativeStrand bit */ long long misMatch = fortyBits & ((sequence1 ^ sequence2) >> 6); int distCount = 0; int distSum = 0; int pos = 0; for (pos = 0; pos < guideSize; ++pos) { @@ -706,92 +714,254 @@ } double score2 = 1.0; if (distCount > 1) { double avgDist = (double)distSum / distCount; score2 = 1.0 / (((19-avgDist)/19.0) * 4 + 1); } double score3 = 1.0; if (mmCount > 0) score3 = 1.0 / (mmCount * mmCount); return (score1 * score2 * score3 * 100); } // static double calcHitScore(long long sequence1, long long sequence2) -#ifdef NOT +static double pamScores[4][4] = { + { 0.0, 0.0, 0.25925926, 0.0 }, /* A[ACGT] */ + { 0.0, 0.0, 0.107142857, 0.0 }, /* C[ACGT] */ + { 0.069444444, 0.022222222, 1.0, 0.016129032 }, /* G[ACGT] */ + { 0.0, 0.0, 0.03896104, 0.0 } /* T[ACGT] */ +}; + +/* the [20] index is the position in the string + * the first [4] index is the query base at that position + * the second [4] index is the target base at that position + */ +static double cfdMmScores[20][4][4] = +{ + { /* 0 */ + { -1, 0.857142857, 1.0, 1.0 }, + { 1.0, -1, 0.913043478, 1.0 }, + { 0.9, 0.714285714, -1, 1.0 }, + { 1.0, 0.857142857, 0.956521739, -1 }, + }, + { /* 1 */ + { -1, 0.785714286, 0.8, 0.727272727 }, + { 0.727272727, -1, 0.695652174, 0.909090909 }, + { 0.846153846, 0.692307692, -1, 0.636363636 }, + { 0.846153846, 0.857142857, 0.84, -1 }, + }, + { /* 2 */ + { -1, 0.428571429, 0.611111111, 0.705882353 }, + { 0.866666667, -1, 0.5, 0.6875 }, + { 0.75, 0.384615385, -1, 0.5 }, + { 0.714285714, 0.428571429, 0.5, -1 }, + }, + { /* 3 */ + { -1, 0.352941176, 0.625, 0.636363636 }, + { 0.842105263, -1, 0.5, 0.8 }, + { 0.9, 0.529411765, -1, 0.363636364 }, + { 0.476190476, 0.647058824, 0.625, -1 }, + }, + { /* 4 */ + { -1, 0.5, 0.72, 0.363636364 }, + { 0.571428571, -1, 0.6, 0.636363636 }, + { 0.866666667, 0.785714286, -1, 0.3 }, + { 0.5, 1.0, 0.64, -1 }, + }, + { /* 5 */ + { -1, 0.454545455, 0.714285714, 0.714285714 }, + { 0.928571429, -1, 0.5, 0.928571429 }, + { 1.0, 0.681818182, -1, 0.666666667 }, + { 0.866666667, 0.909090909, 0.571428571, -1 }, + }, + { /* 6 */ + { -1, 0.4375, 0.705882353, 0.4375 }, + { 0.75, -1, 0.470588235, 0.8125 }, + { 1.0, 0.6875, -1, 0.571428571 }, + { 0.875, 0.6875, 0.588235294, -1 }, + }, + { /* 7 */ + { -1, 0.428571429, 0.733333333, 0.428571429 }, + { 0.65, -1, 0.642857143, 0.875 }, + { 1.0, 0.615384615, -1, 0.625 }, + { 0.8, 1.0, 0.733333333, -1 }, + }, + { /* 8 */ + { -1, 0.571428571, 0.666666667, 0.6 }, + { 0.857142857, -1, 0.619047619, 0.875 }, + { 0.642857143, 0.538461538, -1, 0.533333333 }, + { 0.928571429, 0.923076923, 0.619047619, -1 }, + }, + { /* 9 */ + { -1, 0.333333333, 0.555555556, 0.882352941 }, + { 0.866666667, -1, 0.388888889, 0.941176471 }, + { 0.933333333, 0.4, -1, 0.8125 }, + { 0.857142857, 0.533333333, 0.5, -1 }, + }, + { /* 10 */ + { -1, 0.4, 0.65, 0.307692308 }, + { 0.75, -1, 0.25, 0.307692308 }, + { 1.0, 0.428571429, -1, 0.384615385 }, + { 0.75, 0.666666667, 0.4, -1 }, + }, + { /* 11 */ + { -1, 0.263157895, 0.722222222, 0.333333333 }, + { 0.714285714, -1, 0.444444444, 0.538461538 }, + { 0.933333333, 0.529411765, -1, 0.384615385 }, + { 0.8, 0.947368421, 0.5, -1 }, + }, + { /* 12 */ + { -1, 0.210526316, 0.652173913, 0.3 }, + { 0.384615385, -1, 0.136363636, 0.7 }, + { 0.923076923, 0.421052632, -1, 0.3 }, + { 0.692307692, 0.789473684, 0.260869565, -1 }, + }, + { /* 13 */ + { -1, 0.214285714, 0.466666667, 0.533333333 }, + { 0.35, -1, 0.0, 0.733333333 }, + { 0.75, 0.428571429, -1, 0.266666667 }, + { 0.619047619, 0.285714286, 0.0, -1 }, + }, + { /* 14 */ + { -1, 0.272727273, 0.65, 0.2 }, + { 0.222222222, -1, 0.05, 0.066666667 }, + { 0.941176471, 0.272727273, -1, 0.142857143 }, + { 0.578947368, 0.272727273, 0.05, -1 }, + }, + { /* 15 */ + { -1, 0.0, 0.192307692, 0.0 }, + { 1.0, -1, 0.153846154, 0.307692308 }, + { 1.0, 0.0, -1, 0.0 }, + { 0.909090909, 0.666666667, 0.346153846, -1 }, + }, + { /* 16 */ + { -1, 0.176470588, 0.176470588, 0.133333333 }, + { 0.466666667, -1, 0.058823529, 0.466666667 }, + { 0.933333333, 0.235294118, -1, 0.25 }, + { 0.533333333, 0.705882353, 0.117647059, -1 }, + }, + { /* 17 */ + { -1, 0.19047619, 0.4, 0.5 }, + { 0.538461538, -1, 0.133333333, 0.642857143 }, + { 0.692307692, 0.476190476, -1, 0.666666667 }, + { 0.666666667, 0.428571429, 0.333333333, -1 }, + }, + { /* 18 */ + { -1, 0.206896552, 0.375, 0.538461538 }, + { 0.428571429, -1, 0.125, 0.461538462 }, + { 0.714285714, 0.448275862, -1, 0.666666667 }, + { 0.285714286, 0.275862069, 0.25, -1 }, + }, + { /* 19 */ + { -1, 0.227272727, 0.764705882, 0.6 }, + { 0.5, -1, 0.058823529, 0.3 }, + { 0.9375, 0.428571429, -1, 0.7 }, + { 0.5625, 0.090909091, 0.176470588, -1 }, + }, +}; + +/* Cutting Frequency Determination */ static double calcCfdScore(long long sequence1, long long sequence2) -/* calcCfdScore - from Max' crispor.py script and paper: +/* calcCfdScore - from cfd_score_calculator.py script and paper: * https://www.ncbi.nlm.nih.gov/pubmed/27380939 */ { -return 0.0; +double score = 1.0; +/* the XOR determine differences in two sequences, the + * shift right 6 removes the PAM sequence and + * the 'fortyBits &' eliminates the negativeStrand bit + */ +long long misMatch = fortyBits & ((sequence1 ^ sequence2) >> 6); +long long misMatchBitMask = 0xc000000000; +long long twoBitMask = 0x300000000000; +int shiftRight = 44; /* to move the 2bits to bits 1,0 */ +int index = 0; + +while (misMatchBitMask) + { + if (misMatchBitMask & misMatch) + { + int queryIndex = (sequence1 & twoBitMask) >> shiftRight; + int targetIndex = (sequence2 & twoBitMask) >> shiftRight; + score *= cfdMmScores[index][queryIndex][targetIndex]; + } + twoBitMask >>= 2; + misMatchBitMask >>= 2; + shiftRight -= 2; + ++index; + } +int pam1 = (sequence2 & 0xc) >> 2; +int pam2 = (sequence2 & 0x3); +score *= pamScores[pam1][pam2]; + +return score; } // static double calcCfdScore(long long sequence1, long long sequence2) -#endif static void misMatchString(char *stringReturn, long long misMatch) /* return ascii string ...*.....*.*.....*.. to indicate mis matches */ { int i = 0; -long long bitMask = 0xc000000000; -while (bitMask) +long long twoBitMask = 0xc000000000; +while (twoBitMask) { stringReturn[i] = '.'; - if (bitMask & misMatch) + if (twoBitMask & misMatch) stringReturn[i] = '*'; ++i; - bitMask >>= 2; + twoBitMask >>= 2; } stringReturn[i] = 0; } // static void misMatchString(char *returnString, long long misMatch) static void recordOffTargets(struct crisprList *query, struct crisprList *target, int bitsOn, long long qIndex, long long tIndex, long long twoBitMisMatch) /* bitsOn is from 1 to 4, record this match when less than 1000 total */ { if (query->offBy[0][qIndex] ) // no need to accumulate if 0 mismatch > 0 return; if (offFile) { char queryString[33]; /* local storage for string */ char targetString[33]; /* local storage for string */ char queryPAM[33]; /* local storage for string */ char targetPAM[33]; /* local storage for string */ char misMatch[33]; /* ...*.....*.*... ascii string represent misMatch */ int i = 0; int bitsOnSum = 0; for (i = 1; i < 5; ++i) bitsOnSum += query->offBy[i][qIndex]; if (bitsOnSum < 1000) // could be command line option limit { double hitScore = calcHitScore(query->sequence[qIndex], target->sequence[tIndex]); -// double cfdScore = -// calcCfdScore(query->sequence[qIndex], target->sequence[tIndex]); + double cfdScore = + calcCfdScore(query->sequence[qIndex], target->sequence[tIndex]); misMatchString(misMatch, twoBitMisMatch); kmerValToString(queryString, query->sequence[qIndex], pamSize); kmerValToString(targetString, target->sequence[tIndex], pamSize); kmerPAMString(queryPAM, query->sequence[qIndex]); kmerPAMString(targetPAM, target->sequence[tIndex]); - fprintf(offFile, "%s:%lld %c %s%s %s%s %s %d %.8f\t%s:%lld %c\n", + fprintf(offFile, "%s:%lld %c %s%s %s%s %s %d %.8f %.8f\t%s:%lld %c\n", query->chrom, query->start[qIndex], negativeStrand & query->sequence[qIndex] ? '-' : '+', queryString, queryPAM, targetString, targetPAM, - misMatch, bitsOn, hitScore, target->chrom, + misMatch, bitsOn, hitScore, cfdScore, target->chrom, target->start[tIndex], negativeStrand & target->sequence[tIndex] ? '-' : '+'); } } } // static void recordOffTargets(struct crisprList *query, // struct crisprList *target, int bitsOn, long long qIndex, // long long tIndex) /* this queryVsTarget can be used by threads, it should be safe, * remains to be proven. */ static void queryVsTarget(struct crisprList *query, struct crisprList *target, int threadCount, int threadId) /* run the query crisprs list against the target list in the array structures */