d821d48d0f454a343f806d2567929bad4be1a09c hiram Tue Jul 18 13:03:50 2017 -0700 looks like MIT and CFD scores are functioning correctly now refs #18969 diff --git src/utils/crisprKmers/crisprKmers.c src/utils/crisprKmers/crisprKmers.c index 93fc106..7460eac 100644 --- src/utils/crisprKmers/crisprKmers.c +++ src/utils/crisprKmers/crisprKmers.c @@ -144,57 +144,63 @@ long long start; /* chromosome start 0-relative */ }; // sizeof(struct crisprList): 72 struct crisprList /* all chromosome sets of crisprs */ { struct crisprList *next; /* Next in list. */ char *chrom; /* chrom name */ int size; /* chrom size */ struct crispr *chromCrisprs; /* all the crisprs on this chrom */ long long crisprCount; /* number of crisprs on this chrom */ long long *sequence; /* array of the sequences */ long long *start; /* array of the starts */ int **offBy; /* offBy[5][n] */ + float *mitSum; /* accumulating sum of MIT scores */ }; struct threadControl /* data passed to thread to control execution */ { int threadId; /* this thread Id */ int threadCount; /* total threads running */ struct crisprList *query; /* running query guides against */ struct crisprList *target; /* target guides */ }; struct loopControl /* calculate index for start to < end processing for multiple threads */ { long long listStart; long long listEnd; }; /* base values here are different than standard '2bit' format * these are in order numerically and alphabetically * plus they complement to their complement with an XOR with 0x3 */ #define A_BASE 0 #define C_BASE 1 #define G_BASE 2 #define T_BASE 3 #define U_BASE 3 +#define endsGG ((G_BASE << 2) | G_BASE) +#define endsAG ((A_BASE << 2) | G_BASE) +#define beginsCT ((long long)((C_BASE << 2) | T_BASE) << 42) +#define beginsCC ((long long)((C_BASE << 2) | C_BASE) << 42) + static int orderedNtVal[256]; /* values in alpha order: ACGT 00 01 10 11 */ /* for easier sorting and complementing */ static char bases[4]; /* for binary to ascii conversion */ static void initOrderedNtVal() /* initialization of base value lookup arrays */ { int i; for (i=0; inext) { size_t memSize = cl->crisprCount * sizeof(long long); cl->sequence = (long long *)needLargeMem(memSize); cl->start = (long long *)needLargeMem(memSize); memSize = 5 * sizeof(int *); cl->offBy = (int **)needLargeMem(memSize); memSize = cl->crisprCount * sizeof(int); int r = 0; for (r = 0; r < 5; ++r) cl->offBy[r] = (int *)needLargeZeroedMem(memSize); + memSize = cl->crisprCount * sizeof(float); + cl->mitSum = (float *)needLargeMem(memSize); long long i = 0; struct crispr *c = NULL; for (c = cl->chromCrisprs; c; c = c->next) { ++itemsCopied; cl->sequence[i] = c->sequence; cl->start[i] = c->start; + cl->mitSum[i] = 0.0; ++i; } } long elapsedMs = clock1000() - startTime; timingMessage("copyToArray", itemsCopied, "items copied", elapsedMs, "items/sec", "seconds/item"); } // static void copyToArray(struct crisprList *list) */ static struct crisprList *generateKmers(struct dnaSeq *seq) { struct crispr *crisprSet = NULL; struct crisprList *returnList = NULL; AllocVar(returnList); returnList->chrom = cloneString(seq->name); returnList->size = seq->size; int i; DNA *dna; long long chromPosition = 0; long long startGap = 0; long long gapCount = 0; int kmerLength = 0; long long kmerVal = 0; -long long endsAG = (A_BASE << 2) | G_BASE; -long long endsGG = (G_BASE << 2) | G_BASE; -long long beginsCT = (long long)((C_BASE << 2) | T_BASE) << 42; -long long beginsCC = (long long)((C_BASE << 2) | C_BASE) << 42; long long reverseMask = (long long)0xf << 42; -verbose(4, "# endsAG: %032llx\n", endsAG); -verbose(4, "# endsGG: %032llx\n", endsGG); -verbose(4, "# beginsCT: %032llx\n", beginsCT); -verbose(4, "# beginsCC: %032llx\n", beginsCC); verbose(4, "# 46 bits: %032llx\n", (long long) fortySixBits); dna=seq->dna; for (i=0; i < seq->size; ++i) { int val = orderedNtVal[(int)dna[i]]; if (val >= 0) { kmerVal = fortySixBits & ((kmerVal << 2) | val); ++kmerLength; if (kmerLength > 22) { if ((endsAG == (kmerVal & 0xf)) || (endsGG == (kmerVal & 0xf))) { /* have match for positive strand */ struct crispr *oneCrispr = NULL; AllocVar(oneCrispr); oneCrispr->start = chromPosition - 22; oneCrispr->sequence = kmerVal; slAddHead(&crisprSet, oneCrispr); } - if ((beginsCT == (kmerVal & reverseMask)) || (beginsCC == (kmerVal & reverseMask))) + if ((beginsCT == (kmerVal & reverseMask)) + || (beginsCC == (kmerVal & reverseMask))) { /* have match for negative strand */ struct crispr *oneCrispr = NULL; AllocVar(oneCrispr); oneCrispr->start = chromPosition - 22; oneCrispr->sequence = revComp(kmerVal); oneCrispr->sequence |= negativeStrand; slAddHead(&crisprSet, oneCrispr); } } } // if (val >= 0) else { ++gapCount; startGap = chromPosition; /* skip all N's == any value not = 0,1,2,3 */ @@ -494,47 +496,46 @@ { long long c = 0; for (c = 0; c < list->crisprCount; ++c) { ++itemsOutput; int negativeOffset = 0; char strand = '+'; if (negativeStrand & list->sequence[c]) { strand = '-'; negativeOffset = pamSize; } long long txStart = list->start[c] + negativeOffset; long long txEnd = txStart + guideSize; - int totalOffs = list->offBy[0][c] + list->offBy[1][c] + + int mitScoreCount = + list->offBy[1][c] + list->offBy[2][c] + list->offBy[3][c] + list->offBy[4][c]; char *color = itemColor(list->offBy, c); char kmerString[33]; kmerValToString(kmerString, list->sequence[c], pamSize); char pamString[33]; kmerPAMString(pamString, list->sequence[c]); - if (0 == totalOffs) -verbose(1, "# PERFECT score %s:%lld %c\t%s\n", list->chrom, list->start[c], strand, kmerString); - if (bedFH) - fprintf(bedFH, "%s\t%lld\t%lld\t%d,%d,%d,%d,%d\t%d\t%c\t%lld\t%lld\t%s\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\n", list->chrom, list->start[c], list->start[c]+pamSize+guideSize, list->offBy[0][c], list->offBy[1][c], list->offBy[2][c], list->offBy[3][c], list->offBy[4][c], list->offBy[0][c], strand, txStart, txEnd, color, color, color, kmerString, pamString, list->offBy[1][c], list->offBy[2][c], list->offBy[3][c], list->offBy[4][c]); - -// if (list->offBy[0][c]) -// verbose(3, "# array identical: %d %s:%lld %c\t%s\n", list->offBy[0][c], list->chrom, list->start[c], strand, kmerString); + { + long mitScore = 0; + if (mitScoreCount > 0) /* note: interger <- float conversion */ + mitScore = roundf((list->mitSum[c] / (float) mitScoreCount)); + fprintf(bedFH, "%s\t%lld\t%lld\t%d,%d,%d,%d,%d\t%d\t%c\t%lld\t%lld\t%s\t%s\t%s\t%ld\t%d\t%d\t%d\t%d\n", list->chrom, list->start[c], list->start[c]+pamSize+guideSize, list->offBy[0][c], list->offBy[1][c], list->offBy[2][c], list->offBy[3][c], list->offBy[4][c], list->offBy[0][c], strand, txStart, txEnd, color, kmerString, pamString, mitScore, list->offBy[1][c], list->offBy[2][c], list->offBy[3][c], list->offBy[4][c]); + } ++totalOut; } } if (bedFH) carefulClose(&bedFH); long elapsedMs = clock1000() - startTime; timingMessage("countsOutput:", itemsOutput, "items output", elapsedMs, "items/sec", "seconds/item"); } // static void countsOutput(struct crisprList *all, FILE *bedFH) static struct crisprList *scanSequence(char *inFile) /* scan the given file, return list of crisprs */ { verbose(1, "#\tscanning sequence file: %s\n", inFile); dnaUtilOpen(); @@ -661,95 +662,95 @@ timingMessage("range scan", targetCrisprCount, "remaining target crisprs", elapsedMs, "crisprs/sec", "seconds/crispr"); timingMessage("range scan", returnListCrisprCount, "returned query crisprs", elapsedMs, "crisprs/sec", "seconds/crispr"); if (NULL == all) { allReference = NULL; // they have all been removed verbose(1, "# range scan, every single chrom has been removed\n"); } else if (*allReference != all) { verbose(1, "# range scan, first chrom list has been moved from %p to %p\n", (void *)*allReference, (void *)all); *allReference = all; } return listReturn; } // static crisprList *rangeExtraction(crisprList *all) -static double hitScoreM[20] = +static float hitScoreM[20] = { 0,0,0.014,0,0, 0.395,0.317,0,0.389,0.079, 0.445,0.508,0.613,0.851,0.732, 0.828,0.615,0.804,0.685,0.583 }; -static double calcHitScore(long long sequence1, long long sequence2) +static float calcHitScore(long long sequence1, long long sequence2) /* calcHitScore - from Max's crispor.py script and paper: * https://www.ncbi.nlm.nih.gov/pubmed/27380939 */ { -double score1 = 1.0; +float score1 = 1.0; int mmCount = 0; int lastMmPos = -1; /* the XOR determines differences in two sequences, the shift * right 6 removes the PAM sequence and the 'fortyBits &' eliminates * the negativeStrand bit */ long long misMatch = fortyBits & ((sequence1 ^ sequence2) >> 6); int distCount = 0; int distSum = 0; int pos = 0; for (pos = 0; pos < guideSize; ++pos) { int diff = misMatch & 0x3; if (diff) { ++mmCount; if (lastMmPos != -1) { distSum += pos-lastMmPos; ++distCount; } score1 *= 1 - hitScoreM[pos]; lastMmPos = pos; } misMatch >>= 2; } -double score2 = 1.0; +float score2 = 1.0; if (distCount > 1) { - double avgDist = (double)distSum / distCount; + float avgDist = (float)distSum / distCount; score2 = 1.0 / (((19-avgDist)/19.0) * 4 + 1); } -double score3 = 1.0; +float score3 = 1.0; if (mmCount > 0) score3 = 1.0 / (mmCount * mmCount); return (score1 * score2 * score3 * 100); -} // static double calcHitScore(long long sequence1, long long sequence2) +} // static float calcHitScore(long long sequence1, long long sequence2) -static double pamScores[4][4] = { +static float pamScores[4][4] = { { 0.0, 0.0, 0.25925926, 0.0 }, /* A[ACGT] */ { 0.0, 0.0, 0.107142857, 0.0 }, /* C[ACGT] */ { 0.069444444, 0.022222222, 1.0, 0.016129032 }, /* G[ACGT] */ { 0.0, 0.0, 0.03896104, 0.0 } /* T[ACGT] */ }; /* the [20] index is the position in the string * the first [4] index is the query base at that position * the second [4] index is the target base at that position */ -static double cfdMmScores[20][4][4] = +static float cfdMmScores[20][4][4] = { { /* 0 */ { -1, 0.857142857, 1.0, 1.0 }, { 1.0, -1, 0.913043478, 1.0 }, { 0.9, 0.714285714, -1, 1.0 }, { 1.0, 0.857142857, 0.956521739, -1 }, }, { /* 1 */ { -1, 0.785714286, 0.8, 0.727272727 }, { 0.727272727, -1, 0.695652174, 0.909090909 }, { 0.846153846, 0.692307692, -1, 0.636363636 }, { 0.846153846, 0.857142857, 0.84, -1 }, }, { /* 2 */ { -1, 0.428571429, 0.611111111, 0.705882353 }, @@ -850,118 +851,131 @@ { /* 18 */ { -1, 0.206896552, 0.375, 0.538461538 }, { 0.428571429, -1, 0.125, 0.461538462 }, { 0.714285714, 0.448275862, -1, 0.666666667 }, { 0.285714286, 0.275862069, 0.25, -1 }, }, { /* 19 */ { -1, 0.227272727, 0.764705882, 0.6 }, { 0.5, -1, 0.058823529, 0.3 }, { 0.9375, 0.428571429, -1, 0.7 }, { 0.5625, 0.090909091, 0.176470588, -1 }, }, }; /* Cutting Frequency Determination */ -static double calcCfdScore(long long sequence1, long long sequence2) +static float calcCfdScore(long long sequence1, long long sequence2) /* calcCfdScore - from cfd_score_calculator.py script and paper: * https://www.ncbi.nlm.nih.gov/pubmed/27380939 */ { -double score = 1.0; +float score = 1.0; /* the XOR determine differences in two sequences, the * shift right 6 removes the PAM sequence and * the 'fortyBits &' eliminates the negativeStrand bit */ long long misMatch = fortyBits & ((sequence1 ^ sequence2) >> 6); long long misMatchBitMask = 0xc000000000; long long twoBitMask = 0x300000000000; int shiftRight = 44; /* to move the 2bits to bits 1,0 */ int index = 0; while (misMatchBitMask) { if (misMatchBitMask & misMatch) { int queryIndex = (sequence1 & twoBitMask) >> shiftRight; int targetIndex = (sequence2 & twoBitMask) >> shiftRight; score *= cfdMmScores[index][queryIndex][targetIndex]; } twoBitMask >>= 2; misMatchBitMask >>= 2; shiftRight -= 2; ++index; } int pam1 = (sequence2 & 0xc) >> 2; int pam2 = (sequence2 & 0x3); score *= pamScores[pam1][pam2]; return score; -} // static double calcCfdScore(long long sequence1, long long sequence2) +} // static float calcCfdScore(long long sequence1, long long sequence2) static void misMatchString(char *stringReturn, long long misMatch) /* return ascii string ...*.....*.*.....*.. to indicate mis matches */ { int i = 0; long long twoBitMask = 0xc000000000; while (twoBitMask) { stringReturn[i] = '.'; if (twoBitMask & misMatch) stringReturn[i] = '*'; ++i; twoBitMask >>= 2; } stringReturn[i] = 0; } // static void misMatchString(char *returnString, long long misMatch) static void recordOffTargets(struct crisprList *query, struct crisprList *target, int bitsOn, long long qIndex, long long tIndex, long long twoBitMisMatch) /* bitsOn is from 1 to 4, record this match when less than 1000 total */ { +float mitScore = + calcHitScore(query->sequence[qIndex], target->sequence[tIndex]); +float cfdScore = + calcCfdScore(query->sequence[qIndex], target->sequence[tIndex]); + +/* Note from Max's script: + * this is a heuristic based on the guideSeq data where alternative + * PAMs represent only ~10% of all cleaveage events. + * We divide the MIT score by 5 to make sure that these off-targets + * are not ranked among the top but still appear in the list somewhat + */ +if ( (endsGG == (query->sequence[qIndex] & 0xf)) && + (endsGG != (target->sequence[tIndex] & 0xf) ) ) + mitScore *= 0.2; + +query->mitSum[qIndex] += mitScore; + if (query->offBy[0][qIndex] ) // no need to accumulate if 0 mismatch > 0 return; if (offFile) { char queryString[33]; /* local storage for string */ char targetString[33]; /* local storage for string */ char queryPAM[33]; /* local storage for string */ char targetPAM[33]; /* local storage for string */ char misMatch[33]; /* ...*.....*.*... ascii string represent misMatch */ int i = 0; int bitsOnSum = 0; for (i = 1; i < 5; ++i) bitsOnSum += query->offBy[i][qIndex]; if (bitsOnSum < 1000) // could be command line option limit { - double hitScore = - calcHitScore(query->sequence[qIndex], target->sequence[tIndex]); - double cfdScore = - calcCfdScore(query->sequence[qIndex], target->sequence[tIndex]); misMatchString(misMatch, twoBitMisMatch); kmerValToString(queryString, query->sequence[qIndex], pamSize); kmerValToString(targetString, target->sequence[tIndex], pamSize); kmerPAMString(queryPAM, query->sequence[qIndex]); kmerPAMString(targetPAM, target->sequence[tIndex]); fprintf(offFile, "%s:%lld %c %s%s %s%s %s %d %.8f %.8f\t%s:%lld %c\n", query->chrom, query->start[qIndex], negativeStrand & query->sequence[qIndex] ? '-' : '+', queryString, queryPAM, targetString, targetPAM, - misMatch, bitsOn, hitScore, cfdScore, target->chrom, + misMatch, bitsOn, mitScore, cfdScore, target->chrom, target->start[tIndex], negativeStrand & target->sequence[tIndex] ? '-' : '+'); } } } // static void recordOffTargets(struct crisprList *query, // struct crisprList *target, int bitsOn, long long qIndex, // long long tIndex) /* this queryVsTarget can be used by threads, it should be safe, * remains to be proven. */ static void queryVsTarget(struct crisprList *query, struct crisprList *target, int threadCount, int threadId) /* run the query crisprs list against the target list in the array structures */