0e23fd19b0162759909cee1861a70decad3e85f0 hiram Tue Jul 25 22:04:10 2017 -0700 marking duplicates during first queryVsSelf refs #18969 diff --git src/utils/crisprKmers/crisprKmers.c src/utils/crisprKmers/crisprKmers.c index 5da7ab7..2a4ea99 100644 --- src/utils/crisprKmers/crisprKmers.c +++ src/utils/crisprKmers/crisprKmers.c @@ -98,54 +98,56 @@ /* Command line validation table. */ static struct optionSpec options[] = { {"bed", OPTION_STRING}, {"offTargets", OPTION_STRING}, {"ranges", OPTION_STRING}, {"threads", OPTION_INT}, {"dumpGuides", OPTION_STRING}, {"loadKmers", OPTION_STRING}, {NULL, 0}, }; #define guideSize 20 // 20 bases #define pamSize 3 // 3 bases #define negativeStrand 0x0000800000000000 +#define duplicateGuide 0x0001000000000000 #define fortyEightBits 0xffffffffffff #define fortySixBits 0x3fffffffffff #define fortyBits 0xffffffffff #define high32bits 0xffffffff00000000 #define low32bits 0x00000000ffffffff #define high16bits 0xffff0000ffff0000 #define low16bits 0x0000ffff0000ffff #define high8bits 0xff00ff00ff00ff00 #define low8bits 0x00ff00ff00ff00ff #define high4bits 0xf0f0f0f0f0f0f0f0 #define low4bits 0x0f0f0f0f0f0f0f0f #define high2bits 0xcccccccccccccccc #define low2bits 0x3333333333333333 #define pamMask 0x3f // 0x0000 0000 0000 0000 // 6348 4732 3116 15-0 // sequence word structure: // bits 5-0 - PAM sequence in 2bit encoding for 3 bases // bits 45-6 - 20 base sequence in 2bit encoding format // bit 47 - negative strand indication +// bit 48 - duplicate guide indication // considering using other bits during processing to mark // an item for no more consideration // sizeof(struct crispr): 32 struct crispr /* one chromosome set of guides */ { struct crispr *next; /* Next in list. */ long long sequence; /* sequence value in 2bit format */ long long start; /* chromosome start 0-relative */ }; // sizeof(struct crisprList): 72 struct crisprList /* all chromosome sets of guides */ @@ -469,84 +471,89 @@ static char *green = "50,205,50"; /* #32cd32 */ static char *yellow = "255,255,0"; /* #ffff00 */ static char *black = "0,0,0"; static char *red = "170,1,20"; /* #aa0114 */ if (mitScore > 50) return green; else if (mitScore > 20) return yellow; else if (mitScore == -1) return black; else return red; } static void countsOutput(struct crisprList *all, FILE *bedFH) -/* everything has been scanned and counted, print out all the data from arrays*/ +/* all done scanning and counting, print out all the guide data */ { long startTime = clock1000(); long long itemsOutput = 0; +long long duplicatesMarked = 0; struct crisprList *list; long long totalOut = 0; for (list = all; list; list = list->next) { long long c; for (c = 0; c < list->crisprCount; ++c) { ++itemsOutput; + if (list->sequence[c] & duplicateGuide) + ++duplicatesMarked; int negativeOffset = 0; char strand = '+'; if (negativeStrand & list->sequence[c]) { strand = '-'; negativeOffset = pamSize; } long long txStart = list->start[c] + negativeOffset; long long txEnd = txStart + guideSize; int mitScoreCount = + list->offBy[1][c] + list->offBy[2][c] + list->offBy[3][c] + list->offBy[4][c]; char kmerString[33]; kmerValToString(kmerString, list->sequence[c], pamSize); char pamString[33]; kmerPAMString(pamString, list->sequence[c]); if (bedFH) { long mitScore = 1.0; if (mitScoreCount > 0) /* note: interger <- float conversion */ { mitScore = roundf((list->mitSum[c] / (float) mitScoreCount)); } - else if (list->offBy[0][c] > 0) + else if (list->sequence[c] & duplicateGuide) { mitScore = 0.0; } char *color = scoreToColor(mitScore); /* the end zero will be filled in by post-processing, it will be * the _offset into the crisprDetails.tab file */ fprintf(bedFH, "%s\t%lld\t%lld\t\t%ld\t%c\t%lld\t%lld\t%s\t%s\t%s\t%d,%d,%d,%d,%d\t0\n", list->chrom, list->start[c], list->start[c]+pamSize+guideSize, mitScore, strand, txStart, txEnd, color, kmerString, pamString, list->offBy[0][c], list->offBy[1][c], list->offBy[2][c], list->offBy[3][c], list->offBy[4][c]); } ++totalOut; } } if (bedFH) carefulClose(&bedFH); +verbose(1, "# countsOutput: %lld guides marked as duplicate sequence\n", + duplicatesMarked); timingMessage("countsOutput:", itemsOutput, "items output", startTime, "items/sec", "seconds/item"); } // static void countsOutput(struct crisprList *all, FILE *bedFH) static struct crisprList *scanSequence(char *inFile) /* scan the given file, return list of guides */ { verbose(1, "# scanning sequence file: %s\n", inFile); dnaUtilOpen(); struct dnaLoad *dl = dnaLoadOpen(inFile); struct dnaSeq *seq; struct crisprList *listReturn = NULL; long startTime = clock1000(); @@ -689,31 +696,31 @@ 0,0,0.014,0,0, 0.395,0.317,0,0.389,0.079, 0.445,0.508,0.613,0.851,0.732, 0.828,0.615,0.804,0.685,0.583 }; static float calcHitScore(long long sequence1, long long sequence2) /* calcHitScore - from Max's crispor.py script and paper: * https://www.ncbi.nlm.nih.gov/pubmed/27380939 */ { float score1 = 1.0; int mmCount = 0; int lastMmPos = -1; /* the XOR determines differences in two sequences, the shift * right 6 removes the PAM sequence and the 'fortyBits &' eliminates - * the negativeStrand bit + * the negativeStrand and duplicateGuide bits */ long long misMatch = fortyBits & ((sequence1 ^ sequence2) >> 6); int distCount = 0; int distSum = 0; int pos; for (pos = 0; pos < guideSize; ++pos) { int diff = misMatch & 0x3; if (diff) { ++mmCount; if (lastMmPos != -1) { distSum += pos-lastMmPos; ++distCount; @@ -868,31 +875,31 @@ { -1, 0.227272727, 0.764705882, 0.6 }, { 0.5, -1, 0.058823529, 0.3 }, { 0.9375, 0.428571429, -1, 0.7 }, { 0.5625, 0.090909091, 0.176470588, -1 }, }, }; /* Cutting Frequency Determination */ static float calcCfdScore(long long sequence1, long long sequence2) /* calcCfdScore - from cfd_score_calculator.py script and paper: * https://www.ncbi.nlm.nih.gov/pubmed/27380939 */ { float score = 1.0; /* the XOR determine differences in two sequences, the * shift right 6 removes the PAM sequence and - * the 'fortyBits &' eliminates the negativeStrand bit + * the 'fortyBits &' eliminates the negativeStrand and duplicateGuide bits */ long long misMatch = fortyBits & ((sequence1 ^ sequence2) >> 6); long long misMatchBitMask = 0xc000000000; long long twoBitMask = 0x300000000000; int shiftRight = 44; /* to move the 2bits to bits 1,0 */ int index = 0; while (misMatchBitMask) { if (misMatchBitMask & misMatch) { int queryIndex = (sequence1 & twoBitMask) >> shiftRight; int targetIndex = (sequence2 & twoBitMask) >> shiftRight; score *= cfdMmScores[index][queryIndex][targetIndex]; } @@ -988,158 +995,176 @@ /* this queryVsTarget can be used by threads, appears to be safe */ static void queryVsTarget(struct crisprList *query, struct crisprList *target, int threadCount, int threadId) /* run the query guides list against the target list in the array structures */ { struct crisprList *qList; long long totalCrisprsQuery = 0; long long totalCrisprsTarget = 0; long long totalCompares = 0; struct loopControl *control = NULL; AllocVar(control); long startTime = clock1000(); +long long duplicatesMarked = 0; for (qList = query; qList; qList = qList->next) { totalCrisprsQuery += qList->crisprCount; if (threadCount > 1) { setLoopEnds(control, qList->crisprCount, threadCount, threadId); if (control->listStart >= qList->crisprCount) continue; /* next chrom, no part to be done for this thread */ } else { control->listStart = 0; control->listEnd = qList->crisprCount; } verbose(1, "# thread %d of %d running %s %lld items [ %lld : %lld )\n", 1 + threadId, threadCount, qList->chrom, control->listEnd - control->listStart, control->listStart, control->listEnd); totalCrisprsTarget += control->listEnd - control->listStart; long long qCount; for (qCount = control->listStart; qCount < control->listEnd; ++qCount) { + if (qList->sequence[qCount] & duplicateGuide) + continue; /* already marked as duplicate */ struct crisprList *tList; for (tList = target; tList; tList = tList->next) { long long tCount; totalCompares += tList->crisprCount; for (tCount = 0; tCount < tList->crisprCount; ++tCount) { /* the XOR determine differences in two sequences, the * shift right 6 removes the PAM sequence and - * the 'fortyBits &' eliminates the negativeStrand bit + * the 'fortyBits &' eliminates the negativeStrand and + * duplicateGuide bits */ long long misMatch = fortyBits & ((qList->sequence[qCount] ^ tList->sequence[tCount]) >> 6); if (misMatch) { /* possible misMatch bit values: 01 10 11 * turn those three values into just: 01 */ misMatch = (misMatch | (misMatch >> 1)) & 0x5555555555; int bitsOn = _mm_popcnt_u64(misMatch); if (bitsOn < 5) { - recordOffTargets(qList, tList, bitsOn, qCount, tCount, misMatch); + recordOffTargets(qList, tList, bitsOn, qCount, + tCount, misMatch); qList->offBy[bitsOn][qCount] += 1; // tList->offBy[bitsOn][tCount] += 1; not needed } } else { /* no misMatch, identical guides */ qList->offBy[0][qCount] += 1; + qList->sequence[qCount] |= duplicateGuide; + ++duplicatesMarked; // tList->offBy[0][tCount] += 1; not needed } } // for (tCount = 0; tCount < tList->crisprCount; ++tCount) } // for (tList = target; tList; tList = tList->next) } // for (qCount = 0; qCount < qList->crisprCount; ++qCount) } // for (qList = query; qList; qList = qList->next) +verbose(1, "# queryVsTarget: an additional %lld duplicates marked\n", + duplicatesMarked); timingMessage("queryVsTarget", totalCrisprsQuery, "query guides processed", startTime, "guides/sec", "seconds/guide"); timingMessage("queryVsTarget", totalCrisprsTarget, "vs target guides", startTime, "guides/sec", "seconds/guide"); timingMessage("queryVsTarget", totalCompares, "total comparisons", startTime, "compares/sec", "seconds/compare"); } /* static struct crisprList *queryVsTarget(struct crisprList *query, struct crisprList *target) */ static void queryVsSelf(struct crisprList *all) /* run this 'all' list vs. itself avoiding self to self comparisons */ { struct crisprList *qList; long long totalCrisprsQuery = 0; long long totalCrisprsCompare = 0; long startTime = clock1000(); +long long duplicatesMarked = 0; + /* query runs through all chroms */ for (qList = all; qList; qList = qList->next) { long long qCount; totalCrisprsQuery += qList->crisprCount; verbose(1, "# queryVsSelf %lld query guides on chrom %s\n", qList->crisprCount, qList->chrom); for (qCount = 0; qCount < qList->crisprCount; ++qCount) { /* target starts on same chrom as query, and at next kmer after query for this first chrom */ long long tStart = qCount+1; struct crisprList *tList; + if (qList->sequence[qCount] & duplicateGuide) + continue; /* already marked as duplicate */ for (tList = qList; tList; tList = tList->next) { long long tCount; totalCrisprsCompare += tList->crisprCount - tStart; for (tCount = tStart; tCount < tList->crisprCount; ++tCount) { /* the XOR determine differences in two sequences, the * shift right 6 removes the PAM sequence and - * the 'fortyBits &' eliminates the negativeStrand bit + * the 'fortyBits &' eliminates the negativeStrand and + * duplicateGuide bits */ long long misMatch = fortyBits & ((qList->sequence[qCount] ^ tList->sequence[tCount]) >> 6); if (misMatch) { /* possible misMatch bit values: 01 10 11 * turn those three values into just: 01 */ misMatch = (misMatch | (misMatch >> 1)) & 0x5555555555; int bitsOn = _mm_popcnt_u64(misMatch); if (bitsOn < 5) { recordOffTargets(qList, tList, bitsOn, qCount, tCount, misMatch); qList->offBy[bitsOn][qCount] += 1; tList->offBy[bitsOn][tCount] += 1; } } else { /* no misMatch, identical guides */ + qList->sequence[qCount] |= duplicateGuide; + tList->sequence[tCount] |= duplicateGuide; qList->offBy[0][qCount] += 1; tList->offBy[0][tCount] += 1; + ++duplicatesMarked; } } // for (tCount = 0; tCount < tList->crisprCount; ++tCount) tStart = 0; /* following chroms run through all */ } // for (tList = target; tList; tList = tList->next) } // for (qCount = 0; qCount < qList->crisprCount; ++qCount) } // for (qList = query; qList; qList = qList->next) +verbose(1, "# queryVsSelf: counted %lld duplicate guides\n", duplicatesMarked); timingMessage("queryVsSelf", totalCrisprsQuery, "guides processed", startTime, "guides/sec", "seconds/guide"); timingMessage("queryVsSelf", totalCrisprsCompare, "total comparisons", startTime, "compares/sec", "seconds/compare"); } /* static void queryVsSelf(struct crisprList *all) */ static struct crisprList *readKmers(char *fileIn) /* read in kmer list from 'fileIn', return list structure */ { errAbort("# XXX readKmers function not implemented\n"); verbose(1, "# reading guides from: %s\n", fileIn); struct crisprList *listReturn = NULL; struct lineFile *lf = lineFileOpen(fileIn, TRUE); char *row[10]; @@ -1186,50 +1211,50 @@ return listReturn; } // static struct crisprList *readKmers(char *fileIn) static void writeGuides(struct crisprList *all, char *fileOut) /* write guide list 'all' to 'fileOut' */ { FILE *fh = mustOpen(fileOut, "w"); struct crisprList *list; long long crisprsWritten = 0; char kmerString[33]; char pamString[33]; long startTime = clock1000(); -slReverse(&all); +/* slReverse(&all); * can not do this here, destroy's caller's list, + * caller's value of all doesn't change */ + for (list = all; list; list = list->next) { fprintf(fh, "%s\t%lld\t%d\n", list->chrom, list->crisprCount, list->size); struct crispr *c; -// slReverse(&list->chromCrisprs); for (c = list->chromCrisprs; c; c = c->next) { kmerValToString(kmerString, c->sequence, pamSize); kmerPAMString(pamString, c->sequence); fprintf(fh, "%s\t%s\t%lld\t%c\n", kmerString, pamString, c->start, negativeStrand & c->sequence ? '-' : '+'); ++crisprsWritten; } } carefulClose(&fh); timingMessage("writeGuides", crisprsWritten, "guides written", startTime, "guides/sec", "seconds/guide"); - } // static void writeGuides(struct crisprList *all, char *fileOut) static void *threadFunction(void *id) /* thread entry */ { struct threadControl *tId = (struct threadControl *)id; queryVsTarget(tId->query, tId->target, tId->threadCount, tId->threadId); return NULL; } static void runThreads(int threadCount, struct crisprList *query, struct crisprList *target) { @@ -1298,39 +1323,39 @@ if (queryGuides) copyToArray(queryGuides); if (allGuides) copyToArray(allGuides); vmPeak = currentVmPeak(); verbose(1, "# vmPeak after copyToArray: %lld kB\n", vmPeak); /* larger example: 62646196 kB */ if (threads > 1) { int gB = vmPeak >> 20; /* convert kB to Gb */ threadCount = threads; verbose(1, "# at %d Gb (%lld kB), running %d threads\n", gB, vmPeak, threadCount); } if (queryGuides) // when range selected some query sequences { + /* the query vs. itself avoiding self vs. self and marking dups */ + queryVsSelf(queryGuides); if (allGuides) // if there are any left on the all list { if (threadCount > 1) runThreads(threadCount, queryGuides, allGuides); else - queryVsTarget(queryGuides, allGuides, 1, 0); + queryVsTarget(queryGuides, allGuides, 0, 0); } - /* then run up the query vs. itself avoiding self vs. self */ - queryVsSelf(queryGuides); countsOutput(queryGuides, bedFH); } else { queryVsSelf(allGuides); /* run up all vs. all avoiding self vs. self */ countsOutput(allGuides, bedFH); } carefulClose(&offFile); } } // static void crisprKmers(char *sequence, FILE *bedFH) int main(int argc, char *argv[]) /* Process command line, initialize translation arrays and call the process */ {