d75dbdc035385f4122259f47091fc09c06ad16c2 hiram Thu Jul 13 11:29:41 2017 -0700 appears to work OK with threads refs #18969 diff --git src/utils/crisprKmers/crisprKmers.c src/utils/crisprKmers/crisprKmers.c index 48c210a..f4f3a18 100644 --- src/utils/crisprKmers/crisprKmers.c +++ src/utils/crisprKmers/crisprKmers.c @@ -353,34 +353,35 @@ cl->start = (long long *)needLargeMem(memSize); memSize = 5 * sizeof(int *); cl->offBy = (int **)needLargeMem(memSize); memSize = cl->crisprCount * sizeof(int); int r = 0; for (r = 0; r < 5; ++r) cl->offBy[r] = (int *)needLargeZeroedMem(memSize); long long i = 0; struct crispr *c = NULL; for (c = cl->chromCrisprs; c; c = c->next) { ++itemsCopied; cl->sequence[i] = c->sequence; cl->start[i] = c->start; + ++i; } } long elapsedMs = clock1000() - startTime; -timingMessage("copyToArray:", itemsCopied, "items copied", elapsedMs, "items/sec", "seconds/item"); +timingMessage("copyToArray", itemsCopied, "items copied", elapsedMs, "items/sec", "seconds/item"); } // static void copyToArray(struct crisprList *list) */ static struct crisprList *generateKmers(struct dnaSeq *seq) { struct crispr *crisprSet = NULL; struct crisprList *returnList = NULL; AllocVar(returnList); returnList->chrom = cloneString(seq->name); returnList->size = seq->size; int i; DNA *dna; long long chromPosition = 0; long long startGap = 0; long long gapCount = 0; int kmerLength = 0; @@ -499,33 +500,33 @@ ++itemsOutput; int negativeOffset = 0; char strand = '+'; if (negativeStrand & list->sequence[c]) { strand = '-'; negativeOffset = 3; } long long txStart = list->start[c] + negativeOffset; long long txEnd = txStart + 20; int totalOffs = list->offBy[0][c] + list->offBy[1][c] + list->offBy[2][c] + list->offBy[3][c] + list->offBy[4][c]; char *color = itemColor(list->offBy, c); - char kmerString[32]; + char kmerString[33]; kmerValToString(kmerString, list->sequence[c], 3); - char pamString[32]; + char pamString[33]; kmerPAMString(pamString, list->sequence[c]); if (0 == totalOffs) verbose(1, "# PERFECT score %s:%lld %c\t%s\n", list->chrom, list->start[c], strand, kmerString); if (bedFH) fprintf(bedFH, "%s\t%lld\t%lld\t%d,%d,%d,%d,%d\t%d\t%c\t%lld\t%lld\t%s\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\n", list->chrom, list->start[c], list->start[c]+23, list->offBy[0][c], list->offBy[1][c], list->offBy[2][c], list->offBy[3][c], list->offBy[4][c], list->offBy[0][c], strand, txStart, txEnd, color, color, color, kmerString, pamString, list->offBy[1][c], list->offBy[2][c], list->offBy[3][c], list->offBy[4][c]); if (list->offBy[0][c]) verbose(3, "# array identical: %d %s:%lld %c\t%s\n", list->offBy[0][c], list->chrom, list->start[c], strand, kmerString); ++totalOut; } } if (bedFH) carefulClose(&bedFH); @@ -706,150 +707,186 @@ } double score2 = 1.0; if (distCount > 1) { double avgDist = (double)distSum / distCount; score2 = 1.0 / (((19-avgDist)/19.0) * 4 + 1); } double score3 = 1.0; if (mmCount > 0) score3 = 1.0 / (mmCount * mmCount); return (score1 * score2 * score3 * 100); } // static double calcHitScore(long long sequence1, long long sequence2) +#ifdef NOT +static double calcCfdScore(long long sequence1, long long sequence2) +/* calcCfdScore - from Max' crispor.py script and paper: + * https://www.ncbi.nlm.nih.gov/pubmed/27380939 */ +{ +return 0.0; +} // static double calcCfdScore(long long sequence1, long long sequence2) +#endif + +static void misMatchString(char *stringReturn, long long misMatch) +/* return ascii string ...*.....*.*.....*.. to indicate mis matches */ +{ +int i = 0; +long long bitMask = 0xc000000000; +while (bitMask) + { + stringReturn[i] = '.'; + if (bitMask & misMatch) + stringReturn[i] = '*'; + ++i; + bitMask >>= 2; + } +stringReturn[i] = 0; +} // static void misMatchString(char *returnString, long long misMatch) + static void recordOffTargets(struct crisprList *query, struct crisprList *target, int bitsOn, long long qIndex, - long long tIndex) + long long tIndex, long long twoBitMisMatch) /* bitsOn is from 1 to 4, record this match when less than 1000 total */ { -char queryString[32]; /* local storage for string */ -char targetString[32]; /* local storage for string */ -char queryPAM[32]; /* local storage for string */ -char targetPAM[32]; /* local storage for string */ +char queryString[33]; /* local storage for string */ +char targetString[33]; /* local storage for string */ +char queryPAM[33]; /* local storage for string */ +char targetPAM[33]; /* local storage for string */ +char misMatch[33]; /* ...*.....*.*... ascii string represent misMatch */ if (query->offBy[0][qIndex] ) // no need to accumulate if 0-mismatch > 0 return; if (offFile) { int i = 0; int bitsOnSum = 0; for (i = 1; i < 5; ++i) bitsOnSum += query->offBy[i][qIndex]; + misMatchString(misMatch, twoBitMisMatch); + if (bitsOnSum < 1000) // could be command line option limit { double hitScore = calcHitScore(query->sequence[qIndex], target->sequence[tIndex]); +// double cfdScore = +// calcCfdScore(query->sequence[qIndex], target->sequence[tIndex]); kmerValToString(queryString, query->sequence[qIndex], 3); kmerValToString(targetString, target->sequence[tIndex], 3); kmerPAMString(queryPAM, query->sequence[qIndex]); kmerPAMString(targetPAM, target->sequence[tIndex]); - fprintf(offFile, "%s:%lld %c %s %s %.2f\t", query->chrom, - query->start[qIndex], + fprintf(offFile, "%s:%lld %c %s%s %s%s %s %d %.8f\t%s:%lld %c\n", + query->chrom, query->start[qIndex], negativeStrand & query->sequence[qIndex] ? '-' : '+', - queryString, queryPAM, hitScore); - fprintf(offFile, "%s:%lld %c %s %s\t%d\n", target->chrom, + queryString, queryPAM, targetString, targetPAM, + misMatch, bitsOn, hitScore, target->chrom, target->start[tIndex], - negativeStrand & target->sequence[tIndex] ? '-' : '+', - targetString, targetPAM, bitsOn); + negativeStrand & target->sequence[tIndex] ? '-' : '+'); } } } // static void recordOffTargets(struct crisprList *query, // struct crisprList *target, int bitsOn, long long qIndex, // long long tIndex) -static void queryVsAll(struct crisprList *query, struct crisprList *target, + +/* this queryVsTarget can be used by threads, it should be safe, + * remains to be proven. + */ +static void queryVsTarget(struct crisprList *query, struct crisprList *target, int threadCount, int threadId) /* run the query crisprs list against the target list in the array structures */ { struct crisprList *qList = NULL; long long totalCrisprsQuery = 0; long long totalCrisprsTarget = 0; long long totalCompares = 0; struct loopControl *control = NULL; AllocVar(control); if (threadCount > 1) - verbose(1, "# thread %d of %d threads running queryVsAll()\n", - threadId, threadCount); + verbose(1, "# thread %d of %d threads running queryVsTarget()\n", + 1+threadId, threadCount); long processStart = clock1000(); long elapsedMs = 0; for (qList = query; qList; qList = qList->next) { long long qCount = 0; totalCrisprsQuery += qList->crisprCount; - verbose(1, "# queryVsAll %lld query crisprs on chrom %s\n", qList->crisprCount, qList->chrom); + verbose(1, "# queryVsTarget %lld query crisprs on chrom %s\n", qList->crisprCount, qList->chrom); if (threadCount > 1) { setLoopEnds(control, qList->crisprCount, threadCount, threadId); if (control->listStart >= qList->crisprCount) continue; /* next chrom, no part to be done for this thread */ } else { control->listStart = 0; control->listEnd = qList->crisprCount; } + verbose(1, "# thread # %d of %d running %s items [ %lld : %lld )\n", + 1 + threadId, threadCount, qList->chrom, control->listStart, + control->listEnd); totalCrisprsTarget += control->listEnd - control->listStart; for (qCount = control->listStart; qCount < control->listEnd; ++qCount) { struct crisprList *tList = NULL; for (tList = target; tList; tList = tList->next) { long long tCount = 0; totalCompares += tList->crisprCount; for (tCount = 0; tCount < tList->crisprCount; ++tCount) { /* the XOR determine differences in two sequences, the * shift right 6 removes the PAM sequence and * the 'fortyBits &' eliminates the negativeStrand bit */ long long misMatch = fortyBits & ((qList->sequence[qCount] ^ tList->sequence[tCount]) >> 6); if (misMatch) { /* possible misMatch bit values: 01 10 11 * turn those three values into just: 01 */ misMatch = (misMatch | (misMatch >> 1)) & 0x5555555555; int bitsOn = _mm_popcnt_u64(misMatch); if (bitsOn < 5) { - recordOffTargets(qList, tList, bitsOn, qCount, tCount); + recordOffTargets(qList, tList, bitsOn, qCount, tCount, misMatch); qList->offBy[bitsOn][qCount] += 1; // tList->offBy[bitsOn][tCount] += 1; not needed } } else { /* no misMatch, identical crisprs */ qList->offBy[0][qCount] += 1; // tList->offBy[0][tCount] += 1; not needed } } // for (tCount = 0; tCount < tList->crisprCount; ++tCount) } // for (tList = target; tList; tList = tList->next) } // for (qCount = 0; qCount < qList->crisprCount; ++qCount) } // for (qList = query; qList; qList = qList->next) verbose(1, "# done with scanning, check timing\n"); elapsedMs = clock1000() - processStart; -timingMessage("queryVsAll", totalCrisprsQuery, "query crisprs processed", elapsedMs, "crisprs/sec", "seconds/crispr"); -timingMessage("queryVsAll", totalCrisprsTarget, "vs target crisprs", elapsedMs, "crisprs/sec", "seconds/crispr"); -timingMessage("queryVsAll", totalCompares, "total comparisons", elapsedMs, "compares/sec", "seconds/compare"); -} /* static struct crisprList *queryVsAll(struct crisprList *query, +timingMessage("queryVsTarget", totalCrisprsQuery, "query crisprs processed", elapsedMs, "crisprs/sec", "seconds/crispr"); +timingMessage("queryVsTarget", totalCrisprsTarget, "vs target crisprs", elapsedMs, "crisprs/sec", "seconds/crispr"); +timingMessage("queryVsTarget", totalCompares, "total comparisons", elapsedMs, "compares/sec", "seconds/compare"); +} /* static struct crisprList *queryVsTarget(struct crisprList *query, struct crisprList *target) */ static void queryVsSelf(struct crisprList *all) /* run this 'all' list vs. itself avoiding self to self comparisons */ { struct crisprList *qList = NULL; long long totalCrisprsQuery = 0; long long totalCrisprsCompare = 0; long processStart = clock1000(); long elapsedMs = 0; /* query runs through all chroms */ for (qList = all; qList; qList = qList->next) { @@ -871,53 +908,54 @@ /* the XOR determine differences in two sequences, the * shift right 6 removes the PAM sequence and * the 'fortyBits &' eliminates the negativeStrand bit */ long long misMatch = fortyBits & ((qList->sequence[qCount] ^ tList->sequence[tCount]) >> 6); if (misMatch) { /* possible misMatch bit values: 01 10 11 * turn those three values into just: 01 */ misMatch = (misMatch | (misMatch >> 1)) & 0x5555555555; int bitsOn = _mm_popcnt_u64(misMatch); if (bitsOn < 5) { - recordOffTargets(qList, tList, bitsOn, qCount, tCount); + recordOffTargets(qList, tList, bitsOn, qCount, tCount, misMatch); qList->offBy[bitsOn][qCount] += 1; tList->offBy[bitsOn][tCount] += 1; } } else { /* no misMatch, identical crisprs */ qList->offBy[0][qCount] += 1; tList->offBy[0][tCount] += 1; } } // for (tCount = 0; tCount < tList->crisprCount; ++tCount) tStart = 0; /* following chroms run through all */ } // for (tList = target; tList; tList = tList->next) } // for (qCount = 0; qCount < qList->crisprCount; ++qCount) } // for (qList = query; qList; qList = qList->next) elapsedMs = clock1000() - processStart; timingMessage("queryVsSelf", totalCrisprsQuery, "crisprs processed", elapsedMs, "crisprs/sec", "seconds/crispr"); timingMessage("queryVsSelf", totalCrisprsCompare, "total comparisons", elapsedMs, "compares/sec", "seconds/compare"); } /* static void queryVsSelf(struct crisprList *all) */ static struct crisprList *readKmers(char *fileIn) /* read in kmer list from 'fileIn', return list structure */ { +errAbort("# XXX readKmers function not implemented\n"); verbose(1, "# reading crisprs from: %s\n", fileIn); struct crisprList *listReturn = NULL; struct lineFile *lf = lineFileOpen(fileIn, TRUE); char *row[10]; int wordCount = 0; long long crisprsInput = 0; long startMs = clock1000(); while (0 < (wordCount = lineFileChopNextTab(lf, row, ArraySize(row))) ) { if (3 != wordCount) errAbort("expecing three words at line %d in file %s, found: %d", lf->lineIx, fileIn, wordCount); struct crisprList *newItem = NULL; @@ -945,82 +983,66 @@ newItem->crisprCount, lf->lineIx, fileIn, verifyCount); crisprsInput += verifyCount; } lineFileClose(&lf); long elapsedMs = clock1000() - startMs; timingMessage("readKmers", crisprsInput, "crisprs read in", elapsedMs, "crisprs/sec", "seconds/crispr"); return listReturn; } // static struct crisprList *readKmers(char *fileIn) static void writeKmers(struct crisprList *all, char *fileOut) /* write kmer list 'all' to 'fileOut' */ { +errAbort("# XXX writeKmers function not implemented\n"); FILE *fh = mustOpen(fileOut, "w"); struct crisprList *list = NULL; long long crisprsWritten = 0; +char kmerString[33]; +char pamString[33]; long startMs = clock1000(); slReverse(&all); for (list = all; list; list = list->next) { fprintf(fh, "%s\t%lld\t%d\n", list->chrom, list->crisprCount, list->size); struct crispr *c = NULL; slReverse(&list->chromCrisprs); for (c = list->chromCrisprs; c; c = c->next) { - fprintf(fh, "%lld\t%lld\t%c\n", c->sequence, + kmerValToString(kmerString, c->sequence, 3); + kmerPAMString(pamString, c->sequence); + fprintf(fh, "%s\t%s\t%lld\t%c\n", kmerString, pamString, c->start, negativeStrand & c->sequence ? '-' : '+'); ++crisprsWritten; } } carefulClose(&fh); long elapsedMs = clock1000() - startMs; timingMessage("writeKmers", crisprsWritten, "crisprs written", elapsedMs, "crisprs/sec", "seconds/crispr"); } // static void writeKmers(struct crisprList *all, char *fileOut) static void *threadFunction(void *id) /* thread entry */ { struct threadControl *tId = (struct threadControl *)id; -queryVsAll(tId->query, tId->target, tId->threadCount, tId->threadId); - -#ifdef NOT -/* - * for each list to process, divide the list into tId->threadCount parts, - * then one thread runs part number tId->threadId bits of that list - */ -struct loopControl *control = NULL; -AllocVar(control); - - -long long listCount = 1; -while (listCount < 100) { -setLoopEnds(control, listCount, tId->threadCount, tId->threadId); - -if (control->listStart >= listCount) - verbose(1, "#\t%lld\t# %d of %d does not run from %lld to < %lld in list of %lld size\n", listCount, tId->threadId, tId->threadCount, control->listStart, control->listEnd, listCount); -else - verbose(1, "#\t%lld\t# %d of %d runs from %lld to < %lld in list of %lld size\n", listCount, tId->threadId, tId->threadCount, control->listStart, control->listEnd, listCount); - listCount += 13; -} -#endif +queryVsTarget(tId->query, tId->target, tId->threadCount, tId->threadId); return NULL; } static void runThreads(int threadCount, struct crisprList *query, struct crisprList *target) { struct threadControl *threadIds = NULL; AllocArray(threadIds, threadCount); pthread_t *threads = NULL; AllocArray(threads, threadCount); int pt = 0; for (pt = 0; pt < threadCount; ++pt) @@ -1036,31 +1058,30 @@ if (rc) { errAbort("Unexpected error %d from pthread_create(): %s",rc,strerror(rc)); } } /* Wait for threads to finish */ for (pt = 0; pt < threadCount; ++pt) pthread_join(threads[pt], NULL); } static void crisprKmers(char *sequence) /* crisprKmers - find and annotate crispr sequences. */ { struct crisprList *queryGuides = NULL; -// struct crisprList *countedCrisprs = NULL; struct crisprList *allGuides = NULL; if (loadKmers) allGuides = readKmers(loadKmers); else allGuides = scanSequence(sequence); if (dumpKmers) { writeKmers(allGuides, dumpKmers); return; } long long vmPeak = currentVmPeak(); verbose(1, "# vmPeak after scanSequence: %lld kB\n", vmPeak); @@ -1081,34 +1102,34 @@ copyToArray(queryGuides); if (allGuides) copyToArray(allGuides); vmPeak = currentVmPeak(); verbose(1, "# vmPeak after copyToArray: %lld kB\n", vmPeak); /* larger example: 62646196 kB */ if ((vmPeak >> 20) > memLimit) // the >> 20 converts kB to gB { threadCount = 1 + ((vmPeak >> 20) / memLimit); verbose(1, "# over %d Gb at %lld kB, threadCount: %d\n", memLimit, vmPeak, threadCount); } if (queryGuides) // when range selected some query sequences { if (allGuides) // if there are any left on the all list { - if (0 == 1 && threadCount > 1) + if (threadCount > 1) runThreads(threadCount, queryGuides, allGuides); else - queryVsAll(queryGuides, allGuides, 1, 0); + queryVsTarget(queryGuides, allGuides, 1, 0); } /* then run up the query vs. itself avoiding self vs. self */ queryVsSelf(queryGuides); countsOutput(queryGuides); } else { queryVsSelf(allGuides); /* run up all vs. all avoiding self vs. self */ countsOutput(allGuides); } carefulClose(&offFile); } } // static void crisprKmers(char *sequence)