3f6442f9cbf277adf44d00d78a5aaf53e5b0e4c7 hiram Thu Jul 20 12:47:20 2017 -0700 failed experiment to utilize CPU caching, does not help, just for the record checked in, refs #18969 diff --git src/utils/crisprKmers/crisprKmers.c src/utils/crisprKmers/crisprKmers.c index 7460eac..0f6a72c 100644 --- src/utils/crisprKmers/crisprKmers.c +++ src/utils/crisprKmers/crisprKmers.c @@ -60,58 +60,66 @@ #include "sqlNum.h" #include "binRange.h" #include "obscure.h" #include "memalloc.h" void usage() /* Explain usage and exit. */ { errAbort( "crisprKmers - annotate crispr guide sequences\n" "usage:\n" " crisprKmers \n" "options:\n" " where is a .2bit file or .fa fasta sequence\n" " -verbose=N - for debugging control processing steps with level of verbose:\n" + " - verbose < 2 - scan sequence only, no comparisons\n" + " - verbose > 1 - run comparisons\n" " -bed= - output results to given bed9+ file\n" " -offTargets= - output off target data to given file\n" " -ranges= - use specified bed3 file to limit which guides are\n" " - measured, only those with any overlap to these bed items.\n" " -memLimit=N - N number of gigabytes for each thread, default: no threading\n" " when memory size goes beyond N gB program will thread gB/N times\n" + " -cacheCount=N - number of guides to check at one time vs. targets,\n" + " - to take advantage of CPU cache, e.g. 1Mb of cache\n" + " - could hold 16,384 64bit words.\n" " -dumpKmers= - NOT VALID after scan of sequence, output kmers to file\n" " - process will exit after this, use -loadKmers to continue\n" " -loadKmers= - NOT VALID load kmers from previous scan of sequence from -dumpKmers" ); } static char *bedFileOut = NULL; /* set with -bed= argument, kmer output */ static char *offTargets = NULL; /* write offTargets to */ static FILE *offFile = NULL; /* file handle to write off targets */ static char *ranges = NULL; /* use ranges to limit scanning */ static struct hash *rangesHash = NULL; /* ranges into hash + binkeeper */ static char *dumpKmers = NULL; /* file name to write out kmers from scan */ static char *loadKmers = NULL; /* file name to read in kmers from previous scan */ static int memLimit = 0; /* gB limit before going into thread mode */ +static int cacheCount = 0; /* number of query guides to run through + * for each target to utilize CPU cache */ static int threadCount = 1; /* will be adjusted depending upon vmPeak */ /* Command line validation table. */ static struct optionSpec options[] = { {"bed", OPTION_STRING}, {"offTargets", OPTION_STRING}, {"ranges", OPTION_STRING}, {"memLimit", OPTION_INT}, + {"cacheCount", OPTION_INT}, {"dumpKmers", OPTION_STRING}, {"loadKmers", OPTION_STRING}, {NULL, 0}, }; #define guideSize 20 // 20 bases #define pamSize 3 // 3 bases #define negativeStrand 0x0000800000000000 #define fortyEightBits 0xffffffffffff #define fortySixBits 0x3fffffffffff #define fortyBits 0xffffffffff #define high32bits 0xffffffff00000000 #define low32bits 0x00000000ffffffff #define high16bits 0xffff0000ffff0000 @@ -199,41 +207,43 @@ int i; for (i=0; i 0) && (count > 0)) +if ((elapsedMs > 0) && (count > 0)) { - perSecond = 1000.0 * count / ms; + perSecond = 1000.0 * count / elapsedMs; inverse = 1.0 / perSecond; } -verbose(1, "# %s: %lld %s @ %ld ms\n#\t%.2f %s == %g %s\n", prefix, count, message, ms, perSecond, units, inverse, invUnits); +verbose(1, "# %s: %lld %s @ %ld ms\n#\t%.2f %s == %g %s\n", prefix, count, + message, elapsedMs, perSecond, units, inverse, invUnits); } static void setLoopEnds(struct loopControl *control, long long listSize, int partCount, int partNumber) /* for multiple thread processing, given a list of listSize, # a partCount and a partNumber * return listStart, listEnd indexes in control structure */ { long long partSize = 1 + listSize / partCount; long long listStart = partSize * partNumber; long long listEnd = partSize * (partNumber + 1); if (listEnd > listSize) listEnd = listSize; control->listStart = listStart; @@ -339,78 +349,79 @@ * masking does the reversing. */ register long long v = (val ^ fortySixBits) << 18; v = ((v & high32bits) >> 32) | ((v & low32bits) << 32); v = ((v & high16bits) >> 16) | ((v & low16bits) << 16); v = ((v & high8bits) >> 8) | ((v & low8bits) << 8); v = ((v & high4bits) >> 4) | ((v & low4bits) << 4); v = ((v & high2bits) >> 2) | ((v & low2bits) << 2); return v; } // static long long revComp(long long val) static void copyToArray(struct crisprList *list) /* copy the crispr list data into arrays */ { long startTime = clock1000(); -struct crisprList *cl = NULL; +struct crisprList *cl; long long itemsCopied = 0; for (cl = list; cl; cl = cl->next) { size_t memSize = cl->crisprCount * sizeof(long long); cl->sequence = (long long *)needLargeMem(memSize); cl->start = (long long *)needLargeMem(memSize); memSize = 5 * sizeof(int *); cl->offBy = (int **)needLargeMem(memSize); memSize = cl->crisprCount * sizeof(int); - int r = 0; + int r; for (r = 0; r < 5; ++r) cl->offBy[r] = (int *)needLargeZeroedMem(memSize); memSize = cl->crisprCount * sizeof(float); cl->mitSum = (float *)needLargeMem(memSize); long long i = 0; - struct crispr *c = NULL; + struct crispr *c; for (c = cl->chromCrisprs; c; c = c->next) { ++itemsCopied; cl->sequence[i] = c->sequence; cl->start[i] = c->start; cl->mitSum[i] = 0.0; ++i; } } -long elapsedMs = clock1000() - startTime; -timingMessage("copyToArray", itemsCopied, "items copied", elapsedMs, "items/sec", "seconds/item"); + +timingMessage("copyToArray", itemsCopied, "items copied", startTime, + "items/sec", "seconds/item"); + } // static void copyToArray(struct crisprList *list) */ static struct crisprList *generateKmers(struct dnaSeq *seq) { struct crispr *crisprSet = NULL; struct crisprList *returnList = NULL; AllocVar(returnList); returnList->chrom = cloneString(seq->name); returnList->size = seq->size; int i; DNA *dna; long long chromPosition = 0; long long startGap = 0; long long gapCount = 0; int kmerLength = 0; long long kmerVal = 0; long long reverseMask = (long long)0xf << 42; -verbose(4, "# 46 bits: %032llx\n", (long long) fortySixBits); dna=seq->dna; for (i = 0; i < seq->size; ++i) { int val = orderedNtVal[(int)dna[i]]; if (val >= 0) { kmerVal = fortySixBits & ((kmerVal << 2) | val); ++kmerLength; if (kmerLength > 22) { if ((endsAG == (kmerVal & 0xf)) || (endsGG == (kmerVal & 0xf))) { /* have match for positive strand */ struct crispr *oneCrispr = NULL; AllocVar(oneCrispr); @@ -443,177 +454,173 @@ if (startGap != chromPosition) verbose(4, "#GAP %s\t%lld\t%lld\t%lld\t%lld\t%s\n", seq->name, startGap, chromPosition, gapCount, chromPosition-startGap, "+"); else verbose(4, "#GAP %s\t%lld\t%lld\t%lld\t%lld\t%s\n", seq->name, startGap, chromPosition+1, gapCount, 1+chromPosition-startGap, "+"); kmerLength = 0; /* reset, start over */ kmerVal = 0; } // else if (val >= 0) ++chromPosition; } // for (i = 0; i < seq->size; ++i) // slReverse(&crisprSet); // save time, order not important at this time returnList->chromCrisprs = crisprSet; returnList->crisprCount = slCount(crisprSet); return returnList; } // static struct crisprList *generateKmers(struct dnaSeq *seq) -static char *itemColor(int **offBy, long long index) -/* temporary itemColor, post processing will do this correctly */ -{ -static char *notUnique = "150,150,150"; -// static char *notUniqueAlt = "150,150,150"; -static char *twoMany = "120,120,120"; -// static char *twoManyAlt = "120,120,120"; -static char *highScore = "0,255,0"; // > 70 -static char *mediumScore = "128,128,0"; // > 50 -static char *lowScore = "255,0,0"; // >= 50 -// static char *lowestScore = "80,80,80"; // < 50 -// static char *lowestScoreAlt = "80,80,80"; // < 50 -if (offBy[0][index]) - return notUnique; -else - { - int offSum = offBy[1][index] + offBy[2][index] + offBy[3][index] + - offBy[4][index]; - if (offSum < 100) - return highScore; - else if (offSum < 150) - return mediumScore; - else if (offSum < 250) - return lowScore; +// for future reference, note the Cpf1 system: +// https://benchling.com/pub/cpf1 + +static char *scoreToColor(long mitScore) +/* following scheme from Max's python scripts */ +{ +static char *green = "50,205,50"; /* #32cd32 */ +static char *yellow = "255,255,0"; /* #ffff00 */ +static char *black = "0,0,0"; +static char *red = "170,1,20"; /* #aa0114 */ +if (mitScore > 50) + return green; +else if (mitScore > 20) + return yellow; +else if (mitScore == -1) + return black; else - return twoMany; - } + return red; } static void countsOutput(struct crisprList *all, FILE *bedFH) /* everything has been scanned and counted, print out all the data from arrays*/ { long startTime = clock1000(); long long itemsOutput = 0; struct crisprList *list; long long totalOut = 0; for (list = all; list; list = list->next) { - long long c = 0; + long long c; for (c = 0; c < list->crisprCount; ++c) { ++itemsOutput; int negativeOffset = 0; char strand = '+'; if (negativeStrand & list->sequence[c]) { strand = '-'; negativeOffset = pamSize; } long long txStart = list->start[c] + negativeOffset; long long txEnd = txStart + guideSize; int mitScoreCount = + list->offBy[1][c] + list->offBy[2][c] + list->offBy[3][c] + list->offBy[4][c]; - char *color = itemColor(list->offBy, c); char kmerString[33]; kmerValToString(kmerString, list->sequence[c], pamSize); char pamString[33]; kmerPAMString(pamString, list->sequence[c]); if (bedFH) { - long mitScore = 0; + long mitScore = 1.0; if (mitScoreCount > 0) /* note: interger <- float conversion */ + { mitScore = roundf((list->mitSum[c] / (float) mitScoreCount)); - fprintf(bedFH, "%s\t%lld\t%lld\t%d,%d,%d,%d,%d\t%d\t%c\t%lld\t%lld\t%s\t%s\t%s\t%ld\t%d\t%d\t%d\t%d\n", list->chrom, list->start[c], list->start[c]+pamSize+guideSize, list->offBy[0][c], list->offBy[1][c], list->offBy[2][c], list->offBy[3][c], list->offBy[4][c], list->offBy[0][c], strand, txStart, txEnd, color, kmerString, pamString, mitScore, list->offBy[1][c], list->offBy[2][c], list->offBy[3][c], list->offBy[4][c]); + } + else if (list->offBy[0][c] > 0) + { + mitScore = 0.0; + } + char *color = scoreToColor(mitScore); + fprintf(bedFH, "%s\t%lld\t%lld\t\t%ld\t%c\t%lld\t%lld\t%s\t%s\t%s\t%ld\t%d,%d,%d,%d,%d\n", list->chrom, list->start[c], list->start[c]+pamSize+guideSize, mitScore, strand, txStart, txEnd, color, kmerString, pamString, mitScore, list->offBy[0][c], list->offBy[1][c], list->offBy[2][c], list->offBy[3][c], list->offBy[4][c]); } ++totalOut; } } if (bedFH) carefulClose(&bedFH); -long elapsedMs = clock1000() - startTime; -timingMessage("countsOutput:", itemsOutput, "items output", elapsedMs, "items/sec", "seconds/item"); +timingMessage("countsOutput:", itemsOutput, "items output", startTime, + "items/sec", "seconds/item"); + } // static void countsOutput(struct crisprList *all, FILE *bedFH) static struct crisprList *scanSequence(char *inFile) /* scan the given file, return list of crisprs */ { verbose(1, "#\tscanning sequence file: %s\n", inFile); dnaUtilOpen(); struct dnaLoad *dl = dnaLoadOpen(inFile); struct dnaSeq *seq; struct crisprList *listReturn = NULL; -long elapsedMs = 0; -long scanStart = clock1000(); +long startTime = clock1000(); long long totalCrisprs = 0; /* scanning all sequences, setting up crisprs on the listReturn */ while ((seq = dnaLoadNext(dl)) != NULL) { if (startsWithNoCase("chrUn", seq->name) || rStringIn("hap", seq->name) || rStringIn("_alt", seq->name) ) { verbose(4, "# skip chrom: %s\n", seq->name); continue; } long startTime = clock1000(); struct crisprList *oneList = generateKmers(seq); slAddHead(&listReturn, oneList); totalCrisprs += oneList->crisprCount; - elapsedMs = clock1000() - startTime; if (verboseLevel() > 3) - timingMessage(seq->name, oneList->crisprCount, "crisprs", elapsedMs, "crisprs/sec", "seconds/crispr"); + timingMessage(seq->name, oneList->crisprCount, "crisprs", startTime, + "crisprs/sec", "seconds/crispr"); } -elapsedMs = clock1000() - scanStart; -timingMessage("scanSequence", totalCrisprs, "total crisprs", elapsedMs, "crisprs/sec", "seconds/crispr"); +timingMessage("scanSequence", totalCrisprs, "total crisprs", startTime, + "crisprs/sec", "seconds/crispr"); return listReturn; } /* static crisprList *scanSequence(char *inFile) */ static struct crisprList *rangeExtraction(struct crisprList **allReference) /* given ranges in global rangesHash, construct new list of crisprs that * have any type of overlap with the ranges, also extract those items from * the all list. Returns new list. */ { struct crisprList *all = *allReference; struct crisprList *listReturn = NULL; -struct crisprList *list = NULL; +struct crisprList *list; int inputChromCount = slCount(all); long long returnListCrisprCount = 0; long long examinedCrisprCount = 0; struct crisprList *prevChromList = NULL; -long elapsedMs = 0; -long scanStart = clock1000(); +long startTime = clock1000(); struct crisprList *nextList = NULL; for (list = all; list; list = nextList) { nextList = list->next; // remember before perhaps lost long long beforeCrisprCount = list->crisprCount; examinedCrisprCount += list->crisprCount; - struct crispr *c = NULL; struct binKeeper *bk = hashFindVal(rangesHash, list->chrom); struct crispr *newCrispr = NULL; if (bk != NULL) { struct crispr *prevCrispr = NULL; struct crispr *next = NULL; + struct crispr *c; for (c = list->chromCrisprs; c; c = next) { struct binElement *hitList = NULL; next = c->next; // remember before perhaps lost // select any guide that is at least half contained in any range int midPoint = c->start + ((pamSize + guideSize) >> 1); // if (negativeStrand & c->sequence) // start += 2; // int end = midPoint + 1; hitList = binKeeperFind(bk, midPoint, midPoint + 1); if (hitList) { if (prevCrispr) // remove this one from the 'all' list prevCrispr->next = next; else @@ -642,37 +649,41 @@ prevChromList->next = nextList; else all = nextList; // removing the first one } else { list->crisprCount = slCount(list->chromCrisprs); long long removedCrisprs = beforeCrisprCount - list->crisprCount; verbose(1, "# range scan chrom %s had %lld crisprs, removed %lld leaving %lld target\n", list->chrom, beforeCrisprCount, removedCrisprs, list->crisprCount); } } prevChromList = list; } // for (list = all; list; list = list->next) -elapsedMs = clock1000() - scanStart; verbose(1, "# range scanning %d chroms, return %d selected chroms, leaving %d chroms\n", inputChromCount, slCount(listReturn), slCount(all)); + long long targetCrisprCount = examinedCrisprCount - returnListCrisprCount; -timingMessage("range scan", examinedCrisprCount, "examined crisprs", elapsedMs, "crisprs/sec", "seconds/crispr"); -timingMessage("range scan", targetCrisprCount, "remaining target crisprs", elapsedMs, "crisprs/sec", "seconds/crispr"); -timingMessage("range scan", returnListCrisprCount, "returned query crisprs", elapsedMs, "crisprs/sec", "seconds/crispr"); +timingMessage("range scan", examinedCrisprCount, "examined crisprs", + startTime, "crisprs/sec", "seconds/crispr"); +timingMessage("range scan", targetCrisprCount, "remaining target crisprs", + startTime, "crisprs/sec", "seconds/crispr"); +timingMessage("range scan", returnListCrisprCount, "returned query crisprs", + startTime, "crisprs/sec", "seconds/crispr"); + if (NULL == all) { allReference = NULL; // they have all been removed verbose(1, "# range scan, every single chrom has been removed\n"); } else if (*allReference != all) { verbose(1, "# range scan, first chrom list has been moved from %p to %p\n", (void *)*allReference, (void *)all); *allReference = all; } return listReturn; } // static crisprList *rangeExtraction(crisprList *all) static float hitScoreM[20] = { @@ -684,31 +695,31 @@ static float calcHitScore(long long sequence1, long long sequence2) /* calcHitScore - from Max's crispor.py script and paper: * https://www.ncbi.nlm.nih.gov/pubmed/27380939 */ { float score1 = 1.0; int mmCount = 0; int lastMmPos = -1; /* the XOR determines differences in two sequences, the shift * right 6 removes the PAM sequence and the 'fortyBits &' eliminates * the negativeStrand bit */ long long misMatch = fortyBits & ((sequence1 ^ sequence2) >> 6); int distCount = 0; int distSum = 0; -int pos = 0; +int pos; for (pos = 0; pos < guideSize; ++pos) { int diff = misMatch & 0x3; if (diff) { ++mmCount; if (lastMmPos != -1) { distSum += pos-lastMmPos; ++distCount; } score1 *= 1 - hitScoreM[pos]; lastMmPos = pos; } misMatch >>= 2; @@ -911,188 +922,205 @@ ++i; twoBitMask >>= 2; } stringReturn[i] = 0; } // static void misMatchString(char *returnString, long long misMatch) static void recordOffTargets(struct crisprList *query, struct crisprList *target, int bitsOn, long long qIndex, long long tIndex, long long twoBitMisMatch) /* bitsOn is from 1 to 4, record this match when less than 1000 total */ { float mitScore = calcHitScore(query->sequence[qIndex], target->sequence[tIndex]); float cfdScore = calcCfdScore(query->sequence[qIndex], target->sequence[tIndex]); +/* note: interger cfdInt <- cfdScore float conversion */ +int cfdInt = round(cfdScore * 1000); /* Note from Max's script: * this is a heuristic based on the guideSeq data where alternative * PAMs represent only ~10% of all cleaveage events. * We divide the MIT score by 5 to make sure that these off-targets * are not ranked among the top but still appear in the list somewhat */ if ( (endsGG == (query->sequence[qIndex] & 0xf)) && (endsGG != (target->sequence[tIndex] & 0xf) ) ) mitScore *= 0.2; query->mitSum[qIndex] += mitScore; if (query->offBy[0][qIndex] ) // no need to accumulate if 0 mismatch > 0 return; if (offFile) { char queryString[33]; /* local storage for string */ char targetString[33]; /* local storage for string */ char queryPAM[33]; /* local storage for string */ char targetPAM[33]; /* local storage for string */ char misMatch[33]; /* ...*.....*.*... ascii string represent misMatch */ - int i = 0; - int bitsOnSum = 0; + int i; + int offTargetCount = 0; for (i = 1; i < 5; ++i) - bitsOnSum += query->offBy[i][qIndex]; + offTargetCount += query->offBy[i][qIndex]; - if (bitsOnSum < 1000) // could be command line option limit + if (offTargetCount < 1000) // could be command line option limit { misMatchString(misMatch, twoBitMisMatch); kmerValToString(queryString, query->sequence[qIndex], pamSize); kmerValToString(targetString, target->sequence[tIndex], pamSize); kmerPAMString(queryPAM, query->sequence[qIndex]); kmerPAMString(targetPAM, target->sequence[tIndex]); - fprintf(offFile, "%s:%lld %c %s%s %s%s %s %d %.8f %.8f\t%s:%lld %c\n", + fprintf(offFile, "%s:%lld %c %s %s %s %s %s;%lld%c;%d %s %d %.8f %.8f %s:%lld %c\n", query->chrom, query->start[qIndex], negativeStrand & query->sequence[qIndex] ? '-' : '+', queryString, queryPAM, targetString, targetPAM, + target->chrom, target->start[tIndex], + negativeStrand & target->sequence[tIndex] ? '-' : '+', cfdInt, misMatch, bitsOn, mitScore, cfdScore, target->chrom, target->start[tIndex], negativeStrand & target->sequence[tIndex] ? '-' : '+'); } } } // static void recordOffTargets(struct crisprList *query, // struct crisprList *target, int bitsOn, long long qIndex, // long long tIndex) /* this queryVsTarget can be used by threads, it should be safe, * remains to be proven. */ static void queryVsTarget(struct crisprList *query, struct crisprList *target, int threadCount, int threadId) /* run the query crisprs list against the target list in the array structures */ { -struct crisprList *qList = NULL; +struct crisprList *qList; long long totalCrisprsQuery = 0; long long totalCrisprsTarget = 0; long long totalCompares = 0; struct loopControl *control = NULL; AllocVar(control); -long processStart = clock1000(); -long elapsedMs = 0; +long startTime = clock1000(); for (qList = query; qList; qList = qList->next) { - long long qCount = 0; totalCrisprsQuery += qList->crisprCount; if (threadCount > 1) { setLoopEnds(control, qList->crisprCount, threadCount, threadId); if (control->listStart >= qList->crisprCount) continue; /* next chrom, no part to be done for this thread */ } else { control->listStart = 0; control->listEnd = qList->crisprCount; } verbose(1, "# thread %d of %d running %s %lld items [ %lld : %lld )\n", 1 + threadId, threadCount, qList->chrom, control->listEnd - control->listStart, control->listStart, control->listEnd); totalCrisprsTarget += control->listEnd - control->listStart; - for (qCount = control->listStart; qCount < control->listEnd; ++qCount) - { - struct crisprList *tList = NULL; + long long qStart = control->listStart; + long long qEnd = control->listEnd; + long long qCount; + for (qCount = control->listStart; qCount < control->listEnd; qCount = qEnd) + { + qStart = qCount; + qEnd = qStart + cacheCount; + if (cacheCount < 1) + qEnd = qStart + 1; + if (qEnd > control->listEnd) + qEnd = control->listEnd; + struct crisprList *tList; for (tList = target; tList; tList = tList->next) { - long long tCount = 0; + long long tCount; totalCompares += tList->crisprCount; for (tCount = 0; tCount < tList->crisprCount; ++tCount) { + long long q; + for (q = qStart; q < qEnd; ++q) + { /* the XOR determine differences in two sequences, the * shift right 6 removes the PAM sequence and * the 'fortyBits &' eliminates the negativeStrand bit */ long long misMatch = fortyBits & - ((qList->sequence[qCount] ^ tList->sequence[tCount]) >> 6); + ((qList->sequence[q] ^ tList->sequence[tCount]) >> 6); if (misMatch) { /* possible misMatch bit values: 01 10 11 * turn those three values into just: 01 */ misMatch = (misMatch | (misMatch >> 1)) & 0x5555555555; int bitsOn = _mm_popcnt_u64(misMatch); if (bitsOn < 5) { - recordOffTargets(qList, tList, bitsOn, qCount, tCount, misMatch); - qList->offBy[bitsOn][qCount] += 1; + recordOffTargets(qList, tList, bitsOn, q, tCount, misMatch); + qList->offBy[bitsOn][q] += 1; // tList->offBy[bitsOn][tCount] += 1; not needed } } else { /* no misMatch, identical crisprs */ - qList->offBy[0][qCount] += 1; + qList->offBy[0][q] += 1; // tList->offBy[0][tCount] += 1; not needed } + } // for (q = qStart; q < qEnd; ++q) } // for (tCount = 0; tCount < tList->crisprCount; ++tCount) } // for (tList = target; tList; tList = tList->next) } // for (qCount = 0; qCount < qList->crisprCount; ++qCount) } // for (qList = query; qList; qList = qList->next) -verbose(1, "# done with scanning, check timing\n"); -elapsedMs = clock1000() - processStart; -timingMessage("queryVsTarget", totalCrisprsQuery, "query crisprs processed", elapsedMs, "crisprs/sec", "seconds/crispr"); -timingMessage("queryVsTarget", totalCrisprsTarget, "vs target crisprs", elapsedMs, "crisprs/sec", "seconds/crispr"); -timingMessage("queryVsTarget", totalCompares, "total comparisons", elapsedMs, "compares/sec", "seconds/compare"); + +timingMessage("queryVsTarget", totalCrisprsQuery, "query crisprs processed", + startTime, "crisprs/sec", "seconds/crispr"); +timingMessage("queryVsTarget", totalCrisprsTarget, "vs target crisprs", + startTime, "crisprs/sec", "seconds/crispr"); +timingMessage("queryVsTarget", totalCompares, "total comparisons", + startTime, "compares/sec", "seconds/compare"); + } /* static struct crisprList *queryVsTarget(struct crisprList *query, struct crisprList *target) */ static void queryVsSelf(struct crisprList *all) /* run this 'all' list vs. itself avoiding self to self comparisons */ { -struct crisprList *qList = NULL; +struct crisprList *qList; long long totalCrisprsQuery = 0; long long totalCrisprsCompare = 0; -long processStart = clock1000(); -long elapsedMs = 0; +long startTime = clock1000(); /* query runs through all chroms */ for (qList = all; qList; qList = qList->next) { - long long qCount = 0; + long long qCount; totalCrisprsQuery += qList->crisprCount; verbose(1, "# queryVsSelf %lld query crisprs on chrom %s\n", qList->crisprCount, qList->chrom); for (qCount = 0; qCount < qList->crisprCount; ++qCount) { /* target starts on same chrom as query, and at next kmer after query for this first chrom */ long long tStart = qCount+1; - struct crisprList *tList = NULL; + struct crisprList *tList; for (tList = qList; tList; tList = tList->next) { - long long tCount = tStart; + long long tCount; totalCrisprsCompare += tList->crisprCount - tStart; for (tCount = tStart; tCount < tList->crisprCount; ++tCount) { /* the XOR determine differences in two sequences, the * shift right 6 removes the PAM sequence and * the 'fortyBits &' eliminates the negativeStrand bit */ long long misMatch = fortyBits & ((qList->sequence[qCount] ^ tList->sequence[tCount]) >> 6); if (misMatch) { /* possible misMatch bit values: 01 10 11 * turn those three values into just: 01 */ misMatch = (misMatch | (misMatch >> 1)) & 0x5555555555; @@ -1102,47 +1130,50 @@ recordOffTargets(qList, tList, bitsOn, qCount, tCount, misMatch); qList->offBy[bitsOn][qCount] += 1; tList->offBy[bitsOn][tCount] += 1; } } else { /* no misMatch, identical crisprs */ qList->offBy[0][qCount] += 1; tList->offBy[0][tCount] += 1; } } // for (tCount = 0; tCount < tList->crisprCount; ++tCount) tStart = 0; /* following chroms run through all */ } // for (tList = target; tList; tList = tList->next) } // for (qCount = 0; qCount < qList->crisprCount; ++qCount) } // for (qList = query; qList; qList = qList->next) -elapsedMs = clock1000() - processStart; -timingMessage("queryVsSelf", totalCrisprsQuery, "crisprs processed", elapsedMs, "crisprs/sec", "seconds/crispr"); -timingMessage("queryVsSelf", totalCrisprsCompare, "total comparisons", elapsedMs, "compares/sec", "seconds/compare"); + +timingMessage("queryVsSelf", totalCrisprsQuery, "crisprs processed", + startTime, "crisprs/sec", "seconds/crispr"); +timingMessage("queryVsSelf", totalCrisprsCompare, "total comparisons", + startTime, "compares/sec", "seconds/compare"); + } /* static void queryVsSelf(struct crisprList *all) */ static struct crisprList *readKmers(char *fileIn) /* read in kmer list from 'fileIn', return list structure */ { errAbort("# XXX readKmers function not implemented\n"); verbose(1, "# reading crisprs from: %s\n", fileIn); struct crisprList *listReturn = NULL; struct lineFile *lf = lineFileOpen(fileIn, TRUE); char *row[10]; int wordCount = 0; long long crisprsInput = 0; -long startMs = clock1000(); +long startTime = clock1000(); while (0 < (wordCount = lineFileChopNextTab(lf, row, ArraySize(row))) ) { if (3 != wordCount) errAbort("expecing three words at line %d in file %s, found: %d", lf->lineIx, fileIn, wordCount); struct crisprList *newItem = NULL; AllocVar(newItem); newItem->chrom = cloneString(row[0]); newItem->crisprCount = sqlLongLong(row[1]); newItem->size = sqlLongLong(row[2]); newItem->chromCrisprs = NULL; slAddHead(&listReturn, newItem); long long verifyCount = 0; while ( (verifyCount < newItem->crisprCount) && (0 < (wordCount = lineFileChopNextTab(lf, row, ArraySize(row)))) ) @@ -1153,89 +1184,90 @@ ++verifyCount; struct crispr *oneCrispr = NULL; AllocVar(oneCrispr); oneCrispr->sequence = sqlLongLong(row[0]); oneCrispr->start = sqlLongLong(row[1]); slAddHead(&newItem->chromCrisprs, oneCrispr); } if (verifyCount != newItem->crisprCount) errAbort("expecting %lld kmer items at line %d in file %s, found: %lld", newItem->crisprCount, lf->lineIx, fileIn, verifyCount); crisprsInput += verifyCount; } lineFileClose(&lf); -long elapsedMs = clock1000() - startMs; -timingMessage("readKmers", crisprsInput, "crisprs read in", elapsedMs, "crisprs/sec", "seconds/crispr"); +timingMessage("readKmers", crisprsInput, "crisprs read in", startTime, + "crisprs/sec", "seconds/crispr"); return listReturn; } // static struct crisprList *readKmers(char *fileIn) static void writeKmers(struct crisprList *all, char *fileOut) /* write kmer list 'all' to 'fileOut' */ { errAbort("# XXX writeKmers function not implemented\n"); FILE *fh = mustOpen(fileOut, "w"); -struct crisprList *list = NULL; +struct crisprList *list; long long crisprsWritten = 0; char kmerString[33]; char pamString[33]; -long startMs = clock1000(); +long startTime = clock1000(); slReverse(&all); for (list = all; list; list = list->next) { fprintf(fh, "%s\t%lld\t%d\n", list->chrom, list->crisprCount, list->size); - struct crispr *c = NULL; + struct crispr *c; slReverse(&list->chromCrisprs); for (c = list->chromCrisprs; c; c = c->next) { kmerValToString(kmerString, c->sequence, pamSize); kmerPAMString(pamString, c->sequence); fprintf(fh, "%s\t%s\t%lld\t%c\n", kmerString, pamString, c->start, negativeStrand & c->sequence ? '-' : '+'); ++crisprsWritten; } } carefulClose(&fh); -long elapsedMs = clock1000() - startMs; -timingMessage("writeKmers", crisprsWritten, "crisprs written", elapsedMs, "crisprs/sec", "seconds/crispr"); +timingMessage("writeKmers", crisprsWritten, "crisprs written", startTime, + "crisprs/sec", "seconds/crispr"); + } // static void writeKmers(struct crisprList *all, char *fileOut) static void *threadFunction(void *id) /* thread entry */ { struct threadControl *tId = (struct threadControl *)id; queryVsTarget(tId->query, tId->target, tId->threadCount, tId->threadId); return NULL; } static void runThreads(int threadCount, struct crisprList *query, struct crisprList *target) { struct threadControl *threadIds = NULL; AllocArray(threadIds, threadCount); pthread_t *threads = NULL; AllocArray(threads, threadCount); -int pt = 0; +int pt; for (pt = 0; pt < threadCount; ++pt) { struct threadControl *threadData; AllocVar(threadData); threadData->threadId = pt; threadData->threadCount = threadCount; threadData->query = query; threadData->target = target; threadIds[pt] = *threadData; int rc = pthread_create(&threads[pt], NULL, threadFunction, &threadIds[pt]); if (rc) { errAbort("Unexpected error %d from pthread_create(): %s",rc,strerror(rc)); } } @@ -1314,30 +1346,42 @@ int main(int argc, char *argv[]) /* Process command line, initialize translation arrays and call the process */ { optionInit(&argc, argv, options); if (argc != 2) usage(); verbose(0, "# running verboseLevel: %d\n", verboseLevel()); bedFileOut = optionVal("bed", bedFileOut); dumpKmers = optionVal("dumpKmers", dumpKmers); loadKmers = optionVal("loadKmers", loadKmers); offTargets = optionVal("offTargets", offTargets); memLimit = optionInt("memLimit", memLimit); +if (memLimit < 0) + errAbort("specified memLimi is less than 0: %d\n", memLimit); +if (memLimit > 0) + verbose(1, "# memLimit %d gB\n", memLimit); +cacheCount = optionInt("cacheCount", cacheCount); +if (cacheCount < 0) + errAbort("specified cacheCount is less than 0: %d\n", cacheCount); +if (cacheCount > 0) + verbose(1, "# cacheCount %d items\n", cacheCount); ranges = optionVal("ranges", ranges); if (ranges) rangesHash = readRanges(ranges); FILE *bedFH = NULL; if (bedFileOut) + { bedFH = mustOpen(bedFileOut, "w"); + verbose(1, "# output to bed file: %s\n", bedFileOut); + } initOrderedNtVal(); /* set up orderedNtVal[] */ crisprKmers(argv[1], bedFH); if (verboseLevel() > 1) printVmPeak(); return 0; }