feb770798e0b4ad51ec5b7fc1a82fd1363a66ba1 hiram Wed Jul 12 11:51:53 2017 -0700 adding threading and eliminate strand char storage, now does not work correctly refs #18969 diff --git src/utils/crisprKmers/crisprKmers.c src/utils/crisprKmers/crisprKmers.c index beed0c9..48c210a 100644 --- src/utils/crisprKmers/crisprKmers.c +++ src/utils/crisprKmers/crisprKmers.c @@ -1,21 +1,65 @@ /* crisprKmers - find and annotate crispr sequences. */ /* Copyright (C) 2017 The Regents of the University of California * See README in this or parent directory for licensing information. */ +/* Theory of operation: + a. scan given sequence (2bit of fa or fa.gz file) + b. record all quide sequences, both positive and negative strands, + on a linked list structure, 2bit encoding of the A C G T bases, + with PAM sequence, strand and start coordinates, one linked list + for each chromosome name. + c. if a 'ranges' bed3 file is given, then divide up the linked list + guide sequences into a 'query' list and a 'target' list. + The 'query' list of guide sequences are those that have any overlap + with the 'ranges' bed3 items. The 'target' list is an exclusive + set of all the other guide sequences. + d. Without 'ranges', the full list of sequences can be considerd as + the 'query' sequences. + e. Convert the linked list structures into memory arrays, get all + the sequence data and start coordinates into arrays. This is much + more efficient to work with the arrays than trying to run through + the linked lists. The data happens to become duplicated as it + is copied, it isn't worth the time to try to free the memory for + the data from the linked lists. + f. When working with 'ranges', there are two comparison steps: + 1. compare all the 'query' sequences with all the 'target' sequences, + recording the off-target information for the 'query' sequences + in mis-count arrays and writing the off-target information to a + given file. + 2. compare all 'query' sequences with themselves while avoiding + direct self to self comparison. Recording the same off-target + information as the 'query' vs. 'target' sequences. + g. Without 'ranges', the complete list, aka 'query' list, is compared + to itself while avoiding direct self to self comparison. + Recording same off-target information as when working with 'ranges' + h. Finish off by printing out a bed9+ file with all the data recorded + for off-counts for the 'query' sequences. + + Post-processing needs to go through the recorded off-target output, + to construct final scores for each guide sequence to produce the final + bed9+ output file. + + It doesn't save time to write out all the scanned guide sequences + data to be used by a second run. The original scanning itself is + faster than reading in all that data. +*/ + + #include +#include #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "dnautil.h" #include "dnaseq.h" #include "dnaLoad.h" #include "portable.h" #include "basicBed.h" #include "sqlNum.h" #include "binRange.h" #include "obscure.h" #include "memalloc.h" void usage() @@ -49,77 +93,110 @@ static char *loadKmers = NULL; /* file name to read in kmers from previous scan */ static int memLimit = 8; /* gB limit before going into thread mode */ static int threadCount = 1; /* will be adjusted depending upon vmPeak */ /* Command line validation table. */ static struct optionSpec options[] = { {"bed", OPTION_STRING}, {"offTargets", OPTION_STRING}, {"ranges", OPTION_STRING}, {"memLimit", OPTION_INT}, {"dumpKmers", OPTION_STRING}, {"loadKmers", OPTION_STRING}, {NULL, 0}, }; +#define negativeStrand 0x0000800000000000 + +#define fortyEightBits 0xffffffffffff +#define fortySixBits 0x3fffffffffff +#define fortyBits 0xffffffffff +#define high32bits 0xffffffff00000000 +#define low32bits 0x00000000ffffffff +#define high16bits 0xffff0000ffff0000 +#define low16bits 0x0000ffff0000ffff +#define high8bits 0xff00ff00ff00ff00 +#define low8bits 0x00ff00ff00ff00ff +#define high4bits 0xf0f0f0f0f0f0f0f0 +#define low4bits 0x0f0f0f0f0f0f0f0f +#define high2bits 0xcccccccccccccccc +#define low2bits 0x3333333333333333 + +#define pamMask 0x3f + +// 0x0000 0000 0000 0000 +// 6348 4732 3116 15-0 + +// sequence word structure: +// bits 5-0 - PAM sequence in 2bit encoding for 3 bases +// bits 45-6 - 20 base sequence in 2bit encoding format +// bit 47 - negative strand indication +// considering using other bits during processing to mark +// an item for no more consideration + // sizeof(struct crispr): 32 struct crispr /* one chromosome set of crisprs */ { struct crispr *next; /* Next in list. */ long long sequence; /* sequence value in 2bit format */ long long start; /* chromosome start 0-relative */ - char strand; /* strand: + or - */ }; // sizeof(struct crisprList): 72 struct crisprList /* all chromosome sets of crisprs */ { struct crisprList *next; /* Next in list. */ char *chrom; /* chrom name */ int size; /* chrom size */ struct crispr *chromCrisprs; /* all the crisprs on this chrom */ long long crisprCount; /* number of crisprs on this chrom */ long long *sequence; /* array of the sequences */ long long *start; /* array of the starts */ - char *strand; /* array of the strand characters */ int **offBy; /* offBy[5][n] */ }; +struct threadControl +/* data passed to thread to control execution */ + { + int threadId; /* this thread Id */ + int threadCount; /* total threads running */ + struct crisprList *query; /* running query guides against */ + struct crisprList *target; /* target guides */ + }; + +struct loopControl +/* calculate index for start to < end processing for multiple threads */ + { + long long listStart; + long long listEnd; + }; + +/* base values here are different than standard '2bit' format + * these are in order numerically and alphabetically + * plus they complement to their complement with an XOR with 0x3 + */ #define A_BASE 0 #define C_BASE 1 #define G_BASE 2 #define T_BASE 3 #define U_BASE 3 static int orderedNtVal[256]; /* values in alpha order: ACGT 00 01 10 11 */ /* for easier sorting and complementing */ static char bases[4]; /* for binary to ascii conversion */ -#define fortySixBits 0x3fffffffffff -#define fortyEixhtBits 0xffffffffffff -#define high32bits 0xffffffff00000000 -#define low32bits 0x00000000ffffffff -#define high16bits 0xffff0000ffff0000 -#define low16bits 0x0000ffff0000ffff -#define high8bits 0xff00ff00ff00ff00 -#define low8bits 0x00ff00ff00ff00ff -#define high4bits 0xf0f0f0f0f0f0f0f0 -#define low4bits 0x0f0f0f0f0f0f0f0f -#define high2bits 0xcccccccccccccccc -#define low2bits 0x3333333333333333 - static void initOrderedNtVal() /* initialization of base value lookup arrays */ { int i; for (i=0; i 0) && (count > 0)) { perSecond = 1000.0 * count / ms; inverse = 1.0 / perSecond; } verbose(1, "# %s: %lld %s @ %ld ms -> %.2f %s == %g %s\n", prefix, count, message, ms, perSecond, units, inverse, invUnits); } +static void setLoopEnds(struct loopControl *control, long long listSize, + int partCount, int partNumber) +/* for multiple thread processing, given a list of listSize, + # a partCount and a partNumber + * return listStart, listEnd indexes in control structure + */ +{ +long long partSize = 1 + listSize / partCount; +long long listStart = partSize * partNumber; +long long listEnd = partSize * (partNumber + 1); +if (listEnd > listSize) + listEnd = listSize; +control->listStart = listStart; +control->listEnd = listEnd; +} + static struct hash *readRanges(char *bedFile) /* Read bed and return it as a hash keyed by chromName * with binKeeper values. (from: src/hg/bedIntersect/bedIntersec.c) */ { struct hash *hash = newHash(0); /* key is chromName, value is binkeeper data */ struct lineFile *lf = lineFileOpen(bedFile, TRUE); char *row[3]; verbose(1, "# using ranges file: %s\n", bedFile); while (lineFileNextRow(lf, row, 3)) { struct binKeeper *bk; struct bed3 *item; struct hashEl *hel = hashLookup(hash, row[0]); @@ -176,121 +269,114 @@ /* Compare slPairs on start value */ { const struct crispr *a = *((struct crispr **)va); const struct crispr *b = *((struct crispr **)vb); long long aVal = a->start; long long bVal = b->start; if (aVal < bVal) return -1; else if (aVal > bVal) return 1; else return 0; } #endif -/* these two kmerVal to strings routines could be reduced to one and they - * could use lookup tables to run faster. This is *definately* not reentrant ! +/* the kmerPAMString and kmerValToString functions could be reduced to + * one single function, and a lookup table might make this faster. */ -static char *kmerPAMString(long long val) +static void kmerPAMString(char *stringReturn, long long val) /* return the ASCII string for last three bases in then binary sequence value */ { -static char pamString[32]; long long twoBitMask = 0x30; int shiftCount = 4; int baseCount = 0; while (twoBitMask) { int base = (val & twoBitMask) >> shiftCount; - pamString[baseCount++] = bases[base]; + stringReturn[baseCount++] = bases[base]; twoBitMask >>= 2; shiftCount -= 2; } -pamString[baseCount] = 0; -return pamString; +stringReturn[baseCount] = 0; } // static char *kmerPAMString(struct crispr *c, int trim) -/* beware, this needs to be used immediately upon return since it is - * returning its static answer - */ -static char *kmerValToString(long long val, int trim) +static void kmerValToString(char *stringReturn, long long val, int trim) /* return ASCII string for binary sequence value */ { -static char kmerString[32]; long long twoBitMask = 0x300000000000; int shiftCount = 44; int baseCount = 0; while (twoBitMask && (shiftCount >= (2*trim))) { int base = (val & twoBitMask) >> shiftCount; - kmerString[baseCount++] = bases[base]; + stringReturn[baseCount++] = bases[base]; twoBitMask >>= 2; shiftCount -= 2; } -kmerString[baseCount] = 0; -return kmerString; -} // static char *kmerValToString(long long val, int trim) +stringReturn[baseCount] = 0; +} // static void kmerValToString(char *stringReturn, long long val, int trim) -static long long revComp(long long val) +/* this revComp is only used in one place, perfectly fine as an inline + * function, plus, the val does *not* have the negativeStrand bit set yet + */ +static inline long long revComp(long long val) /* reverse complement the 2-bit numerical value kmer */ { /* complement bases and add 18 0 bits * because this divide and conquer works on 64 bits, not 46, * hence the addition of the 18 zero bits which fall out * The 'val ^ fortySixBits' does the complement, the shifting and * masking does the reversing. */ register long long v = (val ^ fortySixBits) << 18; v = ((v & high32bits) >> 32) | ((v & low32bits) << 32); v = ((v & high16bits) >> 16) | ((v & low16bits) << 16); v = ((v & high8bits) >> 8) | ((v & low8bits) << 8); v = ((v & high4bits) >> 4) | ((v & low4bits) << 4); v = ((v & high2bits) >> 2) | ((v & low2bits) << 2); return v; } // static long long revComp(long long val) static void copyToArray(struct crisprList *list) /* copy the crispr list data into arrays */ { long startTime = clock1000(); struct crisprList *cl = NULL; long long itemsCopied = 0; for (cl = list; cl; cl = cl->next) { size_t memSize = cl->crisprCount * sizeof(long long); cl->sequence = (long long *)needLargeMem(memSize); cl->start = (long long *)needLargeMem(memSize); - memSize = cl->crisprCount * sizeof(char); - cl->strand = (char *)needLargeMem(memSize); memSize = 5 * sizeof(int *); cl->offBy = (int **)needLargeMem(memSize); memSize = cl->crisprCount * sizeof(int); int r = 0; for (r = 0; r < 5; ++r) cl->offBy[r] = (int *)needLargeZeroedMem(memSize); long long i = 0; struct crispr *c = NULL; for (c = cl->chromCrisprs; c; c = c->next) { ++itemsCopied; cl->sequence[i] = c->sequence; cl->start[i] = c->start; - cl->strand[i++] = c->strand; } } long elapsedMs = clock1000() - startTime; timingMessage("copyToArray:", itemsCopied, "items copied", elapsedMs, "items/sec", "seconds/item"); } // static void copyToArray(struct crisprList *list) */ static struct crisprList *generateKmers(struct dnaSeq *seq) { struct crispr *crisprSet = NULL; struct crisprList *returnList = NULL; AllocVar(returnList); returnList->chrom = cloneString(seq->name); returnList->size = seq->size; int i; DNA *dna; @@ -313,71 +399,71 @@ dna=seq->dna; for (i=0; i < seq->size; ++i) { int val = orderedNtVal[(int)dna[i]]; if (val >= 0) { kmerVal = fortySixBits & ((kmerVal << 2) | val); ++kmerLength; if (kmerLength > 22) { if ((endsAG == (kmerVal & 0xf)) || (endsGG == (kmerVal & 0xf))) { /* have match for positive strand */ struct crispr *oneCrispr = NULL; AllocVar(oneCrispr); oneCrispr->start = chromPosition - 22; - oneCrispr->strand = '+'; oneCrispr->sequence = kmerVal; slAddHead(&crisprSet, oneCrispr); } if ((beginsCT == (kmerVal & reverseMask)) || (beginsCC == (kmerVal & reverseMask))) { /* have match for negative strand */ struct crispr *oneCrispr = NULL; AllocVar(oneCrispr); oneCrispr->start = chromPosition - 22; - oneCrispr->strand = '-'; oneCrispr->sequence = revComp(kmerVal); + oneCrispr->sequence |= negativeStrand; slAddHead(&crisprSet, oneCrispr); } } } // if (val >= 0) else { ++gapCount; startGap = chromPosition; /* skip all N's == any value not = 0,1,2,3 */ while ( ((i+1) < seq->size) && (orderedNtVal[(int)dna[i+1]] < 0)) { ++chromPosition; ++i; } if (startGap != chromPosition) verbose(4, "#GAP %s\t%lld\t%lld\t%lld\t%lld\t%s\n", seq->name, startGap, chromPosition, gapCount, chromPosition-startGap, "+"); else verbose(4, "#GAP %s\t%lld\t%lld\t%lld\t%lld\t%s\n", seq->name, startGap, chromPosition+1, gapCount, 1+chromPosition-startGap, "+"); kmerLength = 0; /* reset, start over */ kmerVal = 0; } // else if (val >= 0) ++chromPosition; } // for (i=0; i < seq->size; ++i) // slReverse(&crisprSet); // save time, order not important at this time returnList->chromCrisprs = crisprSet; returnList->crisprCount = slCount(crisprSet); return returnList; } // static struct crisprList *generateKmers(struct dnaSeq *seq) static char *itemColor(int **offBy, long long index) +/* temporary itemColor, post processing will do this correctly */ { static char *notUnique = "150,150,150"; // static char *notUniqueAlt = "150,150,150"; static char *twoMany = "120,120,120"; // static char *twoManyAlt = "120,120,120"; static char *highScore = "0,255,0"; // > 70 static char *mediumScore = "128,128,0"; // > 50 static char *lowScore = "255,0,0"; // >= 50 // static char *lowestScore = "80,80,80"; // < 50 // static char *lowestScoreAlt = "80,80,80"; // < 50 if (offBy[0][index]) return notUnique; else { int offSum = offBy[1][index] + offBy[2][index] + offBy[3][index] + @@ -400,48 +486,56 @@ long long itemsOutput = 0; FILE *bedFH = NULL; if (bedFileOut) bedFH = mustOpen(bedFileOut, "w"); struct crisprList *list; long long totalOut = 0; for (list = all; list; list = list->next) { long long c = 0; for (c = 0; c < list->crisprCount; ++c) { ++itemsOutput; int negativeOffset = 0; - if (list->strand[c] == '-') + char strand = '+'; + if (negativeStrand & list->sequence[c]) + { + strand = '-'; negativeOffset = 3; + } long long txStart = list->start[c] + negativeOffset; long long txEnd = txStart + 20; int totalOffs = list->offBy[0][c] + list->offBy[1][c] + list->offBy[2][c] + list->offBy[3][c] + list->offBy[4][c]; char *color = itemColor(list->offBy, c); + char kmerString[32]; + kmerValToString(kmerString, list->sequence[c], 3); + char pamString[32]; + kmerPAMString(pamString, list->sequence[c]); if (0 == totalOffs) -verbose(1, "# PERFECT score %s:%lld %c\t%s\n", list->chrom, list->start[c], list->strand[c], kmerValToString(list->sequence[c], 3)); +verbose(1, "# PERFECT score %s:%lld %c\t%s\n", list->chrom, list->start[c], strand, kmerString); if (bedFH) - fprintf(bedFH, "%s\t%lld\t%lld\t%d,%d,%d,%d,%d\t%d\t%c\t%lld\t%lld\t%s\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\n", list->chrom, list->start[c], list->start[c]+23, list->offBy[0][c], list->offBy[1][c], list->offBy[2][c], list->offBy[3][c], list->offBy[4][c], list->offBy[0][c], list->strand[c], txStart, txEnd, color, color, color, kmerValToString(list->sequence[c], 3), kmerPAMString(list->sequence[c]), list->offBy[1][c], list->offBy[2][c], list->offBy[3][c], list->offBy[4][c]); + fprintf(bedFH, "%s\t%lld\t%lld\t%d,%d,%d,%d,%d\t%d\t%c\t%lld\t%lld\t%s\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\n", list->chrom, list->start[c], list->start[c]+23, list->offBy[0][c], list->offBy[1][c], list->offBy[2][c], list->offBy[3][c], list->offBy[4][c], list->offBy[0][c], strand, txStart, txEnd, color, color, color, kmerString, pamString, list->offBy[1][c], list->offBy[2][c], list->offBy[3][c], list->offBy[4][c]); if (list->offBy[0][c]) - verbose(3, "# array identical: %d %s:%lld %c\t%s\n", list->offBy[0][c], list->chrom, list->start[c], list->strand[c], kmerValToString(list->sequence[c], 3)); + verbose(3, "# array identical: %d %s:%lld %c\t%s\n", list->offBy[0][c], list->chrom, list->start[c], strand, kmerString); ++totalOut; } } if (bedFH) carefulClose(&bedFH); long elapsedMs = clock1000() - startTime; timingMessage("copyToArray:", itemsOutput, "items output", elapsedMs, "items/sec", "seconds/item"); } // static void countsOutput(struct crisprList *all) static struct crisprList *scanSequence(char *inFile) /* scan the given file, return list of crisprs */ { verbose(1, "#\tscanning sequence file: %s\n", inFile); dnaUtilOpen(); @@ -499,31 +593,31 @@ nextList = list->next; // remember before perhaps lost long long beforeCrisprCount = list->crisprCount; examinedCrisprCount += list->crisprCount; struct crispr *c = NULL; struct binKeeper *bk = hashFindVal(rangesHash, list->chrom); struct crispr *newCrispr = NULL; if (bk != NULL) { struct crispr *prevCrispr = NULL; struct crispr *next = NULL; for (c = list->chromCrisprs; c; c = next) { struct binElement *hitList = NULL; next = c->next; // remember before perhaps lost int start = c->start; - if (c->strand == '-') + if (negativeStrand & c->sequence) start += 2; int end = start + 20; hitList = binKeeperFind(bk, start, end); if (hitList) { if (prevCrispr) // remove this one from the all list prevCrispr->next = next; else list->chromCrisprs = next; // removing the first one c->next = NULL; slAddHead(&newCrispr, c); // constructing new list } else prevCrispr = c; // remains on all list slFreeList(&hitList); @@ -565,142 +659,168 @@ timingMessage("range scan", targetCrisprCount, "remaining target crisprs", elapsedMs, "crisprs/sec", "seconds/crispr"); timingMessage("range scan", returnListCrisprCount, "returned query crisprs", elapsedMs, "crisprs/sec", "seconds/crispr"); if (NULL == all) { allReference = NULL; // they have all been removed verbose(1, "# range scan, every single chrom has been removed\n"); } else if (*allReference != all) { verbose(1, "# range scan, first chrom list has been moved from %p to %p\n", (void *)*allReference, (void *)all); *allReference = all; } return listReturn; } // static crisprList *rangeExtraction(crisprList *all) -static int hitScoreM[20] = {0,0,0.014,0,0,0.395,0.317,0,0.389,0.079,0.445,0.508,0.613,0.851,0.732,0.828,0.615,0.804,0.685,0.583}; +static double hitScoreM[20] = {0,0,0.014,0,0,0.395,0.317,0,0.389,0.079,0.445,0.508,0.613,0.851,0.732,0.828,0.615,0.804,0.685,0.583}; -static int calcHitScore(long long sequence1, long long sequence2) +static double calcHitScore(long long sequence1, long long sequence2) +/* calcHitScore - from Max' crispor.py script and paper: + * https://www.ncbi.nlm.nih.gov/pubmed/27380939 */ { double score1 = 1.0; int mmCount = 0; int lastMmPos = -1; -/* the XOR will determine differences in two sequences, the shift - # right 6 removes the PAM sequence */ -long long misMatch = (sequence1 ^ sequence2) >> 6; +/* the XOR determines differences in two sequences, the shift + * right 6 removes the PAM sequence and the 'fortyBits &' eliminates + * the negativeStrand bit + */ +long long misMatch = fortyBits & ((sequence1 ^ sequence2) >> 6); int distCount = 0; int distSum = 0; int pos = 0; for (pos = 0; pos < 20; ++pos) { int diff = misMatch & 0x3; if (diff) { ++mmCount; if (lastMmPos != -1) { distSum += pos-lastMmPos; ++distCount; } score1 *= 1 - hitScoreM[pos]; lastMmPos = pos; } misMatch >>= 2; } double score2 = 1.0; if (distCount > 1) { double avgDist = (double)distSum / distCount; score2 = 1.0 / (((19-avgDist)/19.0) * 4 + 1); } double score3 = 1.0; if (mmCount > 0) score3 = 1.0 / (mmCount * mmCount); -return (int)(score1 * score2 * score3 * 100); -} +return (score1 * score2 * score3 * 100); +} // static double calcHitScore(long long sequence1, long long sequence2) static void recordOffTargets(struct crisprList *query, struct crisprList *target, int bitsOn, long long qIndex, long long tIndex) /* bitsOn is from 1 to 4, record this match when less than 1000 total */ { +char queryString[32]; /* local storage for string */ +char targetString[32]; /* local storage for string */ +char queryPAM[32]; /* local storage for string */ +char targetPAM[32]; /* local storage for string */ + if (query->offBy[0][qIndex] ) // no need to accumulate if 0-mismatch > 0 return; if (offFile) { int i = 0; int bitsOnSum = 0; for (i = 1; i < 5; ++i) bitsOnSum += query->offBy[i][qIndex]; if (bitsOnSum < 1000) // could be command line option limit - { /* needs to be two fprintfs, as the kmer*String() function - * returns are confused due to static returns */ - int hitScore = + { + double hitScore = calcHitScore(query->sequence[qIndex], target->sequence[tIndex]); - fprintf(offFile, "%s:%lld %c %s %s %d\t", query->chrom, - query->start[qIndex], query->strand[qIndex], - kmerValToString(query->sequence[qIndex], 3), - kmerPAMString(query->sequence[qIndex]), hitScore); + kmerValToString(queryString, query->sequence[qIndex], 3); + kmerValToString(targetString, target->sequence[tIndex], 3); + kmerPAMString(queryPAM, query->sequence[qIndex]); + kmerPAMString(targetPAM, target->sequence[tIndex]); + fprintf(offFile, "%s:%lld %c %s %s %.2f\t", query->chrom, + query->start[qIndex], + negativeStrand & query->sequence[qIndex] ? '-' : '+', + queryString, queryPAM, hitScore); fprintf(offFile, "%s:%lld %c %s %s\t%d\n", target->chrom, - target->start[tIndex], target->strand[tIndex], - kmerValToString(target->sequence[tIndex], 3), - kmerPAMString(target->sequence[tIndex]), bitsOn); + target->start[tIndex], + negativeStrand & target->sequence[tIndex] ? '-' : '+', + targetString, targetPAM, bitsOn); } } } // static void recordOffTargets(struct crisprList *query, // struct crisprList *target, int bitsOn, long long qIndex, // long long tIndex) -static void queryVsAll(struct crisprList *query, struct crisprList *target) +static void queryVsAll(struct crisprList *query, struct crisprList *target, + int threadCount, int threadId) /* run the query crisprs list against the target list in the array structures */ { struct crisprList *qList = NULL; long long totalCrisprsQuery = 0; long long totalCrisprsTarget = 0; long long totalCompares = 0; +struct loopControl *control = NULL; +AllocVar(control); +if (threadCount > 1) + verbose(1, "# thread %d of %d threads running queryVsAll()\n", + threadId, threadCount); long processStart = clock1000(); long elapsedMs = 0; for (qList = query; qList; qList = qList->next) { long long qCount = 0; totalCrisprsQuery += qList->crisprCount; verbose(1, "# queryVsAll %lld query crisprs on chrom %s\n", qList->crisprCount, qList->chrom); - for (qCount = 0; qCount < qList->crisprCount; ++qCount) + if (threadCount > 1) + { + setLoopEnds(control, qList->crisprCount, threadCount, threadId); + if (control->listStart >= qList->crisprCount) + continue; /* next chrom, no part to be done for this thread */ + } + else + { + control->listStart = 0; + control->listEnd = qList->crisprCount; + } + totalCrisprsTarget += control->listEnd - control->listStart; + for (qCount = control->listStart; qCount < control->listEnd; ++qCount) { struct crisprList *tList = NULL; for (tList = target; tList; tList = tList->next) { long long tCount = 0; totalCompares += tList->crisprCount; -// if (0 == qCount) -// { -// totalCrisprsTarget += tList->crisprCount; -// verbose(1, "# queryVsAll %lld target crisprs on chrom %s %lld targets %lld compares\n", tList->crisprCount, tList->chrom, totalCrisprsTarget, totalCompares); -// } for (tCount = 0; tCount < tList->crisprCount; ++tCount) { - /* the XOR will determine differences in two sequences - * the shift right 6 removes the PAM sequence + /* the XOR determine differences in two sequences, the + * shift right 6 removes the PAM sequence and + * the 'fortyBits &' eliminates the negativeStrand bit */ - long long misMatch = - (qList->sequence[qCount] ^ tList->sequence[tCount]) >> 6; + long long misMatch = fortyBits & + ((qList->sequence[qCount] ^ tList->sequence[tCount]) >> 6); if (misMatch) { /* possible misMatch bit values: 01 10 11 * turn those three values into just: 01 */ misMatch = (misMatch | (misMatch >> 1)) & 0x5555555555; int bitsOn = _mm_popcnt_u64(misMatch); if (bitsOn < 5) { recordOffTargets(qList, tList, bitsOn, qCount, tCount); qList->offBy[bitsOn][qCount] += 1; // tList->offBy[bitsOn][tCount] += 1; not needed } } else @@ -708,93 +828,92 @@ qList->offBy[0][qCount] += 1; // tList->offBy[0][tCount] += 1; not needed } } // for (tCount = 0; tCount < tList->crisprCount; ++tCount) } // for (tList = target; tList; tList = tList->next) } // for (qCount = 0; qCount < qList->crisprCount; ++qCount) } // for (qList = query; qList; qList = qList->next) verbose(1, "# done with scanning, check timing\n"); elapsedMs = clock1000() - processStart; timingMessage("queryVsAll", totalCrisprsQuery, "query crisprs processed", elapsedMs, "crisprs/sec", "seconds/crispr"); timingMessage("queryVsAll", totalCrisprsTarget, "vs target crisprs", elapsedMs, "crisprs/sec", "seconds/crispr"); timingMessage("queryVsAll", totalCompares, "total comparisons", elapsedMs, "compares/sec", "seconds/compare"); } /* static struct crisprList *queryVsAll(struct crisprList *query, struct crisprList *target) */ -static void allVsAll(struct crisprList *all) +static void queryVsSelf(struct crisprList *all) /* run this 'all' list vs. itself avoiding self to self comparisons */ { struct crisprList *qList = NULL; long long totalCrisprsQuery = 0; long long totalCrisprsCompare = 0; long processStart = clock1000(); long elapsedMs = 0; /* query runs through all chroms */ for (qList = all; qList; qList = qList->next) { long long qCount = 0; totalCrisprsQuery += qList->crisprCount; - verbose(1, "# allVsAll %lld query crisprs on chrom %s\n", qList->crisprCount, qList->chrom); - /* query runs through all kmers on this chrom */ + verbose(1, "# queryVsSelf %lld query crisprs on chrom %s\n", qList->crisprCount, qList->chrom); for (qCount = 0; qCount < qList->crisprCount; ++qCount) { /* target starts on same chrom as query, and at next kmer after query for this first chrom */ long long tStart = qCount+1; struct crisprList *tList = NULL; for (tList = qList; tList; tList = tList->next) { long long tCount = tStart; totalCrisprsCompare += tList->crisprCount - tStart; for (tCount = tStart; tCount < tList->crisprCount; ++tCount) { - /* the XOR will determine differences in two sequences - * the shift right 6 removes the PAM sequence + /* the XOR determine differences in two sequences, the + * shift right 6 removes the PAM sequence and + * the 'fortyBits &' eliminates the negativeStrand bit */ - long long misMatch = - (qList->sequence[qCount] ^ tList->sequence[tCount]) >> 6; + long long misMatch = fortyBits & + ((qList->sequence[qCount] ^ tList->sequence[tCount]) >> 6); if (misMatch) { /* possible misMatch bit values: 01 10 11 * turn those three values into just: 01 */ misMatch = (misMatch | (misMatch >> 1)) & 0x5555555555; int bitsOn = _mm_popcnt_u64(misMatch); if (bitsOn < 5) { recordOffTargets(qList, tList, bitsOn, qCount, tCount); qList->offBy[bitsOn][qCount] += 1; tList->offBy[bitsOn][tCount] += 1; } } else { /* no misMatch, identical crisprs */ qList->offBy[0][qCount] += 1; tList->offBy[0][tCount] += 1; } } // for (tCount = 0; tCount < tList->crisprCount; ++tCount) tStart = 0; /* following chroms run through all */ } // for (tList = target; tList; tList = tList->next) } // for (qCount = 0; qCount < qList->crisprCount; ++qCount) } // for (qList = query; qList; qList = qList->next) elapsedMs = clock1000() - processStart; -timingMessage("allVsAll", totalCrisprsQuery, "crisprs processed", elapsedMs, "crisprs/sec", "seconds/crispr"); -timingMessage("allVsAll", totalCrisprsCompare, "total comparisons", elapsedMs, "compares/sec", "seconds/compare"); -} /* static struct crisprList *allVsAll(struct crisprList *query, - struct crisprList *target) */ +timingMessage("queryVsSelf", totalCrisprsQuery, "crisprs processed", elapsedMs, "crisprs/sec", "seconds/crispr"); +timingMessage("queryVsSelf", totalCrisprsCompare, "total comparisons", elapsedMs, "compares/sec", "seconds/compare"); +} /* static void queryVsSelf(struct crisprList *all) */ static struct crisprList *readKmers(char *fileIn) /* read in kmer list from 'fileIn', return list structure */ { verbose(1, "# reading crisprs from: %s\n", fileIn); struct crisprList *listReturn = NULL; struct lineFile *lf = lineFileOpen(fileIn, TRUE); char *row[10]; int wordCount = 0; long long crisprsInput = 0; long startMs = clock1000(); while (0 < (wordCount = lineFileChopNextTab(lf, row, ArraySize(row))) ) { @@ -807,31 +926,30 @@ newItem->crisprCount = sqlLongLong(row[1]); newItem->size = sqlLongLong(row[2]); newItem->chromCrisprs = NULL; slAddHead(&listReturn, newItem); long long verifyCount = 0; while ( (verifyCount < newItem->crisprCount) && (0 < (wordCount = lineFileChopNextTab(lf, row, ArraySize(row)))) ) { if (3 != wordCount) errAbort("expecing three words at line %d in file %s, found: %d", lf->lineIx, fileIn, wordCount); ++verifyCount; struct crispr *oneCrispr = NULL; AllocVar(oneCrispr); oneCrispr->sequence = sqlLongLong(row[0]); oneCrispr->start = sqlLongLong(row[1]); - oneCrispr->strand = row[2][0]; slAddHead(&newItem->chromCrisprs, oneCrispr); } if (verifyCount != newItem->crisprCount) errAbort("expecting %lld kmer items at line %d in file %s, found: %lld", newItem->crisprCount, lf->lineIx, fileIn, verifyCount); crisprsInput += verifyCount; } lineFileClose(&lf); long elapsedMs = clock1000() - startMs; timingMessage("readKmers", crisprsInput, "crisprs read in", elapsedMs, "crisprs/sec", "seconds/crispr"); return listReturn; } // static struct crisprList *readKmers(char *fileIn) @@ -842,40 +960,102 @@ FILE *fh = mustOpen(fileOut, "w"); struct crisprList *list = NULL; long long crisprsWritten = 0; long startMs = clock1000(); slReverse(&all); for (list = all; list; list = list->next) { fprintf(fh, "%s\t%lld\t%d\n", list->chrom, list->crisprCount, list->size); struct crispr *c = NULL; slReverse(&list->chromCrisprs); for (c = list->chromCrisprs; c; c = c->next) { fprintf(fh, "%lld\t%lld\t%c\n", c->sequence, - c->start, c->strand); + c->start, negativeStrand & c->sequence ? '-' : '+'); ++crisprsWritten; } } carefulClose(&fh); long elapsedMs = clock1000() - startMs; timingMessage("writeKmers", crisprsWritten, "crisprs written", elapsedMs, "crisprs/sec", "seconds/crispr"); } // static void writeKmers(struct crisprList *all, char *fileOut) +static void *threadFunction(void *id) +/* thread entry */ +{ +struct threadControl *tId = (struct threadControl *)id; + +queryVsAll(tId->query, tId->target, tId->threadCount, tId->threadId); + +#ifdef NOT +/* + * for each list to process, divide the list into tId->threadCount parts, + * then one thread runs part number tId->threadId bits of that list + */ +struct loopControl *control = NULL; +AllocVar(control); + + +long long listCount = 1; +while (listCount < 100) { +setLoopEnds(control, listCount, tId->threadCount, tId->threadId); + +if (control->listStart >= listCount) + verbose(1, "#\t%lld\t# %d of %d does not run from %lld to < %lld in list of %lld size\n", listCount, tId->threadId, tId->threadCount, control->listStart, control->listEnd, listCount); +else + verbose(1, "#\t%lld\t# %d of %d runs from %lld to < %lld in list of %lld size\n", listCount, tId->threadId, tId->threadCount, control->listStart, control->listEnd, listCount); + listCount += 13; +} +#endif + +return NULL; +} + +static void runThreads(int threadCount, struct crisprList *query, + struct crisprList *target) +{ +struct threadControl *threadIds = NULL; +AllocArray(threadIds, threadCount); + +pthread_t *threads = NULL; +AllocArray(threads, threadCount); + +int pt = 0; +for (pt = 0; pt < threadCount; ++pt) + { + struct threadControl *threadData; + AllocVar(threadData); + threadData->threadId = pt; + threadData->threadCount = threadCount; + threadData->query = query; + threadData->target = target; + threadIds[pt] = *threadData; + int rc = pthread_create(&threads[pt], NULL, threadFunction, &threadIds[pt]); + if (rc) + { + errAbort("Unexpected error %d from pthread_create(): %s",rc,strerror(rc)); + } + } + +/* Wait for threads to finish */ +for (pt = 0; pt < threadCount; ++pt) + pthread_join(threads[pt], NULL); +} + static void crisprKmers(char *sequence) /* crisprKmers - find and annotate crispr sequences. */ { struct crisprList *queryGuides = NULL; // struct crisprList *countedCrisprs = NULL; struct crisprList *allGuides = NULL; if (loadKmers) allGuides = readKmers(loadKmers); else allGuides = scanSequence(sequence); if (dumpKmers) { writeKmers(allGuides, dumpKmers); @@ -892,46 +1072,51 @@ offFile = mustOpen(offTargets, "w"); /* if ranges have been specified, construct queryList of kmers to measure */ if (rangesHash) { /* result here is two exclusive sets: query, and allGuides * the query crisprs have been extracted from the allGuides */ queryGuides = rangeExtraction(& allGuides); } if (queryGuides) copyToArray(queryGuides); if (allGuides) copyToArray(allGuides); vmPeak = currentVmPeak(); verbose(1, "# vmPeak after copyToArray: %lld kB\n", vmPeak); /* larger example: 62646196 kB */ - if ((vmPeak >> 20) > 8) // the >> 20 converts kB to gB + if ((vmPeak >> 20) > memLimit) // the >> 20 converts kB to gB { - threadCount = 1 + ((vmPeak >> 20) / 8); - verbose(1, "# over 8 Gb at %lld kB, threadCount: %d\n", vmPeak, threadCount); + threadCount = 1 + ((vmPeak >> 20) / memLimit); + verbose(1, "# over %d Gb at %lld kB, threadCount: %d\n", memLimit, vmPeak, threadCount); } if (queryGuides) // when range selected some query sequences { if (allGuides) // if there are any left on the all list - queryVsAll(queryGuides, allGuides); + { + if (0 == 1 && threadCount > 1) + runThreads(threadCount, queryGuides, allGuides); + else + queryVsAll(queryGuides, allGuides, 1, 0); + } /* then run up the query vs. itself avoiding self vs. self */ - allVsAll(queryGuides); + queryVsSelf(queryGuides); countsOutput(queryGuides); } else { - allVsAll(allGuides); /* run up all vs. all avoiding self vs. self */ + queryVsSelf(allGuides); /* run up all vs. all avoiding self vs. self */ countsOutput(allGuides); } carefulClose(&offFile); } } // static void crisprKmers(char *sequence) int main(int argc, char *argv[]) /* Process command line, initialize translation arrays and call the process */ { optionInit(&argc, argv, options); if (argc != 2) usage(); verbose(0, "# running verboseLevel: %d\n", verboseLevel());