131be66e554e489d599a2e54988a45f1985e194e hiram Mon Jul 17 08:27:54 2017 -0700 correct selection of guides vs. ranges to avoid duplicates when ranges are split refs #18969 diff --git src/utils/crisprKmers/crisprKmers.c src/utils/crisprKmers/crisprKmers.c index ebbc08b..662d4f9 100644 --- src/utils/crisprKmers/crisprKmers.c +++ src/utils/crisprKmers/crisprKmers.c @@ -93,30 +93,32 @@ static char *loadKmers = NULL; /* file name to read in kmers from previous scan */ static int memLimit = 0; /* gB limit before going into thread mode */ static int threadCount = 1; /* will be adjusted depending upon vmPeak */ /* Command line validation table. */ static struct optionSpec options[] = { {"bed", OPTION_STRING}, {"offTargets", OPTION_STRING}, {"ranges", OPTION_STRING}, {"memLimit", OPTION_INT}, {"dumpKmers", OPTION_STRING}, {"loadKmers", OPTION_STRING}, {NULL, 0}, }; +#define guideSize 20 // 20 bases +#define pamSize 3 // 3 bases #define negativeStrand 0x0000800000000000 #define fortyEightBits 0xffffffffffff #define fortySixBits 0x3fffffffffff #define fortyBits 0xffffffffff #define high32bits 0xffffffff00000000 #define low32bits 0x00000000ffffffff #define high16bits 0xffff0000ffff0000 #define low16bits 0x0000ffff0000ffff #define high8bits 0xff00ff00ff00ff00 #define low8bits 0x00ff00ff00ff00ff #define high4bits 0xf0f0f0f0f0f0f0f0 #define low4bits 0x0f0f0f0f0f0f0f0f #define high2bits 0xcccccccccccccccc #define low2bits 0x3333333333333333 @@ -201,31 +203,31 @@ bases[G_BASE] = 'G'; bases[T_BASE] = 'T'; } // static void initOrderedNtVal() static void timingMessage(char *prefix, long long count, char *message, long ms, char *units, char *invUnits) { double perSecond = 0.0; double inverse = 0.0; if ((ms > 0) && (count > 0)) { perSecond = 1000.0 * count / ms; inverse = 1.0 / perSecond; } -verbose(1, "# %s: %lld %s @ %ld ms -> %.2f %s == %g %s\n", prefix, count, message, ms, perSecond, units, inverse, invUnits); +verbose(1, "# %s: %lld %s @ %ld ms\n#\t%.2f %s == %g %s\n", prefix, count, message, ms, perSecond, units, inverse, invUnits); } static void setLoopEnds(struct loopControl *control, long long listSize, int partCount, int partNumber) /* for multiple thread processing, given a list of listSize, # a partCount and a partNumber * return listStart, listEnd indexes in control structure */ { long long partSize = 1 + listSize / partCount; long long listStart = partSize * partNumber; long long listEnd = partSize * (partNumber + 1); if (listEnd > listSize) listEnd = listSize; control->listStart = listStart; @@ -487,60 +489,60 @@ long long itemsOutput = 0; struct crisprList *list; long long totalOut = 0; for (list = all; list; list = list->next) { long long c = 0; for (c = 0; c < list->crisprCount; ++c) { ++itemsOutput; int negativeOffset = 0; char strand = '+'; if (negativeStrand & list->sequence[c]) { strand = '-'; - negativeOffset = 3; + negativeOffset = pamSize; } long long txStart = list->start[c] + negativeOffset; - long long txEnd = txStart + 20; + long long txEnd = txStart + guideSize; int totalOffs = list->offBy[0][c] + list->offBy[1][c] + list->offBy[2][c] + list->offBy[3][c] + list->offBy[4][c]; char *color = itemColor(list->offBy, c); char kmerString[33]; - kmerValToString(kmerString, list->sequence[c], 3); + kmerValToString(kmerString, list->sequence[c], pamSize); char pamString[33]; kmerPAMString(pamString, list->sequence[c]); if (0 == totalOffs) verbose(1, "# PERFECT score %s:%lld %c\t%s\n", list->chrom, list->start[c], strand, kmerString); if (bedFH) - fprintf(bedFH, "%s\t%lld\t%lld\t%d,%d,%d,%d,%d\t%d\t%c\t%lld\t%lld\t%s\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\n", list->chrom, list->start[c], list->start[c]+23, list->offBy[0][c], list->offBy[1][c], list->offBy[2][c], list->offBy[3][c], list->offBy[4][c], list->offBy[0][c], strand, txStart, txEnd, color, color, color, kmerString, pamString, list->offBy[1][c], list->offBy[2][c], list->offBy[3][c], list->offBy[4][c]); + fprintf(bedFH, "%s\t%lld\t%lld\t%d,%d,%d,%d,%d\t%d\t%c\t%lld\t%lld\t%s\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\n", list->chrom, list->start[c], list->start[c]+pamSize+guideSize, list->offBy[0][c], list->offBy[1][c], list->offBy[2][c], list->offBy[3][c], list->offBy[4][c], list->offBy[0][c], strand, txStart, txEnd, color, color, color, kmerString, pamString, list->offBy[1][c], list->offBy[2][c], list->offBy[3][c], list->offBy[4][c]); - if (list->offBy[0][c]) - verbose(3, "# array identical: %d %s:%lld %c\t%s\n", list->offBy[0][c], list->chrom, list->start[c], strand, kmerString); +// if (list->offBy[0][c]) +// verbose(3, "# array identical: %d %s:%lld %c\t%s\n", list->offBy[0][c], list->chrom, list->start[c], strand, kmerString); ++totalOut; } } if (bedFH) carefulClose(&bedFH); long elapsedMs = clock1000() - startTime; -timingMessage("copyToArray:", itemsOutput, "items output", elapsedMs, "items/sec", "seconds/item"); +timingMessage("countsOutput:", itemsOutput, "items output", elapsedMs, "items/sec", "seconds/item"); } // static void countsOutput(struct crisprList *all, FILE *bedFH) static struct crisprList *scanSequence(char *inFile) /* scan the given file, return list of crisprs */ { verbose(1, "#\tscanning sequence file: %s\n", inFile); dnaUtilOpen(); struct dnaLoad *dl = dnaLoadOpen(inFile); struct dnaSeq *seq; struct crisprList *listReturn = NULL; long elapsedMs = 0; long scanStart = clock1000(); long long totalCrisprs = 0; @@ -589,46 +591,47 @@ { nextList = list->next; // remember before perhaps lost long long beforeCrisprCount = list->crisprCount; examinedCrisprCount += list->crisprCount; struct crispr *c = NULL; struct binKeeper *bk = hashFindVal(rangesHash, list->chrom); struct crispr *newCrispr = NULL; if (bk != NULL) { struct crispr *prevCrispr = NULL; struct crispr *next = NULL; for (c = list->chromCrisprs; c; c = next) { struct binElement *hitList = NULL; next = c->next; // remember before perhaps lost - int start = c->start; - if (negativeStrand & c->sequence) - start += 2; - int end = start + 20; - hitList = binKeeperFind(bk, start, end); + // select any guide that is at least half contained in any range + int midPoint = c->start + ((pamSize + guideSize) >> 1); +// if (negativeStrand & c->sequence) +// start += 2; +// int end = midPoint + 1; + hitList = binKeeperFind(bk, midPoint, midPoint + 1); if (hitList) { - if (prevCrispr) // remove this one from the all list + if (prevCrispr) // remove this one from the 'all' list prevCrispr->next = next; else list->chromCrisprs = next; // removing the first one - c->next = NULL; + c->next = NULL; // new item for new list slAddHead(&newCrispr, c); // constructing new list } else - prevCrispr = c; // remains on all list + prevCrispr = c; // remains on 'all' list slFreeList(&hitList); } } if (newCrispr) { struct crisprList *newItem = NULL; AllocVar(newItem); newItem->chrom = cloneString(list->chrom); newItem->size = list->size; newItem->chromCrisprs = newCrispr; newItem->crisprCount = slCount(newCrispr); returnListCrisprCount += newItem->crisprCount; slAddHead(&listReturn, newItem); if (NULL == list->chromCrisprs) // all have been removed for this chrom { @@ -673,31 +676,31 @@ static double calcHitScore(long long sequence1, long long sequence2) /* calcHitScore - from Max' crispor.py script and paper: * https://www.ncbi.nlm.nih.gov/pubmed/27380939 */ { double score1 = 1.0; int mmCount = 0; int lastMmPos = -1; /* the XOR determines differences in two sequences, the shift * right 6 removes the PAM sequence and the 'fortyBits &' eliminates * the negativeStrand bit */ long long misMatch = fortyBits & ((sequence1 ^ sequence2) >> 6); int distCount = 0; int distSum = 0; int pos = 0; -for (pos = 0; pos < 20; ++pos) +for (pos = 0; pos < guideSize; ++pos) { int diff = misMatch & 0x3; if (diff) { ++mmCount; if (lastMmPos != -1) { distSum += pos-lastMmPos; ++distCount; } score1 *= 1 - hitScoreM[pos]; lastMmPos = pos; } misMatch >>= 2; } @@ -733,56 +736,55 @@ { stringReturn[i] = '.'; if (bitMask & misMatch) stringReturn[i] = '*'; ++i; bitMask >>= 2; } stringReturn[i] = 0; } // static void misMatchString(char *returnString, long long misMatch) static void recordOffTargets(struct crisprList *query, struct crisprList *target, int bitsOn, long long qIndex, long long tIndex, long long twoBitMisMatch) /* bitsOn is from 1 to 4, record this match when less than 1000 total */ { +if (query->offBy[0][qIndex] ) // no need to accumulate if 0 mismatch > 0 + return; + +if (offFile) + { char queryString[33]; /* local storage for string */ char targetString[33]; /* local storage for string */ char queryPAM[33]; /* local storage for string */ char targetPAM[33]; /* local storage for string */ char misMatch[33]; /* ...*.....*.*... ascii string represent misMatch */ -if (query->offBy[0][qIndex] ) // no need to accumulate if 0-mismatch > 0 - return; - -if (offFile) - { int i = 0; int bitsOnSum = 0; for (i = 1; i < 5; ++i) bitsOnSum += query->offBy[i][qIndex]; - misMatchString(misMatch, twoBitMisMatch); - if (bitsOnSum < 1000) // could be command line option limit { double hitScore = calcHitScore(query->sequence[qIndex], target->sequence[tIndex]); // double cfdScore = // calcCfdScore(query->sequence[qIndex], target->sequence[tIndex]); - kmerValToString(queryString, query->sequence[qIndex], 3); - kmerValToString(targetString, target->sequence[tIndex], 3); + misMatchString(misMatch, twoBitMisMatch); + kmerValToString(queryString, query->sequence[qIndex], pamSize); + kmerValToString(targetString, target->sequence[tIndex], pamSize); kmerPAMString(queryPAM, query->sequence[qIndex]); kmerPAMString(targetPAM, target->sequence[tIndex]); fprintf(offFile, "%s:%lld %c %s%s %s%s %s %d %.8f\t%s:%lld %c\n", query->chrom, query->start[qIndex], negativeStrand & query->sequence[qIndex] ? '-' : '+', queryString, queryPAM, targetString, targetPAM, misMatch, bitsOn, hitScore, target->chrom, target->start[tIndex], negativeStrand & target->sequence[tIndex] ? '-' : '+'); } } } // static void recordOffTargets(struct crisprList *query, // struct crisprList *target, int bitsOn, long long qIndex, // long long tIndex) @@ -993,31 +995,31 @@ struct crisprList *list = NULL; long long crisprsWritten = 0; char kmerString[33]; char pamString[33]; long startMs = clock1000(); slReverse(&all); for (list = all; list; list = list->next) { fprintf(fh, "%s\t%lld\t%d\n", list->chrom, list->crisprCount, list->size); struct crispr *c = NULL; slReverse(&list->chromCrisprs); for (c = list->chromCrisprs; c; c = c->next) { - kmerValToString(kmerString, c->sequence, 3); + kmerValToString(kmerString, c->sequence, pamSize); kmerPAMString(pamString, c->sequence); fprintf(fh, "%s\t%s\t%lld\t%c\n", kmerString, pamString, c->start, negativeStrand & c->sequence ? '-' : '+'); ++crisprsWritten; } } carefulClose(&fh); long elapsedMs = clock1000() - startMs; timingMessage("writeKmers", crisprsWritten, "crisprs written", elapsedMs, "crisprs/sec", "seconds/crispr"); } // static void writeKmers(struct crisprList *all, char *fileOut) static void *threadFunction(void *id) /* thread entry */ {