47a02857acb0e52455455f577dbe801d9b2b0a66 hiram Sat Jul 8 23:04:40 2017 -0700 now recording off-target data and discovered that (a > 1) is dramatically different from (a >> 1) with no warning of course, refs #18969 diff --git src/utils/crisprKmers/crisprKmers.c src/utils/crisprKmers/crisprKmers.c index 4dc93cb..6a0595b 100644 --- src/utils/crisprKmers/crisprKmers.c +++ src/utils/crisprKmers/crisprKmers.c @@ -18,49 +18,53 @@ /* Explain usage and exit. */ { errAbort( "crisprKmers - annotate crispr sequences\n" "usage:\n" " crisprKmers <sequence>\n" "options:\n" " where <sequence> is a .2bit file or .fa fasta sequence\n" " -verbose=N - control processing steps with level of verbose:\n" " default N=1 - only find crisprs, set up listings\n" " N=2 - empty process crispr list into new list\n" " N=3 - only count identicals during processing\n" " N=4 - attempt measure all off targets and\n" " print out all crisprs as bed format\n" " -bed=<file> - output kmers to given bed9+ file\n" + " -offTargets=<file> - output offTargets to given file\n" " -ranges=<file> - use specified bed3 file to limit which crisprs are\n" " - measured, only those with any overlap to these bed items.\n" " -dumpKmers=<file> - after scan of sequence, output kmers to file\n" " - process will exit after this, use -loadKmers to continue\n" " -loadKmers=<file> - load kmers from previous scan of sequence from -dumpKmers\n" " NOTE: It is faster to simply scan the sequence to get the system ready\n" " to go. Reading in the kmer file takes longer than scanning." ); } -static char *bedFileOut = NULL; /* set with -bed=<file> argument */ -static char *ranges = NULL; /* set with -ranges=<file> argument */ +static char *bedFileOut = NULL; /* set with -bed=<file> argument, kmer output */ +static char *offTargets = NULL; /* write offTargets to <file> */ +static FILE *offFile = NULL; /* file handle to write off targets */ +static char *ranges = NULL; /* use ranges <file> to limit scanning */ static struct hash *rangesHash = NULL; /* ranges into hash + binkeeper */ static char *dumpKmers = NULL; /* file name to write out kmers from scan */ static char *loadKmers = NULL; /* file name to read in kmers from previous scan */ /* Command line validation table. */ static struct optionSpec options[] = { {"bed", OPTION_STRING}, + {"offTargets", OPTION_STRING}, {"ranges", OPTION_STRING}, {"dumpKmers", OPTION_STRING}, {"loadKmers", OPTION_STRING}, {NULL, 0}, }; struct crispr /* one chromosome set of crisprs */ { struct crispr *next; /* Next in list. */ long long sequence; /* sequence value in 2bit format */ long long start; /* chromosome start 0-relative */ char strand; /* strand: + or - */ }; @@ -163,67 +167,73 @@ /* Compare slPairs on start value */ { const struct crispr *a = *((struct crispr **)va); const struct crispr *b = *((struct crispr **)vb); long long aVal = a->start; long long bVal = b->start; if (aVal < bVal) return -1; else if (aVal > bVal) return 1; else return 0; } #endif -static char *kmerPAMStringArray(long long val) +/* these two kmerVal to strings routines could be reduced to one and they + * could use lookup tables to run faster + */ +static char *kmerPAMString(long long val) /* return the ASCII string for last three bases in then binary sequence value */ { static char pamString[32]; long long twoBitMask = 0x30; int shiftCount = 4; int baseCount = 0; while (twoBitMask) { int base = (val & twoBitMask) >> shiftCount; pamString[baseCount++] = bases[base]; twoBitMask >>= 2; shiftCount -= 2; } pamString[baseCount] = 0; return pamString; -} // static char *kmerValToStringArray(struct crispr *c, int trim) +} // static char *kmerPAMString(struct crispr *c, int trim) -static char *kmerValToStringArray(long long val, int trim) +/* beware, this needs to be used immediately upon return since it is + * returning its static answer + */ +static char *kmerValToString(long long val, int trim) /* return ASCII string for binary sequence value */ { static char kmerString[32]; long long twoBitMask = 0x300000000000; int shiftCount = 44; int baseCount = 0; while (twoBitMask && (shiftCount >= (2*trim))) { int base = (val & twoBitMask) >> shiftCount; kmerString[baseCount++] = bases[base]; twoBitMask >>= 2; shiftCount -= 2; } kmerString[baseCount] = 0; return kmerString; -} // static char *kmerValToStringArray(long long val, int trim) +} // static char *kmerValToString(long long val, int trim) static long long revComp(long long val) /* reverse complement the 2-bit numerical value kmer */ { /* complement bases and add 18 0 bits * because this divide and conquer works on 64 bits, not 46, * hence the addition of the 18 zero bits which fall out * The 'val ^ fortySixBits' does the complement, the shifting and * masking does the reversing. */ register long long v = (val ^ fortySixBits) << 18; v = ((v & high32bits) >> 32) | ((v & low32bits) << 32); v = ((v & high16bits) >> 16) | ((v & low16bits) << 16); v = ((v & high8bits) >> 8) | ((v & low8bits) << 8); v = ((v & high4bits) >> 4) | ((v & low4bits) << 4); @@ -324,66 +334,66 @@ if (startGap != chromPosition) verbose(4, "#GAP %s\t%lld\t%lld\t%lld\t%lld\t%s\n", seq->name, startGap, chromPosition, gapCount, chromPosition-startGap, "+"); else verbose(4, "#GAP %s\t%lld\t%lld\t%lld\t%lld\t%s\n", seq->name, startGap, chromPosition+1, gapCount, 1+chromPosition-startGap, "+"); kmerLength = 0; /* reset, start over */ kmerVal = 0; } // else if (val >= 0) ++chromPosition; } // for (i=0; i < seq->size; ++i) // slReverse(&crisprSet); // save time, order not important at this time returnList->chromCrisprs = crisprSet; returnList->crisprCount = slCount(crisprSet); return returnList; } // static struct crisprList *generateKmers(struct dnaSeq *seq) -static void showCountsArray(struct crisprList *all) +static void countsOutput(struct crisprList *all) /* everything has been scanned and counted, print out all the data from arrays*/ { FILE *bedFH = NULL; if (bedFileOut) bedFH = mustOpen(bedFileOut, "w"); struct crisprList *list; long long totalOut = 0; for (list = all; list; list = list->next) { long long c = 0; for (c = 0; c < list->crisprCount; ++c) { int negativeOffset = 0; if (list->strand[c] == '-') negativeOffset = 3; long long txStart = list->start[c] + negativeOffset; - long long txEnd = txStart + 20 + negativeOffset; + long long txEnd = txStart + 20; int totalOffs = list->offBy[0][c] + list->offBy[1][c] + list->offBy[2][c] + list->offBy[3][c] + list->offBy[4][c]; if (0 == totalOffs) -verbose(1, "# PERFECT score %s:%lld %c\t%s\n", list->chrom, list->start[c], list->strand[c], kmerValToStringArray(list->sequence[c], 3)); +verbose(1, "# PERFECT score %s:%lld %c\t%s\n", list->chrom, list->start[c], list->strand[c], kmerValToString(list->sequence[c], 3)); if (bedFH) - fprintf(bedFH, "%s\t%lld\t%lld\t%s\t%d\t%c\t%lld\t%lld\t%s\t%d\t%d\t%d\t%d\n", list->chrom, list->start[c], list->start[c]+23, kmerValToStringArray(list->sequence[c], 3), list->offBy[0][c], list->strand[c], txStart, txEnd, kmerPAMStringArray(list->sequence[c]), list->offBy[1][c], list->offBy[2][c], list->offBy[3][c], list->offBy[4][c]); + fprintf(bedFH, "%s\t%lld\t%lld\t%d,%d,%d,%d,%d\t%d\t%c\t%lld\t%lld\t%s\t%d\t%d\t%d\t%d\t%s\n", list->chrom, list->start[c], list->start[c]+23, list->offBy[0][c], list->offBy[1][c], list->offBy[2][c], list->offBy[3][c], list->offBy[4][c], list->offBy[0][c], list->strand[c], txStart, txEnd, kmerPAMString(list->sequence[c]), list->offBy[1][c], list->offBy[2][c], list->offBy[3][c], list->offBy[4][c], kmerValToString(list->sequence[c], 3)); if (list->offBy[0][c]) - verbose(3, "# array identical: %d %s:%lld %c\t%s\n", list->offBy[0][c], list->chrom, list->start[c], list->strand[c], kmerValToStringArray(list->sequence[c], 3)); + verbose(3, "# array identical: %d %s:%lld %c\t%s\n", list->offBy[0][c], list->chrom, list->start[c], list->strand[c], kmerValToString(list->sequence[c], 3)); ++totalOut; } } if (bedFH) carefulClose(&bedFH); -} // static void showCountsArray(struct crisprList *all) +} // static void countsOutput(struct crisprList *all) static struct crisprList *scanSequence(char *inFile) /* scan the given file, return list of crisprs */ { verbose(1, "#\tscanning sequence file: %s\n", inFile); dnaUtilOpen(); struct dnaLoad *dl = dnaLoadOpen(inFile); struct dnaSeq *seq; struct crisprList *listReturn = NULL; long elapsedMs = 0; long scanStart = clock1000(); long long totalCrisprs = 0; /* scanning all sequences, setting up crisprs on the listReturn */ @@ -418,40 +428,40 @@ struct crisprList *all = *allReference; struct crisprList *listReturn = NULL; struct crisprList *list = NULL; int inputChromCount = slCount(all); long long returnListCrisprCount = 0; long long examinedCrisprCount = 0; struct crisprList *prevChromList = NULL; long elapsedMs = 0; long scanStart = clock1000(); struct crisprList *nextList = NULL; for (list = all; list; list = nextList) { nextList = list->next; // remember before perhaps lost + examinedCrisprCount += list->crisprCount; struct crispr *c = NULL; struct binKeeper *bk = hashFindVal(rangesHash, list->chrom); struct crispr *newCrispr = NULL; if (bk != NULL) { struct crispr *prevCrispr = NULL; struct crispr *next = NULL; for (c = list->chromCrisprs; c; c = next) { - ++examinedCrisprCount; struct binElement *hitList = NULL; next = c->next; // remember before perhaps lost int start = c->start; if (c->strand == '-') start += 2; int end = start + 20; hitList = binKeeperFind(bk, start, end); if (hitList) { ++returnListCrisprCount; if (prevCrispr) // remove this one from the all list prevCrispr->next = next; else list->chromCrisprs = next; // removing the first one c->next = NULL; @@ -477,169 +487,196 @@ prevChromList->next = nextList; else all = nextList; // removing the first one } else { list->crisprCount = slCount(list->chromCrisprs); verbose(1, "# range scan same chrom list still has %lld crisprs on chrom %s\n", list->crisprCount, list->chrom); } } prevChromList = list; } // for (list = all; list; list = list->next) elapsedMs = clock1000() - scanStart; -verbose(1, "# range scan, input %d chromosomes, return %d chromosomes\n", - inputChromCount, slCount(listReturn)); +verbose(1, "# range scanning %d chroms, return %d selected chroms, leaving %d chroms\n", + inputChromCount, slCount(listReturn), slCount(all)); timingMessage("range scan", examinedCrisprCount, "examined crisprs", elapsedMs, "crisprs/sec"); timingMessage("range scan", returnListCrisprCount, "returned crisprs", elapsedMs, "crisprs/sec"); if (NULL == all) { allReference = NULL; // they have all been removed verbose(1, "# range scan, every single chrom has been removed\n"); } else if (*allReference != all) { verbose(1, "# range scan, first chrom list has been moved from %p to %p\n", (void *)*allReference, (void *)all); *allReference = all; } return listReturn; } // static crisprList *rangeExtraction(crisprList *all) -static void queryVsAllArray(struct crisprList *query, +static void recordOffTargets(struct crisprList *query, + struct crisprList *target, int bitsOn, long long qIndex, + long long tIndex) +/* bitsOn is from 1 to 4, record this match when less than 10 count */ +{ +if (offFile) + { + if (query->offBy[bitsOn][qIndex] < 10) + {/* needs to be two prints as the kmerValToString return is confused */ + fprintf(offFile, "%s:%lld %c %s %#020llx\t", query->chrom, query->start[qIndex], + query->strand[qIndex], kmerValToString(query->sequence[qIndex], 3), +(unsigned long long)((query->sequence[qIndex] & 0x3fffffffffc0)>>6)); + fprintf(offFile, "%s:%lld %c %s %#020llx\t%d\n", target->chrom, + target->start[tIndex], target->strand[tIndex], + kmerValToString(target->sequence[tIndex], 3), +(unsigned long long)((target->sequence[tIndex] & 0x3fffffffffc0)>>6), bitsOn); + } + } +} + +static void queryVsAll(struct crisprList *query, struct crisprList *target) /* run the query crisprs list against the target list in the array structures */ { struct crisprList *qList = NULL; long long totalCrisprsQuery = 0; +long long totalCrisprsTarget = 0; long long totalCompares = 0; long processStart = clock1000(); long elapsedMs = 0; for (qList = query; qList; qList = qList->next) { long long qCount = 0; totalCrisprsQuery += qList->crisprCount; - verbose(1, "# queryVsAllArray %lld query crisprs on chrom %s\n", qList->crisprCount, qList->chrom); + verbose(1, "# queryVsAll %lld query crisprs on chrom %s\n", qList->crisprCount, qList->chrom); for (qCount = 0; qCount < qList->crisprCount; ++qCount) { struct crisprList *tList = NULL; for (tList = target; tList; tList = tList->next) { long long tCount = 0; totalCompares += tList->crisprCount; if (0 == qCount) - verbose(1, "# queryVsAllArray %lld target crisprs on chrom %s\n", tList->crisprCount, tList->chrom); +{ +totalCrisprsTarget += tList->crisprCount; +verbose(1, "# queryVsAll %lld target crisprs on chrom %s %lld targets %lld compares\n", tList->crisprCount, tList->chrom, totalCrisprsTarget, totalCompares); +} for (tCount = 0; tCount < tList->crisprCount; ++tCount) { /* the XOR will determine differences in two sequences * the shift right 6 removes the PAM sequence */ long long misMatch = (qList->sequence[qCount] ^ tList->sequence[tCount]) >> 6; if (misMatch) { /* possible misMatch bit values: 01 10 11 * turn those three values into just: 01 */ - misMatch = (misMatch | (misMatch > 1)) & 0x5555555555; + misMatch = (misMatch | (misMatch >> 1)) & 0x5555555555; int bitsOn = _mm_popcnt_u64(misMatch); if (bitsOn < 5) { + recordOffTargets(qList, tList, bitsOn, qCount, tCount); qList->offBy[bitsOn][qCount] += 1; tList->offBy[bitsOn][tCount] += 1; } } else { /* no misMatch, identical crisprs */ qList->offBy[0][qCount] += 1; tList->offBy[0][tCount] += 1; } } // for (tCount = 0; tCount < tList->crisprCount; ++tCount) } // for (tList = target; tList; tList = tList->next) } // for (qCount = 0; qCount < qList->crisprCount; ++qCount) } // for (qList = query; qList; qList = qList->next) verbose(1, "# done with scanning, check timing\n"); elapsedMs = clock1000() - processStart; -timingMessage("queryVsAllArray", totalCrisprsQuery, "crisprs processed", elapsedMs, "crisprs/sec"); -timingMessage("queryVsAllArray", totalCompares, "total comparisons", elapsedMs, "compares/sec"); -} /* static struct crisprList *queryVsAllArray(struct crisprList *query, +timingMessage("queryVsAll", totalCrisprsQuery, "query crisprs processed", elapsedMs, "crisprs/sec"); +timingMessage("queryVsAll", totalCrisprsTarget, "vs target crisprs", elapsedMs, "crisprs/sec"); +timingMessage("queryVsAll", totalCompares, "total comparisons", elapsedMs, "compares/sec"); +} /* static struct crisprList *queryVsAll(struct crisprList *query, struct crisprList *target) */ -static void allVsAllArray(struct crisprList *all) +static void allVsAll(struct crisprList *all) /* run this 'all' list vs. itself with arrays */ { struct crisprList *qList = NULL; long long totalCrisprsQuery = 0; long long totalCrisprsCompare = 0; long processStart = clock1000(); long elapsedMs = 0; /* query runs through all chroms */ for (qList = all; qList; qList = qList->next) { long long qCount = 0; totalCrisprsQuery += qList->crisprCount; - verbose(1, "# allVsAllArray %lld query crisprs on chrom %s\n", qList->crisprCount, qList->chrom); + verbose(1, "# allVsAll %lld query crisprs on chrom %s\n", qList->crisprCount, qList->chrom); /* query runs through all kmers on this chrom */ for (qCount = 0; qCount < qList->crisprCount; ++qCount) { /* target starts on same chrom as query, and at next kmer after query for this first chrom */ long long tStart = qCount+1; struct crisprList *tList = NULL; for (tList = qList; tList; tList = tList->next) { long long tCount = tStart; totalCrisprsCompare += tList->crisprCount - tStart; for (tCount = tStart; tCount < tList->crisprCount; ++tCount) { /* the XOR will determine differences in two sequences * the shift right 6 removes the PAM sequence */ long long misMatch = (qList->sequence[qCount] ^ tList->sequence[tCount]) >> 6; if (misMatch) { /* possible misMatch bit values: 01 10 11 * turn those three values into just: 01 */ - misMatch = (misMatch | (misMatch > 1)) & 0x5555555555; + misMatch = (misMatch | (misMatch >> 1)) & 0x5555555555; int bitsOn = _mm_popcnt_u64(misMatch); if (bitsOn < 5) { + recordOffTargets(qList, tList, bitsOn, qCount, tCount); qList->offBy[bitsOn][qCount] += 1; tList->offBy[bitsOn][tCount] += 1; } } else { /* no misMatch, identical crisprs */ qList->offBy[0][qCount] += 1; tList->offBy[0][tCount] += 1; } } // for (tCount = 0; tCount < tList->crisprCount; ++tCount) tStart = 0; /* following chroms run through all */ } // for (tList = target; tList; tList = tList->next) } // for (qCount = 0; qCount < qList->crisprCount; ++qCount) } // for (qList = query; qList; qList = qList->next) elapsedMs = clock1000() - processStart; -timingMessage("allVsAllArray", totalCrisprsQuery, "crisprs processed", elapsedMs, "crisprs/sec"); -timingMessage("allVsAllArray", totalCrisprsCompare, "total comparisons", elapsedMs, "compares/sec"); -} /* static struct crisprList *allVsAllArray(struct crisprList *query, +timingMessage("allVsAll", totalCrisprsQuery, "crisprs processed", elapsedMs, "crisprs/sec"); +timingMessage("allVsAll", totalCrisprsCompare, "total comparisons", elapsedMs, "compares/sec"); +} /* static struct crisprList *allVsAll(struct crisprList *query, struct crisprList *target) */ static struct crisprList *readKmers(char *fileIn) /* read in kmer list from 'fileIn', return list structure */ { verbose(1, "# reading crisprs from: %s\n", fileIn); struct crisprList *listReturn = NULL; struct lineFile *lf = lineFileOpen(fileIn, TRUE); char *row[10]; int wordCount = 0; long long crisprsInput = 0; long startMs = clock1000(); while (0 < (wordCount = lineFileChopNextTab(lf, row, ArraySize(row))) ) @@ -719,64 +756,68 @@ if (loadKmers) allCrisprs = readKmers(loadKmers); else allCrisprs = scanSequence(sequence); if (dumpKmers) { writeKmers(allCrisprs, dumpKmers); return; } /* processing those crisprs */ if (verboseLevel() > 1) { + if (offTargets) + offFile = mustOpen(offTargets, "w"); /* if ranges have been specified, construct queryList of kmers to measure */ if (rangesHash) { /* result here is two exclusive sets: query, and allCrisprs * the query crisprs have been extracted from the allCrisprs */ queryCrisprs = rangeExtraction(& allCrisprs); copyToArray(queryCrisprs); if (allCrisprs) // if there are any left on the all list { copyToArray(allCrisprs); /* first run up query vs. all */ - queryVsAllArray(queryCrisprs, allCrisprs); + queryVsAll(queryCrisprs, allCrisprs); } /* then run up the query vs. itself avoiding self vs. self */ - allVsAllArray(queryCrisprs); - showCountsArray(queryCrisprs); + allVsAll(queryCrisprs); + countsOutput(queryCrisprs); } else { copyToArray(allCrisprs); /* run up all vs. all avoiding self vs. self */ - allVsAllArray(allCrisprs); - showCountsArray(allCrisprs); + allVsAll(allCrisprs); + countsOutput(allCrisprs); } + carefulClose(&offFile); } } // static void crisprKmers(char *sequence) int main(int argc, char *argv[]) /* Process command line, initialize translation arrays and call the process */ { optionInit(&argc, argv, options); if (argc != 2) usage(); verbose(0, "# running verboseLevel: %d\n", verboseLevel()); bedFileOut = optionVal("bed", bedFileOut); dumpKmers = optionVal("dumpKmers", dumpKmers); loadKmers = optionVal("loadKmers", loadKmers); +offTargets = optionVal("offTargets", offTargets); ranges = optionVal("ranges", ranges); if (ranges) rangesHash = readRanges(ranges); initOrderedNtVal(); /* set up orderedNtVal[] */ crisprKmers(argv[1]); if (verboseLevel() > 1) printVmPeak(); return 0; }