8bd378fa491b953980381f804c91b6f240697d57 hiram Mon Jul 24 10:00:51 2017 -0700 change dumpKmers to dumpGuides and properly recognize only -NGG guides not the -NAG decoys refs #18969 diff --git src/utils/crisprKmers/crisprKmers.c src/utils/crisprKmers/crisprKmers.c index a69e768..5da7ab7 100644 --- src/utils/crisprKmers/crisprKmers.c +++ src/utils/crisprKmers/crisprKmers.c @@ -68,53 +68,53 @@ errAbort( "crisprKmers - annotate crispr guide sequences\n" "usage:\n" " crisprKmers <sequence>\n" "options:\n" " where <sequence> is a .2bit file or .fa fasta sequence\n" " -verbose=N - for debugging control processing steps with level of verbose:\n" " - verbose < 2 - scan sequence only, no comparisons\n" " - verbose > 1 - run comparisons\n" " -bed=<file> - output results to given bed9+ file\n" " -offTargets=<file> - output off target data to given file\n" " -ranges=<file> - use specified bed3 file to limit which guides are\n" " - measured, only those with any overlap to these bed items.\n" " -threads=N - N number of threads to run, default: no threading\n" " - use when ku is going to allocate more CPUs for big mem jobs.\n" - " -dumpKmers=<file> - NOT VALID after scan of sequence, output kmers to file\n" - " - process will exit after this, use -loadKmers to continue\n" - " -loadKmers=<file> - NOT VALID load kmers from previous scan of sequence from -dumpKmers" + " -dumpGuides=<file> - after scan of sequence, output guide sequences to file" ); } +// " -loadKmers=<file> - NOT VALID load kmers from previous scan of sequence from -dumpKmers" + static char *bedFileOut = NULL; /* set with -bed=<file> argument, kmer output */ static char *offTargets = NULL; /* write offTargets to <file> */ static FILE *offFile = NULL; /* file handle to write off targets */ static char *ranges = NULL; /* use ranges <file> to limit scanning */ static struct hash *rangesHash = NULL; /* ranges into hash + binkeeper */ -static char *dumpKmers = NULL; /* file name to write out kmers from scan */ +static char *dumpGuides = NULL; /* file name to write out guides from scan */ static char *loadKmers = NULL; /* file name to read in kmers from previous scan */ static int threads = 0; /* number of threads to run */ static int threadCount = 1; /* will be adjusted depending upon vmPeak */ /* Command line validation table. */ static struct optionSpec options[] = { {"bed", OPTION_STRING}, {"offTargets", OPTION_STRING}, {"ranges", OPTION_STRING}, {"threads", OPTION_INT}, - {"dumpKmers", OPTION_STRING}, + {"dumpGuides", OPTION_STRING}, {"loadKmers", OPTION_STRING}, {NULL, 0}, }; #define guideSize 20 // 20 bases #define pamSize 3 // 3 bases #define negativeStrand 0x0000800000000000 #define fortyEightBits 0xffffffffffff #define fortySixBits 0x3fffffffffff #define fortyBits 0xffffffffff #define high32bits 0xffffffff00000000 #define low32bits 0x00000000ffffffff #define high16bits 0xffff0000ffff0000 #define low16bits 0x0000ffff0000ffff @@ -127,46 +127,46 @@ #define pamMask 0x3f // 0x0000 0000 0000 0000 // 6348 4732 3116 15-0 // sequence word structure: // bits 5-0 - PAM sequence in 2bit encoding for 3 bases // bits 45-6 - 20 base sequence in 2bit encoding format // bit 47 - negative strand indication // considering using other bits during processing to mark // an item for no more consideration // sizeof(struct crispr): 32 struct crispr -/* one chromosome set of crisprs */ +/* one chromosome set of guides */ { struct crispr *next; /* Next in list. */ long long sequence; /* sequence value in 2bit format */ long long start; /* chromosome start 0-relative */ }; // sizeof(struct crisprList): 72 struct crisprList -/* all chromosome sets of crisprs */ +/* all chromosome sets of guides */ { struct crisprList *next; /* Next in list. */ char *chrom; /* chrom name */ int size; /* chrom size */ - struct crispr *chromCrisprs; /* all the crisprs on this chrom */ - long long crisprCount; /* number of crisprs on this chrom */ + struct crispr *chromCrisprs; /* all the guides on this chrom */ + long long crisprCount; /* number of guides on this chrom */ long long *sequence; /* array of the sequences */ long long *start; /* array of the starts */ int **offBy; /* offBy[5][n] */ float *mitSum; /* accumulating sum of MIT scores */ }; struct threadControl /* data passed to thread to control execution */ { int threadId; /* this thread Id */ int threadCount; /* total threads running */ struct crisprList *query; /* running query guides against */ struct crisprList *target; /* target guides */ }; @@ -529,99 +529,101 @@ */ fprintf(bedFH, "%s\t%lld\t%lld\t\t%ld\t%c\t%lld\t%lld\t%s\t%s\t%s\t%d,%d,%d,%d,%d\t0\n", list->chrom, list->start[c], list->start[c]+pamSize+guideSize, mitScore, strand, txStart, txEnd, color, kmerString, pamString, list->offBy[0][c], list->offBy[1][c], list->offBy[2][c], list->offBy[3][c], list->offBy[4][c]); } ++totalOut; } } if (bedFH) carefulClose(&bedFH); timingMessage("countsOutput:", itemsOutput, "items output", startTime, "items/sec", "seconds/item"); } // static void countsOutput(struct crisprList *all, FILE *bedFH) static struct crisprList *scanSequence(char *inFile) -/* scan the given file, return list of crisprs */ +/* scan the given file, return list of guides */ { -verbose(1, "#\tscanning sequence file: %s\n", inFile); +verbose(1, "# scanning sequence file: %s\n", inFile); dnaUtilOpen(); struct dnaLoad *dl = dnaLoadOpen(inFile); struct dnaSeq *seq; struct crisprList *listReturn = NULL; long startTime = clock1000(); long long totalCrisprs = 0; -/* scanning all sequences, setting up crisprs on the listReturn */ +/* scanning all sequences, setting up guides on the listReturn */ while ((seq = dnaLoadNext(dl)) != NULL) { if (startsWithNoCase("chrUn", seq->name) || rStringIn("hap", seq->name) || rStringIn("_alt", seq->name) ) { verbose(4, "# skip chrom: %s\n", seq->name); continue; } long startTime = clock1000(); struct crisprList *oneList = generateKmers(seq); slAddHead(&listReturn, oneList); totalCrisprs += oneList->crisprCount; if (verboseLevel() > 3) - timingMessage(seq->name, oneList->crisprCount, "crisprs", startTime, - "crisprs/sec", "seconds/crispr"); + timingMessage(seq->name, oneList->crisprCount, "guides", startTime, + "guides/sec", "seconds/guide"); } -timingMessage("scanSequence", totalCrisprs, "total crisprs", startTime, - "crisprs/sec", "seconds/crispr"); +timingMessage("scanSequence", totalCrisprs, "total guides", startTime, + "guides/sec", "seconds/guide"); return listReturn; } /* static crisprList *scanSequence(char *inFile) */ static struct crisprList *rangeExtraction(struct crisprList **allReference) -/* given ranges in global rangesHash, construct new list of crisprs that +/* given ranges in global rangesHash, construct new list of guides that * have any type of overlap with the ranges, also extract those items from * the all list. Returns new list. */ { struct crisprList *all = *allReference; struct crisprList *listReturn = NULL; struct crisprList *list; int inputChromCount = slCount(all); long long returnListCrisprCount = 0; long long examinedCrisprCount = 0; struct crisprList *prevChromList = NULL; long startTime = clock1000(); struct crisprList *nextList = NULL; for (list = all; list; list = nextList) { nextList = list->next; // remember before perhaps lost long long beforeCrisprCount = list->crisprCount; examinedCrisprCount += list->crisprCount; struct binKeeper *bk = hashFindVal(rangesHash, list->chrom); struct crispr *newCrispr = NULL; if (bk != NULL) { struct crispr *prevCrispr = NULL; struct crispr *next = NULL; struct crispr *c; for (c = list->chromCrisprs; c; c = next) { - struct binElement *hitList = NULL; next = c->next; // remember before perhaps lost + if (endsAG == (c->sequence & 0xf)) + continue; // only check guides endsGG + struct binElement *hitList = NULL; // select any guide that is at least half contained in any range int midPoint = c->start + ((pamSize + guideSize) >> 1); // if (negativeStrand & c->sequence) // start += 2; // int end = midPoint + 1; hitList = binKeeperFind(bk, midPoint, midPoint + 1); if (hitList) { if (prevCrispr) // remove this one from the 'all' list prevCrispr->next = next; else list->chromCrisprs = next; // removing the first one c->next = NULL; // new item for new list slAddHead(&newCrispr, c); // constructing new list } @@ -639,47 +641,47 @@ newItem->chromCrisprs = newCrispr; newItem->crisprCount = slCount(newCrispr); returnListCrisprCount += newItem->crisprCount; slAddHead(&listReturn, newItem); if (NULL == list->chromCrisprs) // all have been removed for this chrom { if (prevChromList) // remove it from the chrom list prevChromList->next = nextList; else all = nextList; // removing the first one } else { list->crisprCount = slCount(list->chromCrisprs); long long removedCrisprs = beforeCrisprCount - list->crisprCount; - verbose(1, "# range scan chrom %s had %lld crisprs, removed %lld leaving %lld target\n", + verbose(1, "# range scan chrom %s had %lld guides, removed %lld leaving %lld target\n", list->chrom, beforeCrisprCount, removedCrisprs, list->crisprCount); } } prevChromList = list; } // for (list = all; list; list = list->next) verbose(1, "# range scanning %d chroms, return %d selected chroms, leaving %d chroms\n", inputChromCount, slCount(listReturn), slCount(all)); long long targetCrisprCount = examinedCrisprCount - returnListCrisprCount; -timingMessage("range scan", examinedCrisprCount, "examined crisprs", - startTime, "crisprs/sec", "seconds/crispr"); -timingMessage("range scan", targetCrisprCount, "remaining target crisprs", - startTime, "crisprs/sec", "seconds/crispr"); -timingMessage("range scan", returnListCrisprCount, "returned query crisprs", - startTime, "crisprs/sec", "seconds/crispr"); +timingMessage("range scan", examinedCrisprCount, "examined guides", + startTime, "guides/sec", "seconds/guide"); +timingMessage("range scan", targetCrisprCount, "remaining target guides", + startTime, "guides/sec", "seconds/guide"); +timingMessage("range scan", returnListCrisprCount, "returned query guides", + startTime, "guides/sec", "seconds/guide"); if (NULL == all) { allReference = NULL; // they have all been removed verbose(1, "# range scan, every single chrom has been removed\n"); } else if (*allReference != all) { verbose(1, "# range scan, first chrom list has been moved from %p to %p\n", (void *)*allReference, (void *)all); *allReference = all; } return listReturn; } // static crisprList *rangeExtraction(crisprList *all) static float hitScoreM[20] = @@ -976,31 +978,31 @@ target->chrom, target->start[tIndex], negativeStrand & target->sequence[tIndex] ? '-' : '+', cfdInt, misMatch, bitsOn, mitScore, cfdScore, target->chrom, target->start[tIndex], negativeStrand & target->sequence[tIndex] ? '-' : '+'); } } } // static void recordOffTargets(struct crisprList *query, // struct crisprList *target, int bitsOn, long long qIndex, // long long tIndex) /* this queryVsTarget can be used by threads, appears to be safe */ static void queryVsTarget(struct crisprList *query, struct crisprList *target, int threadCount, int threadId) -/* run the query crisprs list against the target list in the array structures */ +/* run the query guides list against the target list in the array structures */ { struct crisprList *qList; long long totalCrisprsQuery = 0; long long totalCrisprsTarget = 0; long long totalCompares = 0; struct loopControl *control = NULL; AllocVar(control); long startTime = clock1000(); for (qList = query; qList; qList = qList->next) { totalCrisprsQuery += qList->crisprCount; if (threadCount > 1) { @@ -1037,64 +1039,64 @@ if (misMatch) { /* possible misMatch bit values: 01 10 11 * turn those three values into just: 01 */ misMatch = (misMatch | (misMatch >> 1)) & 0x5555555555; int bitsOn = _mm_popcnt_u64(misMatch); if (bitsOn < 5) { recordOffTargets(qList, tList, bitsOn, qCount, tCount, misMatch); qList->offBy[bitsOn][qCount] += 1; // tList->offBy[bitsOn][tCount] += 1; not needed } } else - { /* no misMatch, identical crisprs */ + { /* no misMatch, identical guides */ qList->offBy[0][qCount] += 1; // tList->offBy[0][tCount] += 1; not needed } } // for (tCount = 0; tCount < tList->crisprCount; ++tCount) } // for (tList = target; tList; tList = tList->next) } // for (qCount = 0; qCount < qList->crisprCount; ++qCount) } // for (qList = query; qList; qList = qList->next) -timingMessage("queryVsTarget", totalCrisprsQuery, "query crisprs processed", - startTime, "crisprs/sec", "seconds/crispr"); -timingMessage("queryVsTarget", totalCrisprsTarget, "vs target crisprs", - startTime, "crisprs/sec", "seconds/crispr"); +timingMessage("queryVsTarget", totalCrisprsQuery, "query guides processed", + startTime, "guides/sec", "seconds/guide"); +timingMessage("queryVsTarget", totalCrisprsTarget, "vs target guides", + startTime, "guides/sec", "seconds/guide"); timingMessage("queryVsTarget", totalCompares, "total comparisons", startTime, "compares/sec", "seconds/compare"); } /* static struct crisprList *queryVsTarget(struct crisprList *query, struct crisprList *target) */ static void queryVsSelf(struct crisprList *all) /* run this 'all' list vs. itself avoiding self to self comparisons */ { struct crisprList *qList; long long totalCrisprsQuery = 0; long long totalCrisprsCompare = 0; long startTime = clock1000(); /* query runs through all chroms */ for (qList = all; qList; qList = qList->next) { long long qCount; totalCrisprsQuery += qList->crisprCount; - verbose(1, "# queryVsSelf %lld query crisprs on chrom %s\n", qList->crisprCount, qList->chrom); + verbose(1, "# queryVsSelf %lld query guides on chrom %s\n", qList->crisprCount, qList->chrom); for (qCount = 0; qCount < qList->crisprCount; ++qCount) { /* target starts on same chrom as query, and at next kmer after query for this first chrom */ long long tStart = qCount+1; struct crisprList *tList; for (tList = qList; tList; tList = tList->next) { long long tCount; totalCrisprsCompare += tList->crisprCount - tStart; for (tCount = tStart; tCount < tList->crisprCount; ++tCount) { /* the XOR determine differences in two sequences, the * shift right 6 removes the PAM sequence and * the 'fortyBits &' eliminates the negativeStrand bit @@ -1104,52 +1106,52 @@ if (misMatch) { /* possible misMatch bit values: 01 10 11 * turn those three values into just: 01 */ misMatch = (misMatch | (misMatch >> 1)) & 0x5555555555; int bitsOn = _mm_popcnt_u64(misMatch); if (bitsOn < 5) { recordOffTargets(qList, tList, bitsOn, qCount, tCount, misMatch); qList->offBy[bitsOn][qCount] += 1; tList->offBy[bitsOn][tCount] += 1; } } else - { /* no misMatch, identical crisprs */ + { /* no misMatch, identical guides */ qList->offBy[0][qCount] += 1; tList->offBy[0][tCount] += 1; } } // for (tCount = 0; tCount < tList->crisprCount; ++tCount) tStart = 0; /* following chroms run through all */ } // for (tList = target; tList; tList = tList->next) } // for (qCount = 0; qCount < qList->crisprCount; ++qCount) } // for (qList = query; qList; qList = qList->next) -timingMessage("queryVsSelf", totalCrisprsQuery, "crisprs processed", - startTime, "crisprs/sec", "seconds/crispr"); +timingMessage("queryVsSelf", totalCrisprsQuery, "guides processed", + startTime, "guides/sec", "seconds/guide"); timingMessage("queryVsSelf", totalCrisprsCompare, "total comparisons", startTime, "compares/sec", "seconds/compare"); } /* static void queryVsSelf(struct crisprList *all) */ static struct crisprList *readKmers(char *fileIn) /* read in kmer list from 'fileIn', return list structure */ { errAbort("# XXX readKmers function not implemented\n"); -verbose(1, "# reading crisprs from: %s\n", fileIn); +verbose(1, "# reading guides from: %s\n", fileIn); struct crisprList *listReturn = NULL; struct lineFile *lf = lineFileOpen(fileIn, TRUE); char *row[10]; int wordCount = 0; long long crisprsInput = 0; long startTime = clock1000(); while (0 < (wordCount = lineFileChopNextTab(lf, row, ArraySize(row))) ) { if (3 != wordCount) errAbort("expecing three words at line %d in file %s, found: %d", lf->lineIx, fileIn, wordCount); struct crisprList *newItem = NULL; AllocVar(newItem); @@ -1167,69 +1169,68 @@ ++verifyCount; struct crispr *oneCrispr = NULL; AllocVar(oneCrispr); oneCrispr->sequence = sqlLongLong(row[0]); oneCrispr->start = sqlLongLong(row[1]); slAddHead(&newItem->chromCrisprs, oneCrispr); } if (verifyCount != newItem->crisprCount) errAbort("expecting %lld kmer items at line %d in file %s, found: %lld", newItem->crisprCount, lf->lineIx, fileIn, verifyCount); crisprsInput += verifyCount; } lineFileClose(&lf); -timingMessage("readKmers", crisprsInput, "crisprs read in", startTime, - "crisprs/sec", "seconds/crispr"); +timingMessage("readKmers", crisprsInput, "guides read in", startTime, + "guides/sec", "seconds/guide"); return listReturn; } // static struct crisprList *readKmers(char *fileIn) -static void writeKmers(struct crisprList *all, char *fileOut) -/* write kmer list 'all' to 'fileOut' */ +static void writeGuides(struct crisprList *all, char *fileOut) +/* write guide list 'all' to 'fileOut' */ { -errAbort("# XXX writeKmers function not implemented\n"); FILE *fh = mustOpen(fileOut, "w"); struct crisprList *list; long long crisprsWritten = 0; char kmerString[33]; char pamString[33]; long startTime = clock1000(); slReverse(&all); for (list = all; list; list = list->next) { fprintf(fh, "%s\t%lld\t%d\n", list->chrom, list->crisprCount, list->size); struct crispr *c; - slReverse(&list->chromCrisprs); +// slReverse(&list->chromCrisprs); for (c = list->chromCrisprs; c; c = c->next) { kmerValToString(kmerString, c->sequence, pamSize); kmerPAMString(pamString, c->sequence); fprintf(fh, "%s\t%s\t%lld\t%c\n", kmerString, pamString, c->start, negativeStrand & c->sequence ? '-' : '+'); ++crisprsWritten; } } carefulClose(&fh); -timingMessage("writeKmers", crisprsWritten, "crisprs written", startTime, - "crisprs/sec", "seconds/crispr"); +timingMessage("writeGuides", crisprsWritten, "guides written", startTime, + "guides/sec", "seconds/guide"); -} // static void writeKmers(struct crisprList *all, char *fileOut) +} // static void writeGuides(struct crisprList *all, char *fileOut) static void *threadFunction(void *id) /* thread entry */ { struct threadControl *tId = (struct threadControl *)id; queryVsTarget(tId->query, tId->target, tId->threadCount, tId->threadId); return NULL; } static void runThreads(int threadCount, struct crisprList *query, struct crisprList *target) { struct threadControl *threadIds = NULL; @@ -1259,50 +1260,52 @@ for (pt = 0; pt < threadCount; ++pt) pthread_join(threads[pt], NULL); } static void crisprKmers(char *sequence, FILE *bedFH) /* crisprKmers - find and annotate crispr sequences. */ { struct crisprList *queryGuides = NULL; struct crisprList *allGuides = NULL; if (loadKmers) allGuides = readKmers(loadKmers); else allGuides = scanSequence(sequence); -if (dumpKmers) - { - writeKmers(allGuides, dumpKmers); - return; - } +// if (dumpGuides) +// { +// writeGuides(allGuides, dumpGuides); +// return; +// } long long vmPeak = currentVmPeak(); verbose(1, "# vmPeak after scanSequence: %lld kB\n", vmPeak); -/* processing those crisprs */ +/* processing those guides */ if (verboseLevel() > 1) { if (offTargets) offFile = mustOpen(offTargets, "w"); /* if ranges have been specified, construct queryList of kmers to measure */ if (rangesHash) { /* result here is two exclusive sets: query, and allGuides - * the query crisprs have been extracted from the allGuides */ + * the query guides have been extracted from the allGuides */ queryGuides = rangeExtraction(& allGuides); + if (dumpGuides) + writeGuides(queryGuides, dumpGuides); } if (queryGuides) copyToArray(queryGuides); if (allGuides) copyToArray(allGuides); vmPeak = currentVmPeak(); verbose(1, "# vmPeak after copyToArray: %lld kB\n", vmPeak); /* larger example: 62646196 kB */ if (threads > 1) { int gB = vmPeak >> 20; /* convert kB to Gb */ threadCount = threads; verbose(1, "# at %d Gb (%lld kB), running %d threads\n", gB, vmPeak, threadCount); } if (queryGuides) // when range selected some query sequences @@ -1326,31 +1329,31 @@ carefulClose(&offFile); } } // static void crisprKmers(char *sequence, FILE *bedFH) int main(int argc, char *argv[]) /* Process command line, initialize translation arrays and call the process */ { optionInit(&argc, argv, options); if (argc != 2) usage(); verbose(0, "# running verboseLevel: %d\n", verboseLevel()); bedFileOut = optionVal("bed", bedFileOut); -dumpKmers = optionVal("dumpKmers", dumpKmers); +dumpGuides = optionVal("dumpGuides", dumpGuides); loadKmers = optionVal("loadKmers", loadKmers); offTargets = optionVal("offTargets", offTargets); threads = optionInt("threads", threads); if (threads < 0) errAbort("specified threads is less than 0: %d\n", threads); if (threads > 0) verbose(1, "# requesting threads: %d\n", threads); ranges = optionVal("ranges", ranges); if (ranges) rangesHash = readRanges(ranges); FILE *bedFH = NULL; if (bedFileOut) { bedFH = mustOpen(bedFileOut, "w");