a6402cd35ac18b36a7d4ee1a6e41df145a519e98 hiram Tue Jul 11 16:03:50 2017 -0700 prepare to thread this process refs #18969 diff --git src/utils/crisprKmers/crisprKmers.c src/utils/crisprKmers/crisprKmers.c index 4947dc4..beed0c9 100644 --- src/utils/crisprKmers/crisprKmers.c +++ src/utils/crisprKmers/crisprKmers.c @@ -1,71 +1,73 @@ /* crisprKmers - find and annotate crispr sequences. */ + +/* Copyright (C) 2017 The Regents of the University of California + * See README in this or parent directory for licensing information. */ + #include #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "dnautil.h" #include "dnaseq.h" #include "dnaLoad.h" #include "portable.h" #include "basicBed.h" #include "sqlNum.h" #include "binRange.h" #include "obscure.h" #include "memalloc.h" void usage() /* Explain usage and exit. */ { errAbort( - "crisprKmers - annotate crispr sequences\n" + "crisprKmers - annotate crispr guide sequences\n" "usage:\n" " crisprKmers \n" "options:\n" " where is a .2bit file or .fa fasta sequence\n" - " -verbose=N - control processing steps with level of verbose:\n" - " default N=1 - only find crisprs, set up listings\n" - " N=2 - empty process crispr list into new list\n" - " N=3 - only count identicals during processing\n" - " N=4 - attempt measure all off targets and\n" - " print out all crisprs as bed format\n" - " -bed= - output kmers to given bed9+ file\n" - " -offTargets= - output offTargets to given file\n" - " -ranges= - use specified bed3 file to limit which crisprs are\n" + " -verbose=N - for debugging control processing steps with level of verbose:\n" + " -bed= - output results to given bed9+ file\n" + " -offTargets= - output off target data to given file\n" + " -ranges= - use specified bed3 file to limit which guides are\n" " - measured, only those with any overlap to these bed items.\n" - " -dumpKmers= - after scan of sequence, output kmers to file\n" + " -memLimit=N - N number of gigabytes for each thread, default: 8\n" + " when memory size goes beyond N gB program will thread\n" + " -dumpKmers= - NOT VALID after scan of sequence, output kmers to file\n" " - process will exit after this, use -loadKmers to continue\n" - " -loadKmers= - load kmers from previous scan of sequence from -dumpKmers\n" - " NOTE: It is faster to simply scan the sequence to get the system ready\n" - " to go. Reading in the kmer file takes longer than scanning." + " -loadKmers= - NOT VALID load kmers from previous scan of sequence from -dumpKmers" ); } static char *bedFileOut = NULL; /* set with -bed= argument, kmer output */ static char *offTargets = NULL; /* write offTargets to */ static FILE *offFile = NULL; /* file handle to write off targets */ static char *ranges = NULL; /* use ranges to limit scanning */ static struct hash *rangesHash = NULL; /* ranges into hash + binkeeper */ static char *dumpKmers = NULL; /* file name to write out kmers from scan */ static char *loadKmers = NULL; /* file name to read in kmers from previous scan */ +static int memLimit = 8; /* gB limit before going into thread mode */ +static int threadCount = 1; /* will be adjusted depending upon vmPeak */ /* Command line validation table. */ static struct optionSpec options[] = { {"bed", OPTION_STRING}, {"offTargets", OPTION_STRING}, {"ranges", OPTION_STRING}, + {"memLimit", OPTION_INT}, {"dumpKmers", OPTION_STRING}, {"loadKmers", OPTION_STRING}, {NULL, 0}, }; // sizeof(struct crispr): 32 struct crispr /* one chromosome set of crisprs */ { struct crispr *next; /* Next in list. */ long long sequence; /* sequence value in 2bit format */ long long start; /* chromosome start 0-relative */ char strand; /* strand: + or - */ }; @@ -133,30 +135,32 @@ perSecond = 1000.0 * count / ms; inverse = 1.0 / perSecond; } verbose(1, "# %s: %lld %s @ %ld ms -> %.2f %s == %g %s\n", prefix, count, message, ms, perSecond, units, inverse, invUnits); } static struct hash *readRanges(char *bedFile) /* Read bed and return it as a hash keyed by chromName * with binKeeper values. (from: src/hg/bedIntersect/bedIntersec.c) */ { struct hash *hash = newHash(0); /* key is chromName, value is binkeeper data */ struct lineFile *lf = lineFileOpen(bedFile, TRUE); char *row[3]; +verbose(1, "# using ranges file: %s\n", bedFile); + while (lineFileNextRow(lf, row, 3)) { struct binKeeper *bk; struct bed3 *item; struct hashEl *hel = hashLookup(hash, row[0]); if (hel == NULL) { bk = binKeeperNew(0, 1024*1024*1024); hel = hashAdd(hash, row[0], bk); } bk = hel->val; AllocVar (item); item->chrom = hel->name; item->chromStart = lineFileNeedNum(lf, row, 1); item->chromEnd = lineFileNeedNum(lf, row, 2); @@ -173,31 +177,31 @@ { const struct crispr *a = *((struct crispr **)va); const struct crispr *b = *((struct crispr **)vb); long long aVal = a->start; long long bVal = b->start; if (aVal < bVal) return -1; else if (aVal > bVal) return 1; else return 0; } #endif /* these two kmerVal to strings routines could be reduced to one and they - * could use lookup tables to run faster + * could use lookup tables to run faster. This is *definately* not reentrant ! */ static char *kmerPAMString(long long val) /* return the ASCII string for last three bases in then binary sequence value */ { static char pamString[32]; long long twoBitMask = 0x30; int shiftCount = 4; int baseCount = 0; while (twoBitMask) { int base = (val & twoBitMask) >> shiftCount; pamString[baseCount++] = bases[base]; twoBitMask >>= 2; shiftCount -= 2; @@ -851,86 +855,100 @@ { fprintf(fh, "%lld\t%lld\t%c\n", c->sequence, c->start, c->strand); ++crisprsWritten; } } carefulClose(&fh); long elapsedMs = clock1000() - startMs; timingMessage("writeKmers", crisprsWritten, "crisprs written", elapsedMs, "crisprs/sec", "seconds/crispr"); } // static void writeKmers(struct crisprList *all, char *fileOut) static void crisprKmers(char *sequence) /* crisprKmers - find and annotate crispr sequences. */ { -struct crisprList *queryCrisprs = NULL; +struct crisprList *queryGuides = NULL; // struct crisprList *countedCrisprs = NULL; -struct crisprList *allCrisprs = NULL; +struct crisprList *allGuides = NULL; if (loadKmers) - allCrisprs = readKmers(loadKmers); + allGuides = readKmers(loadKmers); else - allCrisprs = scanSequence(sequence); + allGuides = scanSequence(sequence); if (dumpKmers) { - writeKmers(allCrisprs, dumpKmers); + writeKmers(allGuides, dumpKmers); return; } +long long vmPeak = currentVmPeak(); +verbose(1, "# vmPeak after scanSequence: %lld kB\n", vmPeak); + /* processing those crisprs */ if (verboseLevel() > 1) { if (offTargets) offFile = mustOpen(offTargets, "w"); /* if ranges have been specified, construct queryList of kmers to measure */ if (rangesHash) { - /* result here is two exclusive sets: query, and allCrisprs - * the query crisprs have been extracted from the allCrisprs */ - queryCrisprs = rangeExtraction(& allCrisprs); - copyToArray(queryCrisprs); - if (allCrisprs) // if there are any left on the all list + /* result here is two exclusive sets: query, and allGuides + * the query crisprs have been extracted from the allGuides */ + queryGuides = rangeExtraction(& allGuides); + } + if (queryGuides) + copyToArray(queryGuides); + if (allGuides) + copyToArray(allGuides); + vmPeak = currentVmPeak(); + verbose(1, "# vmPeak after copyToArray: %lld kB\n", vmPeak); + /* larger example: 62646196 kB */ + if ((vmPeak >> 20) > 8) // the >> 20 converts kB to gB { - copyToArray(allCrisprs); - /* first run up query vs. all */ - queryVsAll(queryCrisprs, allCrisprs); + threadCount = 1 + ((vmPeak >> 20) / 8); + verbose(1, "# over 8 Gb at %lld kB, threadCount: %d\n", vmPeak, threadCount); } + if (queryGuides) // when range selected some query sequences + { + if (allGuides) // if there are any left on the all list + queryVsAll(queryGuides, allGuides); /* then run up the query vs. itself avoiding self vs. self */ - allVsAll(queryCrisprs); - countsOutput(queryCrisprs); + allVsAll(queryGuides); + countsOutput(queryGuides); } else { - copyToArray(allCrisprs); - /* run up all vs. all avoiding self vs. self */ - allVsAll(allCrisprs); - countsOutput(allCrisprs); + allVsAll(allGuides); /* run up all vs. all avoiding self vs. self */ + countsOutput(allGuides); } + carefulClose(&offFile); } } // static void crisprKmers(char *sequence) int main(int argc, char *argv[]) /* Process command line, initialize translation arrays and call the process */ { optionInit(&argc, argv, options); if (argc != 2) usage(); verbose(0, "# running verboseLevel: %d\n", verboseLevel()); bedFileOut = optionVal("bed", bedFileOut); dumpKmers = optionVal("dumpKmers", dumpKmers); loadKmers = optionVal("loadKmers", loadKmers); offTargets = optionVal("offTargets", offTargets); +memLimit = optionInt("memLimit", memLimit); ranges = optionVal("ranges", ranges); if (ranges) rangesHash = readRanges(ranges); initOrderedNtVal(); /* set up orderedNtVal[] */ crisprKmers(argv[1]); + if (verboseLevel() > 1) printVmPeak(); return 0; }