e20e93ca2a36c232d2fe3b2d918d03674f89d5d4 hiram Fri Jul 7 08:56:14 2017 -0700 the linked lists are too slow, maybe try arrays refs #18969 diff --git src/utils/crisprKmers/crisprKmers.c src/utils/crisprKmers/crisprKmers.c index 8d2d122..a1e0304 100644 --- src/utils/crisprKmers/crisprKmers.c +++ src/utils/crisprKmers/crisprKmers.c @@ -1,83 +1,93 @@ /* crisprKmers - find and annotate crispr sequences. */ #include <popcntintrin.h> #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "dnautil.h" #include "dnaseq.h" #include "dnaLoad.h" #include "portable.h" #include "basicBed.h" #include "sqlNum.h" #include "binRange.h" +#include "obscure.h" void usage() /* Explain usage and exit. */ { errAbort( "crisprKmers - annotate crispr sequences\n" "usage:\n" " crisprKmers <sequence>\n" "options:\n" " where <sequence> is a .2bit file or .fa fasta sequence\n" " -verbose=N - control processing steps with level of verbose:\n" " default N=1 - only find crisprs, set up listings\n" " N=2 - empty process crispr list into new list\n" " N=3 - only count identicals during processing\n" " N=4 - attempt measure all off targets and\n" " print out all crisprs as bed format\n" " -bed=<file> - output kmers to given bed9+ file\n" " -ranges=<file> - use specified bed3 file to limit which crisprs are\n" - " - measured, only those within bed items." + " - measured, only those with any overlap to these bed items.\n" + " -dumpKmers=<file> - after scan of sequence, output kmers to file\n" + " - process will exit after this, use -loadKmers to continue\n" + " -loadKmers=<file> - load kmers from previous scan of sequence from -dumpKmers\n" + " NOTE: It is faster to simply scan the sequence to get the system ready\n" + " to go. Reading in the kmer file takes longer than scanning." ); } static char *bedFileOut = NULL; /* set with -bed=<file> argument */ static char *ranges = NULL; /* set with -ranges=<file> argument */ static struct hash *rangesHash = NULL; /* ranges into hash + binkeeper */ +static char *dumpKmers = NULL; /* file name to write out kmers from scan */ +static char *loadKmers = NULL; /* file name to read in kmers from previous scan */ /* Command line validation table. */ static struct optionSpec options[] = { {"bed", OPTION_STRING}, {"ranges", OPTION_STRING}, + {"dumpKmers", OPTION_STRING}, + {"loadKmers", OPTION_STRING}, {NULL, 0}, }; struct crispr /* one chromosome set of crisprs */ { struct crispr *next; /* Next in list. */ long long sequence; /* sequence value in 2bit format */ - unsigned long long start; /* chromosome start 0-relative */ + long long start; /* chromosome start 0-relative */ char strand; /* strand: + or - */ int offBy0; /* counting number of off targets, 0 mismatch */ int offBy1; /* counting number of off targets, 1 mismatch */ int offBy2; /* counting number of off targets, 2 mismatch */ int offBy3; /* counting number of off targets, 3 mismatch */ int offBy4; /* counting number of off targets, 4 mismatch */ }; struct crisprList /* all chromosome sets of crisprs */ { struct crisprList *next; /* Next in list. */ char *chrom; /* chrom name */ int size; /* chrom size */ struct crispr *chromCrisprs; /* all the crisprs on this chrom */ - unsigned long long crisprCount; /* number of crisprs on this chrom */ + long long crisprCount; /* number of crisprs on this chrom */ }; #define A_BASE 0 #define C_BASE 1 #define G_BASE 2 #define T_BASE 3 #define U_BASE 3 static int orderedNtVal[256]; /* values in alpha order: ACGT 00 01 10 11 */ /* for easier sorting and complementing */ static char bases[4]; /* for binary to ascii conversion */ #define fortySixBits 0x3fffffffffff #define fortyEixhtBits 0xffffffffffff #define high32bits 0xffffffff00000000 #define low32bits 0x00000000ffffffff @@ -95,30 +105,41 @@ { int i; for (i=0; i<ArraySize(orderedNtVal); i++) orderedNtVal[i] = -1; orderedNtVal['A'] = orderedNtVal['a'] = A_BASE; orderedNtVal['C'] = orderedNtVal['c'] = C_BASE; orderedNtVal['G'] = orderedNtVal['g'] = G_BASE; orderedNtVal['T'] = orderedNtVal['t'] = T_BASE; orderedNtVal['U'] = orderedNtVal['u'] = T_BASE; bases[A_BASE] = 'A'; bases[C_BASE] = 'C'; bases[G_BASE] = 'G'; bases[T_BASE] = 'T'; } // static void initOrderedNtVal() +static void timingMessage(char *prefix, long long count, char *message, + long ms, char *units) +{ +double perSecond = 0.0; +if (ms > 0) + perSecond = 1000.0 * count / ms; + +verbose(1, "# %s: %lld %s @ %ld ms -> %.2f %s\n", prefix, count, message, ms, perSecond, units); +} + + static struct hash *readRanges(char *bedFile) /* Read bed and return it as a hash keyed by chromName * with binKeeper values. (from: src/hg/bedIntersect/bedIntersec.c) */ { struct hash *hash = newHash(0); /* key is chromName, value is binkeeper data */ struct lineFile *lf = lineFileOpen(bedFile, TRUE); char *row[3]; while (lineFileNextRow(lf, row, 3)) { struct binKeeper *bk; struct bed3 *item; struct hashEl *hel = hashLookup(hash, row[0]); if (hel == NULL) { @@ -209,41 +230,41 @@ v = ((v & high8bits) >> 8) | ((v & low8bits) << 8); v = ((v & high4bits) >> 4) | ((v & low4bits) << 4); v = ((v & high2bits) >> 2) | ((v & low2bits) << 2); return v; } // static long long revComp(long long val) static struct crisprList *generateKmers(struct dnaSeq *seq) { struct crispr *crisprSet = NULL; struct crisprList *returnList = NULL; AllocVar(returnList); returnList->chrom = cloneString(seq->name); returnList->size = seq->size; int i; DNA *dna; -unsigned long long chromPosition = 0; -unsigned long long startGap = 0; -unsigned long long gapCount = 0; +long long chromPosition = 0; +long long startGap = 0; +long long gapCount = 0; int kmerLength = 0; long long kmerVal = 0; long long endsGG = (G_BASE << 2) | G_BASE; long long beginsCC = (long long)((C_BASE << 2) | C_BASE) << 42; long long reverseMask = (long long)0xf << 42; verbose(4, "# endsGG: %032llx\n", endsGG); verbose(4, "# beginsCC: %032llx\n", beginsCC); -verbose(4, "# 46 bits: %032llx\n", (unsigned long long) fortySixBits); +verbose(4, "# 46 bits: %032llx\n", (long long) fortySixBits); dna=seq->dna; for (i=0; i < seq->size; ++i) { int val = orderedNtVal[(int)dna[i]]; if (val >= 0) { kmerVal = fortySixBits & ((kmerVal << 2) | val); ++kmerLength; if (kmerLength > 22) { if (endsGG == (kmerVal & 0xf)) /* check positive strand */ { struct crispr *oneCrispr = NULL; AllocVar(oneCrispr); @@ -262,105 +283,104 @@ slAddHead(&crisprSet, oneCrispr); } } } // if (val >= 0) else { ++gapCount; startGap = chromPosition; /* skip all N's == any value not = 0,1,2,3 */ while ( ((i+1) < seq->size) && (orderedNtVal[(int)dna[i+1]] < 0)) { ++chromPosition; ++i; } if (startGap != chromPosition) - verbose(4, "#GAP %s\t%llu\t%llu\t%llu\t%llu\t%s\n", seq->name, startGap, chromPosition, gapCount, chromPosition-startGap, "+"); + verbose(4, "#GAP %s\t%lld\t%lld\t%lld\t%lld\t%s\n", seq->name, startGap, chromPosition, gapCount, chromPosition-startGap, "+"); else - verbose(4, "#GAP %s\t%llu\t%llu\t%llu\t%llu\t%s\n", seq->name, startGap, chromPosition+1, gapCount, 1+chromPosition-startGap, "+"); + verbose(4, "#GAP %s\t%lld\t%lld\t%lld\t%lld\t%s\n", seq->name, startGap, chromPosition+1, gapCount, 1+chromPosition-startGap, "+"); kmerLength = 0; /* reset, start over */ kmerVal = 0; } // else if (val >= 0) ++chromPosition; } // for (i=0; i < seq->size; ++i) // slReverse(&crisprSet); // save time, order not important at this time returnList->chromCrisprs = crisprSet; returnList->crisprCount = slCount(crisprSet); return returnList; } // static struct crisprList *generateKmers(struct dnaSeq *seq) static void showCounts(struct crisprList *all) /* everything has been scanned and counted, print out all the data */ { FILE *bedFH = NULL; if (bedFileOut) bedFH = mustOpen(bedFileOut, "w"); verbose(1, "#\tprint out all data\n"); struct crisprList *list; -unsigned long long totalChecked = 0; +long long totalChecked = 0; verbose(1, "# %d number of chromosomes to display\n", slCount(all)); for (list = all; list; list = list->next) { struct crispr *c = NULL; struct crispr *next = NULL; - verbose(1, "# crisprs count %llu on chrom %s\n", list->crisprCount, list->chrom); + verbose(1, "# crisprs count %lld on chrom %s\n", list->crisprCount, list->chrom); for (c = list->chromCrisprs; c; c = next) { int negativeOffset = 0; if (c->strand == '-') negativeOffset = 3; - unsigned long long txStart = c->start + negativeOffset; - unsigned long long txEnd = c->start+20 + negativeOffset; + long long txStart = c->start + negativeOffset; + long long txEnd = c->start+20 + negativeOffset; if (bedFH) - fprintf(bedFH, "%s\t%llu\t%llu\t%s\t%d\t%c\t%llu\t%llu\t%s\t%d\t%d\t%d\t%d\n", list->chrom, c->start, c->start+23, kmerValToString(c, 3), c->offBy0, c->strand, txStart, txEnd, kmerPAMString(c), c->offBy1, c->offBy2, c->offBy3, c->offBy4); - - verbose(4, "%s\t%llu\t%llu\t%s\t%d\t%c\t%llu\t%llu\t%d\t%d\t%d\t%d\n", list->chrom, c->start, c->start+23, kmerValToString(c, 3), c->offBy0, c->strand, txStart, txEnd, c->offBy1, c->offBy2, c->offBy3, c->offBy4); + fprintf(bedFH, "%s\t%lld\t%lld\t%s\t%d\t%c\t%lld\t%lld\t%s\t%d\t%d\t%d\t%d\n", list->chrom, c->start, c->start+23, kmerValToString(c, 3), c->offBy0, c->strand, txStart, txEnd, kmerPAMString(c), c->offBy1, c->offBy2, c->offBy3, c->offBy4); if (c->offBy0) - verbose(3, "# identical: %d %s:%llu %c\t%s\n", c->offBy0, list->chrom, c->start, c->strand, kmerValToString(c, 3)); + verbose(3, "# identical: %d %s:%lld %c\t%s\n", c->offBy0, list->chrom, c->start, c->strand, kmerValToString(c, 3)); next = c->next; ++totalChecked; } } -verbose(1, "# total crisprs displayed: %llu\n", totalChecked); +verbose(1, "# total crisprs displayed: %lld\n", totalChecked); +if (bedFH) + carefulClose(&bedFH); } // static void showCounts(struct crisprList *all) -static unsigned long long countOffTargets(char *chrom, struct crispr *c, struct crisprList *all) +static long long countOffTargets(char *chrom, struct crispr *c, struct crisprList *all) /* given one crispr c, scan against all others to find off targets */ { struct crisprList *listPtr; -unsigned long long totalCompares = 0; +long long totalCompares = 0; for (listPtr = all; listPtr; listPtr = listPtr->next) { struct crispr *crisprPtr = NULL; struct crispr *next = NULL; for (crisprPtr = listPtr->chromCrisprs; crisprPtr; crisprPtr = next) { /* the XOR will determine differences in two sequences * the shift right 6 removes the PAM sequence */ - unsigned long long misMatch = (c->sequence ^ crisprPtr->sequence) >> 6; + long long misMatch = (c->sequence ^ crisprPtr->sequence) >> 6; if (! misMatch) /* no misMatch, identical crisprs */ { c->offBy0 += 1; crisprPtr->offBy0 += 1; -// verbose(4, "# identical: %d %s:%llu %c == %s:%llu %c\t%s == %s\n", c->offBy0, chrom, c->start, c->strand, listPtr->chrom, crisprPtr->start, crisprPtr->strand, kmerValToString(c, 0), kmerValToString(crisprPtr, 0)); } else { - if (verboseLevel() > 3) + if (verboseLevel() > 2) { /* possible misMatch bit values: 01 10 11 * turn those three values into just: 01 */ misMatch = (misMatch | (misMatch > 1)) & 0x5555555555; int bitsOn = _mm_popcnt_u64(misMatch); switch(bitsOn) { case 1: c->offBy1 += 1; crisprPtr->offBy1 += 1; break; case 2: c->offBy2 += 1; crisprPtr->offBy2 += 1; @@ -369,83 +389,83 @@ c->offBy3 += 1; crisprPtr->offBy3 += 1; break; case 4: c->offBy4 += 1; crisprPtr->offBy4 += 1; break; default: break; } } } ++totalCompares; next = crisprPtr->next; } } -// verbose(3,"#\ttotal comparisons %llu\n", totalCompares); return totalCompares; } // static void countOffTargets( . . . ) static struct crisprList *scanSequence(char *inFile) /* scan the given file, return list of crisprs */ { verbose(1, "#\tscanning sequence file: %s\n", inFile); dnaUtilOpen(); struct dnaLoad *dl = dnaLoadOpen(inFile); struct dnaSeq *seq; struct crisprList *listReturn = NULL; -double perSecond = 0.0; long elapsedMs = 0; long scanStart = clock1000(); -unsigned long long totalCrisprs = 0; +long long totalCrisprs = 0; /* scanning all sequences, setting up crisprs on the listReturn */ while ((seq = dnaLoadNext(dl)) != NULL) { + if (startsWithNoCase("chrUn", seq->name) || + rStringIn("hap", seq->name) || rStringIn("_alt", seq->name) ) + { + verbose(1, "# skip chrom: %s\n", seq->name); + continue; + } + long startTime = clock1000(); struct crisprList *oneList = generateKmers(seq); slAddHead(&listReturn, oneList); - elapsedMs = clock1000() - startTime; totalCrisprs += oneList->crisprCount; - perSecond = 0.0; - if (elapsedMs > 0) - perSecond = 1000.0 * oneList->crisprCount / elapsedMs; - verbose(1, "# %s, %d bases, %llu crisprs @ %ld ms -> %.2f crisprs/sec\n", - seq->name, seq->size, oneList->crisprCount, elapsedMs, perSecond); + elapsedMs = clock1000() - startTime; + timingMessage(seq->name, oneList->crisprCount, "crisprs", elapsedMs, "crisprs/sec"); } elapsedMs = clock1000() - scanStart; -perSecond = 0.0; -if (elapsedMs > 0) - perSecond = 1000.0 * totalCrisprs / elapsedMs; -verbose(1, "# %llu total crisprs @ %ld ms -> %.2f crisprs/sec\n", totalCrisprs, elapsedMs, perSecond); - +timingMessage("scanSequence", totalCrisprs, "total crisprs", elapsedMs, "crisprs/sec"); return listReturn; } /* static crisprList *scanSequence(char *inFile) */ static struct crisprList *rangeExtraction(struct crisprList *all) /* given ranges in global rangesHash, construct new list of crisprs that * have any type of overlap with the ranges, also extract those items from * the all list. Returns new list. */ { struct crisprList *listReturn = NULL; struct crisprList *listPtr = NULL; int inputChromCount = slCount(all); -unsigned long long returnListCrisprCount = 0; -unsigned long long examinedCrisprCount = 0; +long long returnListCrisprCount = 0; +long long examinedCrisprCount = 0; + +long elapsedMs = 0; +long scanStart = clock1000(); for (listPtr = all; listPtr; listPtr = listPtr->next) { struct crispr *c = NULL; struct binKeeper *bk = hashFindVal(rangesHash, listPtr->chrom); struct crispr *newCrispr = NULL; if (bk != NULL) { struct crispr *prevCrispr = NULL; struct crispr *next = NULL; for (c = listPtr->chromCrisprs; c; c = next) { ++examinedCrisprCount; struct binElement *hitList = NULL; next = c->next; // remember before perhaps lost @@ -468,193 +488,303 @@ slFreeList(&hitList); } } if (newCrispr) { struct crisprList *newItem = NULL; AllocVar(newItem); newItem->chrom = cloneString(listPtr->chrom); newItem->size = listPtr->size; newItem->chromCrisprs = newCrispr; newItem->crisprCount = slCount(newCrispr); slAddHead(&listReturn, newItem); } } // for (listPtr = all; listPtr; listPtr = listPtr->next) -verbose(1, "# range limits, input %d chromosomes, return %d chromosomes\n", +elapsedMs = clock1000() - scanStart; +verbose(1, "# range scan, input %d chromosomes, return %d chromosomes\n", inputChromCount, slCount(listReturn)); -verbose(1, "# range limits, input %llu crisprs, return %llu crisprs\n", - examinedCrisprCount, returnListCrisprCount); +timingMessage("range scan", examinedCrisprCount, "examined crisprs", elapsedMs, "crisprs/sec"); +timingMessage("range scan", returnListCrisprCount, "returned crisprs", elapsedMs, "crisprs/sec"); return listReturn; } // static crisprList *rangeExtraction(crisprList *all) static struct crisprList *queryVsAll(struct crisprList *query, struct crisprList *target) /* run the query crisprs list against the target list */ { struct crisprList *listReturn = NULL; -unsigned long long totalCrisprsQuery = 0; -unsigned long long totalCompares = 0; +long long totalCrisprsQuery = 0; +long long totalCompares = 0; int targetChrCount = slCount(target); int queryChrCount = slCount(query); verbose(1, "# queryVsAll: target %d chromosomes vs. query %d chromosomes\n", targetChrCount, queryChrCount); struct crisprList *listPtr = NULL; long processStart = clock1000(); long elapsedMs = 0; -double perSecond = 0.0; for (listPtr = query; listPtr; listPtr = listPtr->next) { struct crispr *c = NULL; totalCrisprsQuery += listPtr->crisprCount; - verbose(1, "# queryVsAll: crispr count: %llu on %s\n", listPtr->crisprCount, listPtr->chrom); - int crisprsDone = 0; + verbose(1, "# queryVsAll: crispr count: %lld on %s\n", listPtr->crisprCount, listPtr->chrom); + long long crisprsDone = 0; for (c = listPtr->chromCrisprs; c; c = c->next) { - verbose(4, "%s\t%s\t%llu\t%c\n", kmerValToString(c, 0), listPtr->chrom, c->start, c->strand); + verbose(4, "%s\t%s\t%lld\t%c\n", kmerValToString(c, 0), listPtr->chrom, c->start, c->strand); totalCompares += countOffTargets(listPtr->chrom, c, target); ++crisprsDone; + if (crisprsDone < 9) + { elapsedMs = clock1000() - processStart; -perSecond = 0.0; -if (elapsedMs > 0) -perSecond = 1000.0 * crisprsDone / elapsedMs; -verbose(1, "# queryVsAll: processed %d crisprs @ %ld ms -> %.2f crisprs/sec\n", crisprsDone, elapsedMs, perSecond); -verbose(1, "# %d %s:%llu %c %s, offBy0: %d\n", crisprsDone, listPtr->chrom, c->start, c->strand, kmerValToString(c, 0), c->offBy0); -perSecond = 0.0; -if (elapsedMs > 0) -perSecond = 1000.0 * totalCompares / elapsedMs; -verbose(1, "# queryVsAll: processed %d crisprs total compares %llu @ %ld ms -> %.2f crisprs/sec\n", crisprsDone, totalCompares, elapsedMs, perSecond); + timingMessage("queryVsAll", totalCompares, "total comparisons", elapsedMs, "compares/sec"); + } } } // for (listPtr = target; listPtr; listPtr = nextChr) elapsedMs = clock1000() - processStart; -perSecond = 0.0; -if (elapsedMs > 0) - perSecond = 1000.0 * totalCrisprsQuery / elapsedMs; -verbose(1, "# queryVsAll: %llu total crisprs processed @ %ld ms -> %.2f crisprs/sec\n", totalCrisprsQuery, elapsedMs, perSecond); -perSecond = 0.0; -if (elapsedMs > 0) - perSecond = 1000.0 * totalCompares / elapsedMs; -verbose(1, "# queryVsAll: %llu total comparisons @ %ld ms -> %.2f crisprs/sec\n", totalCompares, elapsedMs, perSecond); +timingMessage("queryVsAll", totalCrisprsQuery, "crisprs processed", elapsedMs, "crisprs/sec"); +timingMessage("queryVsAll", totalCompares, "total comparisons", elapsedMs, "compares/sec"); return listReturn; } static struct crisprList *allVsAll(struct crisprList *all, struct crisprList *addTo) /* run the all list against itself, add to 'addTo' if given */ { struct crisprList *listReturn = NULL; if (addTo) listReturn = addTo; -unsigned long long totalCrisprsIn = 0; -unsigned long long totalCrisprsOut = 0; -unsigned long long totalCompares = 0; +long long totalCrisprsIn = 0; +long long totalCrisprsOut = 0; +long long totalCompares = 0; int chrCount = slCount(all); verbose(1, "# allVsAll: crispr list, scanning %d chromosomes, now processing . . .\n", chrCount); struct crisprList *listPtr = NULL; struct crisprList *nextChr = NULL; long processStart = clock1000(); double perSecond = 0.0; long elapsedMs = 0; for (listPtr = all; listPtr; listPtr = nextChr) { struct crispr *newCrispr = NULL; struct crispr *c = NULL; totalCrisprsIn += listPtr->crisprCount; // to verify same in and out - verbose(1, "# allVsAll: crispr count: %llu on %s\n", listPtr->crisprCount, listPtr->chrom); + verbose(1, "# allVsAll: crispr count: %lld on %s\n", listPtr->crisprCount, listPtr->chrom); struct crispr *next; int crisprsDone = 0; for (c = listPtr->chromCrisprs; c; c = next) { - verbose(4, "%s\t%s\t%llu\t%c\n", kmerValToString(c, 0), listPtr->chrom, c->start, c->strand); + verbose(4, "%s\t%s\t%lld\t%c\n", kmerValToString(c, 0), listPtr->chrom, c->start, c->strand); next = c->next; // remember before lost c->next = NULL; // taking this one out of list listPtr->chromCrisprs = next; // this first item removed from list if ((verboseLevel() > 2) && next) totalCompares += countOffTargets(listPtr->chrom, c, all); slAddHead(&newCrispr, c); // constructing new list ++crisprsDone; elapsedMs = clock1000() - processStart; perSecond = 0.0; if (elapsedMs > 0) perSecond = 1000.0 * crisprsDone / elapsedMs; verbose(1, "# allVsAll: processed %d crisprs @ %ld ms -> %.2f crisprs/sec\n", crisprsDone, elapsedMs, perSecond); -verbose(1, "# %d %s:%llu %c %s, offBy0: %d\n", crisprsDone, listPtr->chrom, c->start, c->strand, kmerValToString(c, 0), c->offBy0); +verbose(1, "# %d %s:%lld %c %s, offBy0: %d\n", crisprsDone, listPtr->chrom, c->start, c->strand, kmerValToString(c, 0), c->offBy0); perSecond = 0.0; if (elapsedMs > 0) perSecond = 1000.0 * totalCompares / elapsedMs; -verbose(1, "# allVsAll: processed %d crisprs total compares %llu @ %ld ms -> %.2f crisprs/sec\n", crisprsDone, totalCompares, elapsedMs, perSecond); +verbose(1, "# allVsAll: processed %d crisprs total compares %lld @ %ld ms -> %.2f crisprs/sec\n", crisprsDone, totalCompares, elapsedMs, perSecond); } nextChr = listPtr->next; // remember before lost listPtr->next = NULL; // taking out of list listPtr->chromCrisprs = newCrispr; // the new crispr list listPtr->crisprCount = slCount(newCrispr); // the new crispr list totalCrisprsOut += listPtr->crisprCount; // to verify correct all = nextChr; // removes this one from list slAddHead(&listReturn, listPtr); } // for (listPtr = all; listPtr; listPtr = nextChr) if (slCount(listReturn) != chrCount) verbose(1, "#\tERROR: transferred crispr list chrom count %d != beginning count %d\n", slCount(listReturn), chrCount); if (totalCrisprsIn != totalCrisprsOut) - verbose(1, "#\tERROR: initial crispr list count %llu != output count %llu\n", totalCrisprsIn, totalCrisprsOut); + verbose(1, "#\tERROR: initial crispr list count %lld != output count %lld\n", totalCrisprsIn, totalCrisprsOut); elapsedMs = clock1000() - processStart; perSecond = 0.0; if (elapsedMs > 0) perSecond = 1000.0 * totalCrisprsIn / elapsedMs; -verbose(1, "# %llu total crisprs processed @ %ld ms -> %.2f crisprs/sec\n", totalCrisprsIn, elapsedMs, perSecond); +verbose(1, "# %lld total crisprs processed @ %ld ms -> %.2f crisprs/sec\n", totalCrisprsIn, elapsedMs, perSecond); perSecond = 0.0; if (elapsedMs > 0) perSecond = 1000.0 * totalCompares / elapsedMs; -verbose(1, "# %llu total comparisons @ %ld ms -> %.2f crisprs/sec\n", totalCompares, elapsedMs, perSecond); +verbose(1, "# %lld total comparisons @ %ld ms -> %.2f crisprs/sec\n", totalCompares, elapsedMs, perSecond); return listReturn; +} // static struct crisprList *allVsAll(struct crisprList *all, + // struct crisprList *addTo) + +static struct crisprList *readKmers(char *fileIn) +/* read in kmer list from 'fileIn', return list structure */ +{ +verbose(1, "# reading crisprs from: %s\n", fileIn); +struct crisprList *listReturn = NULL; +struct lineFile *lf = lineFileOpen(fileIn, TRUE); +char *row[10]; +int wordCount = 0; +long long crisprsInput = 0; + +long startMs = clock1000(); + +while (0 < (wordCount = lineFileChopNextTab(lf, row, ArraySize(row))) ) + { + if (3 != wordCount) + errAbort("expecing three words at line %d in file %s, found: %d", + lf->lineIx, fileIn, wordCount); + struct crisprList *newItem = NULL; + AllocVar(newItem); + newItem->chrom = cloneString(row[0]); + newItem->crisprCount = sqlLongLong(row[1]); + newItem->size = sqlLongLong(row[2]); + newItem->chromCrisprs = NULL; + slAddHead(&listReturn, newItem); + long long verifyCount = 0; + while ( (verifyCount < newItem->crisprCount) && (0 < (wordCount = lineFileChopNextTab(lf, row, ArraySize(row)))) ) + { + if (3 != wordCount) + errAbort("expecing three words at line %d in file %s, found: %d", + lf->lineIx, fileIn, wordCount); + ++verifyCount; + struct crispr *oneCrispr = NULL; + AllocVar(oneCrispr); + oneCrispr->sequence = sqlLongLong(row[0]); + oneCrispr->start = sqlLongLong(row[1]); + oneCrispr->strand = row[2][0]; + oneCrispr->offBy0 = 0; + oneCrispr->offBy1 = 0; + oneCrispr->offBy2 = 0; + oneCrispr->offBy3 = 0; + oneCrispr->offBy4 = 0; + slAddHead(&newItem->chromCrisprs, oneCrispr); + } + if (verifyCount != newItem->crisprCount) + errAbort("expecting %lld kmer items at line %d in file %s, found: %lld", + newItem->crisprCount, lf->lineIx, fileIn, verifyCount); + crisprsInput += verifyCount; + } + +lineFileClose(&lf); + +long elapsedMs = clock1000() - startMs; +timingMessage("readKmers", crisprsInput, "crisprs read in", elapsedMs, "crisprs/sec"); + +return listReturn; +} // static struct crisprList *readKmers(char *fileIn) + +static void writeKmers(struct crisprList *all, char *fileOut) +/* write kmer list 'all' to 'fileOut' */ +{ +FILE *fh = mustOpen(fileOut, "w"); +struct crisprList *list = NULL; +long long crisprsWritten = 0; + +long startMs = clock1000(); + +slReverse(&all); +for (list = all; list; list = list->next) + { + fprintf(fh, "%s\t%lld\t%d\n", list->chrom, list->crisprCount, list->size); + struct crispr *c = NULL; + slReverse(&list->chromCrisprs); + for (c = list->chromCrisprs; c; c = c->next) + { + fprintf(fh, "%lld\t%lld\t%c\n", c->sequence, + c->start, c->strand); + ++crisprsWritten; + } } +carefulClose(&fh); + +long elapsedMs = clock1000() - startMs; +timingMessage("writeKmers", crisprsWritten, "crisprs written", elapsedMs, "crisprs/sec"); +} // static void writeKmers(struct crisprList *all, char *fileOut) static void crisprKmers(char *sequence) /* crisprKmers - find and annotate crispr sequences. */ { -struct crisprList *allCrisprs = scanSequence(sequence); struct crisprList *queryCrisprs = NULL; struct crisprList *countedCrisprs = NULL; +struct crisprList *allCrisprs = NULL; + + +if (loadKmers) + allCrisprs = readKmers(loadKmers); +else + allCrisprs = scanSequence(sequence); + +#ifdef NOT +// timing walking through the linked lists +long startMs = clock1000(); +long long kmerCount = 0; +struct crisprList *list = NULL; +for (list = allCrisprs; list; list = list->next) + kmerCount += slCount(list->chromCrisprs); +long elapsedMs = clock1000() - startMs; +timingMessage("slCount", kmerCount, "kmers counted", elapsedMs, "kmers/sec"); +startMs = clock1000(); +slReverse(&allCrisprs); +for (list = allCrisprs; list; list = list->next) + slReverse(&list->chromCrisprs); +elapsedMs = clock1000() - startMs; +timingMessage("slReverse", kmerCount, "kmers reversed", elapsedMs, "kmers/sec"); + +return; +#endif + +if (dumpKmers) + { + writeKmers(allCrisprs, dumpKmers); + return; + } /* processing those crisprs */ if (verboseLevel() > 1) { /* if ranges have been specified, construct queryList of kmers to measure */ if (rangesHash) { queryCrisprs = rangeExtraction(allCrisprs); /* first run up query vs. all, no list mucking about */ countedCrisprs = queryVsAll(queryCrisprs, allCrisprs); /* then run up query vs. itself, add to countedCrisprs */ countedCrisprs = allVsAll(queryCrisprs, countedCrisprs); } else countedCrisprs = allVsAll(allCrisprs, NULL); showCounts(countedCrisprs); } } // static void crisprKmers(char *sequence) int main(int argc, char *argv[]) /* Process command line, initialize translation arrays and call the process */ { optionInit(&argc, argv, options); if (argc != 2) usage(); bedFileOut = optionVal("bed", bedFileOut); +dumpKmers = optionVal("dumpKmers", dumpKmers); +loadKmers = optionVal("loadKmers", loadKmers); ranges = optionVal("ranges", ranges); if (ranges) rangesHash = readRanges(ranges); initOrderedNtVal(); /* set up orderedNtVal[] */ crisprKmers(argv[1]); +if (verboseLevel() > 1) + printVmPeak(); return 0; }