95a4ab24a874500faece4f5e71815b950697afb3 hiram Mon Jul 10 00:16:59 2017 -0700 adding some fake color to the bed output to test runtime on some larger ranges refs #18969 diff --git src/utils/crisprKmers/crisprKmers.c src/utils/crisprKmers/crisprKmers.c index 6a0595b..d4ea4f0 100644 --- src/utils/crisprKmers/crisprKmers.c +++ src/utils/crisprKmers/crisprKmers.c @@ -119,31 +119,30 @@ bases[C_BASE] = 'C'; bases[G_BASE] = 'G'; bases[T_BASE] = 'T'; } // static void initOrderedNtVal() static void timingMessage(char *prefix, long long count, char *message, long ms, char *units) { double perSecond = 0.0; if (ms > 0) perSecond = 1000.0 * count / ms; verbose(1, "# %s: %lld %s @ %ld ms -> %.2f %s\n", prefix, count, message, ms, perSecond, units); } - static struct hash *readRanges(char *bedFile) /* Read bed and return it as a hash keyed by chromName * with binKeeper values. (from: src/hg/bedIntersect/bedIntersec.c) */ { struct hash *hash = newHash(0); /* key is chromName, value is binkeeper data */ struct lineFile *lf = lineFileOpen(bedFile, TRUE); char *row[3]; while (lineFileNextRow(lf, row, 3)) { struct binKeeper *bk; struct bed3 *item; struct hashEl *hel = hashLookup(hash, row[0]); if (hel == NULL) { @@ -232,54 +231,60 @@ * The 'val ^ fortySixBits' does the complement, the shifting and * masking does the reversing. */ register long long v = (val ^ fortySixBits) << 18; v = ((v & high32bits) >> 32) | ((v & low32bits) << 32); v = ((v & high16bits) >> 16) | ((v & low16bits) << 16); v = ((v & high8bits) >> 8) | ((v & low8bits) << 8); v = ((v & high4bits) >> 4) | ((v & low4bits) << 4); v = ((v & high2bits) >> 2) | ((v & low2bits) << 2); return v; } // static long long revComp(long long val) static void copyToArray(struct crisprList *list) /* copy the crispr list data into arrays */ { +long startTime = clock1000(); struct crisprList *cl = NULL; +long long itemsCopied = 0; + for (cl = list; cl; cl = cl->next) { size_t memSize = cl->crisprCount * sizeof(long long); cl->sequence = (long long *)needLargeMem(memSize); cl->start = (long long *)needLargeMem(memSize); memSize = cl->crisprCount * sizeof(char); cl->strand = (char *)needLargeMem(memSize); memSize = 5 * sizeof(int *); cl->offBy = (int **)needLargeMem(memSize); memSize = cl->crisprCount * sizeof(int); int r = 0; for (r = 0; r < 5; ++r) cl->offBy[r] = (int *)needLargeZeroedMem(memSize); long long i = 0; struct crispr *c = NULL; for (c = cl->chromCrisprs; c; c = c->next) { + ++itemsCopied; cl->sequence[i] = c->sequence; cl->start[i] = c->start; cl->strand[i++] = c->strand; } } +long elapsedMs = clock1000() - startTime; +timingMessage("copyToArray:", itemsCopied, "items copied", elapsedMs, "items/sec"); } // static void copyToArray(struct crisprList *list) */ static struct crisprList *generateKmers(struct dnaSeq *seq) { struct crispr *crisprSet = NULL; struct crisprList *returnList = NULL; AllocVar(returnList); returnList->chrom = cloneString(seq->name); returnList->size = seq->size; int i; DNA *dna; long long chromPosition = 0; long long startGap = 0; long long gapCount = 0; int kmerLength = 0; @@ -334,65 +339,112 @@ if (startGap != chromPosition) verbose(4, "#GAP %s\t%lld\t%lld\t%lld\t%lld\t%s\n", seq->name, startGap, chromPosition, gapCount, chromPosition-startGap, "+"); else verbose(4, "#GAP %s\t%lld\t%lld\t%lld\t%lld\t%s\n", seq->name, startGap, chromPosition+1, gapCount, 1+chromPosition-startGap, "+"); kmerLength = 0; /* reset, start over */ kmerVal = 0; } // else if (val >= 0) ++chromPosition; } // for (i=0; i < seq->size; ++i) // slReverse(&crisprSet); // save time, order not important at this time returnList->chromCrisprs = crisprSet; returnList->crisprCount = slCount(crisprSet); return returnList; } // static struct crisprList *generateKmers(struct dnaSeq *seq) +static char *itemColor(int **offBy, long long index) +{ +static char *notUnique = "150,150,150"; +// static char *notUniqueAlt = "150,150,150"; +static char *twoMany = "120,120,120"; +// static char *twoManyAlt = "120,120,120"; +static char *highScore = "0,255,0"; // > 70 +static char *mediumScore = "128,128,0"; // > 50 +static char *lowScore = "255,0,0"; // >= 50 +// static char *lowestScore = "80,80,80"; // < 50 +// static char *lowestScoreAlt = "80,80,80"; // < 50 +if (offBy[0][index]) + return notUnique; +else + { + int offSum = offBy[1][index] + offBy[2][index] + offBy[3][index] + + offBy[4][index]; + if (offSum < 100) + return highScore; + else if (offSum < 150) + return mediumScore; + else if (offSum < 250) + return lowScore; + else + return twoMany; + } +} +// sample of offBy sums from the chrM vs. hg38 run: +// Q1 91.000000 +// median 150.000000 +// Q3 251.000000 +// average 238.378477 +// min 4.000000 +// max 9023.000000 +// count 2193 +// total 522764.000000 +// standard deviation 424.309155 + static void countsOutput(struct crisprList *all) /* everything has been scanned and counted, print out all the data from arrays*/ { +long startTime = clock1000(); +long long itemsOutput = 0; + FILE *bedFH = NULL; if (bedFileOut) bedFH = mustOpen(bedFileOut, "w"); struct crisprList *list; long long totalOut = 0; for (list = all; list; list = list->next) { long long c = 0; for (c = 0; c < list->crisprCount; ++c) { + ++itemsOutput; int negativeOffset = 0; if (list->strand[c] == '-') negativeOffset = 3; long long txStart = list->start[c] + negativeOffset; long long txEnd = txStart + 20; int totalOffs = list->offBy[0][c] + list->offBy[1][c] + list->offBy[2][c] + list->offBy[3][c] + list->offBy[4][c]; + char *color = itemColor(list->offBy, c); + if (0 == totalOffs) verbose(1, "# PERFECT score %s:%lld %c\t%s\n", list->chrom, list->start[c], list->strand[c], kmerValToString(list->sequence[c], 3)); if (bedFH) - fprintf(bedFH, "%s\t%lld\t%lld\t%d,%d,%d,%d,%d\t%d\t%c\t%lld\t%lld\t%s\t%d\t%d\t%d\t%d\t%s\n", list->chrom, list->start[c], list->start[c]+23, list->offBy[0][c], list->offBy[1][c], list->offBy[2][c], list->offBy[3][c], list->offBy[4][c], list->offBy[0][c], list->strand[c], txStart, txEnd, kmerPAMString(list->sequence[c]), list->offBy[1][c], list->offBy[2][c], list->offBy[3][c], list->offBy[4][c], kmerValToString(list->sequence[c], 3)); + fprintf(bedFH, "%s\t%lld\t%lld\t%d,%d,%d,%d,%d\t%d\t%c\t%lld\t%lld\t%s\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\n", list->chrom, list->start[c], list->start[c]+23, list->offBy[0][c], list->offBy[1][c], list->offBy[2][c], list->offBy[3][c], list->offBy[4][c], list->offBy[0][c], list->strand[c], txStart, txEnd, color, color, color, kmerValToString(list->sequence[c], 3), kmerPAMString(list->sequence[c]), list->offBy[1][c], list->offBy[2][c], list->offBy[3][c], list->offBy[4][c]); if (list->offBy[0][c]) verbose(3, "# array identical: %d %s:%lld %c\t%s\n", list->offBy[0][c], list->chrom, list->start[c], list->strand[c], kmerValToString(list->sequence[c], 3)); ++totalOut; } } if (bedFH) carefulClose(&bedFH); + +long elapsedMs = clock1000() - startTime; +timingMessage("copyToArray:", itemsOutput, "items output", elapsedMs, "items/sec"); } // static void countsOutput(struct crisprList *all) static struct crisprList *scanSequence(char *inFile) /* scan the given file, return list of crisprs */ { verbose(1, "#\tscanning sequence file: %s\n", inFile); dnaUtilOpen(); struct dnaLoad *dl = dnaLoadOpen(inFile); struct dnaSeq *seq; struct crisprList *listReturn = NULL; long elapsedMs = 0; long scanStart = clock1000(); long long totalCrisprs = 0; @@ -428,155 +480,158 @@ struct crisprList *all = *allReference; struct crisprList *listReturn = NULL; struct crisprList *list = NULL; int inputChromCount = slCount(all); long long returnListCrisprCount = 0; long long examinedCrisprCount = 0; struct crisprList *prevChromList = NULL; long elapsedMs = 0; long scanStart = clock1000(); struct crisprList *nextList = NULL; for (list = all; list; list = nextList) { nextList = list->next; // remember before perhaps lost + long long beforeCrisprCount = list->crisprCount; examinedCrisprCount += list->crisprCount; struct crispr *c = NULL; struct binKeeper *bk = hashFindVal(rangesHash, list->chrom); struct crispr *newCrispr = NULL; if (bk != NULL) { struct crispr *prevCrispr = NULL; struct crispr *next = NULL; for (c = list->chromCrisprs; c; c = next) { struct binElement *hitList = NULL; next = c->next; // remember before perhaps lost int start = c->start; if (c->strand == '-') start += 2; int end = start + 20; hitList = binKeeperFind(bk, start, end); if (hitList) { - ++returnListCrisprCount; if (prevCrispr) // remove this one from the all list prevCrispr->next = next; else list->chromCrisprs = next; // removing the first one c->next = NULL; slAddHead(&newCrispr, c); // constructing new list } else prevCrispr = c; // remains on all list slFreeList(&hitList); } } if (newCrispr) { struct crisprList *newItem = NULL; AllocVar(newItem); newItem->chrom = cloneString(list->chrom); newItem->size = list->size; newItem->chromCrisprs = newCrispr; newItem->crisprCount = slCount(newCrispr); + returnListCrisprCount += newItem->crisprCount; slAddHead(&listReturn, newItem); if (NULL == list->chromCrisprs) // all have been removed for this chrom { if (prevChromList) // remove it from the chrom list prevChromList->next = nextList; else all = nextList; // removing the first one } else { list->crisprCount = slCount(list->chromCrisprs); - verbose(1, "# range scan same chrom list still has %lld crisprs on chrom %s\n", - list->crisprCount, list->chrom); + long long removedCrisprs = beforeCrisprCount - list->crisprCount; + verbose(1, "# range scan chrom %s had %lld crisprs, removed %lld leaving %lld target\n", + list->chrom, beforeCrisprCount, removedCrisprs, list->crisprCount); } } prevChromList = list; } // for (list = all; list; list = list->next) elapsedMs = clock1000() - scanStart; verbose(1, "# range scanning %d chroms, return %d selected chroms, leaving %d chroms\n", inputChromCount, slCount(listReturn), slCount(all)); +long long targetCrisprCount = examinedCrisprCount - returnListCrisprCount; timingMessage("range scan", examinedCrisprCount, "examined crisprs", elapsedMs, "crisprs/sec"); -timingMessage("range scan", returnListCrisprCount, "returned crisprs", elapsedMs, "crisprs/sec"); +timingMessage("range scan", targetCrisprCount, "remaining target crisprs", elapsedMs, "crisprs/sec"); +timingMessage("range scan", returnListCrisprCount, "returned query crisprs", elapsedMs, "crisprs/sec"); if (NULL == all) { allReference = NULL; // they have all been removed verbose(1, "# range scan, every single chrom has been removed\n"); } else if (*allReference != all) { verbose(1, "# range scan, first chrom list has been moved from %p to %p\n", (void *)*allReference, (void *)all); *allReference = all; } return listReturn; } // static crisprList *rangeExtraction(crisprList *all) static void recordOffTargets(struct crisprList *query, struct crisprList *target, int bitsOn, long long qIndex, long long tIndex) /* bitsOn is from 1 to 4, record this match when less than 10 count */ { if (offFile) { if (query->offBy[bitsOn][qIndex] < 10) {/* needs to be two prints as the kmerValToString return is confused */ - fprintf(offFile, "%s:%lld %c %s %#020llx\t", query->chrom, query->start[qIndex], - query->strand[qIndex], kmerValToString(query->sequence[qIndex], 3), -(unsigned long long)((query->sequence[qIndex] & 0x3fffffffffc0)>>6)); - fprintf(offFile, "%s:%lld %c %s %#020llx\t%d\n", target->chrom, + fprintf(offFile, "%s:%lld %c %s\t", query->chrom, query->start[qIndex], + query->strand[qIndex], kmerValToString(query->sequence[qIndex], 3)); + fprintf(offFile, "%s:%lld %c %s\t%d\n", target->chrom, target->start[tIndex], target->strand[tIndex], - kmerValToString(target->sequence[tIndex], 3), -(unsigned long long)((target->sequence[tIndex] & 0x3fffffffffc0)>>6), bitsOn); - } + kmerValToString(target->sequence[tIndex], 3), bitsOn); } } +} // static void recordOffTargets(struct crisprList *query, + // struct crisprList *target, int bitsOn, long long qIndex, + // long long tIndex) -static void queryVsAll(struct crisprList *query, - struct crisprList *target) +static void queryVsAll(struct crisprList *query, struct crisprList *target) /* run the query crisprs list against the target list in the array structures */ { struct crisprList *qList = NULL; long long totalCrisprsQuery = 0; long long totalCrisprsTarget = 0; long long totalCompares = 0; long processStart = clock1000(); long elapsedMs = 0; for (qList = query; qList; qList = qList->next) { long long qCount = 0; totalCrisprsQuery += qList->crisprCount; verbose(1, "# queryVsAll %lld query crisprs on chrom %s\n", qList->crisprCount, qList->chrom); for (qCount = 0; qCount < qList->crisprCount; ++qCount) { struct crisprList *tList = NULL; for (tList = target; tList; tList = tList->next) { long long tCount = 0; totalCompares += tList->crisprCount; - if (0 == qCount) -{ -totalCrisprsTarget += tList->crisprCount; -verbose(1, "# queryVsAll %lld target crisprs on chrom %s %lld targets %lld compares\n", tList->crisprCount, tList->chrom, totalCrisprsTarget, totalCompares); -} +// if (0 == qCount) +// { +// totalCrisprsTarget += tList->crisprCount; +// verbose(1, "# queryVsAll %lld target crisprs on chrom %s %lld targets %lld compares\n", tList->crisprCount, tList->chrom, totalCrisprsTarget, totalCompares); +// } for (tCount = 0; tCount < tList->crisprCount; ++tCount) { /* the XOR will determine differences in two sequences * the shift right 6 removes the PAM sequence */ long long misMatch = (qList->sequence[qCount] ^ tList->sequence[tCount]) >> 6; if (misMatch) { /* possible misMatch bit values: 01 10 11 * turn those three values into just: 01 */ misMatch = (misMatch | (misMatch >> 1)) & 0x5555555555; int bitsOn = _mm_popcnt_u64(misMatch); if (bitsOn < 5) @@ -592,31 +647,31 @@ tList->offBy[0][tCount] += 1; } } // for (tCount = 0; tCount < tList->crisprCount; ++tCount) } // for (tList = target; tList; tList = tList->next) } // for (qCount = 0; qCount < qList->crisprCount; ++qCount) } // for (qList = query; qList; qList = qList->next) verbose(1, "# done with scanning, check timing\n"); elapsedMs = clock1000() - processStart; timingMessage("queryVsAll", totalCrisprsQuery, "query crisprs processed", elapsedMs, "crisprs/sec"); timingMessage("queryVsAll", totalCrisprsTarget, "vs target crisprs", elapsedMs, "crisprs/sec"); timingMessage("queryVsAll", totalCompares, "total comparisons", elapsedMs, "compares/sec"); } /* static struct crisprList *queryVsAll(struct crisprList *query, struct crisprList *target) */ static void allVsAll(struct crisprList *all) -/* run this 'all' list vs. itself with arrays */ +/* run this 'all' list vs. itself avoiding self to self comparisons */ { struct crisprList *qList = NULL; long long totalCrisprsQuery = 0; long long totalCrisprsCompare = 0; long processStart = clock1000(); long elapsedMs = 0; /* query runs through all chroms */ for (qList = all; qList; qList = qList->next) { long long qCount = 0; totalCrisprsQuery += qList->crisprCount; verbose(1, "# allVsAll %lld query crisprs on chrom %s\n", qList->crisprCount, qList->chrom); /* query runs through all kmers on this chrom */