efc87af4e5921d966a5bbc9ce8832b7f374f4599 kent Mon Jul 4 17:31:44 2022 -0700 Added file parameter to cacheTwoBitRangesPrintStats. diff --git src/lib/cacheTwoBit.c src/lib/cacheTwoBit.c index d546751..e0e1d0d 100644 --- src/lib/cacheTwoBit.c +++ src/lib/cacheTwoBit.c @@ -1,216 +1,216 @@ /* cacheTwoBit - system for caching open two bit files and ranges of sequences in them */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "dnaseq.h" #include "twoBit.h" #include "rangeTree.h" #include "cacheTwoBit.h" struct cacheTwoBitSeq /* Cached ranges of single sequence in a twoBit file */ { struct cacheTwoBitSeq *next; char *seqName; // Name of sequence int seqSize; // Size of sequence struct twoBitFile *tbf; // Associated twoBit file; bool doRc; // If set, cache reverse complement bool doUpper; // If set sequence is upper cased before caching struct rbTree *rangeTree; // Values of this range tree are DNA seqs long basesQueried; // Total bases read from cache long basesRead; // Total bases read by cache int queryCount; // Number of queries made }; struct cacheTwoBitUrl /* An open two bit file from the internet and a hash of sequences in it */ { struct cacheTwoBitUrl *next; /* Next in list */ char *url; /* Name of URL hashed */ struct twoBitFile *tbf; /* Open two bit file */ struct hash *seqHash; /* Hash of cacheTwoBitSeq */ struct hash *rcSeqHash; /* Hash of reverse-complemented cacheTwoBitSeq */ struct cacheTwoBitSeq *seqList; /* List of sequences accessed */ }; struct cacheTwoBitRanges *cacheTwoBitRangesNew(boolean doUpper) /* Create a new cache for ranges or complete sequence in two bit files */ { struct cacheTwoBitRanges *cache; AllocVar(cache); cache->urlHash = hashNew(0); cache->doUpper = doUpper; return cache; } struct cacheTwoBitSeq *cacheTwoBitSeqNew(struct cacheTwoBitUrl *cacheUrl, char *seqName, boolean doRc, boolean doUpper) /* Create a new cacheTwoBitSeq structure */ { struct cacheTwoBitSeq *ctbSeq; AllocVar(ctbSeq); ctbSeq->seqName = cloneString(seqName); ctbSeq->seqSize = twoBitSeqSize(cacheUrl->tbf, seqName); ctbSeq->doRc = doRc; ctbSeq->doUpper = doUpper; ctbSeq->tbf = cacheUrl->tbf; ctbSeq->rangeTree = rangeTreeNew(); return ctbSeq; } static struct dnaSeq *cacheTwoBitSeqFetch(struct cacheTwoBitSeq *ctbSeq, int start, int end, int *retOffset) /* Get dna sequence from file */ { if (start == 0 && end == 0) /* Spacial case, we read whole seq */ end = twoBitSeqSize(ctbSeq->tbf, ctbSeq->seqName); int size = end-start; if (size <= 0) errAbort("start >= end in cacheTwoBitSeqFetch. start %d, end %d\n", start, end); long offset = 0; struct dnaSeq *seq = NULL; ctbSeq->basesQueried += size; ctbSeq->queryCount += 1; struct rbTree *rangeTree = ctbSeq->rangeTree; struct range *enclosing = rangeTreeFindEnclosing(rangeTree, start, end); if (enclosing != NULL) { seq = enclosing->val; offset = enclosing->start; // fprintf(stderr, "reusing %d %s:%d+%d\n", ctbSeq->doRc, ctbSeq->seqName, start, size); } else { struct range *overlappingList = rangeTreeAllOverlapping(rangeTree, start, end); /* Expand start end so it includes range of all overlapping */ struct range *range; for (range = overlappingList; range != NULL; range = range->next) { if (start > range->start) start = range->start; if (end < range->end) end = range->end; } // fprintf(stderr, "fetching new %d %s:%d+%d\n", ctbSeq->doRc, ctbSeq->seqName, start, size); int rcStart = start, rcEnd = end; if (ctbSeq->doRc) { reverseIntRange(&rcStart, &rcEnd, ctbSeq->seqSize); } seq = twoBitReadSeqFrag(ctbSeq->tbf, ctbSeq->seqName, rcStart, rcEnd); if (seq != NULL) { if (ctbSeq->doUpper) toUpperN(seq->dna, seq->size); if (ctbSeq->doRc) { reverseComplement(seq->dna, seq->size); } rangeTreeAddVal(rangeTree, start, end, seq, NULL); ctbSeq->basesRead += size; offset = start; } } *retOffset = offset; return seq; } static struct dnaSeq *cacheTwoBitRangesFetchOrNot(struct cacheTwoBitRanges *cacheAll, char *url, char *seqName, int start, int end, boolean doRc, boolean missOk, int *retOffset) /* fetch a sequence from a 2bit. Caches open two bit files and sequence in * both forward and reverse strand */ { /* Init static url hash */ struct hash *urlHash = cacheAll->urlHash; // hash of open files struct cacheTwoBitUrl *cacheUrl = hashFindVal(urlHash, url); if (cacheUrl == NULL) { AllocVar(cacheUrl); cacheUrl->url = cloneString(url); cacheUrl->tbf = twoBitOpen(url); cacheUrl->seqHash = hashNew(0); cacheUrl->rcSeqHash = hashNew(0); hashAdd(urlHash, url, cacheUrl); slAddHead(&cacheAll->urlList, cacheUrl); } if (missOk && !twoBitHasSeq(cacheUrl->tbf, seqName)) { return NULL; } struct hash *seqHash = (doRc ? cacheUrl->rcSeqHash : cacheUrl->seqHash); struct cacheTwoBitSeq *ctbSeq = hashFindVal(seqHash, seqName); if (ctbSeq == NULL) { ctbSeq = cacheTwoBitSeqNew(cacheUrl, seqName, doRc, cacheAll->doUpper); hashAdd(seqHash, seqName, ctbSeq); slAddHead(&cacheUrl->seqList, ctbSeq); } struct dnaSeq *seq = NULL; if (start < end || !missOk) seq = cacheTwoBitSeqFetch(ctbSeq, start, end, retOffset); return seq; } struct dnaSeq *cacheTwoBitRangesFetch(struct cacheTwoBitRanges *cacheAll, char *url, char *seqName, int start, int end, boolean doRc, int *retOffset) /* fetch a sequence from a 2bit. Caches open two bit files and sequence in * both forward and reverse strand */ { return cacheTwoBitRangesFetchOrNot(cacheAll, url, seqName, start, end, doRc, FALSE, retOffset); } struct dnaSeq *cacheTwoBitRangesMayFetch(struct cacheTwoBitRanges *cacheAll, char *url, char *seqName, int start, int end, boolean doRc, int *retOffset) /* Fetch a sequence from a twoBit cache. The result in retOffset is where the return dnaSeq * sits within the named sequence, the whole of which is stored in the subtracted * associated twoBit file. Do not free the returned sequence. Returns NULL if sequence not * found */ { return cacheTwoBitRangesFetchOrNot(cacheAll, url, seqName, start, end, doRc, TRUE, retOffset); } -void cacheTwoBitRangesPrintStats(struct cacheTwoBitRanges *cache) +void cacheTwoBitRangesPrintStats(struct cacheTwoBitRanges *cache, FILE *f) /* print cache statistics - Debugging routine */ { -printf("caching %d twoBit files\n", slCount(cache->urlList)); +fprintf(f, "caching %d twoBit files\n", slCount(cache->urlList)); struct cacheTwoBitUrl *cachedUrl; int totalSeq = 0; int totalRanges = 0; long basesQueried = 0; // Total bases read from cache long basesRead = 0; // Total bases read by cache int queryCount = 0; for (cachedUrl = cache->urlList; cachedUrl != NULL; cachedUrl = cachedUrl->next) { - printf("%s has %d + strand, %d minus strand sequences cached\n", + fprintf(f, "%s has %d + strand, %d minus strand sequences cached\n", cachedUrl->url, cachedUrl->seqHash->elCount, cachedUrl->rcSeqHash->elCount); struct cacheTwoBitSeq *ctbSeq; for (ctbSeq = cachedUrl->seqList; ctbSeq != NULL; ctbSeq = ctbSeq->next) { - printf(" %s %c strand\n", ctbSeq->seqName, ctbSeq->doRc ? '=' : '+'); + fprintf(f, " %s %c strand\n", ctbSeq->seqName, ctbSeq->doRc ? '-' : '+'); totalSeq += 1; totalRanges += ctbSeq->rangeTree->n; basesQueried += ctbSeq->basesQueried; basesRead += ctbSeq->basesRead; queryCount += ctbSeq->queryCount; struct range *range = rangeTreeList(ctbSeq->rangeTree); for ( ; range != NULL; range = range->next) { - printf(" %d %d\n", range->start, range->end); + fprintf(f, " %d start %d size\n", range->start, range->end - range->start); } } } -printf("total sequences cached %d in %d ranges covering %d queries\n", +fprintf(f, "total sequences cached %d in %d ranges covering %d queries\n", totalSeq, totalRanges, queryCount); -printf("basesRead %ld bases queried %ld\n", basesRead, basesQueried); +fprintf(f, "basesRead %ld bases queried %ld\n", basesRead, basesQueried); }