f8458ca93f0c0abe5456789dd7a9618d63e55e5b kent Sun Jul 3 15:01:40 2022 -0700 Fixing bug on reverse strand and improving cache behavior. Making it so you can ask for a cached sequence without aborting if it doesn't exist. diff --git src/lib/cacheTwoBit.c src/lib/cacheTwoBit.c index 1c2c8f8..d546751 100644 --- src/lib/cacheTwoBit.c +++ src/lib/cacheTwoBit.c @@ -4,151 +4,192 @@ #include "linefile.h" #include "hash.h" #include "options.h" #include "dnaseq.h" #include "twoBit.h" #include "rangeTree.h" #include "cacheTwoBit.h" struct cacheTwoBitSeq /* Cached ranges of single sequence in a twoBit file */ { struct cacheTwoBitSeq *next; char *seqName; // Name of sequence int seqSize; // Size of sequence struct twoBitFile *tbf; // Associated twoBit file; - boolean doRc; // If set, cache reverse complement + bool doRc; // If set, cache reverse complement + bool doUpper; // If set sequence is upper cased before caching struct rbTree *rangeTree; // Values of this range tree are DNA seqs long basesQueried; // Total bases read from cache long basesRead; // Total bases read by cache int queryCount; // Number of queries made }; struct cacheTwoBitUrl /* An open two bit file from the internet and a hash of sequences in it */ { struct cacheTwoBitUrl *next; /* Next in list */ char *url; /* Name of URL hashed */ struct twoBitFile *tbf; /* Open two bit file */ struct hash *seqHash; /* Hash of cacheTwoBitSeq */ struct hash *rcSeqHash; /* Hash of reverse-complemented cacheTwoBitSeq */ struct cacheTwoBitSeq *seqList; /* List of sequences accessed */ }; -struct cacheTwoBitRanges *cacheTwoBitRangesNew() + +struct cacheTwoBitRanges *cacheTwoBitRangesNew(boolean doUpper) /* Create a new cache for ranges or complete sequence in two bit files */ { struct cacheTwoBitRanges *cache; AllocVar(cache); cache->urlHash = hashNew(0); +cache->doUpper = doUpper; return cache; } struct cacheTwoBitSeq *cacheTwoBitSeqNew(struct cacheTwoBitUrl *cacheUrl, - char *seqName, boolean doRc) + char *seqName, boolean doRc, boolean doUpper) /* Create a new cacheTwoBitSeq structure */ { struct cacheTwoBitSeq *ctbSeq; AllocVar(ctbSeq); ctbSeq->seqName = cloneString(seqName); ctbSeq->seqSize = twoBitSeqSize(cacheUrl->tbf, seqName); ctbSeq->doRc = doRc; +ctbSeq->doUpper = doUpper; ctbSeq->tbf = cacheUrl->tbf; ctbSeq->rangeTree = rangeTreeNew(); return ctbSeq; } static struct dnaSeq *cacheTwoBitSeqFetch(struct cacheTwoBitSeq *ctbSeq, int start, int end, int *retOffset) /* Get dna sequence from file */ { +if (start == 0 && end == 0) /* Spacial case, we read whole seq */ + end = twoBitSeqSize(ctbSeq->tbf, ctbSeq->seqName); + int size = end-start; if (size <= 0) errAbort("start >= end in cacheTwoBitSeqFetch. start %d, end %d\n", start, end); long offset = 0; struct dnaSeq *seq = NULL; ctbSeq->basesQueried += size; ctbSeq->queryCount += 1; -if (ctbSeq->doRc) - reverseIntRange(&start, &end, ctbSeq->seqSize); struct rbTree *rangeTree = ctbSeq->rangeTree; struct range *enclosing = rangeTreeFindEnclosing(rangeTree, start, end); if (enclosing != NULL) { seq = enclosing->val; offset = enclosing->start; + // fprintf(stderr, "reusing %d %s:%d+%d\n", ctbSeq->doRc, ctbSeq->seqName, start, size); } else { struct range *overlappingList = rangeTreeAllOverlapping(rangeTree, start, end); /* Expand start end so it includes range of all overlapping */ struct range *range; for (range = overlappingList; range != NULL; range = range->next) { if (start > range->start) start = range->start; if (end < range->end) end = range->end; } - seq = twoBitReadSeqFrag(ctbSeq->tbf, ctbSeq->seqName, start, end); + + // fprintf(stderr, "fetching new %d %s:%d+%d\n", ctbSeq->doRc, ctbSeq->seqName, start, size); + int rcStart = start, rcEnd = end; + if (ctbSeq->doRc) + { + reverseIntRange(&rcStart, &rcEnd, ctbSeq->seqSize); + } + seq = twoBitReadSeqFrag(ctbSeq->tbf, ctbSeq->seqName, rcStart, rcEnd); if (seq != NULL) { + if (ctbSeq->doUpper) + toUpperN(seq->dna, seq->size); if (ctbSeq->doRc) + { reverseComplement(seq->dna, seq->size); - rangeTreeAddVal(rangeTree, start, end, seq, NULL); } + rangeTreeAddVal(rangeTree, start, end, seq, NULL); ctbSeq->basesRead += size; offset = start; } + } *retOffset = offset; return seq; } -struct dnaSeq *cacheTwoBitRangesFetch(struct cacheTwoBitRanges *cacheAll, char *url, char *seqName, - int start, int end, boolean doRc, int *retOffset) +static struct dnaSeq *cacheTwoBitRangesFetchOrNot(struct cacheTwoBitRanges *cacheAll, + char *url, char *seqName, int start, int end, boolean doRc, boolean missOk, int *retOffset) /* fetch a sequence from a 2bit. Caches open two bit files and sequence in * both forward and reverse strand */ { /* Init static url hash */ struct hash *urlHash = cacheAll->urlHash; // hash of open files struct cacheTwoBitUrl *cacheUrl = hashFindVal(urlHash, url); if (cacheUrl == NULL) { AllocVar(cacheUrl); cacheUrl->url = cloneString(url); cacheUrl->tbf = twoBitOpen(url); cacheUrl->seqHash = hashNew(0); cacheUrl->rcSeqHash = hashNew(0); hashAdd(urlHash, url, cacheUrl); slAddHead(&cacheAll->urlList, cacheUrl); } +if (missOk && !twoBitHasSeq(cacheUrl->tbf, seqName)) + { + return NULL; + } struct hash *seqHash = (doRc ? cacheUrl->rcSeqHash : cacheUrl->seqHash); struct cacheTwoBitSeq *ctbSeq = hashFindVal(seqHash, seqName); if (ctbSeq == NULL) { - ctbSeq = cacheTwoBitSeqNew(cacheUrl, seqName, doRc); + ctbSeq = cacheTwoBitSeqNew(cacheUrl, seqName, doRc, cacheAll->doUpper); hashAdd(seqHash, seqName, ctbSeq); slAddHead(&cacheUrl->seqList, ctbSeq); } -struct dnaSeq *seq = cacheTwoBitSeqFetch(ctbSeq, start, end, retOffset); +struct dnaSeq *seq = NULL; +if (start < end || !missOk) + seq = cacheTwoBitSeqFetch(ctbSeq, start, end, retOffset); return seq; } +struct dnaSeq *cacheTwoBitRangesFetch(struct cacheTwoBitRanges *cacheAll, + char *url, char *seqName, int start, int end, boolean doRc, int *retOffset) +/* fetch a sequence from a 2bit. Caches open two bit files and sequence in + * both forward and reverse strand */ +{ +return cacheTwoBitRangesFetchOrNot(cacheAll, url, seqName, start, end, doRc, FALSE, retOffset); +} + +struct dnaSeq *cacheTwoBitRangesMayFetch(struct cacheTwoBitRanges *cacheAll, + char *url, char *seqName, int start, int end, boolean doRc, int *retOffset) +/* Fetch a sequence from a twoBit cache. The result in retOffset is where the return dnaSeq + * sits within the named sequence, the whole of which is stored in the subtracted + * associated twoBit file. Do not free the returned sequence. Returns NULL if sequence not + * found */ +{ +return cacheTwoBitRangesFetchOrNot(cacheAll, url, seqName, start, end, doRc, TRUE, retOffset); +} + void cacheTwoBitRangesPrintStats(struct cacheTwoBitRanges *cache) /* print cache statistics - Debugging routine */ { printf("caching %d twoBit files\n", slCount(cache->urlList)); struct cacheTwoBitUrl *cachedUrl; int totalSeq = 0; int totalRanges = 0; long basesQueried = 0; // Total bases read from cache long basesRead = 0; // Total bases read by cache int queryCount = 0; for (cachedUrl = cache->urlList; cachedUrl != NULL; cachedUrl = cachedUrl->next) { printf("%s has %d + strand, %d minus strand sequences cached\n", cachedUrl->url, cachedUrl->seqHash->elCount, cachedUrl->rcSeqHash->elCount);