6546198dd3ce4f12b81225c5093163f3a176b5e3 angie Tue Mar 13 14:17:15 2018 -0700 Adding twoBitSeqWindow{New,Free} for seqWindow on 2bit files. diff --git src/hg/lib/seqWindow.c src/hg/lib/seqWindow.c index 930e9d1..286abdc 100644 --- src/hg/lib/seqWindow.c +++ src/hg/lib/seqWindow.c @@ -1,141 +1,213 @@ /* seqWindow -- generic interface & implementations for fetching subranges of a sequence */ /* Copyright (C) 2017 The Regents of the University of California * See README in this or parent directory for licensing information. */ #include "common.h" #include "hdb.h" #include "seqWindow.h" static void seqWindowUpdateRangeAndSeq(struct seqWindow *seqWin, char *seqName, uint start, uint end, char *seq) /* Update seqWin's window coordinates and sequence -- all fetch methods need to do this. * seq is *not* cloned, don't free it afterward! */ { if (!sameOk(seqName, seqWin->seqName)) { freeMem(seqWin->seqName); seqWin->seqName = cloneString(seqName); } seqWin->start = start; seqWin->end = end; freeMem(seqWin->seq); if (seq) touppers(seq); seqWin->seq = seq; } static void seqWindowFreeShared(struct seqWindow **pSw) /* Free the parts of seqWindow that are shared by all implementations. * This *does not* zero out *pSw like a normal Free would, so that implementations * can free up the implementation-specific parts of *pSw */ { if (pSw && *pSw) { struct seqWindow *sw = (struct seqWindow *)*pSw; freeMem(sw->seqName); freeMem(sw->seq); } } struct chromSeqWindow /* seqWindow for chrom sequence from a genome database */ { struct seqWindow sw; // generic interface char *db; // db from which this can fetch sequence }; #define CHROMSEQ_CACHE_FUDGE 4096 static void chromSeqFetch(struct seqWindow *seqWin, char *chrom, uint start, uint end) /* seqWindow fetch method for updating window with new location & sequence if window does not * already cover the requested location. */ { struct chromSeqWindow *csw = (struct chromSeqWindow *)seqWin; boolean sameChrom = sameOk(seqWin->seqName, chrom); if (!sameChrom || start < seqWin->start || end > seqWin->end) { // We must fetch new sequence. Expand range by CHROMSEQ_CACHE_FUDGE so if we get // successive requests for nearby sequences, we won't have to fetch sequence as often. int chromSize = hChromSize(csw->db, chrom); if (start > chromSize) errAbort("chromSeqFetch: start (%u) is out of range for %s %s (length %d)", start, csw->db, chrom, chromSize); uint bufStart = (start > CHROMSEQ_CACHE_FUDGE) ? start - CHROMSEQ_CACHE_FUDGE : 0; uint bufEnd = end + CHROMSEQ_CACHE_FUDGE; // Tolerate & clip ranges that extend past the end of the sequence if (bufEnd > chromSize) bufEnd = chromSize; struct dnaSeq *dnaSeq = hChromSeq(csw->db, chrom, bufStart, bufEnd); if (dnaSeq) { bufEnd = bufStart + dnaSeq->size; // should be unnecessary but just in case seqWindowUpdateRangeAndSeq(seqWin, chrom, bufStart, bufEnd, dnaSeqCannibalize(&dnaSeq)); } else { // No sequence for chrom errAbort("chromSeqFetch: unable to get sequence for %s [%d,%d)", chrom, start, end); } } } struct seqWindow *chromSeqWindowNew(char *db, char *chrom, uint start, uint end) /* Return a new seqWindow that can fetch uppercase sequence from the chrom sequences in db. * If chrom is non-NULL and end > start then load sequence from that range; if chrom is non-NULL * and start == end == 0 then fetch entire chrom. */ { struct chromSeqWindow *csw; AllocVar(csw); csw->sw.fetch = chromSeqFetch; csw->db = cloneString(db); if (start > end) errAbort("chromSeqWindowNew: start (%u) should be <= end (%u)", start, end); if (chrom != NULL) chromSeqFetch((struct seqWindow *)csw, chrom, start, end); return (struct seqWindow *)csw; } void chromSeqWindowFree(struct seqWindow **pSw) /* Free a chromSeqWindow. */ { if (pSw && *pSw) { seqWindowFreeShared(pSw); struct chromSeqWindow *csw = (struct chromSeqWindow *)*pSw; freeMem(csw->db); freez(pSw); } } static void memSeqFetch(struct seqWindow *seqWin, char *acc, uint start, uint end) /* No changes, we have what we have. */ { if (!sameOk(seqWin->seqName, acc)) errAbort("memSeqFetch: sequence name '%s' requested from window on '%s'", acc, seqWin->seqName); } struct seqWindow *memSeqWindowNew(char *acc, char *seq) /* Return a new seqWindow copying this sequence already in memory. */ { struct seqWindow *sw; AllocVar(sw); sw->seqName = cloneString(acc); sw->seq = cloneString(seq ? seq : ""); touppers(sw->seq); sw->start = 0; sw->end = strlen(sw->seq); sw->fetch = memSeqFetch; return sw; } void memSeqWindowFree(struct seqWindow **pSw) /* Free a seqWindow that was created by memSeqWindowNew. */ { if (pSw && *pSw) { seqWindowFreeShared(pSw); // No extra stuff for memSeqWindow freez(pSw); } } + +struct twoBitSeqWindow +/* seqWindow for twoBit file */ + { + struct seqWindow sw; // generic interface + struct twoBitFile *tbf; // twoBitFile from which this can fetch sequence + }; + +#define TWOBITSEQ_CACHE_FUDGE 4096 + +static void twoBitSeqFetch(struct seqWindow *seqWin, char *chrom, uint start, uint end) +/* seqWindow fetch method for updating window with new location & sequence if window does not + * already cover the requested location. */ +{ +struct twoBitSeqWindow *tsw = (struct twoBitSeqWindow *)seqWin; +boolean sameChrom = sameOk(seqWin->seqName, chrom); +if (!sameChrom || start < seqWin->start || end > seqWin->end) + { + // We must fetch new sequence. Expand range by CHROMSEQ_CACHE_FUDGE so if we get + // successive requests for nearby sequences, we won't have to fetch sequence as often. + int chromSize = twoBitSeqSize(tsw->tbf, chrom); + if (start > chromSize) + errAbort("twoBitSeqFetch: start (%u) is out of range for %s %s (length %d)", + start, tsw->tbf->fileName, chrom, chromSize); + if (start == 0 && end == 0) + end = chromSize; + uint bufStart = (start > CHROMSEQ_CACHE_FUDGE) ? start - CHROMSEQ_CACHE_FUDGE : 0; + uint bufEnd = end + CHROMSEQ_CACHE_FUDGE; + // Tolerate & clip ranges that extend past the end of the sequence + if (bufEnd > chromSize) + bufEnd = chromSize; + struct dnaSeq *dnaSeq = twoBitReadSeqFragLower(tsw->tbf, chrom, bufStart, bufEnd); + if (dnaSeq) + { + bufEnd = bufStart + dnaSeq->size; // should be unnecessary but just in case + seqWindowUpdateRangeAndSeq(seqWin, chrom, bufStart, bufEnd, dnaSeqCannibalize(&dnaSeq)); + } + else + { + // No sequence for chrom + errAbort("twoBitSeqFetch: unable to get sequence for %s [%d,%d)", chrom, start, end); + } + } +} + +struct seqWindow *twoBitSeqWindowNew(char *twoBitFileName, char *chrom, uint start, uint end) +/* Return a new seqWindow that can fetch uppercase sequence from twoBitFileName. + * If chrom is non-NULL and end > start then load sequence from that range; if chrom is non-NULL + * and start == end == 0 then fetch entire chrom. */ +{ +struct twoBitSeqWindow *tsw; +AllocVar(tsw); +tsw->sw.fetch = twoBitSeqFetch; +tsw->tbf = twoBitOpen(twoBitFileName); +if (start > end) + errAbort("twoBitSeqWindowNew: start (%u) should be <= end (%u)", start, end); +if (chrom != NULL) + twoBitSeqFetch((struct seqWindow *)tsw, chrom, start, end); +return (struct seqWindow *)tsw; +} + +void twoBitSeqWindowFree(struct seqWindow **pSw) +/* Free a twoBitSeqWindow. */ +{ +if (pSw && *pSw) + { + seqWindowFreeShared(pSw); + struct twoBitSeqWindow *tsw = (struct twoBitSeqWindow *)*pSw; + twoBitClose(&tsw->tbf); + freez(pSw); + } +}