3c9f824a9764c5eed782ae3f07146aadecec74f0 kate Mon Nov 27 15:07:15 2017 -0800 Cleanup to ease some fixes to longTabix. diff --git src/lib/annoStreamLongTabix.c src/lib/annoStreamLongTabix.c new file mode 100644 index 0000000..4ca44a7 --- /dev/null +++ src/lib/annoStreamLongTabix.c @@ -0,0 +1,208 @@ +/* annoStreamLongTabix -- subclass of annoStreamer for longTabix files */ + +/* Copyright (C) 2014 The Regents of the University of California + * See README in this or parent directory for licensing information. */ + +#include "annoStreamLongTabix.h" +#include "twoBit.h" +#include "bedTabix.h" +#include "sqlNum.h" + +struct annoStreamLongTabix + { + struct annoStreamer streamer; // Parent class members & methods + // Private members + char *asWords[6]; // Current row of longTabix with genotypes squashed for autoSql + struct bedTabixFile *btf; // longTabix parsed header and file object + int numFileCols; // Number of columns in longTabix file. + int maxRecords; // Maximum number of annoRows to return. + int recordCount; // Number of annoRows we have returned so far. + boolean eof; // True when we have hit end of file or maxRecords + }; + +static void asxSetRegion(struct annoStreamer *vSelf, char *chrom, uint regionStart, uint regionEnd) +/* Set region and reset internal state. */ +{ +annoStreamerSetRegion(vSelf, chrom, regionStart, regionEnd); +struct annoStreamLongTabix *self = (struct annoStreamLongTabix *)vSelf; +self->eof = FALSE; +if (chrom != NULL) + { + // If this region is not in longTabix index, set self->eof so we won't keep grabbing rows + // from the old position. + boolean gotRegion = lineFileSetTabixRegion(self->btf->lf, chrom, regionStart, regionEnd); + if (! gotRegion) + self->eof = TRUE; + } +} + +static char *asxGetHeader(struct annoStreamer *vSelf) +/* Return longTabix header (e.g. for use by formatter) */ +{ +#ifdef NOTNOW +struct annoStreamLongTabix *self = (struct annoStreamLongTabix *)vSelf; +return cloneString(self->btf->headerString); +#endif +return NULL; +} + +static char **nextRowRaw(struct annoStreamLongTabix *self) +/* Get the next longTabix record and put the row text into autoSql words. + * Return pointer to self->asWords if we get a row, otherwise NULL. */ +{ +char *words[self->numFileCols]; +int wordCount; +if ((wordCount = lineFileChop(self->btf->lf, words)) <= 0) + return NULL; +lineFileExpectWords(self->btf->lf, self->numFileCols, wordCount); +int i; +// First 6 columns are always in the longTabix file: +for (i = 0; i < 6; i++) + { + freeMem(self->asWords[i]); + self->asWords[i] = cloneString(words[i]); + } +//self->record = vcfRecordFromRow(self->btf, words); +return self->asWords; +} + +#ifdef NOTNOW +static char *getProperChromName(struct annoStreamLongTabix *self, char *vcfChrom) +/* We tolerate chr-less chrom names in longTabix and BAM ("1" for "chr1" etc); to avoid + * confusing the rest of the system, return the chr-ful version if it exists. */ +{ +char *name = hashFindVal(self->chromNameHash, vcfChrom); +if (name == NULL) + { + name = vcfChrom; + struct twoBitFile *tbf = self->streamer.assembly->tbf; + char buf[256]; + if (! twoBitIsSequence(tbf, vcfChrom)) + { + safef(buf, sizeof(buf), "chr%s", vcfChrom); + if (twoBitIsSequence(tbf, buf)) + name = buf; + } + name = lmCloneString(self->chromNameHash->lm, name); + hashAdd(self->chromNameHash, vcfChrom, name); + } +return name; +} +#endif + +static char **nextRowUnfiltered(struct annoStreamLongTabix *self, char *minChrom, uint minEnd) +/* Get the next longTabix record and put the row text into autoSql words. + * Return pointer to self->asWords if we get a row, otherwise NULL. */ +{ +struct annoStreamer *sSelf = (struct annoStreamer *)self; +char *regionChrom = sSelf->chrom; +uint regionStart = sSelf->regionStart; +if (minChrom != NULL) + { + if (regionChrom == NULL) + { + regionChrom = minChrom; + regionStart = minEnd; + } + else + { + regionStart = max(regionStart, minEnd); + } + } +char **words = nextRowRaw(self); +if (regionChrom != NULL && words != NULL) + { + //char *rowChrom = getProperChromName(self, words[0]); + char *rowChrom = cloneString(words[0]); + if (strcmp(rowChrom, regionChrom) < 0) + { + uint regionEnd = sSelf->regionEnd; + if (minChrom != NULL && sSelf->chrom == NULL) + regionEnd = annoAssemblySeqSize(sSelf->assembly, minChrom); + // If lineFileSetTabixRegion fails, just keep the current file position + // -- hopefully we'll just be skipping to the next row after region{Chrom,Start,End}. + lineFileSetTabixRegion(self->btf->lf, regionChrom, regionStart, regionEnd); + } + } +if (words != NULL) + self->recordCount++; +if (words == NULL || (self->maxRecords > 0 && self->recordCount >= self->maxRecords)) + self->eof = TRUE; +return words; +} + +static struct annoRow *nextRowFiltered(struct annoStreamLongTabix *self, char *minChrom, uint minEnd, + struct lm *callerLm) +/* Get the next record that passes our filters. */ +{ +char **words = nextRowUnfiltered(self, minChrom, minEnd); +if (words == NULL) + return NULL; +// Skip past any left-join failures until we get a right-join failure, a passing row, or EOF. +struct annoStreamer *sSelf = (struct annoStreamer *)self; +boolean rightFail = FALSE; +while (annoFilterRowFails(sSelf->filters, words, sSelf->numCols, &rightFail)) + { + if (rightFail) + break; + words = nextRowUnfiltered(self, minChrom, minEnd); + if (words == NULL) + return NULL; + } +//struct vcfRecord *rec = self->record; +//char *chrom = getProperChromName(self, rec->chrom); +return annoRowFromStringArray(words[0], sqlUnsigned(words[1]), sqlUnsigned(words[2]), + rightFail, words, sSelf->numCols, callerLm); +} + + +static struct annoRow *asxNextRow(struct annoStreamer *sSelf, char *minChrom, uint minEnd, + struct lm *callerLm) +/* Return an annoRow encoding the next longTabix record, or NULL if there are no more items. + * Use queues to save indels aside until we get to the following base, because longTabix's + * indel encoding starts one base to the left of the actual indel. Thus, sorted longTabix might + * not be sorted in our internal coords, but it won't be off by more than one base. */ +{ +struct annoStreamLongTabix *self = (struct annoStreamLongTabix *)sSelf; +if (minChrom != NULL && sSelf->chrom != NULL && differentString(minChrom, sSelf->chrom)) + errAbort("annoStreamLongTabix %s: nextRow minChrom='%s' but region chrom='%s'", + sSelf->name, minChrom, sSelf->chrom); +if (self->eof) + return NULL; +struct annoRow *nextRow = NULL; +if ((nextRow = nextRowFiltered(self, minChrom, minEnd, callerLm)) != NULL) + return nextRow; +return NULL; +} + + +static void asxClose(struct annoStreamer **pVSelf) +/* Close longTabix file and free self. */ +{ +if (pVSelf == NULL) + return; +struct annoStreamLongTabix *self = *(struct annoStreamLongTabix **)pVSelf; +bedTabixFileClose(&(self->btf)); +annoStreamerFree(pVSelf); +} + +struct annoStreamer *annoStreamLongTabixNew(char *fileOrUrl, struct annoAssembly *aa, int maxRecords) +/* Create an annoStreamer (subclass) object from a longTabix indexed tab file */ +{ +struct bedTabixFile *btf = bedTabixFileMayOpen(fileOrUrl, NULL, 0, 0); + +struct annoStreamLongTabix *self; +AllocVar(self); +struct annoStreamer *streamer = &(self->streamer); +struct asObject *asObj = longTabixAsObj(); +annoStreamerInit(streamer, aa, asObj, fileOrUrl); +streamer->rowType = arWords; +streamer->setRegion = asxSetRegion; +streamer->getHeader = asxGetHeader; +streamer->nextRow = asxNextRow; +streamer->close = asxClose; +self->btf = btf; +self->numFileCols = 6; +self->maxRecords = maxRecords; +return (struct annoStreamer *)self; +}