3c9f824a9764c5eed782ae3f07146aadecec74f0 kate Mon Nov 27 15:07:15 2017 -0800 Cleanup to ease some fixes to longTabix. diff --git src/lib/annoStreamTabix.c src/lib/annoStreamTabix.c deleted file mode 100644 index 5ea068a..0000000 --- src/lib/annoStreamTabix.c +++ /dev/null @@ -1,228 +0,0 @@ -/* annoStreamTabix -- subclass of annoStreamer for VCF files */ - -/* Copyright (C) 2014 The Regents of the University of California - * See README in this or parent directory for licensing information. */ - -#include "annoStreamTabix.h" -#include "twoBit.h" -#include "bedTabix.h" -#include "sqlNum.h" - -static char *longTabixAutoSqlString = -"table longTabix\n" -"\"Long Range Tabix file\"\n" -" (\n" -" string chrom; \"Reference sequence chromosome or scaffold\"\n" -" uint chromStart; \"Start position in chromosome\"\n" -" uint chromEnd; \"End position in chromosome\"\n" -" string interactingRegion; \"(e.g. chrX:123-456,3.14, where chrX:123-456 is the coordinate of the mate, and 3.14 is the score of the interaction)\"\n" -" uint id; \"Unique Id\"\n" -" char[1] strand; \"+ or -\"\n" -" )\n" -; - -struct asObject *longTabixAsObj() -// Return asObject describing fields of longTabix file -{ -return asParseText(longTabixAutoSqlString); -} - -struct annoStreamTabix - { - struct annoStreamer streamer; // Parent class members & methods - // Private members - char *asWords[6]; // Current row of VCF with genotypes squashed for autoSql - struct bedTabixFile *btf; // VCF parsed header and file object - int numFileCols; // Number of columns in tabix file. - int maxRecords; // Maximum number of annoRows to return. - int recordCount; // Number of annoRows we have returned so far. - boolean eof; // True when we have hit end of file or maxRecords - }; - - -static void asxSetRegion(struct annoStreamer *vSelf, char *chrom, uint regionStart, uint regionEnd) -/* Set region and reset internal state. */ -{ -annoStreamerSetRegion(vSelf, chrom, regionStart, regionEnd); -struct annoStreamTabix *self = (struct annoStreamTabix *)vSelf; -self->eof = FALSE; -if (chrom != NULL) - { - // If this region is not in tabix index, set self->eof so we won't keep grabbing rows - // from the old position. - boolean gotRegion = lineFileSetTabixRegion(self->btf->lf, chrom, regionStart, regionEnd); - if (! gotRegion) - self->eof = TRUE; - } -} - -static char *asxGetHeader(struct annoStreamer *vSelf) -/* Return VCF header (e.g. for use by formatter) */ -{ -#ifdef NOTNOW -struct annoStreamTabix *self = (struct annoStreamTabix *)vSelf; -return cloneString(self->btf->headerString); -#endif -return NULL; -} - -static char **nextRowRaw(struct annoStreamTabix *self) -/* Get the next VCF record and put the row text into autoSql words. - * Return pointer to self->asWords if we get a row, otherwise NULL. */ -{ -char *words[self->numFileCols]; -int wordCount; -if ((wordCount = lineFileChop(self->btf->lf, words)) <= 0) - return NULL; -lineFileExpectWords(self->btf->lf, self->numFileCols, wordCount); -int i; -// First 6 columns are always in the VCF file: -for (i = 0; i < 6; i++) - { - freeMem(self->asWords[i]); - self->asWords[i] = cloneString(words[i]); - } -//self->record = vcfRecordFromRow(self->btf, words); -return self->asWords; -} - -#ifdef NOTNOW -static char *getProperChromName(struct annoStreamTabix *self, char *vcfChrom) -/* We tolerate chr-less chrom names in VCF and BAM ("1" for "chr1" etc); to avoid - * confusing the rest of the system, return the chr-ful version if it exists. */ -{ -char *name = hashFindVal(self->chromNameHash, vcfChrom); -if (name == NULL) - { - name = vcfChrom; - struct twoBitFile *tbf = self->streamer.assembly->tbf; - char buf[256]; - if (! twoBitIsSequence(tbf, vcfChrom)) - { - safef(buf, sizeof(buf), "chr%s", vcfChrom); - if (twoBitIsSequence(tbf, buf)) - name = buf; - } - name = lmCloneString(self->chromNameHash->lm, name); - hashAdd(self->chromNameHash, vcfChrom, name); - } -return name; -} -#endif - -static char **nextRowUnfiltered(struct annoStreamTabix *self, char *minChrom, uint minEnd) -/* Get the next VCF record and put the row text into autoSql words. - * Return pointer to self->asWords if we get a row, otherwise NULL. */ -{ -struct annoStreamer *sSelf = (struct annoStreamer *)self; -char *regionChrom = sSelf->chrom; -uint regionStart = sSelf->regionStart; -if (minChrom != NULL) - { - if (regionChrom == NULL) - { - regionChrom = minChrom; - regionStart = minEnd; - } - else - { - regionStart = max(regionStart, minEnd); - } - } -char **words = nextRowRaw(self); -if (regionChrom != NULL && words != NULL) - { - //char *rowChrom = getProperChromName(self, words[0]); - char *rowChrom = cloneString(words[0]); - if (strcmp(rowChrom, regionChrom) < 0) - { - uint regionEnd = sSelf->regionEnd; - if (minChrom != NULL && sSelf->chrom == NULL) - regionEnd = annoAssemblySeqSize(sSelf->assembly, minChrom); - // If lineFileSetTabixRegion fails, just keep the current file position - // -- hopefully we'll just be skipping to the next row after region{Chrom,Start,End}. - lineFileSetTabixRegion(self->btf->lf, regionChrom, regionStart, regionEnd); - } - } -if (words != NULL) - self->recordCount++; -if (words == NULL || (self->maxRecords > 0 && self->recordCount >= self->maxRecords)) - self->eof = TRUE; -return words; -} - -static struct annoRow *nextRowFiltered(struct annoStreamTabix *self, char *minChrom, uint minEnd, - struct lm *callerLm) -/* Get the next record that passes our filters. */ -{ -char **words = nextRowUnfiltered(self, minChrom, minEnd); -if (words == NULL) - return NULL; -// Skip past any left-join failures until we get a right-join failure, a passing row, or EOF. -struct annoStreamer *sSelf = (struct annoStreamer *)self; -boolean rightFail = FALSE; -while (annoFilterRowFails(sSelf->filters, words, sSelf->numCols, &rightFail)) - { - if (rightFail) - break; - words = nextRowUnfiltered(self, minChrom, minEnd); - if (words == NULL) - return NULL; - } -//struct vcfRecord *rec = self->record; -//char *chrom = getProperChromName(self, rec->chrom); -return annoRowFromStringArray(words[0], sqlUnsigned(words[1]), sqlUnsigned(words[2]), - rightFail, words, sSelf->numCols, callerLm); -} - - -static struct annoRow *asxNextRow(struct annoStreamer *sSelf, char *minChrom, uint minEnd, - struct lm *callerLm) -/* Return an annoRow encoding the next VCF record, or NULL if there are no more items. - * Use queues to save indels aside until we get to the following base, because VCF's - * indel encoding starts one base to the left of the actual indel. Thus, sorted VCF might - * not be sorted in our internal coords, but it won't be off by more than one base. */ -{ -struct annoStreamTabix *self = (struct annoStreamTabix *)sSelf; -if (minChrom != NULL && sSelf->chrom != NULL && differentString(minChrom, sSelf->chrom)) - errAbort("annoStreamTabix %s: nextRow minChrom='%s' but region chrom='%s'", - sSelf->name, minChrom, sSelf->chrom); -if (self->eof) - return NULL; -struct annoRow *nextRow = NULL; -if ((nextRow = nextRowFiltered(self, minChrom, minEnd, callerLm)) != NULL) - return nextRow; -return NULL; -} - - -static void asxClose(struct annoStreamer **pVSelf) -/* Close VCF file and free self. */ -{ -if (pVSelf == NULL) - return; -struct annoStreamTabix *self = *(struct annoStreamTabix **)pVSelf; -bedTabixFileClose(&(self->btf)); -annoStreamerFree(pVSelf); -} - -struct annoStreamer *annoStreamTabixNew(char *fileOrUrl, struct annoAssembly *aa, int maxRecords) -/* Create an annoStreamer (subclass) object from a tabix indexed tab file */ -{ -struct bedTabixFile *btf = bedTabixFileMayOpen(fileOrUrl, NULL, 0, 0); - -struct annoStreamTabix *self; -AllocVar(self); -struct annoStreamer *streamer = &(self->streamer); -struct asObject *asObj = longTabixAsObj(); -annoStreamerInit(streamer, aa, asObj, fileOrUrl); -streamer->rowType = arWords; -streamer->setRegion = asxSetRegion; -streamer->getHeader = asxGetHeader; -streamer->nextRow = asxNextRow; -streamer->close = asxClose; -self->btf = btf; -self->numFileCols = 6; -self->maxRecords = maxRecords; -return (struct annoStreamer *)self; -}