043ea3a998c9846a647f87ae14d4f27bc01af3a6 angie Wed May 28 13:59:55 2014 -0700 When position was set to a chrom not present in a VCF+tabix file,annoStreamVcf just kept reading rows out of the file instead of paying attention to the return value of lineFileSetTabixRegion. This inefficiency led to carefulMem running into its 6G limit. Fix: if lineFileSetTabixRegion fails in asvSetRegion, set self->eof to prevent reading out rows from the wrong position. If we're trying to skip ahead to minChrom using lineFileSetTabixRegion in nextRowUnfiltered, and it fails, just keep current position in case the file has more chroms after minChrom. fixes #13343 diff --git src/lib/annoStreamVcf.c src/lib/annoStreamVcf.c index 2c15149..2d60f07 100644 --- src/lib/annoStreamVcf.c +++ src/lib/annoStreamVcf.c @@ -20,34 +20,40 @@ struct annoRow *nextPosQ; // FIFO (max len=1) for stashing row while draining indelQ struct lm *qLm; // Local mem for saving rows in Q's (callerLm disappears) int numFileCols; // Number of columns in VCF file. int maxRecords; // Maximum number of annoRows to return. int recordCount; // Number of annoRows we have returned so far. boolean isTabix; // True if we are accessing compressed VCF via tabix index boolean eof; // True when we have hit end of file or maxRecords }; static void asvSetRegion(struct annoStreamer *vSelf, char *chrom, uint regionStart, uint regionEnd) /* Set region -- and free current sqlResult if there is one. */ { annoStreamerSetRegion(vSelf, chrom, regionStart, regionEnd); struct annoStreamVcf *self = (struct annoStreamVcf *)vSelf; -if (self->isTabix) - lineFileSetTabixRegion(self->vcff->lf, chrom, regionStart, regionEnd); self->indelQ = self->nextPosQ = NULL; self->eof = FALSE; +if (self->isTabix) + { + // If this region is not in tabix index, set self->eof so we won't keep grabbing rows + // from the old position. + boolean gotRegion = lineFileSetTabixRegion(self->vcff->lf, chrom, regionStart, regionEnd); + if (! gotRegion) + self->eof = TRUE; + } } static char *asvGetHeader(struct annoStreamer *vSelf) /* Return VCF header (e.g. for use by formatter) */ { struct annoStreamVcf *self = (struct annoStreamVcf *)vSelf; return cloneString(self->vcff->headerString); } static char **nextRowRaw(struct annoStreamVcf *self) /* Get the next VCF record and put the row text into autoSql words. * Return pointer to self->asWords if we get a row, otherwise NULL. */ { char *words[self->numFileCols]; int wordCount; @@ -122,30 +128,32 @@ } else { regionStart = max(regionStart, minEnd); } } char **words = nextRowRaw(self); if (regionChrom != NULL && words != NULL) { char *rowChrom = getProperChromName(self, words[0]); if (self->isTabix && strcmp(rowChrom, regionChrom) < 0) { uint regionEnd = sSelf->regionEnd; if (minChrom != NULL && sSelf->chrom == NULL) regionEnd = annoAssemblySeqSize(sSelf->assembly, minChrom); + // If lineFileSetTabixRegion fails, just keep the current file position + // -- hopefully we'll just be skipping to the next row after region{Chrom,Start,End}. lineFileSetTabixRegion(self->vcff->lf, regionChrom, regionStart, regionEnd); } while (words != NULL && (strcmp(rowChrom, regionChrom) < 0 || (sameString(rowChrom, regionChrom) && self->record->chromEnd < regionStart))) words = nextRowRaw(self); } // Tabix doesn't give us any rows past end of region, but if not using tabix, // detect when we're past end of region: if (words != NULL && !self->isTabix && sSelf->chrom != NULL && self->record->chromStart > sSelf->regionEnd) { words = NULL; self->record = NULL; }