40ed3e546ef868bffe4c338d93d6528138ecfc44 angie Wed Sep 30 09:25:12 2015 -0700 Add test cases (and bug fixes!) to make sure that we get the desired behavior when an insertion is adjacent to a non-insertion, and when an insertion falls at the start or end of the search region (if applicable): - Include insertions that fall at the start or end of the search region - If the primary row is an insertion, keep secondary non-insertion rows to the left and right. - If the primary row is a non-insertion, keep secondary insertion rows at its start and end. (i.e. keep insertions at boundaries -- but don't let any non-insertions slip through) The VCF logic is more complicated because VCF indels always include an extra base to the left, so they appear to start before they actually do, and can be interspersed with non-indels that start there. diff --git src/lib/annoStreamVcf.c src/lib/annoStreamVcf.c index 2c652b5..cdd2a81 100644 --- src/lib/annoStreamVcf.c +++ src/lib/annoStreamVcf.c @@ -75,30 +75,32 @@ dyStringClear(self->dyGt); for (i = 0; i < self->vcff->genotypeCount; i++) { if (i > 0) dyStringAppendC(self->dyGt, '\t'); dyStringAppend(self->dyGt, words[9+i]); } self->asWords[9] = self->dyGt->string; } else { self->asWords[8] = ""; self->asWords[9] = ""; } self->record = vcfRecordFromRow(self->vcff, words); +vcfRecordTrimIndelLeftBase(self->record); +vcfRecordTrimAllelesRight(self->record); return self->asWords; } static char *getProperChromName(struct annoStreamVcf *self, char *vcfChrom) /* We tolerate chr-less chrom names in VCF and BAM ("1" for "chr1" etc); to avoid * confusing the rest of the system, return the chr-ful version if it exists. */ { char *name = hashFindVal(self->chromNameHash, vcfChrom); if (name == NULL) { name = vcfChrom; struct twoBitFile *tbf = self->streamer.assembly->tbf; char buf[256]; if (! twoBitIsSequence(tbf, vcfChrom)) { @@ -132,37 +134,47 @@ } } char **words = nextRowRaw(self); if (regionChrom != NULL && words != NULL) { char *rowChrom = getProperChromName(self, words[0]); if (self->isTabix && strcmp(rowChrom, regionChrom) < 0) { uint regionEnd = sSelf->regionEnd; if (minChrom != NULL && sSelf->chrom == NULL) regionEnd = annoAssemblySeqSize(sSelf->assembly, minChrom); // If lineFileSetTabixRegion fails, just keep the current file position // -- hopefully we'll just be skipping to the next row after region{Chrom,Start,End}. lineFileSetTabixRegion(self->vcff->lf, regionChrom, regionStart, regionEnd); } + // Skip to next row if this is on a previous chromosome, or is on regionChrom but + // ends before regionStart or ends at regionStart and is not an insertion. + struct vcfRecord *rec = self->record; while (words != NULL && (strcmp(rowChrom, regionChrom) < 0 || - (sameString(rowChrom, regionChrom) && self->record->chromEnd < regionStart))) + (sameString(rowChrom, regionChrom) && + (rec->chromEnd < regionStart || + (rec->chromStart != rec->chromEnd && rec->chromEnd == regionStart))))) + { words = nextRowRaw(self); + rec = self->record; + } } // Tabix doesn't give us any rows past end of region, but if not using tabix, -// detect when we're past end of region: +// detect when we're past end of region. It's possible for a non-indel to precede +// an insertion that has the same VCF start coord but is actually to its left, +// so we can't just quit as soon as we see anything with chromStart == regionEnd. if (words != NULL && !self->isTabix && sSelf->chrom != NULL && self->record->chromStart > sSelf->regionEnd) { words = NULL; self->record = NULL; } if (words != NULL) self->recordCount++; if (words == NULL || (self->maxRecords > 0 && self->recordCount >= self->maxRecords)) self->eof = TRUE; return words; } static struct annoRow *nextRowFiltered(struct annoStreamVcf *self, char *minChrom, uint minEnd, struct lm *callerLm) @@ -171,32 +183,30 @@ char **words = nextRowUnfiltered(self, minChrom, minEnd); if (words == NULL) return NULL; // Skip past any left-join failures until we get a right-join failure, a passing row, or EOF. struct annoStreamer *sSelf = (struct annoStreamer *)self; boolean rightFail = FALSE; while (annoFilterRowFails(sSelf->filters, words, sSelf->numCols, &rightFail)) { if (rightFail) break; words = nextRowUnfiltered(self, minChrom, minEnd); if (words == NULL) return NULL; } struct vcfRecord *rec = self->record; -vcfRecordTrimIndelLeftBase(rec); -vcfRecordTrimAllelesRight(rec); char *chrom = getProperChromName(self, rec->chrom); return annoRowFromStringArray(chrom, rec->chromStart, rec->chromEnd, rightFail, words, sSelf->numCols, callerLm); } INLINE boolean vcfAnnoRowIsIndel(struct annoRow *row) /* If vcf row's start is one base to the left of row->start, it's an indel whose start * coord and alleles required translation into our representation. */ { char **words = (char **)(row->data); uint vcfStart = atoll(words[1]); return (row->start == vcfStart); // 0-based row->start, 1-based vcfStart } INLINE void nextPosQShouldBeEmpty(struct annoStreamVcf *self) @@ -210,31 +220,31 @@ INLINE struct annoRow *asvCloneForQ(struct annoStreamVcf *self, struct annoRow *row) /* Rows that we store in our queues have to be clone with our own qLm for persistence * across calls to asvNextRow. */ { if (self->indelQ == NULL && self->nextPosQ == NULL && lmSize(self->qLm) > MAX_QLM_MEM) { lmCleanup(&self->qLm); self->qLm = lmInit(0); } struct annoStreamer *sSelf = (struct annoStreamer *)self; return annoRowClone(row, sSelf->rowType, sSelf->numCols, self->qLm); } INLINE struct annoRow *asvPopQ(struct annoRow **pQ, struct annoStreamer *sSelf, struct lm *callerLm) -/* Return the row at the head of the indel queue, cloned with caller's localmem. */ +/* Return the row at the head of the given queue, cloned with caller's localmem. */ { return annoRowClone(slPopHead(pQ), sSelf->rowType, sSelf->numCols, callerLm); } static struct annoRow *asvNextRow(struct annoStreamer *sSelf, char *minChrom, uint minEnd, struct lm *callerLm) /* Return an annoRow encoding the next VCF record, or NULL if there are no more items. * Use queues to save indels aside until we get to the following base, because VCF's * indel encoding starts one base to the left of the actual indel. Thus, sorted VCF might * not be sorted in our internal coords, but it won't be off by more than one base. */ { struct annoStreamVcf *self = (struct annoStreamVcf *)sSelf; if (minChrom != NULL && sSelf->chrom != NULL && differentString(minChrom, sSelf->chrom)) errAbort("annoStreamVcf %s: nextRow minChrom='%s' but region chrom='%s'", sSelf->name, minChrom, sSelf->chrom); @@ -251,30 +261,37 @@ { // Another indel at the next position -- move it from nextPosQ to indelQ // and keep searching below. self->indelQ = slPopHead(&self->nextPosQ); nextPosQShouldBeEmpty(self); } else return asvPopQ(&self->nextPosQ, sSelf, callerLm); } else // eof return NULL; } struct annoRow *nextRow = NULL; while ((nextRow = nextRowFiltered(self, minChrom, minEnd, callerLm)) != NULL) { + // nextRowUnfiltered may have to let a substitution past the end of the range + // slip through because the next line(s) of VCF may have an insertion touching the range + // that we can't discard. Catch the non-insertions here. + if (sSelf->chrom != NULL && + nextRow->start == sSelf->regionEnd && nextRow->start != nextRow->end) + continue; + // nextRow has been allocated with callerLm; if we save it for later, we need a clone in qLm. // If we're popping a row from qLm, we need to clone back to the (probably new) callerLm. if (vcfAnnoRowIsIndel(nextRow)) { if (self->indelQ != NULL) { // Coords are apples-to-apples (both indels), look for strict ordering: if (self->indelQ->start < nextRow->start || differentString(self->indelQ->chrom, nextRow->chrom)) { // Another indel, but at some subsequent base -- store in nextPosQ & pop indelQ nextPosQShouldBeEmpty(self); self->nextPosQ = asvCloneForQ(self, nextRow); return asvPopQ(&self->indelQ, sSelf, callerLm); }