ebc84c82794070f9999daf3bf8c5de7f407d5818 angie Fri Dec 2 15:35:06 2011 -0800 Feature #3707 (VCF+tabix support in hgTables): Brooke reported out-of-memconditions in notes 11 & 12. Fix: add an optional threshold on the number of records to retrieve in vcfTabixFileMayOpen. diff --git src/lib/vcf.c src/lib/vcf.c index e53a6e9..7b22ff1 100644 --- src/lib/vcf.c +++ src/lib/vcf.c @@ -609,104 +609,107 @@ } } continue; } *eq = '\0'; el->key = vcfFilePooledStr(vcff, elStr); enum vcfInfoType type = typeForInfoKey(vcff, el->key); char *valStr = eq+1; el->count = parseInfoValue(record, el->key, type, valStr, &(el->values)); if (el->count >= VCF_MAX_INFO) vcfFileErr(vcff, "A single element of the INFO column has at least %d values; " "VCF_MAX_INFO may need to be increased in vcf.c!", VCF_MAX_INFO); } } -static void vcfParseData(struct vcfFile *vcff) +static void vcfParseData(struct vcfFile *vcff, int maxRecords) /* Given a vcfFile into which the header has been parsed, and whose lineFile is positioned * at the beginning of a data row, parse and store all data rows from lineFile. */ { if (vcff == NULL) return; -int expected = 8; +int recCount = 0, expected = 8; if (vcff->genotypeCount > 0) expected = 9 + vcff->genotypeCount; char *words[VCF_MAX_COLUMNS]; int wordCount; while ((wordCount = lineFileChop(vcff->lf, words)) > 0) { + if (maxRecords >= 0 && recCount >= maxRecords) + break; lineFileExpectWords(vcff->lf, expected, wordCount); struct vcfRecord *record; AllocVar(record); record->file = vcff; record->chrom = vcfFilePooledStr(vcff, words[0]); record->chromStart = lineFileNeedNum(vcff->lf, words, 1) - 1; // chromEnd may be overwritten by parseRefAndAlt and parseInfoColumn. record->chromEnd = record->chromStart+1; record->name = vcfFilePooledStr(vcff, words[2]); parseRefAndAlt(vcff, record, words[3], words[4]); record->qual = vcfFilePooledStr(vcff, words[5]); parseFilterColumn(vcff, record, words[6]); parseInfoColumn(vcff, record, words[7]); if (vcff->genotypeCount > 0) { record->format = vcfFilePooledStr(vcff, words[8]); record->genotypeUnparsedStrings = vcfFileAlloc(vcff, vcff->genotypeCount * sizeof(char *)); int i; // Don't bother actually parsing all these until & unless we need the info: for (i = 0; i < vcff->genotypeCount; i++) record->genotypeUnparsedStrings[i] = vcfFileCloneStr(vcff, words[9+i]); } slAddHead(&(vcff->records), record); + recCount++; } slReverse(&(vcff->records)); lineFileClose(&(vcff->lf)); } -struct vcfFile *vcfFileMayOpen(char *fileOrUrl, int maxErr) +struct vcfFile *vcfFileMayOpen(char *fileOrUrl, int maxErr, int maxRecords) /* Parse a VCF file into a vcfFile object. If maxErr not zero, then * continue to parse until this number of error have been reached. A maxErr * less than zero does not stop and reports all errors. */ { struct lineFile *lf = NULL; if (startsWith("http://", fileOrUrl) || startsWith("ftp://", fileOrUrl) || startsWith("https://", fileOrUrl)) lf = netLineFileOpen(fileOrUrl); else lf = lineFileMayOpen(fileOrUrl, TRUE); struct vcfFile *vcff = vcfFileHeaderFromLineFile(lf, maxErr); -vcfParseData(vcff); +vcfParseData(vcff, maxRecords); return vcff; } struct vcfFile *vcfTabixFileMayOpen(char *fileOrUrl, char *chrom, int start, int end, - int maxErr) + int maxErr, int maxRecords) /* Parse header and rows within the given position range from a VCF file that has been * compressed and indexed by tabix into a vcfFile object; return NULL if or if file has * no items in range. * If maxErr not zero, then continue to parse until this number of error have been reached. * A maxErr less than zero does not stop and reports all errors. */ { struct lineFile *lf = lineFileTabixMayOpen(fileOrUrl, TRUE); struct vcfFile *vcff = vcfFileHeaderFromLineFile(lf, maxErr); if (vcff == NULL) return NULL; if (isNotEmpty(chrom) && start != end) { if (lineFileSetTabixRegion(lf, chrom, start, end)) - vcfParseData(vcff); + vcfParseData(vcff, maxRecords); } return vcff; } void vcfFileFree(struct vcfFile **pVcff) /* Free a vcfFile object. */ { if (pVcff == NULL || *pVcff == NULL) return; struct vcfFile *vcff = *pVcff; freez(&(vcff->headerString)); hashFree(&(vcff->pool)); hashFree(&(vcff->byName)); lineFileClose(&(vcff->lf)); freez(pVcff);