4a2bc0c295f5cd80b8f51959e4cec463047e62d7 angie Sun Dec 4 23:17:29 2011 -0800 Feature #3707 (VCF in hgTables): when calling vcf.c's addFilteredBedsOnRegion ina loop, apply maxOut cumulatively, instead of separately for each region. Unfortunately, some higher-level code calls vcfGetFilteredBedsOnRegions with one region at a time, so it doesn't know it's running in a loop. The check for maxOut needs to move up to top-level (e.g. doSummaryStats), or VCF needs its own top-level routines, or a true genome-wide query needs to be possible in hgTables. diff --git src/hg/hgTables/vcf.c src/hg/hgTables/vcf.c index d73da5e..2929c86 100644 --- src/hg/hgTables/vcf.c +++ src/hg/hgTables/vcf.c @@ -236,30 +236,32 @@ struct region *region, *regionList = getRegions(); int maxOut = bigFileMaxOutput(); // Include the header, absolutely necessary for VCF parsing. boolean printedHeader = FALSE; // Temporary storage for row-ification: struct dyString *dyAlt = newDyString(1024); struct dyString *dyFilter = newDyString(1024); struct dyString *dyInfo = newDyString(1024); struct dyString *dyGt = newDyString(1024); struct vcfRecord *rec; for (region = regionList; region != NULL && (maxOut > 0); region = region->next) { char *fileName = vcfFileName(table, conn, region->chrom); struct vcfFile *vcff = vcfTabixFileMayOpen(fileName, region->chrom, region->start, region->end, 100, maxOut); + if (vcff == NULL) + noWarnAbort(); // If we are outputting all fields, but this VCF has no genotype info, omit the // genotype columns from output: if (allFields && vcff->genotypeCount == 0) fieldCount = VCFDATALINE_NUM_COLS - 2; if (!printedHeader) { fprintf(f, "%s", vcff->headerString); if (filter) fprintf(f, "# Filtering on %d columns\n", slCount(filter->columnList)); printedHeader = TRUE; } char *row[VCFDATALINE_NUM_COLS]; char numBuf[VCF_NUM_BUF_SIZE]; for (rec = vcff->records; rec != NULL && (maxOut > 0); rec = rec->next) { @@ -285,97 +287,102 @@ vcfFileFree(&vcff); freeMem(fileName); } if (maxOut == 0) warn("Reached output limit of %d data values, please make region smaller,\n\tor set a higher output line limit with the filter settings.", bigFileMaxOutput()); /* Clean up and exit. */ dyStringFree(&dyAlt); dyStringFree(&dyFilter); dyStringFree(&dyInfo); dyStringFree(&dyGt); hashFree(&fieldHash); freeMem(fieldArray); freeMem(columnArray); } static void addFilteredBedsOnRegion(char *fileName, struct region *region, char *table, struct asFilter *filter, struct lm *bedLm, - struct bed **pBedList, struct hash *idHash) + struct bed **pBedList, struct hash *idHash, int *pMaxOut) /* Add relevant beds in reverse order to pBedList */ { -int maxOut = bigFileMaxOutput(); struct vcfFile *vcff = vcfTabixFileMayOpen(fileName, region->chrom, region->start, region->end, - 100, maxOut); + 100, *pMaxOut); +if (vcff == NULL) + noWarnAbort(); struct lm *lm = lmInit(0); char *row[VCFDATALINE_NUM_COLS]; char numBuf[VCF_NUM_BUF_SIZE]; // Temporary storage for row-ification: struct dyString *dyAlt = newDyString(1024); struct dyString *dyFilter = newDyString(1024); struct dyString *dyInfo = newDyString(1024); struct dyString *dyGt = newDyString(1024); struct vcfRecord *rec; for (rec = vcff->records; rec != NULL; rec = rec->next) { vcfRecordToRow(rec, region->chrom, numBuf, dyAlt, dyFilter, dyInfo, dyGt, row); if (asFilterOnRow(filter, row)) { if ((idHash != NULL) && (hashLookup(idHash, rec->name) == NULL)) continue; struct bed *bed; lmAllocVar(bedLm, bed); bed->chrom = lmCloneString(bedLm, region->chrom); bed->chromStart = rec->chromStart; bed->chromEnd = rec->chromEnd; bed->name = lmCloneString(bedLm, rec->name); slAddHead(pBedList, bed); } + (*pMaxOut)--; } dyStringFree(&dyAlt); dyStringFree(&dyFilter); dyStringFree(&dyInfo); dyStringFree(&dyGt); lmCleanup(&lm); vcfFileFree(&vcff); } struct bed *vcfGetFilteredBedsOnRegions(struct sqlConnection *conn, char *db, char *table, struct region *regionList, struct lm *lm, int *retFieldCount) /* Get list of beds from VCF, in all regions, that pass filtering. */ { +int maxOut = bigFileMaxOutput(); /* Figure out vcf file name get column info and filter. */ struct asObject *as = vcfAsObj(); struct asFilter *filter = asFilterFromCart(cart, db, table, as); struct hash *idHash = identifierHash(db, table); /* Get beds a region at a time. */ struct bed *bedList = NULL; struct region *region; for (region = regionList; region != NULL; region = region->next) { char *fileName = vcfFileName(table, conn, region->chrom); - addFilteredBedsOnRegion(fileName, region, table, filter, lm, &bedList, idHash); + addFilteredBedsOnRegion(fileName, region, table, filter, lm, &bedList, idHash, &maxOut); freeMem(fileName); + if (maxOut <= 0) + break; //#*** need to warn here } slReverse(&bedList); return bedList; } struct slName *randomVcfIds(char *table, struct sqlConnection *conn, int count) /* Return some semi-random IDs from a VCF file. */ { /* Read 10000 items from vcf file, or if they ask for a big list, then 4x what they ask for. */ char *fileName = vcfFileName(table, conn, NULL); struct lineFile *lf = lineFileTabixMayOpen(fileName, TRUE); if (lf == NULL) - errAbort("%s", ""); + noWarnAbort(); int orderedCount = count * 4; if (orderedCount < 10000) orderedCount = 10000; struct slName *idList = NULL; char *words[4]; int i; for (i = 0; i < orderedCount && lineFileChop(lf, words); i++) slAddHead(&idList, slNameNew(words[2])); lineFileClose(&lf); /* Shuffle list and trim it to count if necessary. */ shuffleList(&idList, 1); struct slName *sl; for (sl = idList, i = 0; sl != NULL; i++, sl = sl->next, i++) { if (i+1 >= count) @@ -415,31 +422,31 @@ for (col = as->columnList; col != NULL; col = col->next) { hPrintf("<TR><TD><TT>%s</TT></TD>", col->name); hPrintf("<TD>%s</TD></TR>", col->comment); ++colCount; } hTableEnd(); /* Put up another section with sample rows. */ webNewSection("Sample Rows"); hTableStart(); /* Fetch sample rows. */ struct lineFile *lf = lineFileTabixMayOpen(fileName, TRUE); if (lf == NULL) - errAbort("%s", ""); + noWarnAbort(); char *row[VCF_MAX_SCHEMA_COLS]; int i; for (i = 0; i < 10; i++) { int colCount = lineFileChop(lf, row); int colIx; if (i == 0) { // Print field names as column headers, using colCount to compute genotype span hPrintf("<TR>"); for (colIx = 0, col = as->columnList; col != NULL && colIx < colCount; colIx++, col = col->next) { if (sameString("genotypes", col->name) && colCount > colIx+1)