src/hg/hgTables/vcf.c 4a2bc0c295f5cd80b8f51959e4cec463047e62d7

4a2bc0c295f5cd80b8f51959e4cec463047e62d7
angie
  Sun Dec 4 23:17:29 2011 -0800
Feature #3707 (VCF in hgTables): when calling vcf.c's addFilteredBedsOnRegion ina loop, apply maxOut cumulatively, instead of separately for each region.
Unfortunately, some higher-level code calls vcfGetFilteredBedsOnRegions with
one region at a time, so it doesn't know it's running in a loop.  The check for
maxOut needs to move up to top-level (e.g. doSummaryStats), or VCF needs its own
top-level routines, or a true genome-wide query needs to be possible in hgTables.

diff --git src/hg/hgTables/vcf.c src/hg/hgTables/vcf.c
index d73da5e..2929c86 100644
--- src/hg/hgTables/vcf.c
+++ src/hg/hgTables/vcf.c
@@ -236,30 +236,32 @@
 struct region *region, *regionList = getRegions();
 int maxOut = bigFileMaxOutput();
 // Include the header, absolutely necessary for VCF parsing.
 boolean printedHeader = FALSE;
 // Temporary storage for row-ification:
 struct dyString *dyAlt = newDyString(1024);
 struct dyString *dyFilter = newDyString(1024);
 struct dyString *dyInfo = newDyString(1024);
 struct dyString *dyGt = newDyString(1024);
 struct vcfRecord *rec;
 for (region = regionList; region != NULL && (maxOut > 0); region = region->next)
     {
     char *fileName = vcfFileName(table, conn, region->chrom);
     struct vcfFile *vcff = vcfTabixFileMayOpen(fileName, region->chrom, region->start, region->end,
 					       100, maxOut);
+    if (vcff == NULL)
+	noWarnAbort();
     // If we are outputting all fields, but this VCF has no genotype info, omit the
     // genotype columns from output:
     if (allFields && vcff->genotypeCount == 0)
 	fieldCount = VCFDATALINE_NUM_COLS - 2;
     if (!printedHeader)
 	{
 	fprintf(f, "%s", vcff->headerString);
 	if (filter)
 	    fprintf(f, "# Filtering on %d columns\n", slCount(filter->columnList));
 	printedHeader = TRUE;
 	}
     char *row[VCFDATALINE_NUM_COLS];
     char numBuf[VCF_NUM_BUF_SIZE];
     for (rec = vcff->records;  rec != NULL && (maxOut > 0);  rec = rec->next)
         {
@@ -285,97 +287,102 @@
     vcfFileFree(&vcff);
     freeMem(fileName);
     }
 
 if (maxOut == 0)
     warn("Reached output limit of %d data values, please make region smaller,\n\tor set a higher output line limit with the filter settings.", bigFileMaxOutput());
 /* Clean up and exit. */
 dyStringFree(&dyAlt);  dyStringFree(&dyFilter);  dyStringFree(&dyInfo);  dyStringFree(&dyGt);
 hashFree(&fieldHash);
 freeMem(fieldArray);
 freeMem(columnArray);
 }
 
 static void addFilteredBedsOnRegion(char *fileName, struct region *region, char *table,
 				    struct asFilter *filter, struct lm *bedLm,
-				    struct bed **pBedList, struct hash *idHash)
+				    struct bed **pBedList, struct hash *idHash, int *pMaxOut)
 /* Add relevant beds in reverse order to pBedList */
 {
-int maxOut = bigFileMaxOutput();
 struct vcfFile *vcff = vcfTabixFileMayOpen(fileName, region->chrom, region->start, region->end,
-					   100, maxOut);
+					   100, *pMaxOut);
+if (vcff == NULL)
+    noWarnAbort();
 struct lm *lm = lmInit(0);
 char *row[VCFDATALINE_NUM_COLS];
 char numBuf[VCF_NUM_BUF_SIZE];
 // Temporary storage for row-ification:
 struct dyString *dyAlt = newDyString(1024);
 struct dyString *dyFilter = newDyString(1024);
 struct dyString *dyInfo = newDyString(1024);
 struct dyString *dyGt = newDyString(1024);
 struct vcfRecord *rec;
 for (rec = vcff->records;  rec != NULL;  rec = rec->next)
     {
     vcfRecordToRow(rec, region->chrom, numBuf, dyAlt, dyFilter, dyInfo, dyGt, row);
     if (asFilterOnRow(filter, row))
         {
 	if ((idHash != NULL) && (hashLookup(idHash, rec->name) == NULL))
 	    continue;
 	struct bed *bed;
 	lmAllocVar(bedLm, bed);
 	bed->chrom = lmCloneString(bedLm, region->chrom);
 	bed->chromStart = rec->chromStart;
 	bed->chromEnd = rec->chromEnd;
 	bed->name = lmCloneString(bedLm, rec->name);
 	slAddHead(pBedList, bed);
 	}
+    (*pMaxOut)--;
     }
 dyStringFree(&dyAlt);  dyStringFree(&dyFilter);  dyStringFree(&dyInfo);  dyStringFree(&dyGt);
 lmCleanup(&lm);
 vcfFileFree(&vcff);
 }
 
 struct bed *vcfGetFilteredBedsOnRegions(struct sqlConnection *conn,
 	char *db, char *table, struct region *regionList, struct lm *lm,
 	int *retFieldCount)
 /* Get list of beds from VCF, in all regions, that pass filtering. */
 {
+int maxOut = bigFileMaxOutput();
 /* Figure out vcf file name get column info and filter. */
 struct asObject *as = vcfAsObj();
 struct asFilter *filter = asFilterFromCart(cart, db, table, as);
 struct hash *idHash = identifierHash(db, table);
 
 /* Get beds a region at a time. */
 struct bed *bedList = NULL;
 struct region *region;
 for (region = regionList; region != NULL; region = region->next)
     {
     char *fileName = vcfFileName(table, conn, region->chrom);
-    addFilteredBedsOnRegion(fileName, region, table, filter, lm, &bedList, idHash);
+    addFilteredBedsOnRegion(fileName, region, table, filter, lm, &bedList, idHash, &maxOut);
     freeMem(fileName);
+    if (maxOut <= 0)
+	break; //#*** need to warn here
     }
 slReverse(&bedList);
 return bedList;
 }
 
 struct slName *randomVcfIds(char *table, struct sqlConnection *conn, int count)
 /* Return some semi-random IDs from a VCF file. */
 {
 /* Read 10000 items from vcf file,  or if they ask for a big list, then 4x what they ask for. */
 char *fileName = vcfFileName(table, conn, NULL);
 struct lineFile *lf = lineFileTabixMayOpen(fileName, TRUE);
 if (lf == NULL)
-    errAbort("%s", "");
+    noWarnAbort();
 int orderedCount = count * 4;
 if (orderedCount < 10000)
     orderedCount = 10000;
 struct slName *idList = NULL;
 char *words[4];
 int i;
 for (i = 0;  i < orderedCount && lineFileChop(lf, words); i++)
     slAddHead(&idList, slNameNew(words[2]));
 lineFileClose(&lf);
 /* Shuffle list and trim it to count if necessary. */
 shuffleList(&idList, 1);
 struct slName *sl;
 for (sl = idList, i = 0; sl != NULL; i++, sl = sl->next, i++)
     {
     if (i+1 >= count)
@@ -415,31 +422,31 @@
 for (col = as->columnList; col != NULL; col = col->next)
     {
     hPrintf("<TR><TD><TT>%s</TT></TD>", col->name);
     hPrintf("<TD>%s</TD></TR>", col->comment);
     ++colCount;
     }
 hTableEnd();
 
 /* Put up another section with sample rows. */
 webNewSection("Sample Rows");
 hTableStart();
 
 /* Fetch sample rows. */
 struct lineFile *lf = lineFileTabixMayOpen(fileName, TRUE);
 if (lf == NULL)
-    errAbort("%s", "");
+    noWarnAbort();
 char *row[VCF_MAX_SCHEMA_COLS];
 int i;
 for (i = 0;  i < 10;  i++)
     {
     int colCount = lineFileChop(lf, row);
     int colIx;
     if (i == 0)
 	{
 	// Print field names as column headers, using colCount to compute genotype span
 	hPrintf("<TR>");
 	for (colIx = 0, col = as->columnList; col != NULL && colIx < colCount;
 	     colIx++, col = col->next)
 	    {
 
 	    if (sameString("genotypes", col->name) && colCount > colIx+1)