ebc84c82794070f9999daf3bf8c5de7f407d5818
angie
  Fri Dec 2 15:35:06 2011 -0800
Feature #3707 (VCF+tabix support in hgTables): Brooke reported out-of-memconditions in notes 11 & 12.  Fix: add an optional threshold on the number
of records to retrieve in vcfTabixFileMayOpen.

diff --git src/lib/vcf.c src/lib/vcf.c
index e53a6e9..7b22ff1 100644
--- src/lib/vcf.c
+++ src/lib/vcf.c
@@ -609,104 +609,107 @@
 		}
 	    }
 	continue;
 	}
     *eq = '\0';
     el->key = vcfFilePooledStr(vcff, elStr);
     enum vcfInfoType type = typeForInfoKey(vcff, el->key);
     char *valStr = eq+1;
     el->count = parseInfoValue(record, el->key, type, valStr, &(el->values));
     if (el->count >= VCF_MAX_INFO)
 	vcfFileErr(vcff, "A single element of the INFO column has at least %d values; "
 	       "VCF_MAX_INFO may need to be increased in vcf.c!", VCF_MAX_INFO);
     }
 }
 
-static void vcfParseData(struct vcfFile *vcff)
+static void vcfParseData(struct vcfFile *vcff, int maxRecords)
 /* Given a vcfFile into which the header has been parsed, and whose lineFile is positioned
  * at the beginning of a data row, parse and store all data rows from lineFile. */
 {
 if (vcff == NULL)
     return;
-int expected = 8;
+int recCount = 0, expected = 8;
 if (vcff->genotypeCount > 0)
     expected = 9 + vcff->genotypeCount;
 char *words[VCF_MAX_COLUMNS];
 int wordCount;
 while ((wordCount = lineFileChop(vcff->lf, words)) > 0)
     {
+    if (maxRecords >= 0 && recCount >= maxRecords)
+	break;
     lineFileExpectWords(vcff->lf, expected, wordCount);
     struct vcfRecord *record;
     AllocVar(record);
     record->file = vcff;
     record->chrom = vcfFilePooledStr(vcff, words[0]);
     record->chromStart = lineFileNeedNum(vcff->lf, words, 1) - 1;
     // chromEnd may be overwritten by parseRefAndAlt and parseInfoColumn.
     record->chromEnd = record->chromStart+1;
     record->name = vcfFilePooledStr(vcff, words[2]);
     parseRefAndAlt(vcff, record, words[3], words[4]);
     record->qual = vcfFilePooledStr(vcff, words[5]);
     parseFilterColumn(vcff, record, words[6]);
     parseInfoColumn(vcff, record, words[7]);
     if (vcff->genotypeCount > 0)
 	{
 	record->format = vcfFilePooledStr(vcff, words[8]);
 	record->genotypeUnparsedStrings = vcfFileAlloc(vcff,
 						       vcff->genotypeCount * sizeof(char *));
 	int i;
 	// Don't bother actually parsing all these until & unless we need the info:
 	for (i = 0;  i < vcff->genotypeCount;  i++)
 	    record->genotypeUnparsedStrings[i] = vcfFileCloneStr(vcff, words[9+i]);
 	}
     slAddHead(&(vcff->records), record);
+    recCount++;
     }
 slReverse(&(vcff->records));
 lineFileClose(&(vcff->lf));
 }
 
-struct vcfFile *vcfFileMayOpen(char *fileOrUrl, int maxErr)
+struct vcfFile *vcfFileMayOpen(char *fileOrUrl, int maxErr, int maxRecords)
 /* Parse a VCF file into a vcfFile object.  If maxErr not zero, then
  * continue to parse until this number of error have been reached.  A maxErr
  * less than zero does not stop and reports all errors. */
 {
 struct lineFile *lf = NULL;
 if (startsWith("http://", fileOrUrl) || startsWith("ftp://", fileOrUrl) ||
     startsWith("https://", fileOrUrl))
     lf = netLineFileOpen(fileOrUrl);
 else
     lf = lineFileMayOpen(fileOrUrl, TRUE);
 struct vcfFile *vcff = vcfFileHeaderFromLineFile(lf, maxErr);
-vcfParseData(vcff);
+vcfParseData(vcff, maxRecords);
 return vcff;
 }
 
 struct vcfFile *vcfTabixFileMayOpen(char *fileOrUrl, char *chrom, int start, int end,
-				    int maxErr)
+				    int maxErr, int maxRecords)
 /* Parse header and rows within the given position range from a VCF file that has been
  * compressed and indexed by tabix into a vcfFile object; return NULL if or if file has
  * no items in range.
  * If maxErr not zero, then continue to parse until this number of error have been reached.
  * A maxErr less than zero does not stop and reports all errors. */
 {
 struct lineFile *lf = lineFileTabixMayOpen(fileOrUrl, TRUE);
 struct vcfFile *vcff = vcfFileHeaderFromLineFile(lf, maxErr);
 if (vcff == NULL)
     return NULL;
 if (isNotEmpty(chrom) && start != end)
     {
     if (lineFileSetTabixRegion(lf, chrom, start, end))
-	vcfParseData(vcff);
+	vcfParseData(vcff, maxRecords);
     }
 return vcff;
 }
 
 void vcfFileFree(struct vcfFile **pVcff)
 /* Free a vcfFile object. */
 {
 if (pVcff == NULL || *pVcff == NULL)
     return;
 struct vcfFile *vcff = *pVcff;
 freez(&(vcff->headerString));
 hashFree(&(vcff->pool));
 hashFree(&(vcff->byName));
 lineFileClose(&(vcff->lf));
 freez(pVcff);