70d4208efa11f262ba5591198214c9ccdc6b54ae
angie
  Mon Aug 22 22:24:05 2011 -0700
Feature #3707 (VCF+tabix support in hgTables):Basic hgTables support for VCF, modeled after BAM code.  Tested all fields,
selected fields, bed output; filter, intersection, schema
on 3 flavors of vcfTabix:
ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/release/20101123/interim_phase1_release/ALL.chr17.phase1.projectConsensus.genotypes.vcf.gz
ftp://ftp-trace.ncbi.nlm.nih.gov/1000genomes/ftp/release/20100804/AFR.dindel.20100804.sites.vcf.gz
ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/VCF/v4.0/ByChromosomeNoGeno/00-All.vcf.gz

diff --git src/lib/vcf.c src/lib/vcf.c
index 8b610ba..35e834a 100644
--- src/lib/vcf.c
+++ src/lib/vcf.c
@@ -256,32 +256,30 @@
 }
 
 static void parseMetadataLine(struct vcfFile *vcff, char *line)
 /* Parse a VCF header line beginning with "##" that defines a metadata. */
 {
 char *ptr = line;
 if (ptr == NULL && !startsWith(ptr, "##"))
     errAbort("Bad line passed to parseMetadataLine");
 ptr += 2;
 char *firstEq = strchr(ptr, '=');
 if (firstEq == NULL)
     {
     vcfFileErr(vcff, "Metadata line lacks '=': \"%s\"", line);
     return;
     }
-// Every metadata line is saved here:
-hashAddN(vcff->metaDataHash, ptr, (firstEq - ptr), vcfFileCloneStr(vcff, firstEq+1));
 regmatch_t substrs[8];
 // Some of the metadata lines are crucial for parsing the rest of the file:
 if (startsWith("##fileformat=", line) || startsWith("##format", line))
     {
     if (regexMatchSubstr(line, fileformatRegex, substrs, ArraySize(substrs)))
 	{
 	// substrs[2] is major version #, substrs[3] is set only if there is a minor version,
 	// and substrs[4] is the minor version #.
 	vcff->majorVersion = atoi(line + substrs[2].rm_so);
 	if (substrs[3].rm_so != -1)
 	    vcff->minorVersion = atoi(line + substrs[4].rm_so);
 	}
     else
 	vcfFileErr(vcff, "##fileformat line does not match expected pattern /%s/: \"%s\"",
 		   fileformatRegex, line);
@@ -373,68 +371,75 @@
     if (wordCount < 10)
 	vcfFileErr(vcff, "FORMAT column is given, but no sample IDs for genotype columns...?");
     vcff->genotypeCount = (wordCount - 9);
     vcff->genotypeIds = vcfFileAlloc(vcff, vcff->genotypeCount * sizeof(char *));
     int i;
     for (i = 9;  i < wordCount;  i++)
 	vcff->genotypeIds[i-9] = vcfFileCloneStr(vcff, words[i]);
     }
 }
 
 static struct vcfFile *vcfFileNew()
 /* Return a new, empty vcfFile object. */
 {
 struct vcfFile *vcff = NULL;
 AllocVar(vcff);
-vcff->metaDataHash = hashNew(0);
 vcff->pool = hashNew(0);
 return vcff;
 }
 
 static struct vcfFile *vcfFileHeaderFromLineFile(struct lineFile *lf, int maxErr)
 /* Parse a VCF file into a vcfFile object.  If maxErr not zero, then
  * continue to parse until this number of error have been reached.  A maxErr
  * less than zero does not stop and reports all errors. */
 {
 initVcfSpecInfoDefs();
 initVcfSpecGtFormatDefs();
 if (lf == NULL)
     return NULL;
 struct vcfFile *vcff = vcfFileNew();
 vcff->lf = lf;
 vcff->fileOrUrl = vcfFileCloneStr(vcff, lf->fileName);
 vcff->maxErr = (maxErr < 0) ? INT_MAX : maxErr;
 
+struct dyString *dyHeader = dyStringNew(1024);
 char *line = NULL;
 // First, metadata lines beginning with "##":
 while (lineFileNext(lf, &line, NULL) && startsWith("##", line))
+    {
+    dyStringAppend(dyHeader, line);
+    dyStringAppendC(dyHeader, '\n');
     parseMetadataLine(vcff, line);
+    }
 slReverse(&(vcff->infoDefs));
 slReverse(&(vcff->filterDefs));
 slReverse(&(vcff->gtFormatDefs));
 // Did we get the bare minimum VCF header with supported version?
 if (vcff->majorVersion == 0)
     vcfFileErr(vcff, "missing ##fileformat= header line?  Assuming 4.1.");
 if ((vcff->majorVersion != 4 || (vcff->minorVersion != 0 && vcff->minorVersion != 1)) &&
     (vcff->majorVersion != 3))
     vcfFileErr(vcff, "VCFv%d.%d not supported -- only v3.*, v4.0 or v4.1",
 	       vcff->majorVersion, vcff->minorVersion);
 // Next, one header line beginning with single "#" that names the columns:
 if (line == NULL)
     // EOF after metadata
     return vcff;
+dyStringAppend(dyHeader, line);
+dyStringAppendC(dyHeader, '\n');
 parseColumnHeaderRow(vcff, line);
+vcff->headerString = dyStringCannibalize(&dyHeader);
 return vcff;
 }
 
 
 #define VCF_MAX_INFO 512
 
 static void parseRefAndAlt(struct vcfFile *vcff, struct vcfRecord *record, char *ref, char *alt)
 /* Make an array of alleles, ref first, from the REF and comma-sep'd ALT columns.
  * Use the length of the reference sequence to set record->chromEnd.
  * Note: this trashes the alt argument, since this is expected to be its last use. */
 {
 char *altAlleles[VCF_MAX_INFO];
 int altCount = chopCommas(alt, altAlleles);
 record->alleleCount = 1 + altCount;
 record->alleles = vcfFileAlloc(vcff, record->alleleCount * sizeof(record->alleles[0]));
@@ -658,33 +663,33 @@
     return NULL;
 if (isNotEmpty(chrom) && start != end)
     {
     if (lineFileSetTabixRegion(lf, chrom, start, end))
 	vcfParseData(vcff);
     }
 return vcff;
 }
 
 void vcfFileFree(struct vcfFile **pVcff)
 /* Free a vcfFile object. */
 {
 if (pVcff == NULL || *pVcff == NULL)
     return;
 struct vcfFile *vcff = *pVcff;
+freez(&(vcff->headerString));
 hashFree(&(vcff->pool));
 hashFree(&(vcff->byName));
-hashFree(&(vcff->metaDataHash));
 lineFileClose(&(vcff->lf));
 freez(pVcff);
 }
 
 const struct vcfRecord *vcfFileFindVariant(struct vcfFile *vcff, char *variantId)
 /* Return all records with name=variantId, or NULL if not found. */
 {
 struct vcfRecord *varList = NULL;
 if (vcff->byName == NULL)
     {
     vcff->byName = hashNew(0);
     struct vcfRecord *rec;
     for (rec = vcff->records;  rec != NULL;  rec = rec->next)
 	{
 	if (sameString (rec->name, variantId))