491395482b817cfe82cbdef4dd7f8ee0723c1a4a
angie
  Wed Mar 23 21:01:49 2011 -0700
Feature #2822, #2823 (VCF customFactory + track handler):Added a new track type, vcfTabix, with handlers in hgTracks and hgc
and a customFactory.  It is a new bigDataUrl type of track; the
remote VCF file must be compressed and indexed by tabix, so like
BAM a separate index file is required.  If the VCF file has
genotypes, then each sample's two haplotypes are graphed in a
line, with one line per sample.  Otherwise, alleles and counts
(if available) are drawn using Belinda's pgSnp methods.
The source code has to be compiled with USE_TABIX=1 (which is
automatically set for us by common.mk when it finds the local
installation) in order for the CGIs to recognize the track type.

diff --git src/lib/vcf.c src/lib/vcf.c
index b583131..ea175e5 100644
--- src/lib/vcf.c
+++ src/lib/vcf.c
@@ -1,776 +1,801 @@
 /* VCF: Variant Call Format, version 4.0 / 4.1
  * http://www.1000genomes.org/wiki/Analysis/Variant%20Call%20Format/vcf-variant-call-format-version-40
  * http://www.1000genomes.org/wiki/Analysis/Variant%20Call%20Format/vcf-variant-call-format-version-41
  */
 
 #include "common.h"
+#include "errabort.h"
 #include <limits.h>
 #include "localmem.h"
 #include "net.h"
 #include "regexHelper.h"
 #include "vcf.h"
 
 /* Reserved but optional INFO keys: */
 const char *vcfInfoAncestralAllele = "AA";
 const char *vcfInfoPerAlleleGtCount = "AC";	// allele count in genotypes, for each ALT allele,
 						// in the same order as listed
 const char *vcfInfoAlleleFrequency = "AF";	// allele frequency for each ALT allele in the same
 						// order as listed: use this when estimated from
 						// primary data, not called genotypes
 const char *vcfInfoNumAlleles = "AN";		// total number of alleles in called genotypes
 const char *vcfInfoBaseQuality = "BQ";		// RMS base quality at this position
 const char *vcfInfoCigar = "CIGAR";		// cigar string describing how to align an
 						// alternate allele to the reference allele
 const char *vcfInfoIsDbSnp = "DB";		// dbSNP membership
 const char *vcfInfoDepth = "DP";		// combined depth across samples, e.g. DP=154
 const char *vcfInfoEnd = "END";			// end position of the variant described in this
 						// record (esp. for CNVs)
 const char *vcfInfoIsHapMap2 = "H2";		// membership in hapmap2
 const char *vcfInfoIsHapMap3 = "H3";		// membership in hapmap3
 const char *vcfInfoIs1000Genomes = "1000G";	// membership in 1000 Genomes
 const char *vcfInfoMappingQuality = "MQ";	// RMS mapping quality, e.g. MQ=52
 const char *vcfInfoMapQual0Count = "MQ0";	// number of MAPQ == 0 reads covering this record
 const char *vcfInfoNumSamples = "NS";		// Number of samples with data
 const char *vcfInfoStrandBias = "SB";		// strand bias at this position
 const char *vcfInfoIsSomatic = "SOMATIC";	// indicates that the record is a somatic mutation,
 						// for cancer genomics
 const char *vcfInfoIsValidated = "VALIDATED";	// validated by follow-up experiment
 
 /* Reserved but optional per-genotype keys: */
 const char *vcfGtGenotype = "GT";	// Integer allele indices separated by "/" (unphased)
 					// or "|" (phased). Allele values are 0 for
 					// reference allele, 1 for the first allele in ALT,
 					// 2 for the second allele in ALT and so on.
 const char *vcfGtDepth = "DP";		// Read depth at this position for this sample
 const char *vcfGtFilter = "FT";		// Analogous to variant's FILTER field
 const char *vcfGtLikelihoods = "GL";	// Three floating point log10-scaled likelihoods for
 					// AA,AB,BB genotypes where A=ref and B=alt;
 					// not applicable if site is not biallelic.
 const char *vcfGtPhred = "PL";		// Phred-scaled genotype likelihoods rounded to closest int
 const char *vcfGtConditionalQual = "GQ";// Conditional genotype quality
 					// i.e. phred quality -10log_10 P(genotype call is wrong,
 					// conditioned on the site's being variant)
 const char *vcfGtHaplotypeQualities = "HQ";	// two phred qualities comma separated
 const char *vcfGtPhaseSet = "PS";	// Set of phased genotypes to which this genotype belongs
 const char *vcfGtPhasingQuality = "PQ";	// Phred-scaled P(alleles ordered wrongly in heterozygote)
 const char *vcfGtExpectedAltAlleleCount = "EC";	// Typically used in association analyses
 
 
 // Make definitions of reserved INFO and genotype keys, in case people take them for
 // granted and don't make ##INFO headers for them:
 static struct vcfInfoDef *vcfSpecInfoDefs = NULL;
 static struct vcfInfoDef *vcfSpecGtFormatDefs = NULL;
 
 static void addInfoDef(struct vcfInfoDef **pList,
 		       const char *key, int fieldCount, enum vcfInfoType type, char *description)
 /* Allocate and initialize an info def and add it to pList. */
 {
 struct vcfInfoDef *def;
 AllocVar(def);
 def->key = (char *)key;
 def->fieldCount = fieldCount;
 def->type = type;
 def->description = description;
 slAddHead(pList, def);
 }
 
 static void initVcfSpecInfoDefs()
 /* Make linked list of INFO defs reserved in the spec. */
 {
 if (vcfSpecInfoDefs != NULL)
     return;
 addInfoDef(&vcfSpecInfoDefs, vcfInfoAncestralAllele, 1, vcfInfoString, "Ancestral allele");
 addInfoDef(&vcfSpecInfoDefs, vcfInfoPerAlleleGtCount, 1, vcfInfoInteger,
 	   "Allele count in genotypes, for each ALT allele, in the same order as listed");
 addInfoDef(&vcfSpecInfoDefs, vcfInfoAlleleFrequency, -1, vcfInfoFloat,
 	   "Allele frequency for each ALT allele in the same order as listed: "
 	   "use this when estimated from primary data, not called genotypes");
 addInfoDef(&vcfSpecInfoDefs, vcfInfoNumAlleles, 1, vcfInfoInteger,
 	   "Total number of alleles in called genotypes");
 addInfoDef(&vcfSpecInfoDefs, vcfInfoBaseQuality, 1, vcfInfoFloat,
 	   "RMS base quality at this position");
 addInfoDef(&vcfSpecInfoDefs, vcfInfoCigar, 1, vcfInfoString,
 	   "CIGAR string describing how to align an alternate allele to the reference allele");
 addInfoDef(&vcfSpecInfoDefs, vcfInfoIsDbSnp, 0, vcfInfoFlag, "dbSNP membership");
 addInfoDef(&vcfSpecInfoDefs, vcfInfoDepth, 1, vcfInfoString,
 	   "Combined depth across samples, e.g. DP=154");
 addInfoDef(&vcfSpecInfoDefs, vcfInfoEnd, 1, vcfInfoInteger,
 	   "End position of the variant described in this record "
 	   "(especially for structural variants)");
 addInfoDef(&vcfSpecInfoDefs, vcfInfoIsHapMap2, 1, vcfInfoFlag, "Membership in HapMap 2");
 addInfoDef(&vcfSpecInfoDefs, vcfInfoIsHapMap3, 1, vcfInfoFlag, "Membership in HapMap 3");
 addInfoDef(&vcfSpecInfoDefs, vcfInfoIs1000Genomes, 1, vcfInfoFlag, "Membership in 1000 Genomes");
 addInfoDef(&vcfSpecInfoDefs, vcfInfoMappingQuality, 1, vcfInfoFloat,
 	   "RMS mapping quality, e.g. MQ=52");
 addInfoDef(&vcfSpecInfoDefs, vcfInfoMapQual0Count, 1, vcfInfoInteger,
 	   "Number of MAPQ == 0 reads covering this record");
 addInfoDef(&vcfSpecInfoDefs, vcfInfoNumSamples, 1, vcfInfoInteger, "Number of samples with data");
 addInfoDef(&vcfSpecInfoDefs, vcfInfoStrandBias, 1, vcfInfoString, "Strand bias at this position");
 addInfoDef(&vcfSpecInfoDefs, vcfInfoIsSomatic, 1, vcfInfoFlag,
 	   "Indicates that the record is a somatic mutation, for cancer genomics");
 addInfoDef(&vcfSpecInfoDefs, vcfInfoIsValidated, 1, vcfInfoFlag,
 	   "Validated by follow-up experiment");
 }
 
 
 static void initVcfSpecGtFormatDefs()
 /* Make linked list of genotype info defs reserved in spec. */
 {
 if (vcfSpecGtFormatDefs != NULL)
     return;
 addInfoDef(&vcfSpecGtFormatDefs, vcfGtGenotype, 1, vcfInfoString,
 	   "Integer allele indices separated by \"/\" (unphased) "
 	   "or \"|\" (phased). Allele values are 0 for "
 	   "reference allele, 1 for the first allele in ALT, "
 	   "2 for the second allele in ALT and so on, or \".\" for unknown");
 addInfoDef(&vcfSpecGtFormatDefs, vcfGtDepth, 1, vcfInfoInteger,
 	   "Read depth at this position for this sample");
 addInfoDef(&vcfSpecGtFormatDefs, vcfGtFilter, 1, vcfInfoString,
 	   "PASS to indicate that all filters have been passed, "
 	   "a semi-colon separated list of codes for filters that fail, "
 	   "or \".\" to indicate that filters have not been applied");
 addInfoDef(&vcfSpecGtFormatDefs, vcfGtLikelihoods, -1, vcfInfoFloat,
 	   "Genotype likelihoods comprised of comma separated floating point "
 	   "log10-scaled likelihoods for all possible genotypes given the set "
 	   "of alleles defined in the REF and ALT fields. ");
 addInfoDef(&vcfSpecGtFormatDefs, vcfGtPhred, -1, vcfInfoInteger,
 	   "Phred-scaled genotype likelihoods rounded to the closest integer "
 	   "(and otherwise defined precisely as the genotype likelihoods (GL) field)");
 addInfoDef(&vcfSpecGtFormatDefs, vcfGtConditionalQual, -1, vcfInfoFloat,
 	   "phred-scaled genotype posterior probabilities "
 	   "(and otherwise defined precisely as the genotype likelihoods (GL) field)"
 	   "; intended to store imputed genotype probabilities");
 addInfoDef(&vcfSpecGtFormatDefs, vcfGtHaplotypeQualities, 2, vcfInfoFloat,
 	   "Two comma-separated phred-scaled haplotype qualities");
 addInfoDef(&vcfSpecGtFormatDefs, vcfGtPhaseSet, 1, vcfInfoFloat,
 	   "A set of phased genotypes to which this genotype belongs");
 addInfoDef(&vcfSpecGtFormatDefs, vcfGtPhasingQuality, 1, vcfInfoFloat,
 	   "Phasing quality, the phred-scaled probability that alleles are ordered "
 	   "incorrectly in a heterozygote (against all other members in the phase set)");
 addInfoDef(&vcfSpecGtFormatDefs, vcfGtExpectedAltAlleleCount, 1, vcfInfoFloat,
 	   "Expected alternate allele count (typically used in association analyses)");
 }
 
 static bool vcfFileStopDueToErrors(struct vcfFile *vcff)
 /* determine if we should stop due to the number of errors */
 {
 return vcff->errCnt > vcff->maxErr;
 }
 
 static void vcfFileErr(struct vcfFile *vcff, char *format, ...)
 #if defined(__GNUC__)
 __attribute__((format(printf, 2, 3)))
 #endif
 ;
 
-static void vaVcfFileErr(struct vcfFile *vcff, char *format, va_list args)
-/* Print error message to error file, abort if max errors have been reached */
-{
-if (vcff->lf != NULL)
-    fprintf(vcff->errFh, "%s:%d: ", vcff->lf->fileName, vcff->lf->lineIx);
-vfprintf(vcff->errFh, format, args);
-fprintf(vcff->errFh, "\n");
-vcff->errCnt++;
-if (vcfFileStopDueToErrors(vcff))
-    errAbort("VCF: %d parser errors", vcff->errCnt);
-}
-
 static void vcfFileErr(struct vcfFile *vcff, char *format, ...)
-/* Print error message and abort */
+/* Send error message to errabort stack's warn handler and abort */
 {
 va_list args;
 va_start(args, format);
-vaVcfFileErr(vcff, format, args);
+char formatPlus[1024];
+sprintf(formatPlus, "%s:%d: %s", vcff->lf->fileName, vcff->lf->lineIx, format);
+vaWarn(formatPlus, args);
 va_end(args);
+if (vcfFileStopDueToErrors(vcff))
+    errAbort("VCF: %d parser errors, quitting", vcff->errCnt);
 }
 
 static void *vcfFileAlloc(struct vcfFile *vcff, size_t size)
 /* allocate memory from the memory pool */
 {
 return lmAlloc(vcff->pool->lm, size);
 }
 
 static char *vcfFileCloneStrZ(struct vcfFile *vcff, char *str, size_t size)
 /* allocate memory for a string and copy it */
 {
 return lmCloneStringZ(vcff->pool->lm, str, size);
 }
 
 static char *vcfFileCloneStr(struct vcfFile *vcff, char *str)
 /* allocate memory for a string and copy it */
 {
 return vcfFileCloneStrZ(vcff, str, strlen(str));
 }
 
 static char *vcfFileCloneSubstr(struct vcfFile *vcff, char *line, regmatch_t substr)
 /* Allocate memory for and copy a substring of line. */
 {
 return vcfFileCloneStrZ(vcff, line+substr.rm_so, (substr.rm_eo - substr.rm_so));
 }
 
 #define vcfFileCloneVar(var) lmCloneMem(vcff->pool->lm, var, sizeof(var));
 
 static char *vcfFilePooledStr(struct vcfFile *vcff, char *str)
 /* allocate memory for a string from the shared string pool */
 {
 return hashStoreName(vcff->pool, str);
 }
 
 static enum vcfInfoType vcfInfoTypeFromSubstr(struct vcfFile *vcff, char *line, regmatch_t substr)
 /* Translate substring of line into vcfInfoType or complain. */
 {
 char typeWord[16];
 int substrLen = substr.rm_eo - substr.rm_so;
 if (substrLen > sizeof(typeWord) - 1)
     {
     vcfFileErr(vcff, "substring passed to vcfInfoTypeFromSubstr is too long.");
     return vcfInfoNoType;
     }
 safencpy(typeWord, sizeof(typeWord), line + substr.rm_so, substrLen);
 if (sameString("Integer", typeWord))
     return vcfInfoInteger;
 if (sameString("Float", typeWord))
     return vcfInfoFloat;
 if (sameString("Flag", typeWord))
     return vcfInfoFlag;
 if (sameString("Character", typeWord))
     return vcfInfoCharacter;
 if (sameString("String", typeWord))
     return vcfInfoString;
 vcfFileErr(vcff, "Unrecognized type word \"%s\" in metadata line \"%s\"", typeWord, line);
 return vcfInfoNoType;
 }
 
 // Regular expressions to check format and extract information from header lines:
-static const char *fileformatRegex = "^##fileformat=VCFv([0-9]+)(\\.([0-9]+))?$";
+static const char *fileformatRegex = "^##(file)?format=VCFv([0-9]+)(\\.([0-9]+))?$";
 static const char *infoOrFormatRegex =
     "^##(INFO|FORMAT)="
     "<ID=([A-Za-z0-9]+),"
     "Number=(\\.|[0-9]+),"
     "Type=([A-Za-z]+),"
     "Description=\"([^\"]+)\">$";
 static const char *filterOrAltRegex =
     "^##(FILTER|ALT)="
     "<ID=([A-Za-z0-9]+),"
     "Description=\"([^\"]+)\">$";
 
 INLINE void nonAsciiWorkaround(char *line)
 // Workaround for annoying 3-byte quote marks included in some 1000 Genomes files:
 {
 (void)strSwapStrs(line, strlen(line)+1, "\342\200\234", "\"");
 (void)strSwapStrs(line, strlen(line)+1, "\342\200\235", "\"");
 }
 
 static void parseMetadataLine(struct vcfFile *vcff, char *line)
 /* Parse a VCF header line beginning with "##" that defines a metadata. */
 {
 char *ptr = line;
 if (ptr == NULL && !startsWith(ptr, "##"))
     errAbort("Bad line passed to parseMetadataLine");
 ptr += 2;
 char *firstEq = strchr(ptr, '=');
 if (firstEq == NULL)
     {
     vcfFileErr(vcff, "Metadata line lacks '=': \"%s\"", line);
     return;
     }
 // Every metadata line is saved here:
 hashAddN(vcff->metaDataHash, ptr, (firstEq - ptr), vcfFileCloneStr(vcff, firstEq+1));
 regmatch_t substrs[8];
 // Some of the metadata lines are crucial for parsing the rest of the file:
-if (startsWith("##fileformat=", line))
+if (startsWith("##fileformat=", line) || startsWith("##format", line))
     {
     if (regexMatchSubstr(line, fileformatRegex, substrs, ArraySize(substrs)))
 	{
-	// substrs[1] is major version #, substrs[2] is set only if there is a minor version,
-	// and substrs[3] is the minor version #.
-	vcff->majorVersion = atoi(line + substrs[1].rm_so);
-	if (substrs[2].rm_so != -1)
-	    vcff->minorVersion = atoi(line + substrs[3].rm_so);
+	// substrs[2] is major version #, substrs[3] is set only if there is a minor version,
+	// and substrs[4] is the minor version #.
+	vcff->majorVersion = atoi(line + substrs[2].rm_so);
+	if (substrs[3].rm_so != -1)
+	    vcff->minorVersion = atoi(line + substrs[4].rm_so);
 	}
     else
 	vcfFileErr(vcff, "##fileformat line does not match expected pattern /%s/: \"%s\"",
 		   fileformatRegex, line);
     }
 else if (startsWith("##INFO=", line) || startsWith("##FORMAT=", line))
     {
     boolean isInfo = startsWith("##INFO=", line);
     nonAsciiWorkaround(line);
     if (regexMatchSubstr(line, infoOrFormatRegex, substrs, ArraySize(substrs)))
 	// substrs[2] is ID/key, substrs[3] is Number, [4] is Type and [5] is Description.
 	{
 	struct vcfInfoDef *def = vcfFileAlloc(vcff, sizeof(struct vcfInfoDef));
 	def->key = vcfFileCloneSubstr(vcff, line, substrs[2]);
 	if (sameString(def->key, "."))
 	    def->fieldCount = -1;
 	else
 	    def->fieldCount = atoi(line + substrs[3].rm_so);
 	def->type = vcfInfoTypeFromSubstr(vcff, line, substrs[4]);
 	def->description = vcfFileCloneSubstr(vcff, line, substrs[5]);
 	slAddHead((isInfo ? &(vcff->infoDefs) : &(vcff->gtFormatDefs)), def);
 	}
     else
 	vcfFileErr(vcff, "##%s line does not match expected pattern /%s/: \"%s\"",
 		   (isInfo ? "INFO" : "FORMAT"), infoOrFormatRegex, line);
     }
 else if (startsWith("##FILTER=", line) || startsWith("##ALT=", line))
     {
     boolean isFilter = startsWith("##FILTER", line);
     if (regexMatchSubstr(line, filterOrAltRegex, substrs, ArraySize(substrs)))
 	{
 	// substrs[2] is ID/key, substrs[3] is Description.
 	struct vcfInfoDef *def = vcfFileAlloc(vcff, sizeof(struct vcfInfoDef));
 	def->key = vcfFileCloneSubstr(vcff, line, substrs[2]);
 	def->description = vcfFileCloneSubstr(vcff, line, substrs[3]);
 	slAddHead((isFilter ? &(vcff->filterDefs) : &(vcff->altDefs)), def);
 	}
     else
 	vcfFileErr(vcff, "##%s line does not match expected pattern /%s/: \"%s\"",
 		   (isFilter ? "FILTER" : "ALT"), filterOrAltRegex, line);
     }
 }
 
-static void expectColumnName(struct vcfFile *vcff, char *expected, char *words[], int ix)
+static void expectColumnName2(struct vcfFile *vcff, char *exp1, char *exp2, char *words[], int ix)
 /* Every file must include a header naming the columns, though most column names are
  * fixed; make sure the names of fixed columns are as expected. */
 {
-if (! sameString(expected, words[ix]))
+if (! sameString(exp1, words[ix]))
+    {
+    if (exp2 == NULL)
     vcfFileErr(vcff, "Expected column %d's name in header to be \"%s\" but got \"%s\"",
-	       ix+1, expected, words[ix]);
+		   ix+1, exp1, words[ix]);
+    else if (! sameString(exp2, words[ix]))
+	vcfFileErr(vcff, "Expected column %d's name in header to be \"%s\"  or \"%s\" "
+		   "but got \"%s\"", ix+1, exp1, exp2, words[ix]);
+    }
 }
 
+#define expectColumnName(vcff, exp, words, ix) expectColumnName2(vcff, exp, NULL, words, ix)
+
 // There might be a whole lot of genotype columns...
 #define VCF_MAX_COLUMNS 16 * 1024
 
 static void parseColumnHeaderRow(struct vcfFile *vcff, char *line)
 /* Make sure column names are as we expect, and store genotype sample IDs if any are given. */
 {
 if (line[0] != '#')
     {
     vcfFileErr(vcff, "Expected to find # followed by column names (\"#CHROM POS ...\"), "
 	       "not \"%s\"", line);
     lineFileReuse(vcff->lf);
     return;
     }
 char *words[VCF_MAX_COLUMNS];
 int wordCount = chopLine(line+1, words);
 if (wordCount >= VCF_MAX_COLUMNS)
     vcfFileErr(vcff, "header contains at least %d columns; "
 	       "VCF_MAX_COLUMNS may need to be increased in vcf.c!", VCF_MAX_COLUMNS);
 expectColumnName(vcff, "CHROM", words, 0);
 expectColumnName(vcff, "POS", words, 1);
 expectColumnName(vcff, "ID", words, 2);
 expectColumnName(vcff, "REF", words, 3);
 expectColumnName(vcff, "ALT", words, 4);
-expectColumnName(vcff, "QUAL", words, 5);
+expectColumnName2(vcff, "QUAL", "PROB", words, 5);
 expectColumnName(vcff, "FILTER", words, 6);
 expectColumnName(vcff, "INFO", words, 7);
 if (wordCount > 8)
     {
     expectColumnName(vcff, "FORMAT", words, 8);
     if (wordCount < 10)
 	vcfFileErr(vcff, "FORMAT column is given, but no sample IDs for genotype columns...?");
     vcff->genotypeCount = (wordCount - 9);
     vcff->genotypeIds = vcfFileAlloc(vcff, vcff->genotypeCount * sizeof(char *));
     int i;
     for (i = 9;  i < wordCount;  i++)
 	vcff->genotypeIds[i-9] = vcfFileCloneStr(vcff, words[i]);
     }
 }
 
 static struct vcfFile *vcfFileNew()
 /* Return a new, empty vcfFile object. */
 {
 struct vcfFile *vcff = NULL;
 AllocVar(vcff);
 vcff->metaDataHash = hashNew(0);
 vcff->pool = hashNew(0);
 return vcff;
 }
 
-static struct vcfFile *vcfFileHeaderFromLineFile(struct lineFile *lf, int maxErr, FILE *errFh)
+static struct vcfFile *vcfFileHeaderFromLineFile(struct lineFile *lf, int maxErr)
 /* Parse a VCF file into a vcfFile object.  If maxErr not zero, then
  * continue to parse until this number of error have been reached.  A maxErr
- * less than zero does not stop and reports all errors. Write errors to errFh,
- * if NULL, use stderr. */
+ * less than zero does not stop and reports all errors. */
 {
 initVcfSpecInfoDefs();
 initVcfSpecGtFormatDefs();
 if (lf == NULL)
     return NULL;
 struct vcfFile *vcff = vcfFileNew();
 vcff->lf = lf;
 vcff->fileOrUrl = vcfFileCloneStr(vcff, lf->fileName);
-vcff->errFh = (errFh != NULL) ? errFh : stderr;
 vcff->maxErr = (maxErr < 0) ? INT_MAX : maxErr;
 
 char *line = NULL;
 // First, metadata lines beginning with "##":
 while (lineFileNext(lf, &line, NULL) && startsWith("##", line))
     parseMetadataLine(vcff, line);
 slReverse(&(vcff->infoDefs));
 slReverse(&(vcff->filterDefs));
 slReverse(&(vcff->gtFormatDefs));
 // Did we get the bare minimum VCF header with supported version?
 if (vcff->majorVersion == 0)
     vcfFileErr(vcff, "missing ##fileformat= header line?  Assuming 4.1.");
-if (vcff->majorVersion != 4 || (vcff->minorVersion != 0 && vcff->minorVersion != 1))
-    vcfFileErr(vcff, "VCFv%d.%d not supported -- only 4.0 or 4.1",
+if ((vcff->majorVersion != 4 || (vcff->minorVersion != 0 && vcff->minorVersion != 1)) &&
+    (vcff->majorVersion != 3))
+    vcfFileErr(vcff, "VCFv%d.%d not supported -- only v3.*, v4.0 or v4.1",
 	       vcff->majorVersion, vcff->minorVersion);
 // Next, one header line beginning with single "#" that names the columns:
 if (line == NULL)
     // EOF after metadata
     return vcff;
 parseColumnHeaderRow(vcff, line);
 return vcff;
 }
 
 
-static enum vcfInfoType typeForInfoKey(struct vcfFile *vcff, char *key)
-/* Look up the type of INFO component key, in the definitions from the header,
- * and failing that, from the keys reserved in the spec. */
+struct vcfInfoDef *vcfInfoDefForKey(struct vcfFile *vcff, const char *key)
+/* Return infoDef for key, or NULL if it wasn't specified in the header or VCF spec. */
  {
  struct vcfInfoDef *def;
  // I expect there to be fairly few definitions (less than a dozen) so
  // I'm just doing a linear search not hash:
  for (def = vcff->infoDefs;  def != NULL;  def = def->next)
      {
     if (sameString(key, def->key))
-	return def->type;
+	return def;
     }
 for (def = vcfSpecInfoDefs;  def != NULL;  def = def->next)
     {
     if (sameString(key, def->key))
-	return def->type;
+	return def;
     }
+return NULL;
+}
+
+static enum vcfInfoType typeForInfoKey(struct vcfFile *vcff, const char *key)
+/* Look up the type of INFO component key, in the definitions from the header,
+ * and failing that, from the keys reserved in the spec. */
+{
+struct vcfInfoDef *def = vcfInfoDefForKey(vcff, key);
+if (def == NULL)
+    {
 vcfFileErr(vcff, "There is no INFO header defining \"%s\"", key);
 // default to string so we can display value as-is:
 return vcfInfoString;
 }
+return def->type;
+}
 
 #define VCF_MAX_INFO 512
 
 int parseInfoValue(struct vcfRecord *record, char *infoKey, enum vcfInfoType type, char *valStr,
 		   union vcfDatum **pData)
 /* Parse a comma-separated list of values into array of union vcfInfoDatum and return count. */
 {
 char *valWords[VCF_MAX_INFO];
 int count = chopCommas(valStr, valWords);
 struct vcfFile *vcff = record->file;
 union vcfDatum *data = vcfFileAlloc(vcff, count * sizeof(union vcfDatum));
 int j;
 for (j = 0;  j < count;  j++)
     switch (type)
 	{
 	case vcfInfoInteger:
 	    data[j].datInt = atoi(valWords[j]);
 	    break;
 	case vcfInfoFloat:
 	    data[j].datFloat = atof(valWords[j]);
 	    break;
 	case vcfInfoFlag:
-	    // No data value
+	    // Flag key might have a value in older VCFs e.g. 3.2's DB=0, DB=1
+	    data[j].datString = vcfFilePooledStr(vcff, valWords[j]);
 	    break;
 	case vcfInfoCharacter:
 	    data[j].datChar = valWords[j][0];
 	    break;
 	case vcfInfoString:
 	    data[j].datString = vcfFilePooledStr(vcff, valWords[j]);
 	    break;
 	default:
 	    errAbort("invalid vcfInfoType (uninitialized?) %d", type);
 	    break;
 	}
 // If END is given, use it as chromEnd:
 if (sameString(infoKey, vcfInfoEnd))
     record->chromEnd = data[0].datInt;
 *pData = data;
 return count;
 }
 
 static void parseInfoColumn(struct vcfFile *vcff, struct vcfRecord *record, char *string)
 /* Translate string into array of vcfInfoElement. */
 {
 char *elWords[VCF_MAX_INFO];
 record->infoCount = chopByChar(string, ';', elWords, ArraySize(elWords));
 if (record->infoCount >= VCF_MAX_INFO)
     vcfFileErr(vcff, "INFO column contains at least %d elements; "
 	       "VCF_MAX_INFO may need to be increased in vcf.c!", VCF_MAX_INFO);
 record->infoElements = vcfFileAlloc(vcff, record->infoCount * sizeof(struct vcfInfoElement));
 int i;
 for (i = 0;  i < record->infoCount;  i++)
     {
     char *elStr = elWords[i];
     char *eq = strchr(elStr, '=');
     struct vcfInfoElement *el = &(record->infoElements[i]);
     if (eq == NULL)
 	{
 	el->key = vcfFilePooledStr(vcff, elStr);
 	enum vcfInfoType type = typeForInfoKey(vcff, el->key);
 	if (type != vcfInfoFlag)
 	    {
 	    vcfFileErr(vcff, "Missing = after key in INFO element: \"%s\" (type=%d)",
 		       elStr, type);
 	    if (type == vcfInfoString)
 		el->values[i].datString = vcfFilePooledStr(vcff, "");
 	    }
 	continue;
 	}
     *eq = '\0';
     el->key = vcfFilePooledStr(vcff, elStr);
     enum vcfInfoType type = typeForInfoKey(vcff, el->key);
     char *valStr = eq+1;
     el->count = parseInfoValue(record, el->key, type, valStr, &(el->values));
     if (el->count >= VCF_MAX_INFO)
 	vcfFileErr(vcff, "A single element of the INFO column has at least %d values; "
 	       "VCF_MAX_INFO may need to be increased in vcf.c!", VCF_MAX_INFO);
     }
 }
 
 static void vcfParseData(struct vcfFile *vcff)
 /* Given a vcfFile into which the header has been parsed, and whose lineFile is positioned
  * at the beginning of a data row, parse and store all data rows from lineFile. */
 {
 if (vcff == NULL)
     return;
 int expected = 8;
 if (vcff->genotypeCount > 0)
     expected = 9 + vcff->genotypeCount;
 char *words[VCF_MAX_COLUMNS];
 int wordCount;
 while ((wordCount = lineFileChop(vcff->lf, words)) > 0)
     {
     lineFileExpectWords(vcff->lf, expected, wordCount);
     struct vcfRecord *record;
     AllocVar(record);
     record->file = vcff;
     record->chrom = vcfFilePooledStr(vcff, words[0]);
     record->chromStart = lineFileNeedNum(vcff->lf, words, 1) - 1;
     // chromEnd may be modified by parseInfoColumn, if INFO column includes END.
     record->chromEnd = record->chromStart + 1;
     record->name = vcfFilePooledStr(vcff, words[2]);
     record->ref = vcfFilePooledStr(vcff, words[3]);
     record->alt = vcfFilePooledStr(vcff, words[4]);
     record->qual = atof(words[5]); //#*** qual can be "." so we need to represent that
     record->filter = vcfFilePooledStr(vcff, words[6]);
     parseInfoColumn(vcff, record, words[7]);
     if (vcff->genotypeCount > 0)
 	{
 	record->format = vcfFilePooledStr(vcff, words[8]);
 	record->genotypeUnparsedStrings = vcfFileAlloc(vcff,
 						       vcff->genotypeCount * sizeof(char *));
 	int i;
 	// Don't bother actually parsing all these until & unless we need the info:
 	for (i = 0;  i < vcff->genotypeCount;  i++)
 	    record->genotypeUnparsedStrings[i] = vcfFileCloneStr(vcff, words[9+i]);
 	}
     slAddHead(&(vcff->records), record);
     }
 slReverse(&(vcff->records));
 lineFileClose(&(vcff->lf));
 }
 
-struct vcfFile *vcfFileMayOpen(char *fileOrUrl, int maxErr, FILE *errFh)
+struct vcfFile *vcfFileMayOpen(char *fileOrUrl, int maxErr)
 /* Parse a VCF file into a vcfFile object.  If maxErr not zero, then
  * continue to parse until this number of error have been reached.  A maxErr
- * less than zero does not stop and reports all errors. Write errors to errFh,
- * if NULL, use stderr. */
+ * less than zero does not stop and reports all errors. */
 {
 struct lineFile *lf = NULL;
 if (startsWith("http://", fileOrUrl) || startsWith("ftp://", fileOrUrl) ||
     startsWith("https://", fileOrUrl))
     lf = netLineFileOpen(fileOrUrl);
 else
     lf = lineFileMayOpen(fileOrUrl, TRUE);
-struct vcfFile *vcff = vcfFileHeaderFromLineFile(lf, maxErr, errFh);
+struct vcfFile *vcff = vcfFileHeaderFromLineFile(lf, maxErr);
 vcfParseData(vcff);
 return vcff;
 }
 
 struct vcfFile *vcfTabixFileMayOpen(char *fileOrUrl, char *chrom, int start, int end,
-				    int maxErr, FILE *errFh)
+				    int maxErr)
 /* Parse header and rows within the given position range from a VCF file that has been
  * compressed and indexed by tabix into a vcfFile object; return NULL if or if file has
  * no items in range.
  * If maxErr not zero, then continue to parse until this number of error have been reached.
- * A maxErr less than zero does not stop and reports all errors. Write errors to errFh,
- * if NULL, use stderr. */
+ * A maxErr less than zero does not stop and reports all errors. */
 {
 struct lineFile *lf = lineFileTabixMayOpen(fileOrUrl, TRUE);
-struct vcfFile *vcff = vcfFileHeaderFromLineFile(lf, maxErr, errFh);
+struct vcfFile *vcff = vcfFileHeaderFromLineFile(lf, maxErr);
 if (vcff == NULL)
     return NULL;
-if (! lineFileSetTabixRegion(lf, chrom, start, end))
-    // No items in region
-    return vcff;
+if (isNotEmpty(chrom) && start != end)
+    {
+    if (lineFileSetTabixRegion(lf, chrom, start, end))
 vcfParseData(vcff);
+    }
 return vcff;
 }
 
 void vcfFileFree(struct vcfFile **pVcff)
 /* Free a vcfFile object. */
 {
 if (pVcff == NULL || *pVcff == NULL)
     return;
 struct vcfFile *vcff = *pVcff;
 hashFree(&(vcff->pool));
 hashFree(&(vcff->byName));
 hashFree(&(vcff->metaDataHash));
 lineFileClose(&(vcff->lf));
 freez(pVcff);
 }
 
 const struct vcfRecord *vcfFileFindVariant(struct vcfFile *vcff, char *variantId)
 /* Return all records with name=variantId, or NULL if not found. */
 {
 struct vcfRecord *varList = NULL;
 if (vcff->byName == NULL)
     {
     vcff->byName = hashNew(0);
     struct vcfRecord *rec;
     for (rec = vcff->records;  rec != NULL;  rec = rec->next)
 	{
 	if (sameString (rec->name, variantId))
 	    {
 	    // Make shallow copy of rec so we can alter ->next:
 	    struct vcfRecord *newRec = vcfFileCloneVar(rec);
 	    slAddHead(&varList, newRec);
 	    }
 	hashAdd(vcff->byName, rec->name, rec);
 	}
     slReverse(&varList);
     }
 else
     {
     struct hashEl *hel = hashLookup(vcff->byName, variantId);
     while (hel != NULL)
 	{
 	if (sameString(hel->name, variantId))
 	    {
 	    struct vcfRecord *rec = hel->val;
 	    struct vcfRecord *newRec = vcfFileCloneVar(rec);
 	    slAddHead(&varList, newRec);
 	    }
 	hel = hel->next;
 	}
     // Don't reverse varList -- hash element list was already reversed
     }
 return varList;
 }
 
 const struct vcfInfoElement *vcfRecordFindInfo(const struct vcfRecord *record, char *key)
 /* Find an INFO element, or NULL. */
 {
 int i;
 for (i = 0;  i < record->infoCount;  i++)
     {
     if (sameString(key, record->infoElements[i].key))
 	return &(record->infoElements[i]);
     }
 return NULL;
 }
 
-static enum vcfInfoType typeForGtFormat(struct vcfFile *vcff, char *key)
-/* Look up the type of FORMAT component key, in the definitions from the header,
+struct vcfInfoDef *vcfInfoDefForGtKey(struct vcfFile *vcff, const char *key)
+/* Look up the type of genotype FORMAT component key, in the definitions from the header,
  * and failing that, from the keys reserved in the spec. */
 {
 struct vcfInfoDef *def;
 // I expect there to be fairly few definitions (less than a dozen) so
 // I'm just doing a linear search not hash:
 for (def = vcff->gtFormatDefs;  def != NULL;  def = def->next)
     {
     if (sameString(key, def->key))
-	return def->type;
+	return def;
     }
 for (def = vcfSpecGtFormatDefs;  def != NULL;  def = def->next)
     {
     if (sameString(key, def->key))
-	return def->type;
+	return def;
+    }
+return NULL;
     }
+
+static enum vcfInfoType typeForGtFormat(struct vcfFile *vcff, const char *key)
+/* Look up the type of FORMAT component key, in the definitions from the header,
+ * and failing that, from the keys reserved in the spec. */
+{
+struct vcfInfoDef *def = vcfInfoDefForGtKey(vcff, key);
+if (def == NULL)
+    {
 vcfFileErr(vcff, "There is no FORMAT header defining \"%s\"", key);
 // default to string so we can display value as-is:
 return vcfInfoString;
 }
+return def->type;
+}
 
 #define VCF_MAX_FORMAT VCF_MAX_INFO
 #define VCF_MAX_FORMAT_LEN (VCF_MAX_FORMAT * 4)
 
-static void parseGenotypes(struct vcfRecord *record)
+void vcfParseGenotypes(struct vcfRecord *record)
 /* Translate record->genotypesUnparsedStrings[] into proper struct vcfGenotype[].
  * This destroys genotypesUnparsedStrings. */
 {
+if (record->genotypeUnparsedStrings == NULL)
+    return;
 struct vcfFile *vcff = record->file;
 record->genotypes = vcfFileAlloc(vcff, vcff->genotypeCount * sizeof(struct vcfGenotype));
 char format[VCF_MAX_FORMAT_LEN];
 safecpy(format, sizeof(format), record->format);
 char *formatWords[VCF_MAX_FORMAT];
 int formatWordCount = chopByChar(format, ':', formatWords, ArraySize(formatWords));
 if (formatWordCount >= VCF_MAX_FORMAT)
     vcfFileErr(vcff, "The FORMAT column has at least %d words; "
 	       "VCF_MAX_FORMAT may need to be increased in vcf.c!", VCF_MAX_FORMAT);
 if (differentString(formatWords[0], vcfGtGenotype))
     vcfFileErr(vcff, "FORMAT column should begin with \"%s\" but begins with \"%s\"",
 	       vcfGtGenotype, formatWords[0]);
 int i;
 for (i = 0;  i < vcff->genotypeCount;  i++)
     {
     char *string = record->genotypeUnparsedStrings[i];
     struct vcfGenotype *gt = &(record->genotypes[i]);
     // Each genotype can have multiple :-separated info elements:
     char *gtWords[VCF_MAX_FORMAT];
     int gtWordCount = chopByChar(string, ':', gtWords, ArraySize(gtWords));
     if (gtWordCount != formatWordCount)
 	vcfFileErr(vcff, "The FORMAT column has %d words but the genotype column for %s "
 		   "has %d words", formatWordCount, vcff->genotypeIds[i], gtWordCount);
     if (gtWordCount > formatWordCount)
 	gtWordCount = formatWordCount;
     gt->id = vcff->genotypeIds[i];
     gt->infoCount = gtWordCount;
     gt->infoElements = vcfFileAlloc(vcff, gtWordCount * sizeof(struct vcfInfoElement));
     int j;
     for (j = 0;  j < gtWordCount;  j++)
 	{
 	// Special parsing of genotype:
 	if (sameString(formatWords[j], vcfGtGenotype))
 	    {
 	    char *genotype = gtWords[j];
 	    char *sep = strchr(genotype, '|');
 	    if (sep != NULL)
 		gt->isPhased = TRUE;
 	    else
 		sep = strchr(genotype, '/');
 	    gt->hapIxA = atoi(genotype);
 	    if (sep == NULL)
 		gt->isHaploid = TRUE;
 	    else
 		gt->hapIxB = atoi(sep+1);
 	    }
 	struct vcfInfoElement *el = &(gt->infoElements[j]);
-	el->key = formatWords[j];
+	el->key = vcfFilePooledStr(vcff, formatWords[j]);
 	enum vcfInfoType type = typeForGtFormat(vcff, formatWords[j]);
 	el->count = parseInfoValue(record, formatWords[j], type, gtWords[j], &(el->values));
 	if (el->count >= VCF_MAX_INFO)
 	    vcfFileErr(vcff, "A single element of the genotype column for \"%s\" "
 		       "has at least %d values; "
 		       "VCF_MAX_INFO may need to be increased in vcf.c!",
 		       gt->id, VCF_MAX_INFO);
 	}
 
     }
 record->genotypeUnparsedStrings = NULL;
 }
 
 const struct vcfGenotype *vcfRecordFindGenotype(struct vcfRecord *record, char *sampleId)
-/* Find the genotype and associated info for the individual, or return NULL. */
+/* Find the genotype and associated info for the individual, or return NULL.
+ * This calls vcfParseGenotypes if it has not already been called. */
 {
-if (sampleId == NULL)
-    return NULL;
 struct vcfFile *vcff = record->file;
-if (record->genotypeUnparsedStrings != NULL && record->genotypes == NULL)
-    parseGenotypes(record);
+if (sampleId == NULL || vcff->genotypeCount == 0)
+    return NULL;
+vcfParseGenotypes(record);
 int ix = stringArrayIx(sampleId, vcff->genotypeIds, vcff->genotypeCount);
 if (ix >= 0)
     return &(record->genotypes[ix]);
 return NULL;
 }