19eaeb4d65c6962e516fb23474643ba754c32ea1
angie
  Fri Feb 11 10:52:15 2011 -0800
Feature #2821 (VCF parser): works on a 1000 Genomes pilot release VCF+tabix file with genotypes.[Note: this is a squash of 6 commits, developed off in my vcf branch.]

diff --git src/lib/vcf.c src/lib/vcf.c
new file mode 100644
index 0000000..ff763dc
--- /dev/null
+++ src/lib/vcf.c
@@ -0,0 +1,767 @@
+/* VCF: Variant Call Format, version 4.0 / 4.1
+ * http://www.1000genomes.org/wiki/Analysis/Variant%20Call%20Format/vcf-variant-call-format-version-40
+ * http://www.1000genomes.org/wiki/Analysis/Variant%20Call%20Format/vcf-variant-call-format-version-41
+ */
+
+#include "common.h"
+#include <limits.h>
+#include "localmem.h"
+#include "net.h"
+#include "regexHelper.h"
+#include "vcf.h"
+
+/* Reserved but optional INFO keys: */
+const char *vcfInfoAncestralAllele = "AA";
+const char *vcfInfoPerAlleleGtCount = "AC";	// allele count in genotypes, for each ALT allele,
+						// in the same order as listed
+const char *vcfInfoAlleleFrequency = "AF";	// allele frequency for each ALT allele in the same
+						// order as listed: use this when estimated from
+						// primary data, not called genotypes
+const char *vcfInfoNumAlleles = "AN";		// total number of alleles in called genotypes
+const char *vcfInfoBaseQuality = "BQ";		// RMS base quality at this position
+const char *vcfInfoCigar = "CIGAR";		// cigar string describing how to align an
+						// alternate allele to the reference allele
+const char *vcfInfoIsDbSnp = "DB";		// dbSNP membership
+const char *vcfInfoDepth = "DP";		// combined depth across samples, e.g. DP=154
+const char *vcfInfoEnd = "END";			// end position of the variant described in this
+						// record (esp. for CNVs)
+const char *vcfInfoIsHapMap2 = "H2";		// membership in hapmap2
+const char *vcfInfoIsHapMap3 = "H3";		// membership in hapmap3
+const char *vcfInfoIs1000Genomes = "1000G";	// membership in 1000 Genomes
+const char *vcfInfoMappingQuality = "MQ";	// RMS mapping quality, e.g. MQ=52
+const char *vcfInfoMapQual0Count = "MQ0";	// number of MAPQ == 0 reads covering this record
+const char *vcfInfoNumSamples = "NS";		// Number of samples with data
+const char *vcfInfoStrandBias = "SB";		// strand bias at this position
+const char *vcfInfoIsSomatic = "SOMATIC";	// indicates that the record is a somatic mutation,
+						// for cancer genomics
+const char *vcfInfoIsValidated = "VALIDATED";	// validated by follow-up experiment
+
+/* Reserved but optional per-genotype keys: */
+const char *vcfGtGenotype = "GT";	// Integer allele indices separated by "/" (unphased)
+					// or "|" (phased). Allele values are 0 for
+					// reference allele, 1 for the first allele in ALT,
+					// 2 for the second allele in ALT and so on.
+const char *vcfGtDepth = "DP";		// Read depth at this position for this sample
+const char *vcfGtFilter = "FT";		// Analogous to variant's FILTER field
+const char *vcfGtLikelihoods = "GL";	// Three floating point log10-scaled likelihoods for
+					// AA,AB,BB genotypes where A=ref and B=alt;
+					// not applicable if site is not biallelic.
+const char *vcfGtPhred = "PL";		// Phred-scaled genotype likelihoods rounded to closest int
+const char *vcfGtConditionalQual = "GQ";// Conditional genotype quality
+					// i.e. phred quality -10log_10 P(genotype call is wrong,
+					// conditioned on the site's being variant)
+const char *vcfGtHaplotypeQualities = "HQ";	// two phred qualities comma separated
+const char *vcfGtPhaseSet = "PS";	// Set of phased genotypes to which this genotype belongs
+const char *vcfGtPhasingQuality = "PQ";	// Phred-scaled P(alleles ordered wrongly in heterozygote)
+const char *vcfGtExpectedAltAlleleCount = "EC";	// Typically used in association analyses
+
+
+// Make definitions of reserved INFO and genotype keys, in case people take them for
+// granted and don't make ##INFO headers for them:
+static struct vcfInfoDef *vcfSpecInfoDefs = NULL;
+static struct vcfInfoDef *vcfSpecGtFormatDefs = NULL;
+
+static void addInfoDef(struct vcfInfoDef **pList,
+		       const char *key, int fieldCount, enum vcfInfoType type, char *description)
+/* Allocate and initialize an info def and add it to pList. */
+{
+struct vcfInfoDef *def;
+AllocVar(def);
+def->key = (char *)key;
+def->fieldCount = fieldCount;
+def->type = type;
+def->description = description;
+slAddHead(pList, def);
+}
+
+static void initVcfSpecInfoDefs()
+/* Make linked list of INFO defs reserved in the spec. */
+{
+if (vcfSpecInfoDefs != NULL)
+    return;
+addInfoDef(&vcfSpecInfoDefs, vcfInfoAncestralAllele, 1, vcfInfoString, "Ancestral allele");
+addInfoDef(&vcfSpecInfoDefs, vcfInfoPerAlleleGtCount, 1, vcfInfoInteger,
+	   "Allele count in genotypes, for each ALT allele, in the same order as listed");
+addInfoDef(&vcfSpecInfoDefs, vcfInfoAlleleFrequency, -1, vcfInfoFloat,
+	   "Allele frequency for each ALT allele in the same order as listed: "
+	   "use this when estimated from primary data, not called genotypes");
+addInfoDef(&vcfSpecInfoDefs, vcfInfoNumAlleles, 1, vcfInfoInteger,
+	   "Total number of alleles in called genotypes");
+addInfoDef(&vcfSpecInfoDefs, vcfInfoBaseQuality, 1, vcfInfoFloat,
+	   "RMS base quality at this position");
+addInfoDef(&vcfSpecInfoDefs, vcfInfoCigar, 1, vcfInfoString,
+	   "CIGAR string describing how to align an alternate allele to the reference allele");
+addInfoDef(&vcfSpecInfoDefs, vcfInfoIsDbSnp, 0, vcfInfoFlag, "dbSNP membership");
+addInfoDef(&vcfSpecInfoDefs, vcfInfoDepth, 1, vcfInfoString,
+	   "Combined depth across samples, e.g. DP=154");
+addInfoDef(&vcfSpecInfoDefs, vcfInfoEnd, 1, vcfInfoInteger,
+	   "End position of the variant described in this record "
+	   "(especially for structural variants)");
+addInfoDef(&vcfSpecInfoDefs, vcfInfoIsHapMap2, 1, vcfInfoFlag, "Membership in HapMap 2");
+addInfoDef(&vcfSpecInfoDefs, vcfInfoIsHapMap3, 1, vcfInfoFlag, "Membership in HapMap 3");
+addInfoDef(&vcfSpecInfoDefs, vcfInfoIs1000Genomes, 1, vcfInfoFlag, "Membership in 1000 Genomes");
+addInfoDef(&vcfSpecInfoDefs, vcfInfoMappingQuality, 1, vcfInfoFloat,
+	   "RMS mapping quality, e.g. MQ=52");
+addInfoDef(&vcfSpecInfoDefs, vcfInfoMapQual0Count, 1, vcfInfoInteger,
+	   "Number of MAPQ == 0 reads covering this record");
+addInfoDef(&vcfSpecInfoDefs, vcfInfoNumSamples, 1, vcfInfoInteger, "Number of samples with data");
+addInfoDef(&vcfSpecInfoDefs, vcfInfoStrandBias, 1, vcfInfoString, "Strand bias at this position");
+addInfoDef(&vcfSpecInfoDefs, vcfInfoIsSomatic, 1, vcfInfoFlag,
+	   "Indicates that the record is a somatic mutation, for cancer genomics");
+addInfoDef(&vcfSpecInfoDefs, vcfInfoIsValidated, 1, vcfInfoFlag,
+	   "Validated by follow-up experiment");
+}
+
+
+static void initVcfSpecGtFormatDefs()
+/* Make linked list of genotype info defs reserved in spec. */
+{
+if (vcfSpecGtFormatDefs != NULL)
+    return;
+addInfoDef(&vcfSpecGtFormatDefs, vcfGtGenotype, 1, vcfInfoString,
+	   "Integer allele indices separated by \"/\" (unphased) "
+	   "or \"|\" (phased). Allele values are 0 for "
+	   "reference allele, 1 for the first allele in ALT, "
+	   "2 for the second allele in ALT and so on, or \".\" for unknown");
+addInfoDef(&vcfSpecGtFormatDefs, vcfGtDepth, 1, vcfInfoInteger,
+	   "Read depth at this position for this sample");
+addInfoDef(&vcfSpecGtFormatDefs, vcfGtFilter, 1, vcfInfoString,
+	   "PASS to indicate that all filters have been passed, "
+	   "a semi-colon separated list of codes for filters that fail, "
+	   "or \".\" to indicate that filters have not been applied");
+addInfoDef(&vcfSpecGtFormatDefs, vcfGtLikelihoods, -1, vcfInfoFloat,
+	   "Genotype likelihoods comprised of comma separated floating point "
+	   "log10-scaled likelihoods for all possible genotypes given the set "
+	   "of alleles defined in the REF and ALT fields. ");
+addInfoDef(&vcfSpecGtFormatDefs, vcfGtPhred, -1, vcfInfoInteger,
+	   "Phred-scaled genotype likelihoods rounded to the closest integer "
+	   "(and otherwise defined precisely as the genotype likelihoods (GL) field)");
+addInfoDef(&vcfSpecGtFormatDefs, vcfGtConditionalQual, -1, vcfInfoFloat,
+	   "phred-scaled genotype posterior probabilities "
+	   "(and otherwise defined precisely as the genotype likelihoods (GL) field)"
+	   "; intended to store imputed genotype probabilities");
+addInfoDef(&vcfSpecGtFormatDefs, vcfGtHaplotypeQualities, 2, vcfInfoFloat,
+	   "Two comma-separated phred-scaled haplotype qualities");
+addInfoDef(&vcfSpecGtFormatDefs, vcfGtPhaseSet, 1, vcfInfoFloat,
+	   "A set of phased genotypes to which this genotype belongs");
+addInfoDef(&vcfSpecGtFormatDefs, vcfGtPhasingQuality, 1, vcfInfoFloat,
+	   "Phasing quality, the phred-scaled probability that alleles are ordered "
+	   "incorrectly in a heterozygote (against all other members in the phase set)");
+addInfoDef(&vcfSpecGtFormatDefs, vcfGtExpectedAltAlleleCount, 1, vcfInfoFloat,
+	   "Expected alternate allele count (typically used in association analyses)");
+}
+
+static bool vcfFileStopDueToErrors(struct vcfFile *vcff)
+/* determine if we should stop due to the number of errors */
+{
+return vcff->errCnt > vcff->maxErr;
+}
+
+static void vcfFileErr(struct vcfFile *vcff, char *format, ...)
+#if defined(__GNUC__)
+__attribute__((format(printf, 2, 3)))
+#endif
+;
+
+static void vaVcfFileErr(struct vcfFile *vcff, char *format, va_list args)
+/* Print error message to error file, abort if max errors have been reached */
+{
+if (vcff->lf != NULL)
+    fprintf(vcff->errFh, "%s:%d: ", vcff->lf->fileName, vcff->lf->lineIx);
+vfprintf(vcff->errFh, format, args);
+fprintf(vcff->errFh, "\n");
+vcff->errCnt++;
+if (vcfFileStopDueToErrors(vcff))
+    errAbort("VCF: %d parser errors", vcff->errCnt);
+}
+
+static void vcfFileErr(struct vcfFile *vcff, char *format, ...)
+/* Print error message and abort */
+{
+va_list args;
+va_start(args, format);
+vaVcfFileErr(vcff, format, args);
+va_end(args);
+}
+
+static void *vcfFileAlloc(struct vcfFile *vcff, size_t size)
+/* allocate memory from the memory pool */
+{
+return lmAlloc(vcff->pool->lm, size);
+}
+
+static char *vcfFileCloneStrZ(struct vcfFile *vcff, char *str, size_t size)
+/* allocate memory for a string and copy it */
+{
+return lmCloneStringZ(vcff->pool->lm, str, size);
+}
+
+static char *vcfFileCloneStr(struct vcfFile *vcff, char *str)
+/* allocate memory for a string and copy it */
+{
+return vcfFileCloneStrZ(vcff, str, strlen(str));
+}
+
+static char *vcfFileCloneSubstr(struct vcfFile *vcff, char *line, regmatch_t substr)
+/* Allocate memory for and copy a substring of line. */
+{
+return vcfFileCloneStrZ(vcff, line+substr.rm_so, (substr.rm_eo - substr.rm_so));
+}
+
+#define vcfFileCloneVar(var) lmCloneMem(vcff->pool->lm, var, sizeof(var));
+
+static char *vcfFilePooledStr(struct vcfFile *vcff, char *str)
+/* allocate memory for a string from the shared string pool */
+{
+return hashStoreName(vcff->pool, str);
+}
+
+static enum vcfInfoType vcfInfoTypeFromSubstr(struct vcfFile *vcff, char *line, regmatch_t substr)
+/* Translate substring of line into vcfInfoType or complain. */
+{
+char typeWord[16];
+int substrLen = substr.rm_eo - substr.rm_so;
+if (substrLen > sizeof(typeWord) - 1)
+    {
+    vcfFileErr(vcff, "substring passed to vcfInfoTypeFromSubstr is too long.");
+    return vcfInfoNoType;
+    }
+safencpy(typeWord, sizeof(typeWord), line + substr.rm_so, substrLen);
+if (sameString("Integer", typeWord))
+    return vcfInfoInteger;
+if (sameString("Float", typeWord))
+    return vcfInfoFloat;
+if (sameString("Flag", typeWord))
+    return vcfInfoFlag;
+if (sameString("Character", typeWord))
+    return vcfInfoCharacter;
+if (sameString("String", typeWord))
+    return vcfInfoString;
+vcfFileErr(vcff, "Unrecognized type word \"%s\" in metadata line \"%s\"", typeWord, line);
+return vcfInfoNoType;
+}
+
+// Regular expressions to check format and extract information from header lines:
+static const char *fileformatRegex = "^##fileformat=VCFv([0-9]+)(\\.([0-9]+))?$";
+static const char *infoOrFormatRegex =
+    "^##(INFO|FORMAT)="
+    "<ID=([A-Za-z0-9]+),"
+    "Number=(\\.|[0-9]+),"
+    "Type=([A-Za-z]+),"
+    "Description=\"([^\"]+)\">$";
+static const char *filterOrAltRegex =
+    "^##(FILTER|ALT)="
+    "<ID=([A-Za-z0-9]+),"
+    "Description=\"([^\"]+)\">$";
+
+static void parseMetadataLine(struct vcfFile *vcff, char *line)
+/* Parse a VCF header line beginning with "##" that defines a metadata. */
+{
+char *ptr = line;
+if (ptr == NULL && !startsWith(ptr, "##"))
+    errAbort("Bad line passed to parseMetadataLine");
+ptr += 2;
+char *firstEq = strchr(ptr, '=');
+if (firstEq == NULL)
+    {
+    vcfFileErr(vcff, "Metadata line lacks '=': \"%s\"", line);
+    return;
+    }
+// Every metadata line is saved here:
+hashAddN(vcff->metaDataHash, ptr, (firstEq - ptr), vcfFileCloneStr(vcff, firstEq+1));
+regmatch_t substrs[8];
+// Some of the metadata lines are crucial for parsing the rest of the file:
+if (startsWith("##fileformat=", line))
+    {
+    if (regexMatchSubstr(line, fileformatRegex, substrs, ArraySize(substrs)))
+	{
+	// substrs[1] is major version #, substrs[2] is set only if there is a minor version,
+	// and substrs[3] is the minor version #.
+	vcff->majorVersion = atoi(line + substrs[1].rm_so);
+	if (substrs[2].rm_so != -1)
+	    vcff->minorVersion = atoi(line + substrs[3].rm_so);
+	}
+    else
+	vcfFileErr(vcff, "##fileformat line does not match expected pattern /%s/: \"%s\"",
+		   fileformatRegex, line);
+    }
+else if (startsWith("##INFO=", line) || startsWith("##FORMAT=", line))
+    {
+    boolean isInfo = startsWith("##INFO=", line);
+    if (regexMatchSubstr(line, infoOrFormatRegex, substrs, ArraySize(substrs)))
+	// substrs[2] is ID/key, substrs[3] is Number, [4] is Type and [5] is Description.
+	{
+	struct vcfInfoDef *def = vcfFileAlloc(vcff, sizeof(struct vcfInfoDef));
+	def->key = vcfFileCloneSubstr(vcff, line, substrs[2]);
+	if (sameString(def->key, "."))
+	    def->fieldCount = -1;
+	else
+	    def->fieldCount = atoi(line + substrs[3].rm_so);
+	def->type = vcfInfoTypeFromSubstr(vcff, line, substrs[4]);
+	def->description = vcfFileCloneSubstr(vcff, line, substrs[5]);
+	slAddHead((isInfo ? &(vcff->infoDefs) : &(vcff->gtFormatDefs)), def);
+	}
+    else
+	vcfFileErr(vcff, "##%s line does not match expected pattern /%s/: \"%s\"",
+		   (isInfo ? "INFO" : "FORMAT"), infoOrFormatRegex, line);
+    }
+else if (startsWith("##FILTER=", line) || startsWith("##ALT=", line))
+    {
+    boolean isFilter = startsWith("##FILTER", line);
+    if (regexMatchSubstr(line, filterOrAltRegex, substrs, ArraySize(substrs)))
+	{
+	// substrs[2] is ID/key, substrs[3] is Description.
+	struct vcfInfoDef *def = vcfFileAlloc(vcff, sizeof(struct vcfInfoDef));
+	def->key = vcfFileCloneSubstr(vcff, line, substrs[2]);
+	def->description = vcfFileCloneSubstr(vcff, line, substrs[3]);
+	slAddHead((isFilter ? &(vcff->filterDefs) : &(vcff->altDefs)), def);
+	}
+    else
+	vcfFileErr(vcff, "##%s line does not match expected pattern /%s/: \"%s\"",
+		   (isFilter ? "FILTER" : "ALT"), filterOrAltRegex, line);
+    }
+}
+
+static void expectColumnName(struct vcfFile *vcff, char *expected, char *words[], int ix)
+/* Every file must include a header naming the columns, though most column names are
+ * fixed; make sure the names of fixed columns are as expected. */
+{
+if (! sameString(expected, words[ix]))
+    vcfFileErr(vcff, "Expected column %d's name in header to be \"%s\" but got \"%s\"",
+	       ix+1, expected, words[ix]);
+}
+
+// There might be a whole lot of genotype columns...
+#define VCF_MAX_COLUMNS 16 * 1024
+
+static void parseColumnHeaderRow(struct vcfFile *vcff, char *line)
+/* Make sure column names are as we expect, and store genotype sample IDs if any are given. */
+{
+if (line[0] != '#')
+    {
+    vcfFileErr(vcff, "Expected to find # followed by column names (\"#CHROM POS ...\"), "
+	       "not \"%s\"", line);
+    lineFileReuse(vcff->lf);
+    return;
+    }
+char *words[VCF_MAX_COLUMNS];
+int wordCount = chopLine(line+1, words);
+if (wordCount >= VCF_MAX_COLUMNS)
+    vcfFileErr(vcff, "header contains at least %d columns; "
+	       "VCF_MAX_COLUMNS may need to be increased in vcf.c!", VCF_MAX_COLUMNS);
+expectColumnName(vcff, "CHROM", words, 0);
+expectColumnName(vcff, "POS", words, 1);
+expectColumnName(vcff, "ID", words, 2);
+expectColumnName(vcff, "REF", words, 3);
+expectColumnName(vcff, "ALT", words, 4);
+expectColumnName(vcff, "QUAL", words, 5);
+expectColumnName(vcff, "FILTER", words, 6);
+expectColumnName(vcff, "INFO", words, 7);
+if (wordCount > 8)
+    {
+    expectColumnName(vcff, "FORMAT", words, 8);
+    if (wordCount < 10)
+	vcfFileErr(vcff, "FORMAT column is given, but no sample IDs for genotype columns...?");
+    vcff->genotypeCount = (wordCount - 9);
+    vcff->genotypeIds = vcfFileAlloc(vcff, vcff->genotypeCount * sizeof(char *));
+    int i;
+    for (i = 9;  i < wordCount;  i++)
+	vcff->genotypeIds[i-9] = vcfFileCloneStr(vcff, words[i]);
+    }
+}
+
+static struct vcfFile *vcfFileNew()
+/* Return a new, empty vcfFile object. */
+{
+struct vcfFile *vcff = NULL;
+AllocVar(vcff);
+vcff->metaDataHash = hashNew(0);
+vcff->pool = hashNew(0);
+return vcff;
+}
+
+static struct vcfFile *vcfFileHeaderFromLineFile(struct lineFile *lf, int maxErr, FILE *errFh)
+/* Parse a VCF file into a vcfFile object.  If maxErr not zero, then
+ * continue to parse until this number of error have been reached.  A maxErr
+ * less than zero does not stop and reports all errors. Write errors to errFh,
+ * if NULL, use stderr. */
+{
+initVcfSpecInfoDefs();
+initVcfSpecGtFormatDefs();
+if (lf == NULL)
+    return NULL;
+struct vcfFile *vcff = vcfFileNew();
+vcff->lf = lf;
+vcff->fileOrUrl = vcfFileCloneStr(vcff, lf->fileName);
+vcff->errFh = (errFh != NULL) ? errFh : stderr;
+vcff->maxErr = (maxErr < 0) ? INT_MAX : maxErr;
+
+char *line = NULL;
+// First, metadata lines beginning with "##":
+while (lineFileNext(lf, &line, NULL) && startsWith("##", line))
+    parseMetadataLine(vcff, line);
+slReverse(&(vcff->infoDefs));
+slReverse(&(vcff->filterDefs));
+slReverse(&(vcff->gtFormatDefs));
+// Did we get the bare minimum VCF header with supported version?
+if (vcff->majorVersion == 0)
+    vcfFileErr(vcff, "missing ##fileformat= header line?  Assuming 4.1.");
+if (vcff->majorVersion != 4 || (vcff->minorVersion != 0 && vcff->minorVersion != 1))
+    vcfFileErr(vcff, "VCFv%d.%d not supported -- only 4.0 or 4.1",
+	       vcff->majorVersion, vcff->minorVersion);
+// Next, one header line beginning with single "#" that names the columns:
+if (line == NULL)
+    // EOF after metadata
+    return vcff;
+parseColumnHeaderRow(vcff, line);
+return vcff;
+}
+
+
+static enum vcfInfoType typeForInfoKey(struct vcfFile *vcff, char *key)
+/* Look up the type of INFO component key, in the definitions from the header,
+ * and failing that, from the keys reserved in the spec. */
+ {
+ struct vcfInfoDef *def;
+ // I expect there to be fairly few definitions (less than a dozen) so
+ // I'm just doing a linear search not hash:
+ for (def = vcff->infoDefs;  def != NULL;  def = def->next)
+     {
+    if (sameString(key, def->key))
+	return def->type;
+    }
+for (def = vcfSpecInfoDefs;  def != NULL;  def = def->next)
+    {
+    if (sameString(key, def->key))
+	return def->type;
+    }
+vcfFileErr(vcff, "There is no INFO header defining \"%s\"", key);
+// default to string so we can display value as-is:
+return vcfInfoString;
+}
+
+#define VCF_MAX_INFO 512
+
+int parseInfoValue(struct vcfRecord *record, char *infoKey, enum vcfInfoType type, char *valStr,
+		   union vcfDatum **pData)
+/* Parse a comma-separated list of values into array of union vcfInfoDatum and return count. */
+{
+char *valWords[VCF_MAX_INFO];
+int count = chopCommas(valStr, valWords);
+struct vcfFile *vcff = record->file;
+union vcfDatum *data = vcfFileAlloc(vcff, count * sizeof(union vcfDatum));
+int j;
+for (j = 0;  j < count;  j++)
+    switch (type)
+	{
+	case vcfInfoInteger:
+	    data[j].datInt = atoi(valWords[j]);
+	    break;
+	case vcfInfoFloat:
+	    data[j].datFloat = atof(valWords[j]);
+	    break;
+	case vcfInfoFlag:
+	    // No data value
+	    break;
+	case vcfInfoCharacter:
+	    data[j].datChar = valWords[j][0];
+	    break;
+	case vcfInfoString:
+	    data[j].datString = vcfFilePooledStr(vcff, valWords[j]);
+	    break;
+	default:
+	    errAbort("invalid vcfInfoType (uninitialized?) %d", type);
+	    break;
+	}
+// If END is given, use it as chromEnd:
+if (sameString(infoKey, vcfInfoEnd))
+    record->chromEnd = data[0].datInt;
+*pData = data;
+return count;
+}
+
+static void parseInfoColumn(struct vcfFile *vcff, struct vcfRecord *record, char *string)
+/* Translate string into array of vcfInfoElement. */
+{
+char *elWords[VCF_MAX_INFO];
+record->infoCount = chopByChar(string, ';', elWords, ArraySize(elWords));
+if (record->infoCount >= VCF_MAX_INFO)
+    vcfFileErr(vcff, "INFO column contains at least %d elements; "
+	       "VCF_MAX_INFO may need to be increased in vcf.c!", VCF_MAX_INFO);
+record->infoElements = vcfFileAlloc(vcff, record->infoCount * sizeof(struct vcfInfoElement));
+int i;
+for (i = 0;  i < record->infoCount;  i++)
+    {
+    char *elStr = elWords[i];
+    char *eq = strchr(elStr, '=');
+    struct vcfInfoElement *el = &(record->infoElements[i]);
+    if (eq == NULL)
+	{
+	el->key = vcfFilePooledStr(vcff, elStr);
+	enum vcfInfoType type = typeForInfoKey(vcff, el->key);
+	if (type != vcfInfoFlag)
+	    {
+	    vcfFileErr(vcff, "Missing = after key in INFO element: \"%s\" (type=%d)",
+		       elStr, type);
+	    if (type == vcfInfoString)
+		el->values[i].datString = vcfFilePooledStr(vcff, "");
+	    }
+	continue;
+	}
+    *eq = '\0';
+    el->key = vcfFilePooledStr(vcff, elStr);
+    enum vcfInfoType type = typeForInfoKey(vcff, el->key);
+    char *valStr = eq+1;
+    el->count = parseInfoValue(record, el->key, type, valStr, &(el->values));
+    if (el->count >= VCF_MAX_INFO)
+	vcfFileErr(vcff, "A single element of the INFO column has at least %d values; "
+	       "VCF_MAX_INFO may need to be increased in vcf.c!", VCF_MAX_INFO);
+    }
+}
+
+static void vcfParseData(struct vcfFile *vcff)
+/* Given a vcfFile into which the header has been parsed, and whose lineFile is positioned
+ * at the beginning of a data row, parse and store all data rows from lineFile. */
+{
+if (vcff == NULL)
+    return;
+int expected = 8;
+if (vcff->genotypeCount > 0)
+    expected = 9 + vcff->genotypeCount;
+char *words[VCF_MAX_COLUMNS];
+int wordCount;
+while ((wordCount = lineFileChop(vcff->lf, words)) > 0)
+    {
+    lineFileExpectWords(vcff->lf, expected, wordCount);
+    struct vcfRecord *record;
+    AllocVar(record);
+    record->file = vcff;
+    record->chrom = vcfFilePooledStr(vcff, words[0]);
+    record->chromStart = lineFileNeedNum(vcff->lf, words, 1) - 1;
+    // chromEnd may be modified by parseInfoColumn, if INFO column includes END.
+    record->chromEnd = record->chromStart + 1;
+    record->name = vcfFilePooledStr(vcff, words[2]);
+    record->ref = vcfFilePooledStr(vcff, words[3]);
+    record->alt = vcfFilePooledStr(vcff, words[4]);
+    record->qual = atof(words[5]); //#*** qual can be "." so we need to represent that
+    record->filter = vcfFilePooledStr(vcff, words[6]);
+    parseInfoColumn(vcff, record, words[7]);
+    if (vcff->genotypeCount > 0)
+	{
+	record->format = vcfFilePooledStr(vcff, words[8]);
+	record->genotypeUnparsedStrings = vcfFileAlloc(vcff,
+						       vcff->genotypeCount * sizeof(char *));
+	int i;
+	// Don't bother actually parsing all these until & unless we need the info:
+	for (i = 0;  i < vcff->genotypeCount;  i++)
+	    record->genotypeUnparsedStrings[i] = vcfFileCloneStr(vcff, words[9+i]);
+	}
+    slAddHead(&(vcff->records), record);
+    }
+slReverse(&(vcff->records));
+lineFileClose(&(vcff->lf));
+}
+
+struct vcfFile *vcfFileMayOpen(char *fileOrUrl, int maxErr, FILE *errFh)
+/* Parse a VCF file into a vcfFile object.  If maxErr not zero, then
+ * continue to parse until this number of error have been reached.  A maxErr
+ * less than zero does not stop and reports all errors. Write errors to errFh,
+ * if NULL, use stderr. */
+{
+struct lineFile *lf = NULL;
+if (startsWith("http://", fileOrUrl) || startsWith("ftp://", fileOrUrl) ||
+    startsWith("https://", fileOrUrl))
+    lf = netLineFileOpen(fileOrUrl);
+else
+    lf = lineFileMayOpen(fileOrUrl, TRUE);
+struct vcfFile *vcff = vcfFileHeaderFromLineFile(lf, maxErr, errFh);
+vcfParseData(vcff);
+return vcff;
+}
+
+struct vcfFile *vcfTabixFileMayOpen(char *fileOrUrl, char *chrom, int start, int end,
+				    int maxErr, FILE *errFh)
+/* Parse header and rows within the given position range from a VCF file that has been
+ * compressed and indexed by tabix into a vcfFile object; return NULL if or if file has
+ * no items in range.
+ * If maxErr not zero, then continue to parse until this number of error have been reached.
+ * A maxErr less than zero does not stop and reports all errors. Write errors to errFh,
+ * if NULL, use stderr. */
+{
+struct lineFile *lf = lineFileOnTabix(fileOrUrl, TRUE);
+struct vcfFile *vcff = vcfFileHeaderFromLineFile(lf, maxErr, errFh);
+if (vcff == NULL)
+    return NULL;
+if (! lineFileSetTabixRegion(lf, chrom, start, end))
+    return NULL;
+vcfParseData(vcff);
+return vcff;
+}
+
+void vcfFileFree(struct vcfFile **pVcff)
+/* Free a vcfFile object. */
+{
+if (pVcff == NULL || *pVcff == NULL)
+    return;
+struct vcfFile *vcff = *pVcff;
+hashFree(&(vcff->pool));
+hashFree(&(vcff->byName));
+hashFree(&(vcff->metaDataHash));
+lineFileClose(&(vcff->lf));
+freez(pVcff);
+}
+
+const struct vcfRecord *vcfFileFindVariant(struct vcfFile *vcff, char *variantId)
+/* Return all records with name=variantId, or NULL if not found. */
+{
+struct vcfRecord *varList = NULL;
+if (vcff->byName == NULL)
+    {
+    vcff->byName = hashNew(0);
+    struct vcfRecord *rec;
+    for (rec = vcff->records;  rec != NULL;  rec = rec->next)
+	{
+	if (sameString (rec->name, variantId))
+	    {
+	    // Make shallow copy of rec so we can alter ->next:
+	    struct vcfRecord *newRec = vcfFileCloneVar(rec);
+	    slAddHead(&varList, newRec);
+	    }
+	hashAdd(vcff->byName, rec->name, rec);
+	}
+    slReverse(&varList);
+    }
+else
+    {
+    struct hashEl *hel = hashLookup(vcff->byName, variantId);
+    while (hel != NULL)
+	{
+	if (sameString(hel->name, variantId))
+	    {
+	    struct vcfRecord *rec = hel->val;
+	    struct vcfRecord *newRec = vcfFileCloneVar(rec);
+	    slAddHead(&varList, newRec);
+	    }
+	hel = hel->next;
+	}
+    // Don't reverse varList -- hash element list was already reversed
+    }
+return varList;
+}
+
+const struct vcfInfoElement *vcfRecordFindInfo(const struct vcfRecord *record, char *key)
+/* Find an INFO element, or NULL. */
+{
+int i;
+for (i = 0;  i < record->infoCount;  i++)
+    {
+    if (sameString(key, record->infoElements[i].key))
+	return &(record->infoElements[i]);
+    }
+return NULL;
+}
+
+static enum vcfInfoType typeForGtFormat(struct vcfFile *vcff, char *key)
+/* Look up the type of FORMAT component key, in the definitions from the header,
+ * and failing that, from the keys reserved in the spec. */
+{
+struct vcfInfoDef *def;
+// I expect there to be fairly few definitions (less than a dozen) so
+// I'm just doing a linear search not hash:
+for (def = vcff->gtFormatDefs;  def != NULL;  def = def->next)
+    {
+    if (sameString(key, def->key))
+	return def->type;
+    }
+for (def = vcfSpecGtFormatDefs;  def != NULL;  def = def->next)
+    {
+    if (sameString(key, def->key))
+	return def->type;
+    }
+vcfFileErr(vcff, "There is no FORMAT header defining \"%s\"", key);
+// default to string so we can display value as-is:
+return vcfInfoString;
+}
+
+#define VCF_MAX_FORMAT VCF_MAX_INFO
+#define VCF_MAX_FORMAT_LEN (VCF_MAX_FORMAT * 4)
+
+static void parseGenotypes(struct vcfRecord *record)
+/* Translate record->genotypesUnparsedStrings[] into proper struct vcfGenotype[].
+ * This destroys genotypesUnparsedStrings. */
+{
+struct vcfFile *vcff = record->file;
+record->genotypes = vcfFileAlloc(vcff, vcff->genotypeCount * sizeof(struct vcfGenotype));
+char format[VCF_MAX_FORMAT_LEN];
+safecpy(format, sizeof(format), record->format);
+char *formatWords[VCF_MAX_FORMAT];
+int formatWordCount = chopByChar(format, ':', formatWords, ArraySize(formatWords));
+if (formatWordCount >= VCF_MAX_FORMAT)
+    vcfFileErr(vcff, "The FORMAT column has at least %d words; "
+	       "VCF_MAX_FORMAT may need to be increased in vcf.c!", VCF_MAX_FORMAT);
+if (differentString(formatWords[0], vcfGtGenotype))
+    vcfFileErr(vcff, "FORMAT column should begin with \"%s\" but begins with \"%s\"",
+	       vcfGtGenotype, formatWords[0]);
+int i;
+for (i = 0;  i < vcff->genotypeCount;  i++)
+    {
+    char *string = record->genotypeUnparsedStrings[i];
+    struct vcfGenotype *gt = &(record->genotypes[i]);
+    // Each genotype can have multiple :-separated info elements:
+    char *gtWords[VCF_MAX_FORMAT];
+    int gtWordCount = chopByChar(string, ':', gtWords, ArraySize(gtWords));
+    if (gtWordCount != formatWordCount)
+	vcfFileErr(vcff, "The FORMAT column has %d words but the genotype column for %s "
+		   "has %d words", formatWordCount, vcff->genotypeIds[i], gtWordCount);
+    if (gtWordCount > formatWordCount)
+	gtWordCount = formatWordCount;
+    gt->id = vcff->genotypeIds[i];
+    gt->infoCount = gtWordCount;
+    gt->infoElements = vcfFileAlloc(vcff, gtWordCount * sizeof(struct vcfInfoElement));
+    int j;
+    for (j = 0;  j < gtWordCount;  j++)
+	{
+	// Special parsing of genotype:
+	if (sameString(formatWords[j], vcfGtGenotype))
+	    {
+	    char *genotype = gtWords[j];
+	    char *sep = strchr(genotype, '|');
+	    if (sep != NULL)
+		gt->isPhased = TRUE;
+	    else
+		sep = strchr(genotype, '/');
+	    gt->hapIxA = atoi(genotype);
+	    if (sep == NULL)
+		gt->isHaploid = TRUE;
+	    else
+		gt->hapIxB = atoi(sep+1);
+	    }
+	struct vcfInfoElement *el = &(gt->infoElements[j]);
+	el->key = formatWords[j];
+	enum vcfInfoType type = typeForGtFormat(vcff, formatWords[j]);
+	el->count = parseInfoValue(record, formatWords[j], type, gtWords[j], &(el->values));
+	if (el->count >= VCF_MAX_INFO)
+	    vcfFileErr(vcff, "A single element of the genotype column for \"%s\" "
+		       "has at least %d values; "
+		       "VCF_MAX_INFO may need to be increased in vcf.c!",
+		       gt->id, VCF_MAX_INFO);
+	}
+
+    }
+record->genotypeUnparsedStrings = NULL;
+}
+
+const struct vcfGenotype *vcfRecordFindGenotype(struct vcfRecord *record, char *sampleId)
+/* Find the genotype and associated info for the individual, or return NULL. */
+{
+if (sampleId == NULL)
+    return NULL;
+struct vcfFile *vcff = record->file;
+if (record->genotypeUnparsedStrings != NULL && record->genotypes == NULL)
+    parseGenotypes(record);
+int ix = stringArrayIx(sampleId, vcff->genotypeIds, vcff->genotypeCount);
+if (ix >= 0)
+    return &(record->genotypes[ix]);
+return NULL;
+}
+