ad3a176457ad1d21a0fedc47f349ec3484e751f1
angie
  Fri Feb 9 16:27:51 2018 -0800
Cleaning up some old ugliness about the size parameter to isAllDna and isAllNt.
hgc's printSnpAlignment code that parsed snpNNN.fa was using lineSize as length
but lineSize is length+1.  Then isAllDna was written with "i<size-1" as the
loop test instead of "i < size".  I didn't fix that properly when I separated
out isAllNt from isAllDna.
Later, I (re?)discovered that isAllNt needed length+1 as its size and just
added some FIXME comments.  Thanks Brian R for prodding me to actually fix it.
refs #20895

diff --git src/lib/vcf.c src/lib/vcf.c
index f980d12..246faaf 100644
--- src/lib/vcf.c
+++ src/lib/vcf.c
@@ -1,1603 +1,1598 @@
 /* VCF: Variant Call Format, version 4.0 / 4.1
  * http://www.1000genomes.org/wiki/Analysis/Variant%20Call%20Format/vcf-variant-call-format-version-40
  * http://www.1000genomes.org/wiki/Analysis/Variant%20Call%20Format/vcf-variant-call-format-version-41
  */
 
 /* Copyright (C) 2014 The Regents of the University of California 
  * See README in this or parent directory for licensing information. */
 
 #include "common.h"
 #include "dnautil.h"
 #include "errAbort.h"
 #include <limits.h>
 #include "localmem.h"
 #include "net.h"
 #include "regexHelper.h"
 #include "vcf.h"
 
 /* Reserved but optional INFO keys: */
 const char *vcfInfoAncestralAllele = "AA";
 const char *vcfInfoPerAlleleGtCount = "AC";	// allele count in genotypes, for each ALT allele,
 						// in the same order as listed
 const char *vcfInfoAlleleFrequency = "AF";	// allele frequency for each ALT allele in the same
 						// order as listed: use this when estimated from
 						// primary data, not called genotypes
 const char *vcfInfoNumAlleles = "AN";		// total number of alleles in called genotypes
 const char *vcfInfoBaseQuality = "BQ";		// RMS base quality at this position
 const char *vcfInfoCigar = "CIGAR";		// cigar string describing how to align an
 						// alternate allele to the reference allele
 const char *vcfInfoIsDbSnp = "DB";		// dbSNP membership
 const char *vcfInfoDepth = "DP";		// combined depth across samples, e.g. DP=154
 const char *vcfInfoEnd = "END";			// end position of the variant described in this
 						// record (esp. for CNVs)
 const char *vcfInfoIsHapMap2 = "H2";		// membership in hapmap2
 const char *vcfInfoIsHapMap3 = "H3";		// membership in hapmap3
 const char *vcfInfoIs1000Genomes = "1000G";	// membership in 1000 Genomes
 const char *vcfInfoMappingQuality = "MQ";	// RMS mapping quality, e.g. MQ=52
 const char *vcfInfoMapQual0Count = "MQ0";	// number of MAPQ == 0 reads covering this record
 const char *vcfInfoNumSamples = "NS";		// Number of samples with data
 const char *vcfInfoStrandBias = "SB";		// strand bias at this position
 const char *vcfInfoIsSomatic = "SOMATIC";	// indicates that the record is a somatic mutation,
 						// for cancer genomics
 const char *vcfInfoIsValidated = "VALIDATED";	// validated by follow-up experiment
 
 /* Reserved but optional per-genotype keys: */
 const char *vcfGtGenotype = "GT";	// Integer allele indices separated by "/" (unphased)
 					// or "|" (phased). Allele values are 0 for
 					// reference allele, 1 for the first allele in ALT,
 					// 2 for the second allele in ALT and so on.
 const char *vcfGtDepth = "DP";		// Read depth at this position for this sample
 const char *vcfGtFilter = "FT";		// Analogous to variant's FILTER field
 const char *vcfGtLikelihoods = "GL";	// Floating point log10-scaled likelihoods for genotypes.
 					// If bi-allelic, order of genotypes is AA,AB,BB
 					// where A=ref and B=alt.
 					// If tri-allelic, AA,AB,BB,AC,BC,CC.
 const char *vcfGtPhred = "PL";		// Phred-scaled genotype likelihoods rounded to closest int
 					// Same ordering as GL
 const char *vcfGtConditionalQual = "GQ";// Conditional genotype quality
 					// i.e. phred quality -10log_10 P(genotype call is wrong,
 					// conditioned on the site's being variant)
 const char *vcfGtHaplotypeQualities = "HQ";	// two phred qualities comma separated
 const char *vcfGtPhaseSet = "PS";	// Set of phased genotypes to which this genotype belongs
 const char *vcfGtPhasingQuality = "PQ";	// Phred-scaled P(alleles ordered wrongly in heterozygote)
 const char *vcfGtExpectedAltAlleleCount = "EC";	// Typically used in association analyses
 
 
 // Make definitions of reserved INFO and genotype keys, in case people take them for
 // granted and don't make ##INFO headers for them:
 static struct vcfInfoDef *vcfSpecInfoDefs = NULL;
 static struct vcfInfoDef *vcfSpecGtFormatDefs = NULL;
 
 static void addInfoDef(struct vcfInfoDef **pList,
 		       const char *key, int fieldCount, enum vcfInfoType type, char *description)
 /* Allocate and initialize an info def and add it to pList. */
 {
 struct vcfInfoDef *def;
 AllocVar(def);
 def->key = (char *)key;
 def->fieldCount = fieldCount;
 def->type = type;
 def->description = description;
 slAddHead(pList, def);
 }
 
 static void initVcfSpecInfoDefs()
 /* Make linked list of INFO defs reserved in the spec. */
 {
 if (vcfSpecInfoDefs != NULL)
     return;
 addInfoDef(&vcfSpecInfoDefs, vcfInfoAncestralAllele, 1, vcfInfoString, "Ancestral allele");
 addInfoDef(&vcfSpecInfoDefs, vcfInfoPerAlleleGtCount, 1, vcfInfoInteger,
 	   "Allele count in genotypes, for each ALT allele, in the same order as listed");
 addInfoDef(&vcfSpecInfoDefs, vcfInfoAlleleFrequency, -1, vcfInfoFloat,
 	   "Allele frequency for each ALT allele in the same order as listed: "
 	   "use this when estimated from primary data, not called genotypes");
 addInfoDef(&vcfSpecInfoDefs, vcfInfoNumAlleles, 1, vcfInfoInteger,
 	   "Total number of alleles in called genotypes");
 addInfoDef(&vcfSpecInfoDefs, vcfInfoBaseQuality, 1, vcfInfoFloat,
 	   "RMS base quality at this position");
 addInfoDef(&vcfSpecInfoDefs, vcfInfoCigar, 1, vcfInfoString,
 	   "CIGAR string describing how to align an alternate allele to the reference allele");
 addInfoDef(&vcfSpecInfoDefs, vcfInfoIsDbSnp, 0, vcfInfoFlag, "dbSNP membership");
 addInfoDef(&vcfSpecInfoDefs, vcfInfoDepth, 1, vcfInfoString,
 	   "Combined depth across samples, e.g. DP=154");
 addInfoDef(&vcfSpecInfoDefs, vcfInfoEnd, 1, vcfInfoInteger,
 	   "End position of the variant described in this record "
 	   "(especially for structural variants)");
 addInfoDef(&vcfSpecInfoDefs, vcfInfoIsHapMap2, 1, vcfInfoFlag, "Membership in HapMap 2");
 addInfoDef(&vcfSpecInfoDefs, vcfInfoIsHapMap3, 1, vcfInfoFlag, "Membership in HapMap 3");
 addInfoDef(&vcfSpecInfoDefs, vcfInfoIs1000Genomes, 1, vcfInfoFlag, "Membership in 1000 Genomes");
 addInfoDef(&vcfSpecInfoDefs, vcfInfoMappingQuality, 1, vcfInfoFloat,
 	   "RMS mapping quality, e.g. MQ=52");
 addInfoDef(&vcfSpecInfoDefs, vcfInfoMapQual0Count, 1, vcfInfoInteger,
 	   "Number of MAPQ == 0 reads covering this record");
 addInfoDef(&vcfSpecInfoDefs, vcfInfoNumSamples, 1, vcfInfoInteger, "Number of samples with data");
 addInfoDef(&vcfSpecInfoDefs, vcfInfoStrandBias, 1, vcfInfoString, "Strand bias at this position");
 addInfoDef(&vcfSpecInfoDefs, vcfInfoIsSomatic, 1, vcfInfoFlag,
 	   "Indicates that the record is a somatic mutation, for cancer genomics");
 addInfoDef(&vcfSpecInfoDefs, vcfInfoIsValidated, 1, vcfInfoFlag,
 	   "Validated by follow-up experiment");
 }
 
 
 static void initVcfSpecGtFormatDefs()
 /* Make linked list of genotype info defs reserved in spec. */
 {
 if (vcfSpecGtFormatDefs != NULL)
     return;
 addInfoDef(&vcfSpecGtFormatDefs, vcfGtGenotype, 1, vcfInfoString,
 	   "Integer allele indices separated by \"/\" (unphased) "
 	   "or \"|\" (phased). Allele values are 0 for "
 	   "reference allele, 1 for the first allele in ALT, "
 	   "2 for the second allele in ALT and so on, or \".\" for unknown");
 addInfoDef(&vcfSpecGtFormatDefs, vcfGtDepth, 1, vcfInfoInteger,
 	   "Read depth at this position for this sample");
 addInfoDef(&vcfSpecGtFormatDefs, vcfGtFilter, 1, vcfInfoString,
 	   "PASS to indicate that all filters have been passed, "
 	   "a semi-colon separated list of codes for filters that fail, "
 	   "or \".\" to indicate that filters have not been applied");
 addInfoDef(&vcfSpecGtFormatDefs, vcfGtLikelihoods, -1, vcfInfoFloat,
 	   "Genotype likelihoods comprised of comma separated floating point "
 	   "log10-scaled likelihoods for all possible genotypes given the set "
 	   "of alleles defined in the REF and ALT fields. ");
 addInfoDef(&vcfSpecGtFormatDefs, vcfGtPhred, -1, vcfInfoInteger,
 	   "Phred-scaled genotype likelihoods rounded to the closest integer "
 	   "(and otherwise defined precisely as the genotype likelihoods (GL) field)");
 addInfoDef(&vcfSpecGtFormatDefs, vcfGtConditionalQual, -1, vcfInfoFloat,
 	   "phred-scaled genotype posterior probabilities "
 	   "(and otherwise defined precisely as the genotype likelihoods (GL) field)"
 	   "; intended to store imputed genotype probabilities");
 addInfoDef(&vcfSpecGtFormatDefs, vcfGtHaplotypeQualities, 2, vcfInfoFloat,
 	   "Two comma-separated phred-scaled haplotype qualities");
 addInfoDef(&vcfSpecGtFormatDefs, vcfGtPhaseSet, 1, vcfInfoFloat,
 	   "A set of phased genotypes to which this genotype belongs");
 addInfoDef(&vcfSpecGtFormatDefs, vcfGtPhasingQuality, 1, vcfInfoFloat,
 	   "Phasing quality, the phred-scaled probability that alleles are ordered "
 	   "incorrectly in a heterozygote (against all other members in the phase set)");
 addInfoDef(&vcfSpecGtFormatDefs, vcfGtExpectedAltAlleleCount, 1, vcfInfoFloat,
 	   "Expected alternate allele count (typically used in association analyses)");
 }
 
 static bool vcfFileStopDueToErrors(struct vcfFile *vcff)
 /* determine if we should stop due to the number of errors */
 {
 return vcff->errCnt > vcff->maxErr;
 }
 
 static void vcfFileErr(struct vcfFile *vcff, char *format, ...)
 #if defined(__GNUC__)
 __attribute__((format(printf, 2, 3)))
 #endif
 ;
 
 static void vaVcfWarn(struct vcfFile *vcff, char *format, va_list args)
 /* Add a little bit of info like file position to warning */
 {
 if (vcff->lf != NULL)
     {
     char formatPlus[1024];
     safef(formatPlus, sizeof(formatPlus), 
 	"%s:%d: %s", vcff->lf->fileName, vcff->lf->lineIx, format);
     vaWarn(formatPlus, args);
     }
 else
     vaWarn(format, args);
 }
 
 static void vcfFileWarn(struct vcfFile *vcff, char *format, ...)
 /* Send error message to errabort stack's warn handler and abort */
 {
 va_list args;
 va_start(args, format);
 vaVcfWarn(vcff, format, args);
 }
 
 static void vcfFileErr(struct vcfFile *vcff, char *format, ...)
 /* Send error message to errabort stack's warn handler and abort */
 {
 vcff->errCnt++;
 if (vcff->maxErr == VCF_IGNORE_ERRS)
     return;
 va_list args;
 va_start(args, format);
 vaVcfWarn(vcff, format, args);
 if (vcfFileStopDueToErrors(vcff))
     errAbort("VCF: %d parser errors, quitting", vcff->errCnt);
 }
 
 static void *vcfFileAlloc(struct vcfFile *vcff, size_t size)
 /* Use vcff's local mem to allocate memory. */
 {
 return lmAlloc( vcfFileLm(vcff), size);
 }
 
 INLINE char *vcfFileCloneStrZ(struct vcfFile *vcff, char *str, size_t size)
 /* Use vcff's local mem to allocate memory for a string and copy it. */
 {
 return lmCloneStringZ( vcfFileLm(vcff), str, size);
 }
 
 INLINE char *vcfFileCloneStr(struct vcfFile *vcff, char *str)
 /* Use vcff's local mem to allocate memory for a string and copy it. */
 {
 return vcfFileCloneStrZ(vcff, str, strlen(str));
 }
 
 INLINE char *vcfFileCloneSubstr(struct vcfFile *vcff, char *line, regmatch_t substr)
 /* Allocate memory for and copy a substring of line. */
 {
 return vcfFileCloneStrZ(vcff, line+substr.rm_so, (substr.rm_eo - substr.rm_so));
 }
 
 #define vcfFileCloneVar(vcff, var) lmCloneMem( vcfFileLm(vcff), var, sizeof(var))
 
 char *vcfFilePooledStr(struct vcfFile *vcff, char *str)
 /* Allocate memory for a string from vcff's shared string pool. */
 {
 return hashStoreName(vcff->pool, str);  // Always stored in main pool, not reuse pool
 }
 
 static enum vcfInfoType vcfInfoTypeFromSubstr(struct vcfFile *vcff, char *line, regmatch_t substr)
 /* Translate substring of line into vcfInfoType or complain. */
 {
 char typeWord[16];
 int substrLen = substr.rm_eo - substr.rm_so;
 if (substrLen > sizeof(typeWord) - 1)
     {
     vcfFileErr(vcff, "substring passed to vcfInfoTypeFromSubstr is too long.");
     return vcfInfoString;
     }
 safencpy(typeWord, sizeof(typeWord), line + substr.rm_so, substrLen);
 if (sameString("Integer", typeWord))
     return vcfInfoInteger;
 if (sameString("Float", typeWord))
     return vcfInfoFloat;
 if (sameString("Flag", typeWord))
     return vcfInfoFlag;
 if (sameString("Character", typeWord))
     return vcfInfoCharacter;
 if (sameString("String", typeWord))
     return vcfInfoString;
 vcfFileErr(vcff, "Unrecognized type word \"%s\" in metadata line \"%s\"", typeWord, line);
 return vcfInfoString;
 }
 
 // Regular expressions to check format and extract information from header lines:
 static const char *fileformatRegex = "^##(file)?format=VCFv([0-9]+)(\\.([0-9]+))?$";
 static const char *infoOrFormatRegex =
     "^##(INFO|FORMAT)="
     "<ID=([\\.+A-Za-z0-9_:-]+),"
     "Number=([\\.AGR]|[0-9-]+),"
     "Type=([A-Za-z]+),"
     "Description=\"?(.*)\"?>$";
 static const char *filterOrAltRegex =
     "^##(FILTER|ALT)="
     "<ID=([^,]+),"
     "(Description|Type)=\"?(.*)\"?>$";
 // VCF version 3.3 was different enough to warrant separate regexes:
 static const char *infoOrFormatRegex3_3 =
     "^##(INFO|FORMAT)="
     "([A-Za-z0-9_:-]+),"
     "(\\.|A|G|[0-9-]+),"
     "([A-Za-z]+),"
     "\"?(.*)\"?$";
 static const char *filterRegex3_3 =
     "^##(FILTER)="
     "([^,]+),"
     "()\"?(.*)\"?$";
 
 INLINE void nonAsciiWorkaround(char *line)
 // Workaround for annoying 3-byte quote marks included in some 1000 Genomes files:
 {
 (void)strSwapStrs(line, strlen(line)+1, "\342\200\234", "\"");
 (void)strSwapStrs(line, strlen(line)+1, "\342\200\235", "\"");
 }
 
 static void parseMetadataLine(struct vcfFile *vcff, char *line)
 /* Parse a VCF header line beginning with "##" that defines a metadata. */
 {
 char *ptr = line;
 if (ptr == NULL && !startsWith(ptr, "##"))
     errAbort("Bad line passed to parseMetadataLine");
 ptr += 2;
 char *firstEq = strchr(ptr, '=');
 if (firstEq == NULL)
     {
     if (vcff->majorVersion > 4 ||
 	(vcff->majorVersion == 4 && vcff->minorVersion > 0))
 	vcfFileErr(vcff, "Metadata line lacks '=': \"%s\"", line);
     return;
     }
 regmatch_t substrs[8];
 // Some of the metadata lines are crucial for parsing the rest of the file:
 if (startsWith("##fileformat=", line) || startsWith("##format", line))
     {
     if (regexMatchSubstr(line, fileformatRegex, substrs, ArraySize(substrs)))
 	{
 	// substrs[2] is major version #, substrs[3] is set only if there is a minor version,
 	// and substrs[4] is the minor version #.
 	vcff->majorVersion = atoi(line + substrs[2].rm_so);
 	if (substrs[3].rm_so != -1)
 	    vcff->minorVersion = atoi(line + substrs[4].rm_so);
 	}
     else
 	vcfFileErr(vcff, "##fileformat line does not match expected pattern /%s/: \"%s\"",
 		   fileformatRegex, line);
     }
 else if (startsWith("##INFO=", line) || startsWith("##FORMAT=", line))
     {
     boolean isInfo = startsWith("##INFO=", line);
     nonAsciiWorkaround(line);
     if (regexMatchSubstr(line, infoOrFormatRegex, substrs, ArraySize(substrs)) ||
 	regexMatchSubstr(line, infoOrFormatRegex3_3, substrs, ArraySize(substrs)))
 	// substrs[2] is ID/key, substrs[3] is Number, [4] is Type and [5] is Description.
 	{
 	struct vcfInfoDef *def = vcfFileAlloc(vcff, sizeof(struct vcfInfoDef));
 	def->key = vcfFileCloneSubstr(vcff, line, substrs[2]);
 	char *number = vcfFileCloneSubstr(vcff, line, substrs[3]);
 	if (sameString(number, ".") || sameString(number, "A") || sameString(number, "G"))
 	    // A is #alts which varies line-to-line; "G" is #genotypes which we haven't
 	    // yet seen.  Why is there a G here -- shouldn't such attributes go in the
 	    // genotype columns?
 	    def->fieldCount = -1;
 	else
 	    def->fieldCount = atoi(number);
 	def->type = vcfInfoTypeFromSubstr(vcff, line, substrs[4]);
 	// greedy regex pulls in end quote, trim if found:
 	if (line[substrs[5].rm_eo-1] == '"')
 	    line[substrs[5].rm_eo-1] = '\0';
 	def->description = vcfFileCloneSubstr(vcff, line, substrs[5]);
         char *p = NULL;
         if ((p = strstr(def->description, "\",Source=\"")) ||
             (p = strstr(def->description, "\",Version=\"")))
             *p = '\0';
 	slAddHead((isInfo ? &(vcff->infoDefs) : &(vcff->gtFormatDefs)), def);
 	}
     else
 	vcfFileErr(vcff, "##%s line does not match expected pattern /%s/ or /%s/: \"%s\"",
 		   (isInfo ? "INFO" : "FORMAT"), infoOrFormatRegex, infoOrFormatRegex3_3, line);
     }
 else if (startsWith("##FILTER=", line) || startsWith("##ALT=", line))
     {
     boolean isFilter = startsWith("##FILTER", line);
     if (regexMatchSubstr(line, filterOrAltRegex, substrs, ArraySize(substrs)) ||
 	regexMatchSubstr(line, filterRegex3_3, substrs, ArraySize(substrs)))
 	{
 	// substrs[2] is ID/key, substrs[4] is Description.
 	struct vcfInfoDef *def = vcfFileAlloc(vcff, sizeof(struct vcfInfoDef));
 	def->key = vcfFileCloneSubstr(vcff, line, substrs[2]);
 	// greedy regex pulls in end quote, trim if found:
 	if (line[substrs[4].rm_eo-1] == '"')
 	    line[substrs[4].rm_eo-1] = '\0';
 	def->description = vcfFileCloneSubstr(vcff, line, substrs[4]);
 	slAddHead((isFilter ? &(vcff->filterDefs) : &(vcff->altDefs)), def);
 	}
     else
 	{
 	if (isFilter)
 	    vcfFileErr(vcff, "##FILTER line does not match expected pattern /%s/ or /%s/: \"%s\"",
 		       filterOrAltRegex, filterRegex3_3, line);
 	else
 	    vcfFileErr(vcff, "##ALT line does not match expected pattern /%s/: \"%s\"",
 		       filterOrAltRegex, line);
 	}
     }
 }
 
 static void expectColumnName2(struct vcfFile *vcff, char *exp1, char *exp2, char *words[], int ix)
 /* Every file must include a header naming the columns, though most column names are
  * fixed; make sure the names of fixed columns are as expected. */
 {
 if (! sameString(exp1, words[ix]))
     {
     if (exp2 == NULL)
 	vcfFileErr(vcff, "Expected column %d's name in header to be \"%s\" but got \"%s\"",
 		   ix+1, exp1, words[ix]);
     else if (! sameString(exp2, words[ix]))
 	vcfFileErr(vcff, "Expected column %d's name in header to be \"%s\"  or \"%s\" "
 		   "but got \"%s\"", ix+1, exp1, exp2, words[ix]);
     }
 }
 
 #define expectColumnName(vcff, exp, words, ix) expectColumnName2(vcff, exp, NULL, words, ix)
 
 // There might be a whole lot of genotype columns...
 #define VCF_MAX_COLUMNS 16 * 1024
 
 char *vcfDefaultHeader = "#CHROM POS ID REF ALT QUAL FILTER INFO";
 /* Default header if we have none. */
 
 static void parseColumnHeaderRow(struct vcfFile *vcff, char *line)
 /* Make sure column names are as we expect, and store genotype sample IDs if any are given. */
 {
 char *words[VCF_MAX_COLUMNS];
 int wordCount = chopLine(line+1, words);
 if (wordCount >= VCF_MAX_COLUMNS)
     vcfFileErr(vcff, "header contains at least %d columns; "
 	       "VCF_MAX_COLUMNS may need to be increased in vcf.c!", VCF_MAX_COLUMNS);
 expectColumnName(vcff, "CHROM", words, 0);
 expectColumnName(vcff, "POS", words, 1);
 expectColumnName(vcff, "ID", words, 2);
 expectColumnName(vcff, "REF", words, 3);
 expectColumnName(vcff, "ALT", words, 4);
 expectColumnName2(vcff, "QUAL", "PROB", words, 5);
 expectColumnName(vcff, "FILTER", words, 6);
 expectColumnName(vcff, "INFO", words, 7);
 if (wordCount > 8)
     {
     expectColumnName(vcff, "FORMAT", words, 8);
     if (wordCount < 10)
 	vcfFileErr(vcff, "FORMAT column is given, but no sample IDs for genotype columns...?");
     vcff->genotypeCount = (wordCount - 9);
     vcff->genotypeIds = vcfFileAlloc(vcff, vcff->genotypeCount * sizeof(char *));
     int i;
     for (i = 9;  i < wordCount;  i++)
 	vcff->genotypeIds[i-9] = vcfFileCloneStr(vcff, words[i]);
     }
 }
 
 struct vcfFile *vcfFileNew()
 /* Return a new, empty vcfFile object. */
 {
 struct vcfFile *vcff = NULL;
 AllocVar(vcff);
 vcff->pool = hashNew(0);
 vcff->reusePool = NULL;  // Must explicitly request a separate record pool
 return vcff;
 }
 
 void vcfFileMakeReusePool(struct vcfFile *vcff, int initialSize)
 // Creates a separate memory pool for records.  Establishing this pool allows
 // using vcfFileFlushRecords to abandon previously read records and free
 // the associated memory. Very useful when reading an entire file in batches.
 {
 assert(vcff->reusePool == NULL); // don't duplicate this
 vcff->reusePool = lmInit(initialSize);
 }
 
 void vcfFileFlushRecords(struct vcfFile *vcff)
 // Abandons all previously read vcff->records and flushes the reuse pool (if it exists).
 // USE WITH CAUTION.  All previously allocated record pointers are now invalid.
 {
 if (vcff->reusePool != NULL)
     {
     size_t poolSize = lmSize(vcff->reusePool);
     //if (poolSize > (48 * 1024 * 1024))
     //    printf("\nReuse pool %ld of %ld unused\n",lmAvailable(vcff->reusePool),poolSize);
     lmCleanup(&vcff->reusePool);
     vcff->reusePool = lmInit(poolSize);
     }
 vcff->records = NULL;
 }
 
 static struct vcfFile *vcfFileHeaderFromLineFile(struct lineFile *lf, int maxErr)
 /* Parse a VCF file into a vcfFile object.  If maxErr not zero, then
  * continue to parse until this number of error have been reached.  A maxErr
  * less than zero does not stop and reports all errors.
  * Set maxErr to VCF_IGNORE_ERRS for silence */
 {
 initVcfSpecInfoDefs();
 initVcfSpecGtFormatDefs();
 if (lf == NULL)
     return NULL;
 struct vcfFile *vcff = vcfFileNew();
 vcff->lf = lf;
 vcff->fileOrUrl = vcfFileCloneStr(vcff, lf->fileName);
 vcff->maxErr = (maxErr < 0) ? INT_MAX : maxErr;
 
 struct dyString *dyHeader = dyStringNew(1024);
 char *line = NULL;
 // First, metadata lines beginning with "##":
 while (lineFileNext(lf, &line, NULL) && startsWith("##", line))
     {
     dyStringAppend(dyHeader, line);
     dyStringAppendC(dyHeader, '\n');
     parseMetadataLine(vcff, line);
     }
 slReverse(&(vcff->infoDefs));
 slReverse(&(vcff->filterDefs));
 slReverse(&(vcff->gtFormatDefs));
 // Did we get the bare minimum VCF header with supported version?
 if (vcff->majorVersion == 0)
     {
     vcfFileWarn(vcff, "missing ##fileformat= header line?  Assuming 4.1.");
     vcff->majorVersion = 4;
     vcff->minorVersion = 1;
     }
 if ((vcff->majorVersion != 4 || vcff->minorVersion < 0 || vcff->minorVersion > 2) &&
     (vcff->majorVersion != 3))
     vcfFileErr(vcff, "VCFv%d.%d not supported -- only v3.*, v4.0, v4.1 or v4.2",
 	       vcff->majorVersion, vcff->minorVersion);
 // Next, one header line beginning with single "#" that names the columns:
 if (line == NULL)
     // EOF after metadata
     return vcff;
 char headerLineBuf[256];
 if (line[0] != '#')
     {
     lineFileReuse(lf);
     vcfFileWarn(vcff, "Expected to find # followed by column names (\"#CHROM POS ...\"), "
 	       "assuming default VCF 4.1 columns");
     safef(headerLineBuf, sizeof(headerLineBuf), "%s", vcfDefaultHeader);
     line = headerLineBuf;
     }
 dyStringAppend(dyHeader, line);
 dyStringAppendC(dyHeader, '\n');
 parseColumnHeaderRow(vcff, line);
 vcff->headerString = dyStringCannibalize(&dyHeader);
 return vcff;
 }
 
 struct vcfFile *vcfFileFromHeader(char *name, char *headerString, int maxErr)
 /* Parse the VCF header string into a vcfFile object with no rows.
  * name is for error reporting.
  * If maxErr is non-negative then continue to parse until maxErr+1 errors have been found.
  * A maxErr less than zero does not stop and reports all errors.
  * Set maxErr to VCF_IGNORE_ERRS for silence. */
 {
 struct lineFile *lf = lineFileOnString(name, TRUE, cloneString(headerString));
 return vcfFileHeaderFromLineFile(lf, maxErr);
 }
 
 #define VCF_MAX_INFO (4*1024)
 
 static void parseRefAndAlt(struct vcfFile *vcff, struct vcfRecord *record, char *ref, char *alt)
 /* Make an array of alleles, ref first, from the REF and comma-sep'd ALT columns.
  * Use the length of the reference sequence to set record->chromEnd.
  * Note: this trashes the alt argument, since this is expected to be its last use. */
 {
 char *altAlleles[VCF_MAX_INFO];
 int altCount = chopCommas(alt, altAlleles);
 record->alleleCount = 1 + altCount;
 record->alleles = vcfFileAlloc(vcff, record->alleleCount * sizeof(record->alleles[0]));
 record->alleles[0] = vcfFilePooledStr(vcff, ref);
 int i;
 for (i = 0;  i < altCount;  i++)
     record->alleles[1+i] = vcfFilePooledStr(vcff, altAlleles[i]);
 int refLen = strlen(ref);
 if (refLen == dnaFilteredSize(ref))
     record->chromEnd = record->chromStart + refLen;
 }
 
 static void parseFilterColumn(struct vcfFile *vcff, struct vcfRecord *record, char *filterStr)
 /* Transform ;-separated filter codes into count + string array. */
 {
 // We don't want to modify something allocated with vcfFilePooledStr because that uses
 // hash element names for storage!  So don't make a vcfFilePooledStr copy of filterStr and
 // chop that; instead, chop a temp string and pool the words separately.
 static struct dyString *tmp = NULL;
 if (tmp == NULL)
     tmp = dyStringNew(0);
 dyStringClear(tmp);
 dyStringAppend(tmp, filterStr);
 record->filterCount = countChars(filterStr, ';') + 1;
 record->filters = vcfFileAlloc(vcff, record->filterCount * sizeof(char **));
 (void)chopByChar(tmp->string, ';', record->filters, record->filterCount);
 int i;
 for (i = 0;  i < record->filterCount;  i++)
     record->filters[i] = vcfFilePooledStr(vcff, record->filters[i]);
 }
 
 struct vcfInfoDef *vcfInfoDefForKey(struct vcfFile *vcff, const char *key)
 /* Return infoDef for key, or NULL if it wasn't specified in the header or VCF spec. */
 {
 struct vcfInfoDef *def;
 // I expect there to be fairly few definitions (less than a dozen) so
 // I'm just doing a linear search not hash:
 for (def = vcff->infoDefs;  def != NULL;  def = def->next)
     {
     if (sameString(key, def->key))
 	return def;
     }
 for (def = vcfSpecInfoDefs;  def != NULL;  def = def->next)
     {
     if (sameString(key, def->key))
 	return def;
     }
 return NULL;
 }
 
 static enum vcfInfoType typeForInfoKey(struct vcfFile *vcff, const char *key)
 /* Look up the type of INFO component key, in the definitions from the header,
  * and failing that, from the keys reserved in the spec. */
 {
 struct vcfInfoDef *def = vcfInfoDefForKey(vcff, key);
 return def ? def->type : vcfInfoString;
 }
 
 static int parseInfoValue(struct vcfRecord *record, char *infoKey, enum vcfInfoType type,
 			  char *valStr, union vcfDatum **pData, bool **pMissingData)
 /* Parse a comma-separated list of values into array of union vcfInfoDatum and return count. */
 {
 char *valWords[VCF_MAX_INFO];
 int count = chopCommas(valStr, valWords);
 struct vcfFile *vcff = record->file;
 union vcfDatum *data = vcfFileAlloc(vcff, count * sizeof(union vcfDatum));
 bool *missingData = vcfFileAlloc(vcff, count * sizeof(*missingData));
 int j;
 for (j = 0;  j < count;  j++)
     {
     if (type != vcfInfoString && type != vcfInfoCharacter && sameString(valWords[j], "."))
 	missingData[j] = TRUE;
     switch (type)
 	{
 	case vcfInfoInteger:
 	    data[j].datInt = atoi(valWords[j]);
 	    break;
 	case vcfInfoFloat:
 	    data[j].datFloat = atof(valWords[j]);
 	    break;
 	case vcfInfoFlag:
 	    // Flag key might have a value in older VCFs e.g. 3.2's DB=0, DB=1
 	    data[j].datString = vcfFilePooledStr(vcff, valWords[j]);
 	    break;
 	case vcfInfoCharacter:
 	    data[j].datChar = valWords[j][0];
 	    break;
 	case vcfInfoString:
 	    data[j].datString = vcfFilePooledStr(vcff, valWords[j]);
 	    break;
 	default:
 	    errAbort("invalid vcfInfoType (uninitialized?) %d", type);
 	    break;
 	}
     }
 // If END is given, use it as chromEnd:
 if (sameString(infoKey, vcfInfoEnd))
     record->chromEnd = data[0].datInt;
 *pData = data;
 *pMissingData = missingData;
 return count;
 }
 
 static void parseInfoColumn(struct vcfFile *vcff, struct vcfRecord *record, char *string)
 /* Translate string into array of vcfInfoElement. */
 {
 if (sameString(string, "."))
     {
     record->infoCount = 0;
     return;
     }
 char *elWords[VCF_MAX_INFO];
 record->infoCount = chopByChar(string, ';', elWords, ArraySize(elWords));
 if (record->infoCount >= VCF_MAX_INFO)
     vcfFileErr(vcff, "INFO column contains at least %d elements; "
 	       "VCF_MAX_INFO may need to be increased in vcf.c!", VCF_MAX_INFO);
 record->infoElements = vcfFileAlloc(vcff, record->infoCount * sizeof(struct vcfInfoElement));
 char *emptyString = vcfFilePooledStr(vcff, "");
 int i;
 for (i = 0;  i < record->infoCount;  i++)
     {
     char *elStr = elWords[i];
     char *eq = strchr(elStr, '=');
     struct vcfInfoElement *el = &(record->infoElements[i]);
     if (eq == NULL)
 	{
 	el->key = vcfFilePooledStr(vcff, elStr);
 	enum vcfInfoType type = typeForInfoKey(vcff, el->key);
 	if (type != vcfInfoFlag)
 	    {
 	    struct vcfInfoDef *def = vcfInfoDefForKey(vcff, el->key);
 	    // Complain only if we are expecting a particular number of values for this keyword:
 	    if (def != NULL && def->fieldCount >= 0)
 		vcfFileErr(vcff, "Missing = after key in INFO element: \"%s\" (type=%d)",
 			   elStr, type);
 	    if (type == vcfInfoString)
 		{
 		el->values = vcfFileAlloc(vcff, sizeof(union vcfDatum));
 		el->values[0].datString = emptyString;
 		}
 	    }
 	continue;
 	}
     *eq = '\0';
     el->key = vcfFilePooledStr(vcff, elStr);
     enum vcfInfoType type = typeForInfoKey(vcff, el->key);
     char *valStr = eq+1;
     el->count = parseInfoValue(record, el->key, type, valStr, &(el->values), &(el->missingData));
     if (el->count >= VCF_MAX_INFO)
 	vcfFileErr(vcff, "A single element of the INFO column has at least %d values; "
 	       "VCF_MAX_INFO may need to be increased in vcf.c!", VCF_MAX_INFO);
     }
 }
 
 struct vcfRecord *vcfRecordFromRow(struct vcfFile *vcff, char **words)
 /* Parse words from a VCF data line into a VCF record structure. */
 {
 struct vcfRecord *record = vcfFileAlloc(vcff, sizeof(struct vcfRecord));
 record->file = vcff;
 record->chrom = vcfFilePooledStr(vcff, words[0]);
 record->chromStart = lineFileNeedNum(vcff->lf, words, 1) - 1;
 // chromEnd may be overwritten by parseRefAndAlt and parseInfoColumn.
 record->chromEnd = record->chromStart+1;
 record->name = vcfFilePooledStr(vcff, words[2]);
 parseRefAndAlt(vcff, record, words[3], words[4]);
 record->qual = vcfFilePooledStr(vcff, words[5]);
 parseFilterColumn(vcff, record, words[6]);
 parseInfoColumn(vcff, record, words[7]);
 if (vcff->genotypeCount > 0)
     {
     record->format = vcfFilePooledStr(vcff, words[8]);
     record->genotypeUnparsedStrings = vcfFileAlloc(vcff,
 						   vcff->genotypeCount * sizeof(char *));
     int i;
     // Don't bother actually parsing all these until & unless we need the info:
     for (i = 0;  i < vcff->genotypeCount;  i++)
 	record->genotypeUnparsedStrings[i] = vcfFileCloneStr(vcff, words[9+i]);
     }
 return record;
 }
 
 static int checkWordCount(struct vcfFile *vcff, char **words, int wordCount)
 // Compensate for error in 1000 Genomes Phase 1 file
 // ALL.chr21.integrated_phase1_v3.20101123.snps_indels_svs.genotypes.vcf.gz
 // which has some lines that have an extra "\t" at the end of line,
 // causing the wordCount to be too high by 1:
 {
 int expected = 8;
 if (vcff->genotypeCount > 0)
     expected = 9 + vcff->genotypeCount;
 if (wordCount == expected+1 && words[expected][0] == '\0')
     wordCount--;
 lineFileExpectWords(vcff->lf, expected, wordCount);
 return wordCount;
 }
 
 struct vcfRecord *vcfNextRecord(struct vcfFile *vcff)
 /* Parse the words in the next line from vcff into a vcfRecord. Return NULL at end of file.
  * Note: this does not store record in vcff->records! */
 {
 char *words[VCF_MAX_COLUMNS];
 int wordCount;
 if ((wordCount = lineFileChopTab(vcff->lf, words)) <= 0)
     return NULL;
 wordCount = checkWordCount(vcff, words, wordCount);
 return vcfRecordFromRow(vcff, words);
 }
 
 static boolean noAltAllele(char **alleles, int alleleCount)
 /* Return true if there is no alternate allele (missing value ".") or the given alternate allele
  * is the same as the reference allele. */
 {
 return (alleleCount == 2 &&
         (sameString(alleles[0], alleles[1]) || sameString(".", alleles[1])));
 }
 
 static boolean allelesHavePaddingBase(char **alleles, int alleleCount)
 /* Examine alleles to see if they either a) all start with the same base or
  * b) include a symbolic or 0-length allele.  In either of those cases, there
  * must be an initial padding base that we'll need to trim from non-symbolic
  * alleles. */
 {
 if (sameString(alleles[0], "-"))
     return FALSE;
 else if (noAltAllele(alleles, alleleCount))
     // Don't trim assertion of no change (ref == alt)
     return FALSE;
 boolean hasPaddingBase = TRUE;
 char firstBase = '\0';
-if (isAllNt(alleles[0], strlen(alleles[0])
-            +1)) //#*** FIXME isAllNt ignores last base in string!!! always TRUE for len=1
+if (isAllNt(alleles[0], strlen(alleles[0])))
     firstBase = alleles[0][0];
 int i;
 for (i = 1;  i < alleleCount;  i++)
     {
     if (sameString(alleles[i], "-"))
         {
         hasPaddingBase = FALSE;
         break;
         }
-    else if (isAllNt(alleles[i], strlen(alleles[i])
-                     +1)) //#*** FIXME isAllNt ignores last base in string!!! always TRUE for len=1
+    else if (isAllNt(alleles[i], strlen(alleles[i])))
 	{
 	if (firstBase == '\0')
 	    firstBase = alleles[i][0];
 	if (alleles[i][0] != firstBase)
 	    // Different first base implies unpadded alleles.
 	    hasPaddingBase = FALSE;
 	}
     else if (sameString(alleles[i], "<X>") || sameString(alleles[i], "<*>"))
         {
         // Special case for samtools mpileup "<X>" or gVCF "<*>" (no alternate allele observed) --
         // being symbolic doesn't make this an indel and ref base is not necessarily padded.
         hasPaddingBase = FALSE;
         }
     else
 	{
 	// Symbolic ALT allele: Indel, so REF must have the padding base.
 	hasPaddingBase = TRUE;
 	break;
 	}
     }
 return hasPaddingBase;
 }
 
 unsigned int vcfRecordTrimIndelLeftBase(struct vcfRecord *rec)
 /* For indels, VCF includes the left neighboring base; for example, if the alleles are
  * AA/- following a G base, then the VCF record will start one base to the left and have
  * "GAA" and "G" as the alleles.  Also, if any alt allele is symbolic (e.g. <DEL>) then
  * the ref allele must have a padding base.
  * That is not nice for display for two reasons:
  * 1. Indels appear one base wider than their dbSNP entries.
  * 2. In pgSnp display mode, the two alleles are always the same color.
  * However, for hgTracks' mapBox we need the correct chromStart for identifying the
  * record in hgc -- so return the original chromStart. */
 {
 unsigned int chromStartOrig = rec->chromStart;
 struct vcfFile *vcff = rec->file;
 if (rec->alleleCount > 1)
     {
     boolean hasPaddingBase = allelesHavePaddingBase(rec->alleles, rec->alleleCount);
     if (hasPaddingBase)
 	{
 	rec->chromStart++;
 	int i;
 	for (i = 0;  i < rec->alleleCount;  i++)
 	    {
 	    if (rec->alleles[i][1] == '\0')
 		rec->alleles[i] = vcfFilePooledStr(vcff, "-");
-	    else if (isAllNt(rec->alleles[i], strlen(rec->alleles[i])
-                             +1)) //#*** FIXME isAllNt ignores last base in string!!! always TRUE for len=1
+	    else if (isAllNt(rec->alleles[i], strlen(rec->alleles[i])))
 		rec->alleles[i] = vcfFilePooledStr(vcff, rec->alleles[i]+1);
 	    else // don't trim first character of symbolic allele
 		rec->alleles[i] = vcfFilePooledStr(vcff, rec->alleles[i]);
 	    }
 	}
     }
 return chromStartOrig;
 }
 
 static boolean allEndsGEStartsAndIdentical(char **starts, char **ends, int count)
 /* Given two arrays with <count> elements, return true if all strings in ends[] are
  * greater than or equal to the corresponding strings in starts[], and all ends[]
  * have the same char. */
 {
 int i;
 char refEnd = ends[0][0];
 for (i = 0;  i < count;  i++)
     {
     if (ends[i] < starts[i] || ends[i][0] != refEnd)
 	return FALSE;
     }
 return TRUE;
 }
 
 static int countIdenticalBasesRight(char **alleles, int alCount)
 /* Return the number of bases that are identical at the end of each allele (usually 0). */
 {
 if (noAltAllele(alleles, alCount))
     // Don't trim assertion of no change (ref == alt)
     return 0;
 char *alleleEnds[alCount];
 int i;
 for (i = 0;  i < alCount;  i++)
     {
     int alLen = strlen(alleles[i]);
     // If any allele is symbolic, don't try to trim.
     if (sameString(alleles[i], "-") ||
-        !isAllNt(alleles[i], alLen
-                 +1)) //#*** FIXME isAllNt ignores last base in string!!! always TRUE for len=1
+        !isAllNt(alleles[i], alLen))
 	return 0;
     alleleEnds[i] = alleles[i] + alLen-1;
     }
 int trimmedBases = 0;
 while (allEndsGEStartsAndIdentical(alleles, alleleEnds, alCount))
     {
     trimmedBases++;
     // Trim identical last base of alleles and move alleleEnds[] items back.
     for (i = 0;  i < alCount;  i++)
 	alleleEnds[i]--;
     }
 return trimmedBases;
 }
 
 unsigned int vcfRecordTrimAllelesRight(struct vcfRecord *rec)
 /* Some tools output indels with extra base to the right, for example ref=ACC, alt=ACCC
  * which should be ref=A, alt=AC.  When the extra bases make the variant extend from an
  * intron (or gap) into an exon, it can cause a false appearance of a frameshift.
  * To avoid this, when all alleles have identical base(s) at the end, trim all of them,
  * and update rec->chromEnd.
  * For hgTracks' mapBox we need the correct chromStart for identifying the record in hgc,
  * so return the original chromEnd. */
 {
 unsigned int chromEndOrig = rec->chromEnd;
 int alCount = rec->alleleCount;
 char **alleles = rec->alleles;
 int trimmedBases = countIdenticalBasesRight(alleles, alCount);
 if (trimmedBases > 0)
     {
     struct vcfFile *vcff = rec->file;
     // Allocate new pooled strings for the trimmed alleles.
     int i;
     for (i = 0;  i < alCount;  i++)
 	{
 	int alLen = strlen(alleles[i]); // alLen >= trimmedBases > 0
 	char trimmed[alLen+1];
 	safencpy(trimmed, sizeof(trimmed), alleles[i], alLen - trimmedBases);
 	if (isEmpty(trimmed))
 	    safencpy(trimmed, sizeof(trimmed), "-", 1);
 	alleles[i] = vcfFilePooledStr(vcff, trimmed);
 	}
     rec->chromEnd -= trimmedBases;
     }
 return chromEndOrig;
 }
 
 static boolean chromsMatch(char *chromA, char *chromB)
 // Return TRUE if chromA and chromB are non-NULL and identical, possibly ignoring
 // "chr" at the beginning of one but not the other.
 {
 if (chromA == NULL || chromB == NULL)
     return FALSE;
 char *chromAPlus = startsWith("chr", chromA) ? chromA+3 : chromA;
 char *chromBPlus = startsWith("chr", chromB) ? chromB+3 : chromB;
 return sameString(chromAPlus, chromBPlus);
 }
 
 static struct vcfRecord *vcfParseData(struct vcfFile *vcff, char *chrom, int start, int end,
 				      int maxRecords)
 // Given a vcfFile into which the header has been parsed, and whose
 // lineFile is positioned at the beginning of a data row, parse and
 // return all data rows (in region, if chrom is non-NULL) from lineFile,
 // up to maxRecords.
 {
 if (vcff == NULL)
     return NULL;
 int recCount = 0;
 struct vcfRecord *records = NULL;
 struct vcfRecord *record;
 while ((record = vcfNextRecord(vcff)) != NULL)
     {
     if (maxRecords >= 0 && recCount >= maxRecords)
         break;
     if (chrom == NULL)
 	{
 	slAddHead(&records, record);
 	recCount++;
 	}
     else if (chromsMatch(chrom, record->chrom))
 	{
 	if (end > 0 && record->chromStart >= end)
 	    break;
 	else if (record->chromEnd > start)
 	    {
 	    slAddHead(&records, record);
 	    recCount++;
 	    }
 	}
     }
 slReverse(&records);
 return records;
 }
 
 struct vcfFile *vcfFileMayOpen(char *fileOrUrl, char *chrom, int start, int end,
 			       int maxErr, int maxRecords, boolean parseAll)
 /* Open fileOrUrl and parse VCF header; return NULL if unable.
  * If chrom is non-NULL, scan past any variants that precede {chrom, chromStart}.
  * Note: this is very inefficient -- it's better to use vcfTabix if possible!
  * If parseAll, then read in all lines in region, parse and store in
  * vcff->records; if maxErr >= zero, then continue to parse until
  * there are maxErr+1 errors.  A maxErr less than zero does not stop
  * and reports all errors. Set maxErr to VCF_IGNORE_ERRS for silence */
 {
 struct lineFile *lf = NULL;
 if (startsWith("http://", fileOrUrl) || startsWith("ftp://", fileOrUrl) ||
     startsWith("https://", fileOrUrl))
     lf = netLineFileOpen(fileOrUrl);
 else
     lf = lineFileMayOpen(fileOrUrl, TRUE);
 struct vcfFile *vcff = vcfFileHeaderFromLineFile(lf, maxErr);
 if (vcff && chrom != NULL)
     {
     char *line = NULL;
     while (lineFileNextReal(vcff->lf, &line))
 	{
 	char lineCopy[strlen(line)+1];
 	safecpy(lineCopy, sizeof(lineCopy), line);
 	char *words[VCF_MAX_COLUMNS];
 	int wordCount = chopTabs(lineCopy, words);
 	wordCount = checkWordCount(vcff, words, wordCount);
 	struct vcfRecord *record = vcfRecordFromRow(vcff, words);
 	if (chromsMatch(chrom, record->chrom))
 	    {
 	    if (record->chromEnd < start)
 		continue;
 	    else
 		{
 		lineFileReuse(vcff->lf);
 		break;
 		}
 	    }
 	}
     }
 if (vcff && parseAll)
     {
     vcff->records = vcfParseData(vcff, chrom, start, end, maxRecords);
     lineFileClose(&(vcff->lf)); // Not sure why it is closed.  Angie?
     }
 return vcff;
 }
 
 struct vcfFile *vcfTabixFileMayOpen(char *fileOrUrl, char *chrom, int start, int end,
 				    int maxErr, int maxRecords)
 /* Open a VCF file that has been compressed and indexed by tabix and
  * parse VCF header, or return NULL if unable.  If chrom is non-NULL,
  * seek to the position range and parse all lines in range into
  * vcff->records.  If maxErr >= zero, then continue to parse until
  * there are maxErr+1 errors.  A maxErr less than zero does not stop
  * and reports all errors. Set maxErr to VCF_IGNORE_ERRS for silence */
 {
 return vcfTabixFileAndIndexMayOpen(fileOrUrl, NULL, chrom, start, end, maxErr, maxRecords);
 }
 
 struct vcfFile *vcfTabixFileAndIndexMayOpen(char *fileOrUrl, char *tbiFileOrUrl, char *chrom, int start, int end,
 				    int maxErr, int maxRecords)
 /* Open a VCF file that has been compressed and indexed by tabix and
  * parse VCF header, or return NULL if unable. tbiFileOrUrl can be NULL.
  * If chrom is non-NULL, seek to the position range and parse all lines in
  * range into vcff->records.  If maxErr >= zero, then continue to parse until
  * there are maxErr+1 errors.  A maxErr less than zero does not stop
  * and reports all errors. Set maxErr to VCF_IGNORE_ERRS for silence */
 {
 struct lineFile *lf = lineFileTabixAndIndexMayOpen(fileOrUrl, tbiFileOrUrl, TRUE);
 if (lf == NULL)
     return NULL;
 struct vcfFile *vcff = vcfFileHeaderFromLineFile(lf, maxErr);
 if (vcff == NULL)
     return NULL;
 if (isNotEmpty(chrom) && start != end)
     {
     if (lineFileSetTabixRegion(lf, chrom, start, end))
         vcff->records = vcfParseData(vcff, NULL, 0, 0, maxRecords);
     lineFileClose(&(vcff->lf)); // file is all read in so we close it
     }
 
 return vcff;
 }
 
 int vcfRecordCmp(const void *va, const void *vb)
 /* Compare to sort based on position. */
 {
 const struct vcfRecord *a = *((struct vcfRecord **)va);
 const struct vcfRecord *b = *((struct vcfRecord **)vb);
 int dif;
 dif = strcmp(a->chrom, b->chrom);
 if (dif == 0)
     dif = a->chromStart - b->chromStart;
 if (dif == 0)
     dif = a->chromEnd - b->chromEnd; // shortest first
 if (dif == 0)
     dif = strcmp(a->name, b->name);  // finally by name
 return dif;
 }
 
 int vcfTabixBatchRead(struct vcfFile *vcff, char *chrom, int start, int end,
                       int maxErr, int maxRecords)
 // Reads a batch of records from an opened and indexed VCF file, adding them to
 // vcff->records and returning the count of new records added in this batch.
 // Note: vcff->records will continue to be sorted, even if batches are loaded
 // out of order.  Additionally, resulting vcff->records will contain no duplicates
 // so returned count refects only the new records added, as opposed to all records
 // in range.  If maxErr >= zero, then continue to parse until there are maxErr+1
 // errors.  A maxErr less than zero does not stop and reports all errors.  Set
 // maxErr to VCF_IGNORE_ERRS for silence.
 {
 int oldCount = slCount(vcff->records);
 
 if (lineFileSetTabixRegion(vcff->lf, chrom, start, end))
     {
     struct vcfRecord *records = vcfParseData(vcff, NULL, 0, 0, maxRecords);
     if (records)
         {
         struct vcfRecord *lastRec = vcff->records;
         if (lastRec == NULL)
             vcff->records = records;
         else
             {
             // Considered just asserting the batches were in order, but a problem may
             // result when non-overlapping location windows pick up the same long variant.
             slSortMergeUniq(&(vcff->records), records, vcfRecordCmp, NULL);
             }
         }
     }
 
 return slCount(vcff->records) - oldCount;
 }
 
 void vcfFileFree(struct vcfFile **pVcff)
 /* Free a vcfFile object. */
 {
 if (pVcff == NULL || *pVcff == NULL)
     return;
 struct vcfFile *vcff = *pVcff;
 if (vcff->maxErr == VCF_IGNORE_ERRS && vcff->errCnt > 0)
     {
     vcff->maxErr++; // Never completely silent!  Go ahead and report how many errs were detected
     vcfFileErr(vcff,"Closing with %d errors.",vcff->errCnt);
     }
 freez(&(vcff->headerString));
 hashFree(&(vcff->pool));
 if (vcff->reusePool)
     lmCleanup(&vcff->reusePool);
 hashFree(&(vcff->byName));
 lineFileClose(&(vcff->lf));
 freez(pVcff);
 }
 
 const struct vcfRecord *vcfFileFindVariant(struct vcfFile *vcff, char *variantId)
 /* Return all records with name=variantId, or NULL if not found. */
 {
 struct vcfRecord *varList = NULL;
 if (vcff->byName == NULL)
     {
     vcff->byName = hashNew(0);
     struct vcfRecord *rec;
     for (rec = vcff->records;  rec != NULL;  rec = rec->next)
 	{
 	if (sameString(rec->name, variantId))
 	    {
 	    // Make shallow copy of rec so we can alter ->next:
 	    struct vcfRecord *newRec = vcfFileCloneVar(vcff,rec);
 	    slAddHead(&varList, newRec);
 	    }
 	hashAdd(vcff->byName, rec->name, rec);
 	}
     slReverse(&varList);
     }
 else
     {
     struct hashEl *hel = hashLookup(vcff->byName, variantId);
     while (hel != NULL)
 	{
 	if (sameString(hel->name, variantId))
 	    {
 	    struct vcfRecord *rec = hel->val;
 	    struct vcfRecord *newRec = vcfFileCloneVar(vcff,rec);
 	    slAddHead(&varList, newRec);
 	    }
 	hel = hel->next;
 	}
     // Don't reverse varList -- hash element list was already reversed
     }
 return varList;
 }
 
 const struct vcfInfoElement *vcfRecordFindInfo(const struct vcfRecord *record, char *key)
 /* Find an INFO element, or NULL. */
 {
 int i;
 for (i = 0;  i < record->infoCount;  i++)
     {
     if (sameString(key, record->infoElements[i].key))
 	return &(record->infoElements[i]);
     }
 return NULL;
 }
 
 struct vcfInfoDef *vcfInfoDefForGtKey(struct vcfFile *vcff, const char *key)
 /* Look up the type of genotype FORMAT component key, in the definitions from the header,
  * and failing that, from the keys reserved in the spec. */
 {
 struct vcfInfoDef *def;
 // I expect there to be fairly few definitions (less than a dozen) so
 // I'm just doing a linear search not hash:
 for (def = vcff->gtFormatDefs;  def != NULL;  def = def->next)
     {
     if (sameString(key, def->key))
 	return def;
     }
 for (def = vcfSpecGtFormatDefs;  def != NULL;  def = def->next)
     {
     if (sameString(key, def->key))
 	return def;
     }
 return NULL;
 }
 
 static enum vcfInfoType typeForGtFormat(struct vcfFile *vcff, const char *key)
 /* Look up the type of FORMAT component key, in the definitions from the header,
  * and failing that, from the keys reserved in the spec. */
 {
 struct vcfInfoDef *def = vcfInfoDefForGtKey(vcff, key);
 return def ? def->type : vcfInfoString;
 }
 
 static void parseGt(char *genotype, struct vcfGenotype *gt)
 /* Parse genotype, which should be something like "0/0", "0/1", "1|0" or "0/." into gt fields. */
 {
 char *sep = strchr(genotype, '|');
 if (sep != NULL)
     gt->isPhased = TRUE;
 else
     sep = strchr(genotype, '/');
 if (genotype[0] == '.')
     gt->hapIxA = -1;
 else
     gt->hapIxA = atoi(genotype);
 if (sep == NULL)
     gt->isHaploid = TRUE;
 else if (sep[1] == '.')
     gt->hapIxB = -1;
 else
     gt->hapIxB = atoi(sep+1);
 }
 
 static void parseSgtAsGt(struct vcfRecord *rec, struct vcfGenotype *gt)
 /* Parse SGT to normal and tumor genotypes */
 {
 const struct vcfInfoElement *sgtEl = vcfRecordFindInfo(rec, "SGT");
 if (sgtEl)
     {
     char *val = sgtEl->values[0].datString;
     // set hapIxA and hapIxB where 0 = ref, 1 = alt
     if (startsWith("ref->", val))
         {
         //indel, use 0/0 for normal and 0/1 for tumor
         if (sameString(gt->id, "NORMAL"))
             gt->hapIxA = gt->hapIxB = 0;
         else if (sameString(gt->id, "TUMOR"))
             {
             gt->hapIxA = 0;
             gt->hapIxB = 1;
             }
         }
     else
         {
         if (sameString(gt->id, "NORMAL"))
             {
             char str[2];
             safef(str, sizeof(str), "%c", val[0]);
             if (sameString(rec->alleles[0], str))
                 gt->hapIxA = 0;
             else if (sameString(rec->alleles[1], str))
                 gt->hapIxA = 1;
             else
                 gt->hapIxA = -1;
             safef(str, sizeof(str), "%c", val[1]);
             if (sameString(rec->alleles[1], str))
                 gt->hapIxB = 1;
             else if (sameString(rec->alleles[0], str))
                 gt->hapIxB = 0;
             else
                 gt->hapIxB = -1;
             }
         else if (sameString(gt->id, "TUMOR"))
             {
             char str[2];
             safef(str, sizeof(str), "%c", val[4]);
             if (sameString(rec->alleles[0], str))
                 gt->hapIxA = 0;
             else if (sameString(rec->alleles[1], str))
                 gt->hapIxA = 1;
             else
                 gt->hapIxA = -1;
             safef(str, sizeof(str), "%c", val[5]);
             if (sameString(rec->alleles[1], str))
                 gt->hapIxB = 1;
             else if (sameString(rec->alleles[0], str))
                 gt->hapIxB = 0;
             else
                 gt->hapIxB = -1;
             }
         }
     }
 }
 
 static void parsePlAsGt(char *pl, struct vcfGenotype *gt)
 /* Parse PL value, which is usually a list of 3 numbers, one of which is 0 (the selected haplotype),
  * into gt fields. */
 {
 char plCopy[strlen(pl)+1];
 safecpy(plCopy, sizeof(plCopy), pl);
 char *words[32];
 int wordCount = chopCommas(plCopy, words);
 if (wordCount > 0 && sameString(words[0], "0"))
     {
     // genotype is AA (ref/ref)
     gt->hapIxA = 0;
     gt->hapIxB = 0;
     }
 else if (wordCount > 1 && sameString(words[1], "0"))
     {
     // genotype is AB (ref/alt)
     gt->hapIxA = 0;
     gt->hapIxB = 1;
     }
 else if (wordCount > 2 && sameString(words[2], "0"))
     {
     // genotype is BB (alt/alt)
     gt->hapIxA = 1;
     gt->hapIxB = 1;
     }
 else if (wordCount > 3 && sameString(words[3], "0"))
     {
     // genotype is AC (ref/alt1)
     gt->hapIxA = 0;
     gt->hapIxB = 2;
     }
 else if (wordCount > 4 && sameString(words[4], "0"))
     {
     // genotype is BC (alt/alt1)
     gt->hapIxA = 1;
     gt->hapIxB = 2;
     }
 else if (wordCount > 5 && sameString(words[5], "0"))
     {
     // genotype is CC (alt1/alt1)
     gt->hapIxA = 2;
     gt->hapIxB = 2;
     }
 else
     {
     // too fancy, treat as "./."
     gt->hapIxA = -1;
     gt->hapIxB = -1;
     }
 }
 
 #define VCF_MAX_FORMAT VCF_MAX_INFO
 #define VCF_MAX_FORMAT_LEN (VCF_MAX_FORMAT * 4)
 
 void vcfParseGenotypes(struct vcfRecord *record)
 /* Translate record->genotypesUnparsedStrings[] into proper struct vcfGenotype[].
  * This destroys genotypesUnparsedStrings. */
 {
 if (record->genotypeUnparsedStrings == NULL)
     return;
 struct vcfFile *vcff = record->file;
 record->genotypes = vcfFileAlloc(vcff, vcff->genotypeCount * sizeof(struct vcfGenotype));
 char format[VCF_MAX_FORMAT_LEN];
 safecpy(format, sizeof(format), record->format);
 char *formatWords[VCF_MAX_FORMAT];
 int formatWordCount = chopByChar(format, ':', formatWords, ArraySize(formatWords));
 if (formatWordCount >= VCF_MAX_FORMAT)
     {
     vcfFileErr(vcff, "The FORMAT column has at least %d words; "
 	       "VCF_MAX_FORMAT may need to be increased in vcf.c!", VCF_MAX_FORMAT);
     formatWordCount = VCF_MAX_FORMAT;
     }
 int i;
 // Store the pooled format word pointers and associated types for use in inner loop below.
 enum vcfInfoType formatTypes[VCF_MAX_FORMAT];
 for (i = 0;  i < formatWordCount;  i++)
     {
     formatTypes[i] = typeForGtFormat(vcff, formatWords[i]);
     formatWords[i] = vcfFilePooledStr(vcff, formatWords[i]);
     }
 for (i = 0;  i < vcff->genotypeCount;  i++)
     {
     char *string = record->genotypeUnparsedStrings[i];
     struct vcfGenotype *gt = &(record->genotypes[i]);
     // Each genotype can have multiple :-separated info elements:
     char *gtWords[VCF_MAX_FORMAT];
     int gtWordCount = chopByChar(string, ':', gtWords, ArraySize(gtWords));
     if (gtWordCount != formatWordCount)
 	vcfFileErr(vcff, "The FORMAT column has %d words but the genotype column for %s "
 		   "has %d words", formatWordCount, vcff->genotypeIds[i], gtWordCount);
     if (gtWordCount > formatWordCount)
 	gtWordCount = formatWordCount;
     gt->id = vcff->genotypeIds[i];
     gt->infoCount = gtWordCount;
     gt->infoElements = vcfFileAlloc(vcff, gtWordCount * sizeof(struct vcfInfoElement));
     gt->hapIxA = gt->hapIxB = -1;
     boolean foundGT = FALSE;
     int j;
     for (j = 0;  j < gtWordCount;  j++)
 	{
 	// Special parsing of genotype:
 	if (sameString(formatWords[j], vcfGtGenotype))
 	    {
             parseGt(gtWords[j], gt);
             foundGT = TRUE;
 	    }
         else if (!foundGT && sameString(formatWords[j], vcfGtPhred))
             {
             parsePlAsGt(gtWords[j], gt);
             foundGT = TRUE;
             }
 	struct vcfInfoElement *el = &(gt->infoElements[j]);
 	el->key = formatWords[j];
 	el->count = parseInfoValue(record, formatWords[j], formatTypes[j], gtWords[j],
 				   &(el->values), &(el->missingData));
 	if (el->count >= VCF_MAX_INFO)
 	    vcfFileErr(vcff, "A single element of the genotype column for \"%s\" "
 		       "has at least %d values; "
 		       "VCF_MAX_INFO may need to be increased in vcf.c!",
 		       gt->id, VCF_MAX_INFO);
 	}
     if (!foundGT)   //try SGT
         {
         parseSgtAsGt(record, gt);
         if (gt->hapIxA != -1)
             foundGT = TRUE;
         }
     if (i == 0 && !foundGT)
         vcfFileErr(vcff,
                    "Genotype FORMAT column includes neither GT nor PL; unable to parse genotypes.");
     }
 record->genotypeUnparsedStrings = NULL;
 }
 
 const struct vcfGenotype *vcfRecordFindGenotype(struct vcfRecord *record, char *sampleId)
 /* Find the genotype and associated info for the individual, or return NULL.
  * This calls vcfParseGenotypes if it has not already been called. */
 {
 struct vcfFile *vcff = record->file;
 if (sampleId == NULL || vcff->genotypeCount == 0)
     return NULL;
 vcfParseGenotypes(record);
 int ix = stringArrayIx(sampleId, vcff->genotypeIds, vcff->genotypeCount);
 if (ix >= 0)
     return &(record->genotypes[ix]);
 return NULL;
 }
 
 const struct vcfInfoElement *vcfGenotypeFindInfo(const struct vcfGenotype *gt, char *key)
 /* Find the genotype infoElement for key, or return NULL. */
 {
 int i;
 for (i = 0;  i < gt->infoCount;  i++)
     if (sameString(key, gt->infoElements[i].key))
         return &gt->infoElements[i];
 return NULL;
 }
 
 static char *vcfDataLineAutoSqlString =
         "table vcfDataLine"
         "\"The fields of a Variant Call Format data line\""
         "    ("
         "    string chrom;      \"An identifier from the reference genome\""
         "    uint pos;          \"The reference position, with the 1st base having position 1\""
         "    string id;         \"Semi-colon separated list of unique identifiers where available\""
         "    string ref;                \"Reference base(s)\""
         "    string alt;                \"Comma separated list of alternate non-reference alleles "
                                          "called on at least one of the samples\""
         "    string qual;       \"Phred-scaled quality score for the assertion made in ALT. i.e. "
                                  "give -10log_10 prob(call in ALT is wrong)\""
         "    string filter;     \"PASS if this position has passed all filters. Otherwise, a "
                                   "semicolon-separated list of codes for filters that fail\""
         "    string info;       \"Additional information encoded as a semicolon-separated series "
                                  "of short keys with optional comma-separated values\""
         "    string format;     \"If genotype columns are specified in header, a "
                                  "semicolon-separated list of of short keys starting with GT\""
         "    string genotypes;  \"If genotype columns are specified in header, a tab-separated "
                                  "set of genotype column values; each value is a colon-separated "
                                  "list of values corresponding to keys in the format column\""
         "    )";
 
 struct asObject *vcfAsObj()
 // Return asObject describing fields of VCF
 {
 return asParseText(vcfDataLineAutoSqlString);
 }
 
 char *vcfGetSlashSepAllelesFromWords(char **words, struct dyString *dy)
 /* Overwrite dy with a /-separated allele string from VCF words,
  * skipping the extra initial base that VCF requires for indel alleles if necessary,
  * and trimming identical bases at the ends of all alleles if there are any.
  * Return dy->string for convenience. */
 {
 // VCF reference allele gets its own column:
 char *refAllele = words[3];
 char *altAlleles = words[4];
 // Make a vcfRecord-like allele array (ref in [0], alts after) so we can check for padding base:
 int alCount = 1 + countChars(altAlleles, ',') + 1;
 char *alleles[alCount];
 alleles[0] = refAllele;
 char altAlCopy[strlen(altAlleles)+1];
 safecpy(altAlCopy, sizeof(altAlCopy), altAlleles);
 chopByChar(altAlCopy, ',', &(alleles[1]), alCount-1);
 int i;
 if (allelesHavePaddingBase(alleles, alCount))
     {
     // Skip padding base (unless we have a symbolic allele):
     for (i = 0;  i < alCount;  i++)
-	if (isAllNt(alleles[i], strlen(alleles[i])
-                    +1)) //#*** FIXME isAllNt ignores last base in string!!! always TRUE for len=1
+	if (isAllNt(alleles[i], strlen(alleles[i])))
 	    alleles[i]++;
     }
 // Having dealt with left padding base, now look for identical bases on the right:
 int trimmedBases = countIdenticalBasesRight(alleles, alCount);
 // Build a /-separated allele string, trimming bases on the right if necessary:
 dyStringClear(dy);
 if (noAltAllele(alleles, alCount))
     alCount = 1;
 for (i = 0;  i < alCount;  i++)
     {
     char *allele = alleles[i];
     if (!sameString(allele, "."))
         {
         if (i != 0)
             {
             if (sameString(alleles[0], allele))
                 continue;
             dyStringAppendC(dy, '/');
             }
         if (allele[trimmedBases] == '\0')
             dyStringAppendC(dy, '-');
         else
             dyStringAppendN(dy, allele, strlen(allele)-trimmedBases);
         }
     }
 return dy->string;
 }
 
 static void vcfWriteWordArrayWithSep(FILE *f, int count, char **words, char sep)
 /* Write words joined by sep to f (or, if count is zero, ".").  */
 {
 if (count < 1)
     fputc('.', f);
 else
     {
     fputs(words[0], f);
     int i;
     for (i = 1;  i < count;  i++)
         {
         fputc(sep, f);
         fputs(words[i], f);
         }
     }
 }
 
 static void vcfWriteInfo(FILE *f, struct vcfRecord *rec)
 /* Write rec->infoElements to f. */
 {
 if (rec->infoCount < 1)
     fputc('.', f);
 else
     {
     int i, j;
     for (i = 0;  i < rec->infoCount;  i++)
         {
         struct vcfInfoElement *info = &(rec->infoElements[i]);
         enum vcfInfoType type = typeForInfoKey(rec->file, info->key);
         if (i > 0)
             fputc(';', f);
         fputs(info->key, f);
         for (j = 0;  j < info->count;  j++)
             {
             union vcfDatum datum = info->values[j];
             switch (type)
                 {
                 case vcfInfoInteger:
                     fprintf(f, "=%d", datum.datInt);
                     break;
                 case vcfInfoFloat:
                     fprintf(f, "=%lf", datum.datFloat);
                     break;
                 case vcfInfoFlag:
                     // Flag key might have a value in older VCFs e.g. 3.2's DB=0, DB=1
                     if (isNotEmpty(datum.datString))
                         fprintf(f, "=%s", datum.datString);
                     break;
                 case vcfInfoCharacter:
                     fprintf(f, "%c", datum.datChar);
                     break;
                 case vcfInfoString:
                     fputc('=', f);
                     if (isNotEmpty(datum.datString))
                         fputs(datum.datString, f);
                     break;
                 default:
                     vcfFileErr(rec->file, "invalid vcfInfoType (uninitialized?) %d", type);
                     break;
                 }
             }
         }
     }
 }
 
 void vcfRecordWriteNoGt(FILE *f, struct vcfRecord *rec)
 /* Write the first 8 columns of VCF rec to f.  Genotype data will be ignored if present. */
 {
 fprintf(f, "%s\t%d\t%s\t%s\t", rec->chrom, rec->chromStart+1, rec->name, rec->alleles[0]);
 // Alternate alleles start at [1]
 vcfWriteWordArrayWithSep(f, rec->alleleCount-1, &(rec->alleles[1]), ',');
 fputc('\t', f);
 fputs(rec->qual, f);
 fputc('\t', f);
 vcfWriteWordArrayWithSep(f, rec->filterCount, rec->filters, ';');
 fputc('\t', f);
 vcfWriteInfo(f, rec);
 fputc('\n', f);
 }