66cfabe5850f11b52956866d59e9c55ecaa799ce
angie
  Thu Apr 24 15:03:57 2025 -0700
Fix a couple things found by valgrind.  Using memmove instead of safecpy when ranges can overlap is safer, but using safecpy didn't seem to be causing bugs.  However, failing to clip to length of codonNew when uppercasing caused memory corruption that caused eventual failure reported in #35577.  valgrind rules.

diff --git src/hg/cgilib/annoFormatVep.c src/hg/cgilib/annoFormatVep.c
index 9941e60cc6c..4ee3c32ad9d 100644
--- src/hg/cgilib/annoFormatVep.c
+++ src/hg/cgilib/annoFormatVep.c
@@ -1,1625 +1,1634 @@
 /* annoFormatVep -- write functional predictions in the same format as Ensembl's
  * Variant Effect Predictor to fileName, interpreting input rows according to config.
  * See http://uswest.ensembl.org/info/docs/variation/vep/vep_formats.html */
 
 /* Copyright (C) 2014 The Regents of the University of California 
  * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
 
 #include "annoFormatVep.h"
 #include "annoGratorGpVar.h"
 #include "annoGratorQuery.h"
 #include "annoStreamDbPslPlus.h"
 #include "asParse.h"
 #include "bigGenePred.h"
 #include "dystring.h"
 #include "genePred.h"
 #include "gpFx.h"
 #include "hdb.h"
 #include "hgHgvs.h"
 #include "htmshell.h"
 #include "pgSnp.h"
 #include "portable.h"
 #include "vcf.h"
 #include <time.h>
 
 struct annoFormatVepExtraItem
     // A single input column whose value should be placed in the Extras output column,
     // identified by tag.
     {
     struct annoFormatVepExtraItem *next;
     char *tag;					// Keyword to use in extras column (tag=value;)
     char *description;				// Text description for output header
     int rowIx;					// Offset of column in row from data source
 						// (N/A for wig sources)
     boolean isRegulatory;			// TRUE if overlap implies regulatory_region
     boolean isBoolean;				// TRUE if we don't need to print out the value,
 						// only whether it is found
     };
 
 struct annoFormatVep;
 
 struct annoFormatVepExtraSource
     // A streamer or grator that supplies at least one value for Extras output column.
     {
     struct annoFormatVepExtraSource *next;
     struct annoStreamer *source;		// streamer or grator: same pointers as below
     struct annoFormatVepExtraItem *items;	// one or more columns of source and their tags
 
     void (*printExtra)(struct annoFormatVepExtraSource *self,
                        struct annoFormatVep *afVep, struct annoFormatVepExtraItem *extraItem,
 		       struct annoRow *extraRows, struct annoRow *gpvRow, boolean *pGotExtra);
     /* Method for printing items from this source; pGotExtra is in/out to keep track of
      * whether any call has actually printed something yet. */
     };
 
 struct annoFormatVepConfig
     // Describe the primary source and grators (whose rows must be delivered in this order)
     // that provide data for VEP output columns.
     {
     struct annoStreamer *variantSource;		// Primary source: variants
     char *variantDescription;			// Description to appear in output header
     struct annoStreamer *gpVarSource;		// annoGratorGpVar makes the core predictions
     char *gpVarDescription;			// Description to appear in output header
     struct annoStreamer *snpSource;		// Latest dbSNP provides IDs of known variants
     char *snpDescription;			// Description to appear in output header
     struct annoFormatVepExtraSource *extraSources;	// Everything else that may be tacked on
     };
 
 enum afVariantDataType
     // What type of primary data (variants) are we integrating with?
     {
     afvdtInvalid,                       // uninitialized/unknown
     afvdtVcf,                           // primary data source is VCF
     afvdtPgSnpTable,                    // primary data source is pgSnp table with bin in .as
     afvdtPgSnpFile                      // primary data source is pgSnp file, no bin
     };
 
 struct annoFormatVep
 // Subclass of annoFormatter that writes VEP-equivalent output to a file.
     {
     struct annoFormatter formatter;	// superclass / external interface
     struct annoFormatVepConfig *config;	// Description of input sources and values for Extras col
     char *fileName;			// Output filename
     FILE *f;				// Output file handle
     struct lm *lm;			// localmem for scratch storage
     struct dyString *dyScratch;		// dyString for local temporary use
     struct annoAssembly *assembly;	// Reference assembly (for sequence)
     int lmRowCount;			// counter for periodic localmem cleanup
     int varNameIx;			// Index of name column from variant source, or -1 if N/A
     int varAllelesIx;			// Index of alleles column from variant source, or -1
     int txNameIx;                       // Index of transcript name from (big)genePred/PSL
     int geneNameIx;			// Index of gene name from (big)genePred/PSL+ if included
     int hgvsGIx;			// Index of HGVS g. column from annoGratorGpVar
     int hgvsCNIx;			// Index of HGVS c./n. column from annoGratorGpVar
     int hgvsPIx;			// Index of HGVS p. column from annoGratorGpVar
     int snpNameIx;			// Index of name column from dbSNP source, or -1
     boolean needHeader;			// TRUE if we should print out the header
     enum afVariantDataType variantType; // Are variants VCF or a flavor of pgSnp?
     boolean doHtml;			// TRUE if we should include html tags & make a <table>.
     struct seqWindow *gSeqWin;          // genomic sequence fetcher for HGVS term generation
     boolean hgvsMakeG;                  // Generate genomic (g.) HGVS terms only if this is set
     boolean hgvsBreakDelIns;            // Include deleted sequence (not only ins) e.g. delGGinsAT
     };
 
 
 INLINE void afVepLineBreak(FILE *f, boolean doHtml)
 /* Depending on whether we're printing HTML, print either a newline or <BR>. */
 {
 if (doHtml)
     fputs("<BR>", f);
 fputc('\n', f);
 }
 
 INLINE void afVepStartRow(FILE *f, boolean doHtml)
 /* If we're printing HTML, print a <TR><TD>. */
 {
 if (doHtml)
     fputs("<TR><TD>", f);
 }
 
 INLINE void afVepNextColumn(FILE *f, boolean doHtml)
 /* Depending on whether we're printing HTML, print either a tab or </TD><TD>. */
 {
 if (doHtml)
     fputs("</TD><TD>", f);
 else
     fputc('\t', f);
 }
 
 INLINE void afVepEndRow(FILE *f, boolean doHtml)
 /* If we're printing HTML, print a </TD></TR>, otherwise, just a newline. */
 {
 if (doHtml)
     fputs("</TD></TR>\n", f);
 else
     fputc('\n', f);
 }
 
 INLINE void afVepPuts(char *text, struct annoFormatVep *self)
 /* Write text to self->f.  If we're printing HTML, write encoded text, otherwise just write text. */
 {
 if (self->doHtml)
     htmTextOut(self->f, text);
 else
     fputs(text, self->f);
 }
 
 static void afVepPrintf(struct annoFormatVep *self, char *format, ...)
 /*  Printf to self->f, but if self->doHtml then encode before writing out. */
 #ifdef __GNUC__
 __attribute__((format(printf, 2, 3)))
 #endif
     ;
 
 static void afVepPrintf(struct annoFormatVep *self, char *format, ...)
 /*  Printf to self->f, but if self->doHtml then encode before writing out. */
 {
 va_list args;
 va_start(args, format);
 if (self->doHtml)
     vaHtmlFprintf(self->f, format, args);
 else
     vfprintf(self->f, format, args);
 va_end(args);
 }
 
 static void afVepPrintHeaderExtraTags(struct annoFormatVep *self, char *bStart, char *bEnd)
 /* For each extra column described in config, write out its tag and a brief description.
  * bStart and bEnd are for bold output in HTML, or if not HTML, starting the line with "## ". */
 {
 struct annoFormatVepExtraSource *extras = self->config->extraSources, *extraSrc;
 if (extras == NULL)
     return;
 fprintf(self->f, "%sKeys for Extra column items:%s", bStart, bEnd);
 afVepLineBreak(self->f, self->doHtml);
 for (extraSrc = extras;  extraSrc != NULL;  extraSrc = extraSrc->next)
     {
     struct annoFormatVepExtraItem *extraItem;
     for (extraItem = extraSrc->items;  extraItem != NULL;  extraItem = extraItem->next)
 	if (isNotEmpty(extraItem->tag))
 	    {
 	    fprintf(self->f, "%s%s%s: %s", bStart, extraItem->tag, bEnd, extraItem->description);
 	    afVepLineBreak(self->f, self->doHtml);
 	    }
     }
 }
 
 static void afVepPrintHeaderDate(FILE *f, boolean doHtml)
 /* VEP header includes a date formatted like "2012-06-16 16:09:38" */
 {
 long now = clock1();
 struct tm *tm = localtime(&now);
 fprintf(f, "## Output produced at %d-%02d-%02d %02d:%02d:%02d",
 	1900 + tm->tm_year, 1 + tm->tm_mon, tm->tm_mday, tm->tm_hour, tm->tm_min, tm->tm_sec);
 afVepLineBreak(f, doHtml);
 }
 
 static char *columnLabels[] = { "Uploaded Variation",
 				"Location",
 				"Allele",
 				"Gene",
 				"Feature",
 				"Feature type",
 				"Consequence",
 				"Position in cDNA",
 				"Position in CDS",
 				"Position in protein",
 				"Amino acid change",
 				"Codon change",
 				"Co-located Variation",
 				"Extra",
 				NULL };
 
 static void afVepPrintColumnLabels(struct annoFormatVep *self)
 /* If we're printing HTML, begin a table and use TH for labels; otherwise just
  * print out a tab-sep text line with labels.  VEP doesn't begin this line with a #! */
 {
 char **pLabel = columnLabels;
 if (self->doHtml)
     {
     fputs("<BR><TABLE class='stdTbl'>\n<TR><TH>", self->f);
     fputs(*pLabel++, self->f);
     while (*pLabel != NULL)
 	{
 	fputs("</TH><TH>", self->f);
 	fputs(*pLabel++, self->f);
 	}
     fputs("</TH></TR>\n", self->f);
     }
 else
     {
     fputs(*pLabel++, self->f);
     while (*pLabel != NULL)
 	{
 	fputc('\t', self->f);
 	fputs(*pLabel++, self->f);
 	}
     fputc('\n', self->f);
     }
 }
 
 static void afVepPrintHeader(struct annoFormatVep *self, char *db)
 /* Print a header that looks almost like a VEP header. */
 {
 FILE *f = self->f;
 boolean doHtml = self->doHtml;
 char *bStart = doHtml ? "<B>" : "## ";
 char *bEnd = doHtml ? "</B>" : "";
 if (!doHtml)
     {
     // Suppress these lines from HTML output -- IMO they're better suited for a file header:
     fputs("## ENSEMBL VARIANT EFFECT PREDICTOR format (UCSC Variant Annotation Integrator)", f);
     afVepLineBreak(f, doHtml);
     afVepPrintHeaderDate(f, doHtml);
     fprintf(f, "## Connected to UCSC database %s", db);
     afVepLineBreak(f, doHtml);
     }
 struct annoFormatVepConfig *config = self->config;
 fprintf(f, "%sVariants:%s ", bStart, bEnd);
 afVepPrintf(self, "%s", config->variantDescription);
 if (! strstr(config->variantSource->name, "/trash/"))
     fprintf(f, " (%s)", config->variantSource->name);
 afVepLineBreak(f, doHtml);
 fprintf(f, "%sTranscripts:%s %s (%s)", bStart, bEnd,
 	config->gpVarDescription, config->gpVarSource->name);
 afVepLineBreak(f, doHtml);
 if (config->snpSource != NULL)
     {
     fprintf(f, "%sdbSNP:%s %s (%s)", bStart, bEnd,
 	    config->snpDescription, config->snpSource->name);
     afVepLineBreak(f, doHtml);
     }
 afVepPrintHeaderExtraTags(self, bStart, bEnd);
 afVepPrintColumnLabels(self);
 self->needHeader = FALSE;
 }
 
 static void afVepInitialize(struct annoFormatter *fSelf, struct annoStreamer *primarySource,
 			    struct annoStreamer *integrators)
 /* Print header, regardless of whether we get any data after this. */
 {
 struct annoFormatVep *self = (struct annoFormatVep *)fSelf;
 if (self->needHeader)
     afVepPrintHeader(self, primarySource->assembly->name);
 }
 
 static void afVepComment(struct annoFormatter *fSelf, char *content)
 /* Print out a comment, either starting with "# " or as a warnBox depending on doHtml. */
 {
 if (strchr(content, '\n'))
     errAbort("afVepComment: no multi-line input");
 struct annoFormatVep *self = (struct annoFormatVep *)fSelf;
 if (self->doHtml)
     warn("%s", content);
 else
     fprintf(self->f, "# %s\n", content);
 }
 
 static void compressDashes(char *string)
 /* If string has a run of '-' characters, turn it into single '-'. */
 {
 char *p = string;
 while ((p = strchr(p, '-')) != NULL)
     {
     char *end = p+1;
     while (*end == '-')
 	end++;
     if (end > p+1)
 	memmove(p+1, end, strlen(end)+1);
     p++;
     }
 }
 
 static char *afVepGetAlleles(struct annoFormatVep *self, struct annoRow *varRow)
 /* Return a string with slash-separated alleles.  Result is alloc'd by self->lm. */
 {
 char **varWords = (char **)(varRow->data);
 char *alleles = NULL;
 if (self->variantType == afvdtVcf)
     alleles = lmCloneString(self->lm, vcfGetSlashSepAllelesFromWords(varWords, self->dyScratch));
 else if (self->varAllelesIx >= 0)
     alleles = lmCloneString(self->lm, varWords[self->varAllelesIx]);
 else
     errAbort("annoFormatVep: afVepSetConfig didn't specify how to get alleles");
 compressDashes(alleles);
 return alleles;
 }
 
 static void afVepPrintNameAndLoc(struct annoFormatVep *self, struct annoRow *varRow)
 /* Print variant name and position in genome. */
 {
 char **varWords = (char **)(varRow->data);
 uint start1Based = varRow->start + 1;
 // Use variant name if available, otherwise construct an identifier:
 if (self->varNameIx >= 0 && !sameString(varWords[self->varNameIx], "."))
     afVepPuts(varWords[self->varNameIx], self);
 else
     {
     char *alleles = afVepGetAlleles(self, varRow);
     afVepPrintf(self, "%s_%u_%s", varRow->chrom, start1Based, alleles);
     }
 afVepNextColumn(self->f, self->doHtml);
 // Location is chr:start for single-base, chr:start-end for indels:
 if (varRow->end == start1Based)
     afVepPrintf(self, "%s:%u", varRow->chrom, start1Based);
 else if (start1Based > varRow->end)
     afVepPrintf(self, "%s:%u-%u", varRow->chrom, varRow->end, start1Based);
 else
     afVepPrintf(self, "%s:%u-%u", varRow->chrom, start1Based, varRow->end);
 afVepNextColumn(self->f, self->doHtml);
 }
 
 INLINE void afVepPrintPlaceholders(FILE *f, int count, boolean doHtml)
 /* VEP uses "-" for N/A.  Sometimes there are several consecutive N/A columns.
  * Count = 0 means print "-" with no tab. Count > 0 means print that many "-" columns. */
 {
 if (count == 0)
     fputc('-', f);
 else
     {
     int i;
     for (i = 0;  i < count;  i++)
 	{
 	fputc('-', f);
 	afVepNextColumn(f, doHtml);
 	}
     }
 }
 
 #define afVepPrintPlaceholder(f, doHtml) afVepPrintPlaceholders(f, 1, doHtml)
 
 INLINE char *placeholderForEmpty(char *val)
 /* Return VEP placeholder "-" if val is empty, otherwise return val. */
 {
 return (isEmpty(val) ? "-" : val);
 }
 
 
 static void afVepPrintGene(struct annoFormatVep *self, struct annoRow *gpvRow)
 /* If the genePred portion of gpvRow contains a gene name (in addition to transcript name),
  * print it out; otherwise print a placeholder. */
 {
 if (self->geneNameIx >= 0)
     {
     char **words = (char **)(gpvRow->data);
     afVepPuts(placeholderForEmpty(words[self->geneNameIx]), self);
     afVepNextColumn(self->f, self->doHtml);
     }
 else
     afVepPrintPlaceholder(self->f, self->doHtml);
 }
 
 static void limitLength(char *seq, int baseCount, char *unit)
 /* If seq is longer than an abbreviated version of itself, change it to the abbreviated version. */
 {
 if (isEmpty(seq))
     return;
 int len = strlen(seq);
 const int elipsLen = 3;
 char lengthNote[512];
 safef(lengthNote, sizeof(lengthNote), "(%d %s)", len, unit);
 if (len > 2*baseCount + elipsLen + strlen(lengthNote))
     {
     // First baseCount bases, then "...":
     int offset = baseCount;
     safecpy(seq+offset, len+1-offset, "...");
     offset += elipsLen;
     // then last baseCount bases:
-    safecpy(seq+offset, len+1-offset, seq+len-baseCount);
+    memmove(seq+offset, seq+len-baseCount, baseCount);
     offset += baseCount;
     // then lengthNote:
     safecpy(seq+offset, len+1-offset, lengthNote);
     }
 }
 
 static void tweakStopCodonAndLimitLength(char *aaSeq, char *codonSeq)
 /* If aa from gpFx has a stop 'Z', replace it with "*".
  * If the strings are very long, truncate with a note about how long they are. */
 {
 char *earlyStop = strchr(aaSeq, 'Z');
 if (earlyStop)
     {
     earlyStop[0] = '*';
     earlyStop[1] = '\0';
     }
 limitLength(aaSeq, 5, "aa");
 limitLength(codonSeq, 12, "nt");
 }
 
 INLINE char *dashForEmpty(char *s)
 /* Represent empty alleles in insertions/deletions by the customary "-". */
 {
 if (isEmpty(s))
     return "-";
 else
     return s;
 }
 
 static void afVepPrintPredictions(struct annoFormatVep *self, struct annoRow *varRow,
 				  struct annoRow *gpvRow, struct gpFx *gpFx)
 /* Print VEP columns computed by annoGratorGpVar (or placeholders) */
 {
 // variant allele used to calculate the consequence (or first alternate allele)
 char *abbrevAllele = cloneString(gpFx->gAllele);
 limitLength(abbrevAllele, 24, "nt");
 afVepPuts(placeholderForEmpty(abbrevAllele), self);
 afVepNextColumn(self->f, self->doHtml);
 // ID of affected gene
 afVepPrintGene(self, gpvRow);
 // ID of feature
 afVepPrintf(self, "%s", placeholderForEmpty(gpFx->transcript));
 afVepNextColumn(self->f, self->doHtml);
 // type of feature {Transcript, RegulatoryFeature, MotifFeature}
 if (gpFx->soNumber == intergenic_variant)
     afVepPrintPlaceholder(self->f, self->doHtml);
 else
     {
     fputs("Transcript", self->f);
     afVepNextColumn(self->f, self->doHtml);
     }
 // consequence: SO term e.g. splice_region_variant
 afVepPuts(soTermToString(gpFx->soNumber), self);
 afVepNextColumn(self->f, self->doHtml);
 if (gpFx->detailType == codingChange)
     {
     struct codingChange *change = &(gpFx->details.codingChange);
     uint refLen = strlen(change->txRef), altLen = strlen(change->txAlt);
     boolean isInsertion = (refLen == 0);
     boolean isDeletion = altLen < refLen;
     if (isInsertion)
 	{
 	fprintf(self->f, "%u-%u", change->cDnaPosition, change->cDnaPosition+1);
 	afVepNextColumn(self->f, self->doHtml);
 	fprintf(self->f, "%u-%u", change->cdsPosition, change->cdsPosition+1);
 	}
     else if (isDeletion && refLen > 1)
 	{
 	fprintf(self->f, "%u-%u", change->cDnaPosition+1, change->cDnaPosition+refLen);
 	afVepNextColumn(self->f, self->doHtml);
 	fprintf(self->f, "%u-%u", change->cdsPosition+1, change->cdsPosition+refLen);
 	}
     else
 	{
 	fprintf(self->f, "%u", change->cDnaPosition+1);
 	afVepNextColumn(self->f, self->doHtml);
 	fprintf(self->f, "%u", change->cdsPosition+1);
 	}
     afVepNextColumn(self->f, self->doHtml);
     fprintf(self->f, "%u", change->pepPosition+1);
     afVepNextColumn(self->f, self->doHtml);
     int variantFrame = change->cdsPosition % 3;
     strLower(change->codonOld);
     int upLen = min(strlen(change->codonOld+variantFrame), refLen);
     toUpperN(change->codonOld+variantFrame, upLen);
     strLower(change->codonNew);
     int alleleLength = altLen;
     // watch out for symbolic alleles [actually now that we're using txAlt we shouldn't see these]:
     if (sameString(change->txAlt, "<X>") || sameString(change->txAlt, "<*>"))
         alleleLength = upLen;
     else if (startsWith("<", change->txAlt))
         alleleLength = 0;
+    int codonNewLen = strlen(change->codonNew);
+    if (codonNewLen < variantFrame + alleleLength)
+        {
+        // This can happen when an insertion introduces a stop codon -- codonNew is shorter than
+        // we would otherwise expect.
+        alleleLength = codonNewLen - variantFrame;
+        if (alleleLength < 0)
+            alleleLength = 0;
+        }
     toUpperN(change->codonNew+variantFrame, alleleLength);
     tweakStopCodonAndLimitLength(change->aaOld, change->codonOld);
     tweakStopCodonAndLimitLength(change->aaNew, change->codonNew);
     fprintf(self->f, "%s/%s", dashForEmpty(change->aaOld), dashForEmpty(change->aaNew));
     afVepNextColumn(self->f, self->doHtml);
     fprintf(self->f, "%s/%s", dashForEmpty(change->codonOld), dashForEmpty(change->codonNew));
     afVepNextColumn(self->f, self->doHtml);
     }
 else if (gpFx->detailType == nonCodingExon)
     {
     struct nonCodingExon *change = &(gpFx->details.nonCodingExon);
     uint refLen = strlen(change->txRef), altLen = strlen(change->txAlt);
     boolean isInsertion = (refLen == 0);
     boolean isDeletion = altLen < refLen;
     int cDnaPosition = change->cDnaPosition;
     if (isInsertion)
 	fprintf(self->f, "%u-%u", cDnaPosition, cDnaPosition+1);
     else if (isDeletion)
 	fprintf(self->f, "%u-%u", cDnaPosition+1, cDnaPosition+refLen);
     else
 	fprintf(self->f, "%u", cDnaPosition+1);
     afVepNextColumn(self->f, self->doHtml);
     // Coding effect columns (except for cDnaPosition) are N/A:
     afVepPrintPlaceholders(self->f, 4, self->doHtml);
     }
 else
     // Coding effect columns are N/A:
     afVepPrintPlaceholders(self->f, 5, self->doHtml);
 freez(&abbrevAllele);
 }
 
 static void afVepPrintExistingVar(struct annoFormatVep *self, struct annoRow *varRow,
 				  struct annoStreamRows gratorData[], int gratorCount)
 /* Print existing variant ID (or placeholder) */
 {
 if (self->snpNameIx >= 0)
     {
     if (gratorCount < 2 || gratorData[1].streamer != self->config->snpSource)
 	errAbort("annoFormatVep: config error, snpSource is not where expected");
     struct annoRow *snpRows = gratorData[1].rowList, *row;
     if (snpRows != NULL)
 	{
 	int count = 0;
 	for (row = snpRows;  row != NULL;  row = row->next)
 	    {
 	    char **snpWords = (char **)(row->data);
 	    if (row->start == varRow->start && row->end == varRow->end)
 		{
 		if (count > 0)
 		    fputc(',', self->f);
 		afVepPrintf(self, "%s", snpWords[self->snpNameIx]);
 		count++;
 		}
 	    }
 	if (count == 0)
 	    afVepPrintPlaceholder(self->f, self->doHtml);
 	else
 	    afVepNextColumn(self->f, self->doHtml);
 	}
     else
 	afVepPrintPlaceholder(self->f, self->doHtml);
     }
 else
     afVepPrintPlaceholder(self->f, self->doHtml);
 }
 
 static boolean isCodingSnv(struct annoRow *primaryRow, struct gpFx *gpFx)
 /* Return TRUE if this is a single-nucleotide non-synonymous change. */
 {
 if (gpFx == NULL ||
     gpFx->detailType != codingChange ||
     gpFx->soNumber == synonymous_variant)
     return FALSE;
 char *txRef = gpFx->details.codingChange.txRef;
 char *txAlt = gpFx->details.codingChange.txAlt;
 if (txRef == NULL || txAlt == NULL ||
     strlen(txRef) != 1 || sameString(txRef, "-") ||
     strlen(txAlt) != 1 || sameString(txAlt, "-"))
     return FALSE;
 return TRUE;
 }
 
 static int commaSepFindIx(char *item, char *s)
 /* Treating comma-separated, non-NULL s as an array of words,
  * return the index of item in the array, or -1 if item is not in array. */
 {
 int itemLen = strlen(item);
 int ix = 0;
 char *p = strchr(s, ',');
 while (p != NULL)
     {
     int elLen = (p - s);
     if (elLen == itemLen && strncmp(s, item, itemLen) == 0)
 	return ix;
     s = p+1;
     p = strchr(s, ',');
     ix++;
     }
 if (strlen(s) == itemLen && strncmp(s, item, itemLen) == 0)
     return ix;
 return -1;
 }
 
 static int commaSepFindIntIx(int item, char *s)
 /* Treating comma-separated, non-NULL s as an array of words that encode integers,
  * return the index of item in the array, or -1 if item is not in array. */
 {
 char itemString[64];
 safef(itemString, sizeof(itemString), "%d", item);
 return commaSepFindIx(itemString, s);
 }
 
 static char *commaSepWordFromIxOrWhole(int ix, char *s, struct lm *lm)
 /* Treating comma-separated, non-NULL s as an array of words,
  * return the word at ix in the array -- but if s has no commas,
  * just return (lmCloned) s.  errAborts if s is comma-sep but ix is out of bounds. */
 {
 int i = 0;
 char *p = strchr(s, ',');
 if (p == NULL)
     return lmCloneString(lm, s);
 while (p != NULL)
     {
     if (i == ix)
 	return lmCloneStringZ(lm, s, p-s);
     s = p+1;
     p = strchr(s, ',');
     i++;
     }
 if (i == ix)
     return lmCloneString(lm, s);
 errAbort("commaSepWordFromIxOrWhole: Bad index %d for string '%s'", ix, s);
 return NULL;
 }
 
 INLINE void afVepNewExtra(struct annoFormatVep *self, boolean *pGotExtra)
 /* If we already printed an extra item, print the extra column's separator; set pGotExtra. */
 {
 assert(pGotExtra);
 if (*pGotExtra)
     {
     if (self->doHtml)
 	fputs("; ", self->f);
     else
 	fputc(';', self->f);
     }
 *pGotExtra = TRUE;
 }
 
 static void afVepPrintDbNsfpSift(struct annoFormatVep *self,
 				 struct annoFormatVepExtraSource *extraSrc,
 				 struct annoRow *extraRows, struct gpFx *gpFx, char *ensTxId,
 				 boolean *pGotExtra)
 /* Match the allele from gpFx to the per-allele scores in row from dbNsfpSift. */
 {
 // Look up column indices only once:
 static int ensTxIdIx=-1, altAl1Ix, score1Ix, altAl2Ix, score2Ix, altAl3Ix, score3Ix;
 if (ensTxIdIx == -1)
     {
     struct asColumn *columns = extraSrc->source->asObj->columnList;
     ensTxIdIx = asColumnMustFindIx(columns, "ensTxId");
     altAl1Ix = asColumnMustFindIx(columns, "altAl1");
     score1Ix = asColumnMustFindIx(columns, "score1");
     altAl2Ix = asColumnMustFindIx(columns, "altAl2");
     score2Ix = asColumnMustFindIx(columns, "score2");
     altAl3Ix = asColumnMustFindIx(columns, "altAl3");
     score3Ix = asColumnMustFindIx(columns, "score3");
     }
 
 struct annoRow *row;
 for (row = extraRows;  row != NULL;  row = row->next)
     {
     // Skip this row unless it contains the ensTxId found by getDbNsfpEnsTx
     // (but handle rare cases where dbNsfpSeqChange has "." for ensTxId, lame)
     char **words = row->data;
     if (differentString(ensTxId, ".") && differentString(words[ensTxIdIx], ".") &&
 	commaSepFindIx(ensTxId, words[ensTxIdIx]) < 0)
 	continue;
     struct annoFormatVepExtraItem *extraItem = extraSrc->items;
     char *scoreStr = NULL;
     if (sameString(gpFx->gAllele, words[altAl1Ix]))
 	scoreStr = words[score1Ix];
     else if (sameString(gpFx->gAllele, words[altAl2Ix]))
 	scoreStr = words[score2Ix];
     else if (sameString(gpFx->gAllele, words[altAl3Ix]))
 	scoreStr = words[score3Ix];
     if (isNotEmpty(scoreStr) && differentString(scoreStr, "."))
 	{
         double score = atof(scoreStr);
         char prediction = '?';
         if (score < 0.05)
             prediction = 'D';
         else
             prediction = 'T';
 	afVepNewExtra(self, pGotExtra);
 	fprintf(self->f, "%s=%c(%s)", extraItem->tag, prediction, scoreStr);
 	}
     }
 }
 
 static void afVepPrintDbNsfpPolyPhen2(struct annoFormatVep *self,
 				      struct annoFormatVepExtraSource *extraSrc,
 				      struct annoRow *extraRows, struct gpFx *gpFx,
 				      boolean *pGotExtra)
 /* Match the allele from gpFx to the per-allele scores in each row from dbNsfpPolyPhen2. */
 {
 // Look up column indices only once:
 static int aaPosIx=-1,
     altAl1Ix, hDivScore1Ix, hDivPred1Ix, hVarScore1Ix, hVarPred1Ix,
     altAl2Ix, hDivScore2Ix, hDivPred2Ix, hVarScore2Ix, hVarPred2Ix,
     altAl3Ix, hDivScore3Ix, hDivPred3Ix, hVarScore3Ix, hVarPred3Ix;
 if (aaPosIx == -1)
     {
     struct asColumn *columns = extraSrc->source->asObj->columnList;
     aaPosIx = asColumnMustFindIx(columns, "uniProtAaPos");
     altAl1Ix = asColumnMustFindIx(columns, "altAl1");
     hDivScore1Ix = asColumnMustFindIx(columns, "hDivScore1");
     hDivPred1Ix = asColumnMustFindIx(columns, "hDivPred1");
     hVarScore1Ix = asColumnMustFindIx(columns, "hVarScore1");
     hVarPred1Ix = asColumnMustFindIx(columns, "hVarPred1");
     altAl2Ix = asColumnMustFindIx(columns, "altAl2");
     hDivScore2Ix = asColumnMustFindIx(columns, "hDivScore2");
     hDivPred2Ix = asColumnMustFindIx(columns, "hDivPred2");
     hVarScore2Ix = asColumnMustFindIx(columns, "hVarScore2");
     hVarPred2Ix = asColumnMustFindIx(columns, "hVarPred2");
     altAl3Ix = asColumnMustFindIx(columns, "altAl3");
     hDivScore3Ix = asColumnMustFindIx(columns, "hDivScore3");
     hDivPred3Ix = asColumnMustFindIx(columns, "hDivPred3");
     hVarScore3Ix = asColumnMustFindIx(columns, "hVarScore3");
     hVarPred3Ix = asColumnMustFindIx(columns, "hVarPred3");
     }
 
 struct codingChange *cc = &(gpFx->details.codingChange);
 int count = 0;
 struct annoRow *row;
 for (row = extraRows;  row != NULL;  row = row->next)
     {
     char **words = row->data;
 //#*** polyphen2 can actually have multiple scores/preds for the same pepPosition...
 //#*** so what we really should do is loop on comma-sep words[aaPosIx] and print scores/preds
 //#*** whenever pepPosition matches.
     int txIx = commaSepFindIntIx(cc->pepPosition+1, words[aaPosIx]);
     if (txIx < 0)
 	continue;
     struct annoFormatVepExtraItem *extraItem = extraSrc->items;
     for (;  extraItem != NULL;  extraItem = extraItem->next)
 	{
 	boolean isHdiv = (stringIn("HDIV", extraItem->tag) != NULL);
 	int predIx = -1, scoreIx = -1;
 	if (sameString(gpFx->gAllele, words[altAl1Ix]))
 	    {
 	    predIx = isHdiv ? hDivPred1Ix : hVarPred1Ix;
 	    scoreIx = isHdiv ? hDivScore1Ix : hVarScore1Ix;
 	    }
 	else if (sameString(gpFx->gAllele, words[altAl2Ix]))
 	    {
 	    predIx = isHdiv ? hDivPred2Ix : hVarPred2Ix;
 	    scoreIx = isHdiv ? hDivScore2Ix : hVarScore2Ix;
 	    }
 	else if (sameString(gpFx->gAllele, words[altAl3Ix]))
 	    {
 	    predIx = isHdiv ? hDivPred3Ix : hVarPred3Ix;
 	    scoreIx = isHdiv ? hDivScore3Ix : hVarScore3Ix;
 	    }
 	char *pred = (predIx < 0) ? NULL : words[predIx];
 	if (pred == NULL || sameString(pred, "."))
 	    continue;
 	pred = commaSepWordFromIxOrWhole(txIx, pred, self->lm);
 	if (isNotEmpty(pred))
 	    {
 	    if (count == 0)
 		{
 		afVepNewExtra(self, pGotExtra);
 		fprintf(self->f, "%s=", extraItem->tag);
 		}
 	    else
 		fputc(',', self->f);
 	    char *score = (scoreIx < 0) ? "?" : commaSepWordFromIxOrWhole(txIx, words[scoreIx], self->lm);
 	    fprintf(self->f, "%s(%s)", pred, score);
 	    count++;
 	    }
 	}
     }
 }
 
 static void afVepPrintDbNsfpMutationTA(struct annoFormatVep *self,
 				       struct annoFormatVepExtraSource *extraSrc,
 				       struct annoRow *extraRows, struct gpFx *gpFx, char *ensTxId,
 				       boolean *pGotExtra)
 /* Match the allele from gpFx to the per-allele scores in row from dbNsfpMutationTaster
  * or dbNsfpMutationAssessor -- they have identical column indices. */
 {
 // Look up column indices only once:
 static int ensTxIdIx=-1, altAl1Ix, score1Ix, pred1Ix,
     altAl2Ix, score2Ix, pred2Ix, altAl3Ix, score3Ix, pred3Ix;
 if (ensTxIdIx == -1)
     {
     struct asColumn *columns = extraSrc->source->asObj->columnList;
     ensTxIdIx = asColumnMustFindIx(columns, "ensTxId");
     altAl1Ix = asColumnMustFindIx(columns, "altAl1");
     score1Ix = asColumnMustFindIx(columns, "score1");
     pred1Ix = asColumnMustFindIx(columns, "pred1");
     altAl2Ix = asColumnMustFindIx(columns, "altAl2");
     score2Ix = asColumnMustFindIx(columns, "score2");
     pred2Ix = asColumnMustFindIx(columns, "pred2");
     altAl3Ix = asColumnMustFindIx(columns, "altAl3");
     score3Ix = asColumnMustFindIx(columns, "score3");
     pred3Ix = asColumnMustFindIx(columns, "pred3");
     }
 
 struct annoRow *row;
 for (row = extraRows;  row != NULL;  row = row->next)
     {
     // Skip this row unless it contains the ensTxId found by getDbNsfpEnsTx
     // (but handle rare cases where dbNsfpSeqChange has "." for ensTxId, lame)
     char **words = row->data;
     if (differentString(ensTxId, ".") && differentString(words[ensTxIdIx], ".") &&
 	commaSepFindIx(ensTxId, words[ensTxIdIx]) < 0)
 	continue;
     struct annoFormatVepExtraItem *extraItem = extraSrc->items;
     char *score = NULL, *pred = NULL;
     if (sameString(gpFx->gAllele, words[altAl1Ix]))
 	{
 	score = words[score1Ix];
 	pred = words[pred1Ix];
 	}
     else if (sameString(gpFx->gAllele, words[altAl2Ix]))
 	{
 	score = words[score2Ix];
 	pred = words[pred2Ix];
 	}
     else if (sameString(gpFx->gAllele, words[altAl3Ix]))
 	{
 	score = words[score3Ix];
 	pred = words[pred3Ix];
 	}
     if (isNotEmpty(score) && differentString(score, "."))
 	{
 	afVepNewExtra(self, pGotExtra);
 	if (isEmpty(pred))
 	    pred = "?";
 	fprintf(self->f, "%s=%s(%s)", extraItem->tag, pred, score);
 	}
     }
 }
 
 static void afVepPrintDbNsfpLrt(struct annoFormatVep *self,
 				struct annoFormatVepExtraSource *extraSrc,
 				struct annoRow *extraRows, struct gpFx *gpFx, char *ensTxId,
 				boolean *pGotExtra)
 /* Match the allele from gpFx to the per-allele scores in row from dbNsfpLrt --
  * it also has omega{1,2,3} columns, but I'm not sure what those mean so am leaving out for now. */
 {
 // Look up column indices only once:
 static int ensTxIdIx=-1, altAl1Ix, score1Ix, pred1Ix,
     altAl2Ix, score2Ix, pred2Ix, altAl3Ix, score3Ix, pred3Ix;
 if (ensTxIdIx == -1)
     {
     struct asColumn *columns = extraSrc->source->asObj->columnList;
     ensTxIdIx = asColumnMustFindIx(columns, "ensTxId");
     altAl1Ix = asColumnMustFindIx(columns, "altAl1");
     score1Ix = asColumnMustFindIx(columns, "score1");
     pred1Ix = asColumnMustFindIx(columns, "pred1");
     altAl2Ix = asColumnMustFindIx(columns, "altAl2");
     score2Ix = asColumnMustFindIx(columns, "score2");
     pred2Ix = asColumnMustFindIx(columns, "pred2");
     altAl3Ix = asColumnMustFindIx(columns, "altAl3");
     score3Ix = asColumnMustFindIx(columns, "score3");
     pred3Ix = asColumnMustFindIx(columns, "pred3");
     }
 
 struct annoRow *row;
 for (row = extraRows;  row != NULL;  row = row->next)
     {
     // Skip this row unless it contains the ensTxId found by getDbNsfpEnsTx
     // (but handle rare cases where dbNsfpSeqChange has "." for ensTxId, lame)
     char **words = row->data;
     if (differentString(ensTxId, ".") && differentString(words[ensTxIdIx], ".") &&
 	commaSepFindIx(ensTxId, words[ensTxIdIx]) < 0)
 	continue;
     struct annoFormatVepExtraItem *extraItem = extraSrc->items;
     char *score = NULL, *pred = NULL;
     if (sameString(gpFx->gAllele, words[altAl1Ix]))
 	{
 	score = words[score1Ix];
 	pred = words[pred1Ix];
 	}
     else if (sameString(gpFx->gAllele, words[altAl2Ix]))
 	{
 	score = words[score2Ix];
 	pred = words[pred2Ix];
 	}
     else if (sameString(gpFx->gAllele, words[altAl3Ix]))
 	{
 	score = words[score3Ix];
 	pred = words[pred3Ix];
 	}
     if (isNotEmpty(score) && differentString(score, "."))
 	{
 	afVepNewExtra(self, pGotExtra);
 	if (isEmpty(pred))
 	    pred = "?";
 	fprintf(self->f, "%s=%s(%s)", extraItem->tag, pred, score);
 	}
     }
 }
 
 static void afVepPrintDbNsfpVest(struct annoFormatVep *self,
                                  struct annoFormatVepExtraSource *extraSrc,
                                  struct annoRow *extraRows, struct gpFx *gpFx, char *ensTxId,
                                  boolean *pGotExtra)
 /* Match the allele from gpFx to the per-allele scores in row from dbNsfpVest */
 {
 // TODO: also compare var[123] protein change columns to gpFx to make sure we have the right one
 // Look up column indices only once:
 static int ensTxIdIx=-1, altAl1Ix, score1Ix, altAl2Ix, score2Ix, altAl3Ix, score3Ix;
 if (ensTxIdIx == -1)
     {
     struct asColumn *columns = extraSrc->source->asObj->columnList;
     ensTxIdIx = asColumnMustFindIx(columns, "ensTxId");
     altAl1Ix = asColumnMustFindIx(columns, "altAl1");
     score1Ix = asColumnMustFindIx(columns, "score1");
     altAl2Ix = asColumnMustFindIx(columns, "altAl2");
     score2Ix = asColumnMustFindIx(columns, "score2");
     altAl3Ix = asColumnMustFindIx(columns, "altAl3");
     score3Ix = asColumnMustFindIx(columns, "score3");
     }
 
 struct annoRow *row;
 for (row = extraRows;  row != NULL;  row = row->next)
     {
     // Skip this row unless it contains the ensTxId found by getDbNsfpEnsTx
     char **words = row->data;
     if (differentString(ensTxId, ".") && differentString(words[ensTxIdIx], ".") &&
 	commaSepFindIx(ensTxId, words[ensTxIdIx]) < 0)
 	continue;
     struct annoFormatVepExtraItem *extraItem = extraSrc->items;
     char *score = NULL;
     if (sameString(gpFx->gAllele, words[altAl1Ix]))
 	score = words[score1Ix];
     else if (sameString(gpFx->gAllele, words[altAl2Ix]))
 	score = words[score2Ix];
     else if (sameString(gpFx->gAllele, words[altAl3Ix]))
 	score = words[score3Ix];
     if (isNotEmpty(score) && differentString(score, "."))
 	{
 	afVepNewExtra(self, pGotExtra);
 	fprintf(self->f, "%s=%s", extraItem->tag, score);
 	}
     }
 }
 
 static void afVepPrintDbNsfpInterPro(struct annoFormatVep *self,
 				     struct annoFormatVepExtraSource *extraSrc,
 				     struct annoRow *extraRows, struct gpFx *gpFx, char *ensTxId,
 				     boolean *pGotExtra)
 /* Print out any overlapping (comma-sep list, HTML-encoded ',' and '=') InterPro domains. */
 {
 // Look up column indices only once:
 static int domainsIx = -1;
 if (domainsIx == -1)
     {
     struct asColumn *columns = extraSrc->source->asObj->columnList;
     domainsIx = asColumnMustFindIx(columns, "domains");
     }
 
 struct annoRow *row;
 for (row = extraRows;  row != NULL;  row = row->next)
     {
     char **words = row->data;
     struct annoFormatVepExtraItem *extraItem = extraSrc->items;
     char *domains = words[domainsIx];
     int lastC = strlen(domains) - 1;
     if (domains[lastC] == ',')
 	domains[lastC] = '\0';
     if (isNotEmpty(domains) && differentString(domains, "."))
 	{
 	afVepNewExtra(self, pGotExtra);
 	fprintf(self->f, "%s=%s", extraItem->tag, domains);
 	}
     }
 }
 
 static boolean allelesAgree(char altNt, char altAa, char **words)
 /* Return TRUE if dbNsfpSeqChange words have altAa associated with altNt. */
 {
 //#*** TODO: handle stop codon representation
 if ((altNt == words[11][0] && altAa == words[12][0]) ||
     (altNt == words[13][0] && altAa == words[14][0]) ||
     (altNt == words[15][0] && altAa == words[16][0]))
     return TRUE;
 return FALSE;
 }
 
 static struct annoRow *getRowsFromSource(struct annoStreamer *src,
 					 struct annoStreamRows gratorData[], int gratorCount)
 /* Search gratorData for src, and return its rows when found. */
 {
 int i;
 for (i = 0;  i < gratorCount;  i++)
     {
     if (gratorData[i].streamer == src)
 	return gratorData[i].rowList;
     }
 errAbort("annoFormatVep: Can't find source %s in gratorData", src->name);
 return NULL;
 }
 
 static char *getDbNsfpEnsTx(struct annoFormatVep *self, struct gpFx *gpFx,
 			    struct annoStreamRows *gratorData, int gratorCount)
 /* Find the Ensembl transcript ID, if any, for which dbNsfp has results consistent
  * with gpFx. */
 {
 int i;
 for (i = 0;  i < gratorCount;  i++)
     {
     struct annoStreamer *source = gratorData[i].streamer;
     if (!sameString(source->asObj->name, "dbNsfpSeqChange")) //#*** need metadata!
 	continue;
     struct codingChange *cc = &(gpFx->details.codingChange);
     struct annoRow *extraRows = getRowsFromSource(source, gratorData, gratorCount);
     struct annoRow *row;
     for (row = extraRows;  row != NULL;  row = row->next)
 	{
 	char **words = row->data;
 	if (strcasecmp(cc->codonOld, words[7]) != 0)
 	    continue;
 	if (!allelesAgree(gpFx->gAllele[0], cc->aaNew[0], words))
 	    continue;
 	int txIx = commaSepFindIntIx(cc->pepPosition+1, words[10]);
 	if (txIx >= 0)
 	    {
 	    if (sameString(words[4], "."))
 		return ".";
 	    char *ensTxId = commaSepWordFromIxOrWhole(txIx, words[4], self->lm);
 	    return ensTxId;
 	    }
 	}
     break;
     }
 return NULL;
 }
 
 static void afVepPrintExtrasDbNsfp(struct annoFormatVep *self, struct annoRow *varRow,
 				   struct annoRow *gpvRow, struct gpFx *gpFx,
 				   struct annoStreamRows gratorData[], int gratorCount,
 				   boolean *pGotExtra)
 /* Print the Extra column's tag=value; components from dbNSFP data, if we have any. */
 {
 // dbNSFP has data only for coding non-synonymous single-nucleotide changes:
 if (!isCodingSnv(varRow, gpFx))
     return;
 // Does dbNsfpSeqChange have a coding change consistent with gpFx?:
 char *ensTxId = getDbNsfpEnsTx(self, gpFx, gratorData, gratorCount);
 // Now cycle through selected dbNsfp* sources, printing out scores for ensTxId:
 struct annoFormatVepExtraSource *extras = self->config->extraSources, *extraSrc;
 for (extraSrc = extras;  extraSrc != NULL;  extraSrc = extraSrc->next)
     {
     char *asObjName = extraSrc->source->asObj->name;
     if (!startsWith("dbNsfp", asObjName))
 	continue;
     struct annoRow *extraRows = getRowsFromSource(extraSrc->source, gratorData, gratorCount);
     if (sameString(asObjName, "dbNsfpPolyPhen2"))
 	{
 	// PolyPhen2 is based on UniProt proteins, not GENCODE/Ensembl transcripts,
 	// so ensTxId doesn't apply.
 	afVepPrintDbNsfpPolyPhen2(self, extraSrc, extraRows, gpFx, pGotExtra);
 	if (ensTxId == NULL)
 	    break; // all done now, no need to keep looking
 	}
     else if (ensTxId != NULL)
 	{
 	if (sameString(asObjName, "dbNsfpSift"))
 	    afVepPrintDbNsfpSift(self, extraSrc, extraRows, gpFx, ensTxId, pGotExtra);
 	else if (sameString(asObjName, "dbNsfpMutationTaster") ||
 		 sameString(asObjName, "dbNsfpMutationAssessor"))
 	    afVepPrintDbNsfpMutationTA(self, extraSrc, extraRows, gpFx, ensTxId, pGotExtra);
 	else if (sameString(asObjName, "dbNsfpLrt"))
 	    afVepPrintDbNsfpLrt(self, extraSrc, extraRows, gpFx, ensTxId, pGotExtra);
         else if (sameString(asObjName, "dbNsfpVest"))
 	    afVepPrintDbNsfpVest(self, extraSrc, extraRows, gpFx, ensTxId, pGotExtra);
 	else if (sameString(asObjName, "dbNsfpInterPro"))
 	    afVepPrintDbNsfpInterPro(self, extraSrc, extraRows, gpFx, ensTxId, pGotExtra);
 	else if (!sameString(asObjName, "dbNsfpSeqChange"))
 	    errAbort("Unrecognized asObj->name '%s' from dbNSFP source '%s'",
 		     asObjName, extraSrc->source->name);
 	}
     }
 }
 
 static void afVepPrintExtrasHgvs(struct annoFormatVep *self, struct annoRow *gpvRow,
                                  boolean *pGotExtra)
 /* If not empty, print HGVS terms (last column of gpvRow). */
 {
 char **row = gpvRow->data;
 if (isNotEmpty(row[self->hgvsGIx]))
     {
     afVepNewExtra(self, pGotExtra);
     fprintf(self->f, "HGVSG=%s", row[self->hgvsGIx]);
     }
 if (isNotEmpty(row[self->hgvsCNIx]))
     {
     afVepNewExtra(self, pGotExtra);
     fprintf(self->f, "HGVSCN=%s", row[self->hgvsCNIx]);
     }
 if (isNotEmpty(row[self->hgvsPIx]))
     {
     afVepNewExtra(self, pGotExtra);
     fprintf(self->f, "HGVSP=%s", row[self->hgvsPIx]);
     }
 }
 
 static void afVepPrintExtraWigVec(struct annoFormatVepExtraSource *extraSrc,
                                   struct annoFormatVep *self,
                                   struct annoFormatVepExtraItem *extraItem,
                                   struct annoRow *extraRows, struct annoRow *gpvRow,
                                   boolean *pGotExtra)
 /* Print values from a wig source with type arWigVec (per-base floats) */
 {
 int i;
 struct annoRow *row;
 for (i = 0, row = extraRows;  row != NULL;  i++, row = row->next)
     {
     float *vector = row->data;
     int len = row->end - row->start;
     int j;
     for (j = 0;  j < len;  j++)
 	{
 	if (i+j == 0)
 	    {
 	    afVepNewExtra(self, pGotExtra);
 	    fprintf(self->f, "%s=", extraItem->tag);
 	    }
 	else
 	    fputc(',', self->f);
 	fprintf(self->f, "%g", vector[j]);
 	}
     }
 }
 
 static void afVepPrintExtraWigSingle(struct annoFormatVepExtraSource *extraSrc,
                                      struct annoFormatVep *self,
                                      struct annoFormatVepExtraItem *extraItem,
                                      struct annoRow *extraRows, struct annoRow *gpvRow,
                                      boolean *pGotExtra)
 /* Print values from a wig source with type arWigSingle (one double per row, e.g. an average) */
 {
 struct annoRow *row;
 for (row = extraRows;  row != NULL;  row = row->next)
     {
     double *pValue = row->data;
     if (row == extraRows)
         {
         afVepNewExtra(self, pGotExtra);
         fprintf(self->f, "%s=", extraItem->tag);
         }
     else
         fputc(',', self->f);
     fprintf(self->f, "%g", *pValue);
     }
 }
 
 static void afVepPrintExtraWords(struct annoFormatVepExtraSource *extraSrc,
                                  struct annoFormatVep *self,
 				 struct annoFormatVepExtraItem *extraItem,
 				 struct annoRow *extraRows, struct annoRow *gpvRow,
                                  boolean *pGotExtra)
 /* Print comma-separated values in the specified column from the usual array-of-words. */
 {
 if (extraItem->rowIx < 0)
     errAbort("annoFormatVep: invalid rowIx for tag %s", extraItem->tag);
 char **gpvWords = gpvRow ? gpvRow->data : NULL;
 char *gpvTranscript = gpvWords ? gpvWords[self->txNameIx] : NULL;
 boolean gotGpvTx = FALSE;
 int i;
 struct annoRow *row;
 for (i = 0, row = extraRows;  row != NULL;  row = row->next)
     {
     char **words = row->data;
     char *extraTranscript = words[0];
     // If extraSrc happens to be gpVarSource, use only the row that matches the current
     // transcript, and use it only once.
     if (extraSrc->source == self->config->gpVarSource)
         {
         extraTranscript = words[self->txNameIx];
         if (gpvTranscript == NULL || differentString(extraTranscript, gpvTranscript) || gotGpvTx)
             continue;
         gotGpvTx = TRUE;
         }
     char *val = words[extraItem->rowIx];
     if (isEmpty(val))
         continue;
     // Strip trailing comma if found
     int valLen = strlen(val);
     if (val[valLen-1] == ',')
         val[valLen-1] = '\0';
     if (i == 0)
 	{
 	afVepNewExtra(self, pGotExtra);
 	fprintf(self->f, "%s=", extraItem->tag);
         if (extraItem->isBoolean)
             fputs("YES", self->f);
 	}
     else if (! extraItem->isBoolean)
 	fputc(',', self->f);
     if (! extraItem->isBoolean)
         {
         // Watch out for characters that will mess up parsing of EXTRAS column:
         subChar(val, '=', '_');
         subChar(val, ';', '_');
         afVepPuts(val, self);
         }
     i++;
     }
 }
 
 static void afVepPrintExtrasOther(struct annoFormatVep *self, struct annoRow *varRow,
 				  struct annoRow *gpvRow, struct gpFx *gpFx,
 				  struct annoStreamRows gratorData[], int gratorCount,
 				  boolean includeRegulatory, boolean *pGotExtra)
 /* Print the Extra column's tag=value; components (other than dbNSFP) if we have any. */
 {
 struct annoFormatVepExtraSource *extras = self->config->extraSources, *extraSrc;
 for (extraSrc = extras;  extraSrc != NULL;  extraSrc = extraSrc->next)
     {
     char *asObjName = extraSrc->source->asObj->name;
     if (startsWith("dbNsfp", asObjName))
 	continue;
     struct annoRow *extraRows = getRowsFromSource(extraSrc->source, gratorData, gratorCount);
     if (extraRows != NULL)
 	{
 	struct annoFormatVepExtraItem *extraItem;
 	for (extraItem = extraSrc->items;  extraItem != NULL;  extraItem = extraItem->next)
 	    if (includeRegulatory || ! extraItem->isRegulatory)
 		extraSrc->printExtra(extraSrc, self, extraItem, extraRows, gpvRow, pGotExtra);
 	}
     }
 // VEP automatically adds DISTANCE for upstream/downstream variants
 if (gpFx && (gpFx->soNumber == upstream_gene_variant || gpFx->soNumber == downstream_gene_variant))
     {
     afVepNewExtra(self, pGotExtra);
     // In order to really understand the boundary conditions of Ensembl's DISTANCE function,
     // I'll have to read the code someday.  Upstream deletions on + strand still not handled
     // quite right.
     int ensemblFudge = 0;
     int distance = gpvRow->start - varRow->start + ensemblFudge;
     if (distance < 0)
 	{
 	ensemblFudge = 1;
 	distance = varRow->start - gpvRow->end + ensemblFudge;
 	}
     fprintf(self->f, "DISTANCE=%d", distance);
     }
 boolean includeExonNumber = TRUE;  //#*** optional in VEP
 if (includeExonNumber && gpFx)
     {
     // Add Exon or intron number if applicable
     enum detailType deType = gpFx->detailType;
     int num = -1, count = 0;
     if (deType == codingChange)
         {
 	num = gpFx->details.codingChange.exonNumber;
 	count = gpFx->details.codingChange.exonCount;
         }
     else if (deType == nonCodingExon)
         {
 	num = gpFx->details.nonCodingExon.exonNumber;
 	count = gpFx->details.nonCodingExon.exonCount;
         }
     else if (deType == intron)
         {
 	num = gpFx->details.intron.intronNumber;
         count = gpFx->details.intron.intronCount;
         }
     if (num >= 0)
 	{
 	afVepNewExtra(self, pGotExtra);
         fprintf(self->f, "%s=%d/%d", (deType == intron) ? "INTRON" : "EXON", num+1, count);
         }
     }
 if (!*pGotExtra)
     afVepPrintPlaceholders(self->f, 0, self->doHtml);
 }
 
 static void afVepLmCleanup(struct annoFormatVep *self)
 /* Clean out localmem every 1k rows. */
 {
 self->lmRowCount++;
 if (self->lmRowCount > 1024)
     {
     lmCleanup(&(self->lm));
     self->lm = lmInit(0);
     self->lmRowCount = 0;
     }
 }
 
 static void afVepPrintOneLineGpVar(struct annoFormatVep *self, struct annoStreamRows *varData,
 				   struct annoRow *gpvRow,
 				   struct annoStreamRows gratorData[], int gratorCount)
 /* Print one line of VEP with a coding gene consequence: a variant, an allele,
  * functional consequences of that allele, and whatever else is included in the config. */
 {
 afVepLmCleanup(self);
 struct annoRow *varRow = varData->rowList;
 struct gpFx *gpFx = annoGratorGpVarGpFxFromRow(self->config->gpVarSource, gpvRow, self->lm);
 afVepStartRow(self->f, self->doHtml);
 afVepPrintNameAndLoc(self, varRow);
 afVepPrintPredictions(self, varRow, gpvRow, gpFx);
 afVepPrintExistingVar(self, varRow, gratorData, gratorCount);
 boolean gotExtra = FALSE;
 afVepPrintExtrasDbNsfp(self, varRow, gpvRow, gpFx, gratorData, gratorCount, &gotExtra);
 afVepPrintExtrasHgvs(self, gpvRow, &gotExtra);
 afVepPrintExtrasOther(self, varRow, gpvRow, gpFx, gratorData, gratorCount, FALSE, &gotExtra);
 afVepEndRow(self->f, self->doHtml);
 }
 
 static boolean afVepIsRegulatory(struct annoFormatVep *self,
 				 struct annoStreamRows gratorData[], int gratorCount)
 /* Return TRUE if one of the extraSource items is a regulatory region. */
 {
 struct annoFormatVepExtraSource *extras = self->config->extraSources, *extraSrc;
 for (extraSrc = extras;  extraSrc != NULL;  extraSrc = extraSrc->next)
     {
     struct annoRow *extraRows = getRowsFromSource(extraSrc->source, gratorData, gratorCount);
     if (extraRows != NULL)
 	{
 	struct annoFormatVepExtraItem *extraItem;
 	for (extraItem = extraSrc->items;  extraItem != NULL;  extraItem = extraItem->next)
 	    {
 	    if (extraItem->isRegulatory)
 		return TRUE;
 	    }
 	}
     }
 return FALSE;
 }
 
 static char *afVepGetFirstAltAllele(struct annoFormatVep *self, struct annoRow *varRow)
 /* For VEP we must print out some alternate allele, whether the alternate allele is
  * related to any other info (like coding changes) or not.  Pick the first non-reference
  * allele. */
 {
 int refAlBufSize = varRow->end - varRow->start + 1;
 char refAllele[refAlBufSize];
 annoAssemblyGetSeq(self->assembly, varRow->chrom, varRow->start, varRow->end,
 		   refAllele, sizeof(refAllele));
 struct variant *variant = NULL;
 if (self->variantType == afvdtVcf)
     variant = variantFromVcfAnnoRow(varRow, refAllele, self->lm, self->dyScratch);
 else if (self->variantType == afvdtPgSnpTable)
     variant = variantFromPgSnpAnnoRow(varRow, refAllele, TRUE, self->lm);
 else if (self->variantType == afvdtPgSnpFile)
     variant = variantFromPgSnpAnnoRow(varRow, refAllele, FALSE, self->lm);
 return firstAltAllele(variant->alleles);
 }
 
 static void afVepPrintPredictionsReg(struct annoFormatVep *self, char *alt)
 /* Print VEP allele, placeholder gene and item names, 'RegulatoryFeature',
  * 'regulatory_region_variant', and placeholder coding effect columns. */
 {
 afVepPuts(alt, self);
 afVepNextColumn(self->f, self->doHtml);
 afVepPrintPlaceholders(self->f, 2, self->doHtml);
 fputs("RegulatoryFeature", self->f);
 afVepNextColumn(self->f, self->doHtml);
 fputs(soTermToString(regulatory_region_variant), self->f);
 afVepNextColumn(self->f, self->doHtml);
 // Coding effect columns are N/A:
 afVepPrintPlaceholders(self->f, 5, self->doHtml);
 }
 
 static void afVepPrintExtrasHgvsGOnly(struct annoFormatVep *self, struct annoRow *varRow, char *alt,
                                       boolean *pGotExtra)
 /* Make our own HGVS g. term (when we aren't taking HGVS terms from annoGratorGpVar). */
 {
 if (self->hgvsMakeG)
     {
     struct bed3 *variantBed = (struct bed3 *)varRow;
     char *chromAcc = hRefSeqAccForChrom(self->assembly->name, varRow->chrom);
     char *hgvsG = hgvsGFromVariant(self->gSeqWin, variantBed, alt, chromAcc, self->hgvsBreakDelIns);
     afVepNewExtra(self, pGotExtra);
     fprintf(self->f, "HGVSG=%s", hgvsG);
     freeMem(hgvsG);
     }
 }
 
 static void afVepPrintRegulatory(struct annoFormatVep *self, struct annoStreamRows *varData,
 				 struct annoStreamRows gratorData[], int gratorCount)
 /* If there are EXTRAS column sources that imply regulatory_region_variant, print out
  * lines with that consequence (no gpFx/dbNSFP). */
 {
 if (afVepIsRegulatory(self, gratorData, gratorCount))
     {
     afVepLmCleanup(self);
     struct annoRow *varRow = varData->rowList;
     afVepStartRow(self->f, self->doHtml);
     afVepPrintNameAndLoc(self, varRow);
     char *alt = afVepGetFirstAltAllele(self, varRow);
     afVepPrintPredictionsReg(self, alt);
     afVepPrintExistingVar(self, varRow, gratorData, gratorCount);
     boolean gotExtra = FALSE;
     afVepPrintExtrasHgvsGOnly(self, varRow, alt, &gotExtra);
     afVepPrintExtrasOther(self, varRow, NULL, NULL, gratorData, gratorCount, TRUE, &gotExtra);
     afVepEndRow(self->f, self->doHtml);
     }
 }
 
 static void afVepFormatOne(struct annoFormatter *fSelf, struct annoStreamRows *primaryData,
 			   struct annoStreamRows gratorData[], int gratorCount)
 /* Print one variant's VEP (possibly multiple lines) using collected rows. */
 {
 struct annoFormatVep *self = (struct annoFormatVep *)fSelf;
 struct annoRow *gpVarRows = gratorData[0].rowList, *gpvRow;
 for (gpvRow = gpVarRows;  gpvRow != NULL;  gpvRow = gpvRow->next)
     afVepPrintOneLineGpVar(self, primaryData, gpvRow, gratorData, gratorCount);
 afVepPrintRegulatory(self, primaryData, gratorData, gratorCount);
 }
 
 static void afVepEndHtml(struct annoFormatVep *self)
 {
 fputs("</TABLE><BR>\n", self->f);
 }
 
 static void afVepClose(struct annoFormatter **pFSelf)
 /* Close file handle, free self. */
 {
 if (pFSelf == NULL)
     return;
 struct annoFormatVep *self = *(struct annoFormatVep **)pFSelf;
 if (self->doHtml)
     afVepEndHtml(self);
 freeMem(self->fileName);
 carefulClose(&(self->f));
 lmCleanup(&(self->lm));
 dyStringFree(&(self->dyScratch));
 chromSeqWindowFree(&self->gSeqWin);
 annoFormatterFree(pFSelf);
 }
 
 static void afVepSetConfig(struct annoFormatVep *self, struct annoFormatVepConfig *config)
 /* Check config and figure out where various output columns will come from. */
 {
 self->config = config;
 struct asColumn *varAsColumns = config->variantSource->asObj->columnList;
 self->varNameIx = -1;
 self->varAllelesIx = -1;
 if (asObjectsMatch(config->variantSource->asObj, pgSnpAsObj()))
     self->variantType = afvdtPgSnpTable;
 else if (asObjectsMatch(config->variantSource->asObj, pgSnpFileAsObj()))
     self->variantType = afvdtPgSnpFile;
 else if (asObjectsMatch(config->variantSource->asObj, vcfAsObj()))
     {
     self->variantType = afvdtVcf;
     self->varNameIx = asColumnMustFindIx(varAsColumns, "id");
     }
 else
     errAbort("afVepSetConfig: variant source %s doesn't look like pgSnp or VCF",
 	     config->variantSource->name);
 if (self->variantType == afvdtPgSnpTable || self->variantType == afvdtPgSnpFile)
     {
     // pgSnp's "name" column actually contains slash-separated alleles
     self->varAllelesIx = asColumnMustFindIx(varAsColumns, "name");
     }
 if (config->gpVarSource == NULL)
     errAbort("afVepSetConfig: config must have a gpVarSource");
 else if (! asColumnNamesMatchFirstN(config->gpVarSource->asObj, genePredAsObj(), 10) &&
          ! asColumnNamesMatchFirstN(config->gpVarSource->asObj, bigGenePredAsObj(), 16) &&
          ! asColumnNamesMatchFirstN(config->gpVarSource->asObj, annoStreamDbPslPlusAsObj(),
                                     PSLPLUS_NUM_COLS))
     errAbort("afVepSetConfig: gpVarSource %s doesn't look like genePred or pslPlus",
 	     config->gpVarSource->name);
 // refGene and augmented knownGene have extra name fields that have the HGNC gene symbol:
 struct asColumn *gpvAsColumns = config->gpVarSource->asObj->columnList;
 self->txNameIx = asColumnFindIx(gpvAsColumns, "name");
 if (self->txNameIx < 0)
     self->txNameIx = asColumnFindIx(gpvAsColumns, "qName");
 if (self->txNameIx < 0)
     errAbort("afVepSetConfig: can't find name or qName column in gpVarSource %s",
              config->gpVarSource->name);
 self->geneNameIx = asColumnFindIx(gpvAsColumns, "geneSymbol");
 if (self->geneNameIx < 0)
     self->geneNameIx = asColumnFindIx(gpvAsColumns, "kgXref_geneSymbol");
 if (self->geneNameIx < 0)
     self->geneNameIx = asColumnFindIx(gpvAsColumns, "name2");
 if (self->geneNameIx < 0)
     self->geneNameIx = asColumnFindIx(gpvAsColumns, "proteinID");
 self->hgvsGIx = asColumnMustFindIx(gpvAsColumns, "hgvsG");
 self->hgvsCNIx = asColumnMustFindIx(gpvAsColumns, "hgvsCN");
 self->hgvsPIx = asColumnMustFindIx(gpvAsColumns, "hgvsP");
 if (config->snpSource != NULL)
     {
     struct asColumn *snpAsColumns = config->snpSource->asObj->columnList;
     self->snpNameIx = asColumnMustFindIx(snpAsColumns, "name");
     }
 else
     self->snpNameIx = -1;
 }
 
 struct annoFormatVepConfig *annoFormatVepConfigNew(struct annoStreamer *variantSource,
 						   char *variantDescription,
 						   struct annoStreamer *gpVarSource,
 						   char *gpVarDescription,
 						   struct annoStreamer *snpSource,
 						   char *snpDescription)
 /* Return a basic configuration for VEP output.  variantSource and gpVarSource must be
  * provided; snpSource can be NULL. */
 {
 struct annoFormatVepConfig *config;
 AllocVar(config);
 config->variantSource = variantSource;
 config->variantDescription = cloneString(variantDescription);
 config->gpVarSource = gpVarSource;
 config->gpVarDescription = cloneString(gpVarDescription);
 config->snpSource = snpSource;
 config->snpDescription = cloneString(snpDescription);
 return config;
 }
 
 struct annoFormatter *annoFormatVepNew(char *fileName, boolean doHtml,
 				       struct annoStreamer *variantSource,
 				       char *variantDescription,
 				       struct annoStreamer *gpVarSource,
 				       char *gpVarDescription,
 				       struct annoStreamer *snpSource,
 				       char *snpDescription,
 				       struct annoAssembly *assembly)
 /* Return a formatter that will write functional predictions in the same format as Ensembl's
  * Variant Effect Predictor to fileName (can be "stdout").
  * variantSource and gpVarSource must be provided; snpSource can be NULL. */
 {
 struct annoFormatVep *self;
 AllocVar(self);
 struct annoFormatter *fSelf = &(self->formatter);
 fSelf->getOptions = annoFormatterGetOptions;
 fSelf->setOptions = annoFormatterSetOptions;
 fSelf->initialize = afVepInitialize;
 fSelf->comment = afVepComment;
 fSelf->formatOne = afVepFormatOne;
 fSelf->close = afVepClose;
 self->fileName = cloneString(fileName);
 self->f = mustOpen(fileName, "w");
 self->lm = lmInit(0);
 self->dyScratch = dyStringNew(0);
 self->needHeader = TRUE;
 self->doHtml = doHtml;
 self->assembly = assembly;
 struct annoFormatVepConfig *config = annoFormatVepConfigNew(variantSource, variantDescription,
 							    gpVarSource, gpVarDescription,
 							    snpSource, snpDescription);
 afVepSetConfig(self, config);
 return (struct annoFormatter *)self;
 }
 
 static void afvAddExtraItemMaybeReg(struct annoFormatter *fSelf, struct annoStreamer *extraSource,
 				    char *tag, char *description, char *column, boolean isBoolean,
                                     boolean isReg)
 /* Tell annoFormatVep that it should include the given column of extraSource
  * in the EXTRAS column with tag.  The VEP header will include tag's description.
  * If isReg, overlap implies that the variant has consequence regulatory_region_variant.
  * For some special-cased sources e.g. dbNsfp files, column may be NULL/ignored. */
 {
 struct annoFormatVep *self = (struct annoFormatVep *)fSelf;
 struct annoFormatVepExtraSource *src;
 for (src = self->config->extraSources;  src != NULL;  src = src->next)
     if (src->source == extraSource)
 	break;
 if (src == NULL)
     {
     AllocVar(src);
     src->source = extraSource;
     if (extraSource->rowType == arWigVec)
 	src->printExtra = afVepPrintExtraWigVec;
     else if (extraSource->rowType == arWigSingle)
 	src->printExtra = afVepPrintExtraWigSingle;
     else
 	src->printExtra = afVepPrintExtraWords;
     slAddTail(&(self->config->extraSources), src);
     }
 struct annoFormatVepExtraItem *item;
 AllocVar(item);
 item->tag = cloneString(tag);
 item->description = cloneString(description);
 item->rowIx = -1;
 if (isNotEmpty(column))
     item->rowIx = asColumnMustFindIx(extraSource->asObj->columnList, column);
 item->isRegulatory = isReg;
 item->isBoolean = isBoolean;
 slAddTail(&(src->items), item);
 }
 
 void annoFormatVepAddExtraItem(struct annoFormatter *fSelf, struct annoStreamer *extraSource,
 			       char *tag, char *description, char *column, boolean isBoolean)
 /* Tell annoFormatVep that it should include the given column of extraSource
  * in the EXTRAS column with tag.  The VEP header will include tag's description.
  * For some special-cased sources e.g. dbNsfp files, column may be NULL/ignored.
  * If isBoolean, the column's description won't be output, only whether a match was found. */
 {
 afvAddExtraItemMaybeReg(fSelf, extraSource, tag, description, column, isBoolean, FALSE);
 }
 
 void annoFormatVepAddRegulatory(struct annoFormatter *fSelf, struct annoStreamer *regSource,
 				char *tag, char *description, char *column)
 /* Tell annoFormatVep to use the regulatory_region_variant consequence if there is overlap
  * with regSource, and to include the given column of regSource in the EXTRAS column.
  * The VEP header will include tag's description. */
 {
 afvAddExtraItemMaybeReg(fSelf, regSource, tag, description, column, FALSE, TRUE);
 struct annoFormatVep *self = (struct annoFormatVep *)fSelf;
 if (self->gSeqWin == NULL)
     {
     char *db = regSource->assembly->name;
     self->gSeqWin = chromSeqWindowNew(db, NULL, 0, 0);
     }
 }
 
 void annoFormatVepSetHgvsOutOptions(struct annoFormatter *fSelf, uint hgvsOutOptions)
 /* Import the HGVS output options described in hgHgvs.h */
 {
 struct annoFormatVep *self = (struct annoFormatVep *)fSelf;
 // We only do g. terms, for regulatory region variants, so no need for transcript/protein opts.
 if (hgvsOutOptions & HGVS_OUT_G)
     self->hgvsMakeG = TRUE;
 if (hgvsOutOptions & HGVS_OUT_BREAK_DELINS)
     self->hgvsBreakDelIns = TRUE;
 }