675a334117c7a98be59035fdf620ac8ca764fd1e
angie
  Thu Jan 18 12:11:33 2018 -0800
isAllNt ignores last base in string so beware; also, isAllNt returns true for "-" (translates it to 'n') but that means the empty string so exclude it from base-trimming.

diff --git src/lib/vcf.c src/lib/vcf.c
index a2acfa7..183bb82 100644
--- src/lib/vcf.c
+++ src/lib/vcf.c
@@ -749,38 +749,47 @@
 {
 char *words[VCF_MAX_COLUMNS];
 int wordCount;
 if ((wordCount = lineFileChopTab(vcff->lf, words)) <= 0)
     return NULL;
 wordCount = checkWordCount(vcff, words, wordCount);
 return vcfRecordFromRow(vcff, words);
 }
 
 static boolean allelesHavePaddingBase(char **alleles, int alleleCount)
 /* Examine alleles to see if they either a) all start with the same base or
  * b) include a symbolic or 0-length allele.  In either of those cases, there
  * must be an initial padding base that we'll need to trim from non-symbolic
  * alleles. */
 {
+if (sameString(alleles[0], "-"))
+    return FALSE;
 boolean hasPaddingBase = TRUE;
 char firstBase = '\0';
-if (isAllNt(alleles[0], strlen(alleles[0])))
+if (isAllNt(alleles[0], strlen(alleles[0])
+            +1)) //#*** FIXME isAllNt ignores last base in string!!! always TRUE for len=1
     firstBase = alleles[0][0];
 int i;
 for (i = 1;  i < alleleCount;  i++)
     {
-    if (isAllNt(alleles[i], strlen(alleles[i])))
+    if (sameString(alleles[i], "-"))
+        {
+        hasPaddingBase = FALSE;
+        break;
+        }
+    else if (isAllNt(alleles[i], strlen(alleles[i])
+                     +1)) //#*** FIXME isAllNt ignores last base in string!!! always TRUE for len=1
 	{
 	if (firstBase == '\0')
 	    firstBase = alleles[i][0];
 	if (alleles[i][0] != firstBase)
 	    // Different first base implies unpadded alleles.
 	    hasPaddingBase = FALSE;
 	}
     else if (sameString(alleles[i], "<X>") || sameString(alleles[i], "<*>"))
         {
         // Special case for samtools mpileup "<X>" or gVCF "<*>" (no alternate allele observed) --
         // being symbolic doesn't make this an indel and ref base is not necessarily padded.
         hasPaddingBase = FALSE;
         }
     else
 	{
@@ -804,31 +813,32 @@
  * record in hgc -- so return the original chromStart. */
 {
 unsigned int chromStartOrig = rec->chromStart;
 struct vcfFile *vcff = rec->file;
 if (rec->alleleCount > 1)
     {
     boolean hasPaddingBase = allelesHavePaddingBase(rec->alleles, rec->alleleCount);
     if (hasPaddingBase)
 	{
 	rec->chromStart++;
 	int i;
 	for (i = 0;  i < rec->alleleCount;  i++)
 	    {
 	    if (rec->alleles[i][1] == '\0')
 		rec->alleles[i] = vcfFilePooledStr(vcff, "-");
-	    else if (isAllNt(rec->alleles[i], strlen(rec->alleles[i])))
+	    else if (isAllNt(rec->alleles[i], strlen(rec->alleles[i])
+                             +1)) //#*** FIXME isAllNt ignores last base in string!!! always TRUE for len=1
 		rec->alleles[i] = vcfFilePooledStr(vcff, rec->alleles[i]+1);
 	    else // don't trim first character of symbolic allele
 		rec->alleles[i] = vcfFilePooledStr(vcff, rec->alleles[i]);
 	    }
 	}
     }
 return chromStartOrig;
 }
 
 static boolean allEndsGEStartsAndIdentical(char **starts, char **ends, int count)
 /* Given two arrays with <count> elements, return true if all strings in ends[] are
  * greater than or equal to the corresponding strings in starts[], and all ends[]
  * have the same char. */
 {
 int i;
@@ -838,31 +848,33 @@
     if (ends[i] < starts[i] || ends[i][0] != refEnd)
 	return FALSE;
     }
 return TRUE;
 }
 
 static int countIdenticalBasesRight(char **alleles, int alCount)
 /* Return the number of bases that are identical at the end of each allele (usually 0). */
 {
 char *alleleEnds[alCount];
 int i;
 for (i = 0;  i < alCount;  i++)
     {
     int alLen = strlen(alleles[i]);
     // If any allele is symbolic, don't try to trim.
-    if (!isAllNt(alleles[i], alLen))
+    if (sameString(alleles[i], "-") ||
+        !isAllNt(alleles[i], alLen
+                 +1)) //#*** FIXME isAllNt ignores last base in string!!! always TRUE for len=1
 	return 0;
     alleleEnds[i] = alleles[i] + alLen-1;
     }
 int trimmedBases = 0;
 while (allEndsGEStartsAndIdentical(alleles, alleleEnds, alCount))
     {
     trimmedBases++;
     // Trim identical last base of alleles and move alleleEnds[] items back.
     for (i = 0;  i < alCount;  i++)
 	alleleEnds[i]--;
     }
 return trimmedBases;
 }
 
 unsigned int vcfRecordTrimAllelesRight(struct vcfRecord *rec)
@@ -1454,31 +1466,32 @@
 // VCF reference allele gets its own column:
 char *refAllele = words[3];
 char *altAlleles = words[4];
 // Make a vcfRecord-like allele array (ref in [0], alts after) so we can check for padding base:
 int alCount = 1 + countChars(altAlleles, ',') + 1;
 char *alleles[alCount];
 alleles[0] = refAllele;
 char altAlCopy[strlen(altAlleles)+1];
 safecpy(altAlCopy, sizeof(altAlCopy), altAlleles);
 chopByChar(altAlCopy, ',', &(alleles[1]), alCount-1);
 int i;
 if (allelesHavePaddingBase(alleles, alCount))
     {
     // Skip padding base (unless we have a symbolic allele):
     for (i = 0;  i < alCount;  i++)
-	if (isAllNt(alleles[i], strlen(alleles[i])))
+	if (isAllNt(alleles[i], strlen(alleles[i])
+                    +1)) //#*** FIXME isAllNt ignores last base in string!!! always TRUE for len=1
 	    alleles[i]++;
     }
 // Having dealt with left padding base, now look for identical bases on the right:
 int trimmedBases = countIdenticalBasesRight(alleles, alCount);
 // Build a /-separated allele string, trimming bases on the right if necessary:
 dyStringClear(dy);
 for (i = 0;  i < alCount;  i++)
     {
     char *allele = alleles[i];
     if (!sameString(allele, "."))
         {
         if (i > 0)
             dyStringAppendC(dy, '/');
         if (allele[trimmedBases] == '\0')
             dyStringAppendC(dy, '-');