239f153403dc75bf2a57c67ef04dcc7c38e8ecff
angie
  Wed Mar 5 14:03:23 2014 -0800
In addition to trimming the left padding base from VCF indels,now we trim identical bases on the right of all alleles, if there
are any.  I've seen multiple cases of user VCF with indels that
have inflated size due to including right flanking bases; when
trying to determine functional consequences, it's misleading to
have an indel sized larger than it has to be.

diff --git src/lib/vcf.c src/lib/vcf.c
index 0d7bb0b..e229f03 100644
--- src/lib/vcf.c
+++ src/lib/vcf.c
@@ -755,30 +755,101 @@
 	int i;
 	for (i = 0;  i < rec->alleleCount;  i++)
 	    {
 	    if (rec->alleles[i][1] == '\0')
 		rec->alleles[i] = vcfFilePooledStr(vcff, "-");
 	    else if (isAllNt(rec->alleles[i], strlen(rec->alleles[i])))
 		rec->alleles[i] = vcfFilePooledStr(vcff, rec->alleles[i]+1);
 	    else // don't trim first character of symbolic allele
 		rec->alleles[i] = vcfFilePooledStr(vcff, rec->alleles[i]);
 	    }
 	}
     }
 return chromStartOrig;
 }
 
+static boolean allEndsGEStartsAndIdentical(char **starts, char **ends, int count)
+/* Given two arrays with <count> elements, return true if all strings in ends[] are
+ * greater than or equal to the corresponding strings in starts[], and all ends[]
+ * have the same char. */
+{
+int i;
+char refEnd = ends[0][0];
+for (i = 0;  i < count;  i++)
+    {
+    if (ends[i] < starts[i] || ends[i][0] != refEnd)
+	return FALSE;
+    }
+return TRUE;
+}
+
+static int countIdenticalBasesRight(char **alleles, int alCount)
+/* Return the number of bases that are identical at the end of each allele (usually 0). */
+{
+char *alleleEnds[alCount];
+int i;
+for (i = 0;  i < alCount;  i++)
+    {
+    int alLen = strlen(alleles[i]);
+    // If any allele is symbolic, don't try to trim.
+    if (!isAllNt(alleles[i], alLen))
+	return 0;
+    alleleEnds[i] = alleles[i] + alLen-1;
+    }
+int trimmedBases = 0;
+while (allEndsGEStartsAndIdentical(alleles, alleleEnds, alCount))
+    {
+    trimmedBases++;
+    // Trim identical last base of alleles and move alleleEnds[] items back.
+    for (i = 0;  i < alCount;  i++)
+	alleleEnds[i]--;
+    }
+return trimmedBases;
+}
+
+unsigned int vcfRecordTrimAllelesRight(struct vcfRecord *rec)
+/* Some tools output indels with extra base to the right, for example ref=ACC, alt=ACCC
+ * which should be ref=A, alt=AC.  When the extra bases make the variant extend from an
+ * intron (or gap) into an exon, it can cause a false appearance of a frameshift.
+ * To avoid this, when all alleles have identical base(s) at the end, trim all of them,
+ * and update rec->chromEnd.
+ * For hgTracks' mapBox we need the correct chromStart for identifying the record in hgc,
+ * so return the original chromEnd. */
+{
+unsigned int chromEndOrig = rec->chromEnd;
+int alCount = rec->alleleCount;
+char **alleles = rec->alleles;
+int trimmedBases = countIdenticalBasesRight(alleles, alCount);
+if (trimmedBases > 0)
+    {
+    struct vcfFile *vcff = rec->file;
+    // Allocate new pooled strings for the trimmed alleles.
+    int i;
+    for (i = 0;  i < alCount;  i++)
+	{
+	int alLen = strlen(alleles[i]); // alLen >= trimmedBases > 0
+	char trimmed[alLen+1];
+	safencpy(trimmed, sizeof(trimmed), alleles[i], alLen - trimmedBases);
+	if (isEmpty(trimmed))
+	    safencpy(trimmed, sizeof(trimmed), "-", 1);
+	alleles[i] = vcfFilePooledStr(vcff, trimmed);
+	}
+    rec->chromEnd -= trimmedBases;
+    }
+return chromEndOrig;
+}
+
 static boolean chromsMatch(char *chromA, char *chromB)
 // Return TRUE if chromA and chromB are non-NULL and identical, possibly ignoring
 // "chr" at the beginning of one but not the other.
 {
 if (chromA == NULL || chromB == NULL)
     return FALSE;
 char *chromAPlus = startsWith("chr", chromA) ? chromA+3 : chromA;
 char *chromBPlus = startsWith("chr", chromB) ? chromB+3 : chromB;
 return sameString(chromAPlus, chromBPlus);
 }
 
 static struct vcfRecord *vcfParseData(struct vcfFile *vcff, char *chrom, int start, int end,
 				      int maxRecords)
 // Given a vcfFile into which the header has been parsed, and whose
 // lineFile is positioned at the beginning of a data row, parse and
@@ -822,34 +893,33 @@
  * If parseAll, then read in all lines in region, parse and store in
  * vcff->records; if maxErr >= zero, then continue to parse until
  * there are maxErr+1 errors.  A maxErr less than zero does not stop
  * and reports all errors. Set maxErr to VCF_IGNORE_ERRS for silence */
 {
 struct lineFile *lf = NULL;
 if (startsWith("http://", fileOrUrl) || startsWith("ftp://", fileOrUrl) ||
     startsWith("https://", fileOrUrl))
     lf = netLineFileOpen(fileOrUrl);
 else
     lf = lineFileMayOpen(fileOrUrl, TRUE);
 struct vcfFile *vcff = vcfFileHeaderFromLineFile(lf, maxErr);
 if (vcff && chrom != NULL)
     {
     char *line = NULL;
-    int size = 0;
-    while (lineFileNext(vcff->lf, &line, &size))
+    while (lineFileNextReal(vcff->lf, &line))
 	{
-	char lineCopy[size+1];
+	char lineCopy[strlen(line)+1];
 	safecpy(lineCopy, sizeof(lineCopy), line);
 	char *words[VCF_MAX_COLUMNS];
 	int wordCount = chopTabs(lineCopy, words);
 	int expected = 8;
 	if (vcff->genotypeCount > 0)
 	    expected = 9 + vcff->genotypeCount;
 	lineFileExpectWords(vcff->lf, expected, wordCount);
 	struct vcfRecord *record = vcfRecordFromRow(vcff, words);
 	if (chromsMatch(chrom, record->chrom))
 	    {
 	    if (record->chromEnd < start)
 		continue;
 	    else
 		{
 		lineFileReuse(vcff->lf);
@@ -1160,64 +1230,52 @@
         "    string format;     \"If genotype columns are specified in header, a "
                                  "semicolon-separated list of of short keys starting with GT\""
         "    string genotypes;  \"If genotype columns are specified in header, a tab-separated "
                                  "set of genotype column values; each value is a colon-separated "
                                  "list of values corresponding to keys in the format column\""
         "    )";
 
 struct asObject *vcfAsObj()
 // Return asObject describing fields of VCF
 {
 return asParseText(vcfDataLineAutoSqlString);
 }
 
 char *vcfGetSlashSepAllelesFromWords(char **words, struct dyString *dy)
 /* Overwrite dy with a /-separated allele string from VCF words,
- * skipping the extra initial base that VCF requires for indel alleles if necessary.
+ * skipping the extra initial base that VCF requires for indel alleles if necessary,
+ * and trimming identical bases at the ends of all alleles if there are any.
  * Return dy->string for convenience. */
 {
-dyStringClear(dy);
 // VCF reference allele gets its own column:
 char *refAllele = words[3];
 char *altAlleles = words[4];
 // Make a vcfRecord-like allele array (ref in [0], alts after) so we can check for padding base:
 int alCount = 1 + countChars(altAlleles, ',') + 1;
 char *alleles[alCount];
 alleles[0] = refAllele;
-dyStringAppend(dy, altAlleles);
-chopByChar(dy->string, ',', &(alleles[1]), alCount-1);
-boolean hasPaddingBase = allelesHavePaddingBase(alleles, alCount);
-int offset = hasPaddingBase ? 1 : 0;
-// Build a /-separated allele string, trimming first base if offset is 1:
-dyStringClear(dy);
-if (refAllele[offset] == '\0')
-    dyStringAppendC(dy, '-');
-else
-    dyStringAppend(dy, refAllele+offset);
-if (isNotEmpty(altAlleles) && differentString(altAlleles, "."))
-    {
-    // Read alleles from altAlleles because the contents of alleles[] are trashed
-    // by our reuse of dy.
-    char *p;
-    while ((p = strchr(altAlleles, ',')) != NULL)
+char altAlCopy[strlen(altAlleles)+1];
+safecpy(altAlCopy, sizeof(altAlCopy), altAlleles);
+chopByChar(altAlCopy, ',', &(alleles[1]), alCount-1);
+int i;
+if (allelesHavePaddingBase(alleles, alCount))
     {
-	dyStringAppendC(dy, '/');
-	int len = p - altAlleles;
-	if (len-offset == 0)
-	    dyStringAppendC(dy, '-');
-	else if (isAllNt(altAlleles, len))
-	    dyStringAppendN(dy, altAlleles+offset, len-offset);
-	else
-	    dyStringAppendN(dy, altAlleles, len);  // symbolic
-	altAlleles = p+1;
+    // Skip padding base:
+    for (i = 0;  i < alCount;  i++)
+	alleles[i]++;
     }
+// Having dealt with left padding base, now look for identical bases on the right:
+int trimmedBases = countIdenticalBasesRight(alleles, alCount);
+// Build a /-separated allele string, trimming bases on the right if necessary:
+dyStringClear(dy);
+for (i = 0;  i < alCount;  i++)
+    {
+    if (i > 0)
 	dyStringAppendC(dy, '/');
-    int len = strlen(altAlleles);
-    if (len-offset == 0)
+    char *allele = alleles[i];
+    if (allele[trimmedBases] == '\0')
 	dyStringAppendC(dy, '-');
-    else if (isAllNt(altAlleles, len))
-	dyStringAppendN(dy, altAlleles+offset, len-offset);
     else
-	dyStringAppendN(dy, altAlleles, len);  // symbolic
+	dyStringAppendN(dy, allele, strlen(allele)-trimmedBases);
     }
 return dy->string;
 }