2b5eb866f050d964d8964ec5a84f7b63889cc6b1
angie
  Mon Jun 3 14:39:36 2013 -0700
New CGI, hgVai (Variant Annotation Integrator): simple checklist-styleUI by which user can select variants that they have uploaded; gene
predictions to identify which part of a gene, if any, is hit by each
variant; several additional sources of annotations/predictions e.g.
dbNSFP scores and conserved elements/scores; and several filters to
constrain output to the variants most likely to have a functional effect.
Along with the new CGI, there are various lib bugfixes and improvements,
a new hg/lib/tests/ testcase, and some test file changes to accomodate
data updates to both knownGene and the pg* tables in knownGene.
refs #6152

diff --git src/lib/vcf.c src/lib/vcf.c
index 5f55c5d..3f3754a 100644
--- src/lib/vcf.c
+++ src/lib/vcf.c
@@ -1070,37 +1070,67 @@
         "    string info;       \"Additional information encoded as a semicolon-separated series "
                                  "of short keys with optional comma-separated values\""
         "    string format;     \"If genotype columns are specified in header, a "
                                  "semicolon-separated list of of short keys starting with GT\""
         "    string genotypes;  \"If genotype columns are specified in header, a tab-separated "
                                  "set of genotype column values; each value is a colon-separated "
                                  "list of values corresponding to keys in the format column\""
         "    )";
 
 struct asObject *vcfAsObj()
 // Return asObject describing fields of VCF
 {
 return asParseText(vcfDataLineAutoSqlString);
 }
 
-char *vcfGetSlashSepAllelesFromWords(char **words, struct dyString *dy)
-/* Overwrite dy with a /-separated allele string from VCF words;
- * return dy->string for convenience. */
+char *vcfGetSlashSepAllelesFromWords(char **words, struct dyString *dy,
+				     boolean *retSkippedFirstBase)
+/* Overwrite dy with a /-separated allele string from VCF words,
+ * skipping the extra initial base that VCF requires for indel alleles if necessary.
+ * Return dy->string for convenience. */
 {
 dyStringClear(dy);
 // VCF reference allele gets its own column:
-dyStringAppend(dy, words[3]);
+char *refAllele = words[3];
+char *altAlleles = words[4];
+// First determine whether there is an extra initial base that we need to skip:
+boolean allStartSame = TRUE;
+char *p;
+while ((p = strchr(altAlleles, ',')) != NULL)
+    {
+    if (altAlleles[0] != refAllele[0])
+	allStartSame = FALSE;
+    altAlleles = p+1;
+    }
+if (altAlleles[0] != refAllele[0])
+    allStartSame = FALSE;
+int offset = allStartSame ? 1 : 0;
+if (refAllele[offset] == '\0')
+    dyStringAppendC(dy, '-');
+else
+    dyStringAppend(dy, refAllele+offset);
 // VCF alternate alleles are comma-separated, make them /-separated:
-if (isNotEmpty(words[4]))
+altAlleles = words[4];
+if (isNotEmpty(altAlleles) && differentString(altAlleles, "."))
     {
-    char *altAlleles = words[4], *p;
+    // Now construct the string:
     while ((p = strchr(altAlleles, ',')) != NULL)
 	{
 	dyStringAppendC(dy, '/');
-	dyStringAppendN(dy, altAlleles, p-altAlleles);
+	int len = p - altAlleles - offset;
+	if (len == 0)
+	    dyStringAppendC(dy, '-');
+	else
+	    dyStringAppendN(dy, altAlleles+offset, len);
 	altAlleles = p+1;
 	}
     dyStringAppendC(dy, '/');
-    dyStringAppend(dy, altAlleles);
+    int len = strlen(altAlleles) - offset;
+    if (len == 0)
+	dyStringAppendC(dy, '-');
+    else
+	dyStringAppendN(dy, altAlleles+offset, len);
     }
+if (retSkippedFirstBase)
+    *retSkippedFirstBase = offset;
 return dy->string;
 }