519e0946826199d1d9792fa8df5972843fce021c
angie
  Tue Aug 9 14:39:38 2011 -0700
Feature #2821 (VCF parser): improved representation of alleles:parse ref and comma-sep'd alt allele string into count and array
inside record, so callers don't all have to parse the comma-sep'd
alternate allele string.

diff --git src/hg/lib/pgSnp.c src/hg/lib/pgSnp.c
index 40b44e6..bdb716e 100644
--- src/hg/lib/pgSnp.c
+++ src/hg/lib/pgSnp.c
@@ -678,108 +678,121 @@
 item->alleleFreq = cloneString(row[5]);
 char pattern[128];
 safef(pattern, sizeof(pattern), "^[0-9]+(,[0-9]+){%d}$", item->alleleCount - 1);
 if (! regexMatchNoCase(row[5], pattern))
     lineFileAbort(lf, "invalid allele frequency, %s with count of %d", row[5], item->alleleCount);
 /* scores, comma separated list of numbers with above # of items */
 item->alleleScores = cloneString(row[6]);
 safef(pattern, sizeof(pattern), "^[0-9.]+(,[0-9.]+){%d}$", item->alleleCount - 1);
 if (! regexMatchNoCase(row[6], pattern))
     lineFileAbort(lf, "invalid allele scores, %s with count of %d", row[6], item->alleleCount);
 return item;
 }
 
 #define VCF_MAX_ALLELE_LEN 80
 
-struct pgSnp *pgSnpFromVcfRecord(struct vcfRecord *rec)
-/* Convert VCF rec to pgSnp; don't free rec->file (vcfFile) until
- * you're done with pgSnp because pgSnp points to rec->chrom. */
+static char *alleleCountsFromVcfRecord(struct vcfRecord *rec, int alDescCount)
+/* Build up comma-sep list of per-allele counts, if available, up to alDescCount
+ * which may be less than rec->alleleCount: */
 {
 static struct dyString *dy = NULL;
 if (dy == NULL)
     dy = dyStringNew(0);
 else
     dyStringClear(dy);
-struct pgSnp *pgs;
-AllocVar(pgs);
-pgs->chrom = rec->chrom;
-pgs->chromStart = rec->chromStart;
-pgs->chromEnd = rec->chromEnd;
-// Build up slash-separated allele string from rec->ref + rec->alt:
-dyStringAppend(dy, rec->ref);
-int altCount, i;
-if (sameString(rec->alt, "."))
-    altCount = 0;
-else
-    {
-    char *words[VCF_MAX_ALLELE_LEN/2];
-    char copy[VCF_MAX_ALLELE_LEN+1];
-    strncpy(copy, rec->alt, VCF_MAX_ALLELE_LEN);
-    copy[VCF_MAX_ALLELE_LEN] = '\0';
-    altCount = chopCommas(copy, words);
-    for (i = 0;  i < altCount && dy->stringSize < VCF_MAX_ALLELE_LEN;  i++)
-	dyStringPrintf(dy, "/%s", words[i]);
-    if (i < altCount)
-	altCount = i;
-    }
-pgs->name = cloneStringZ(dy->string, dy->stringSize+1);
-pgs->alleleCount = altCount + 1;
-// Build up comma-sep list of per-allele counts, if available:
 dyStringClear(dy);
-int refAlleleCount = 0;
-boolean gotAltCounts = FALSE;
+int alCounts[VCF_MAX_ALLELE_LEN];
+boolean gotTotalCount = FALSE, gotAltCounts = FALSE;
+int i;
 for (i = 0;  i < rec->infoCount;  i++)
     if (sameString(rec->infoElements[i].key, "AN"))
 	{
-	refAlleleCount = rec->infoElements[i].values[0].datInt;
+	gotTotalCount = TRUE;
+	// Set ref allele to total count, subtract alt counts below.
+	alCounts[0] = rec->infoElements[i].values[0].datInt;
 	break;
 	}
 for (i = 0;  i < rec->infoCount;  i++)
     if (sameString(rec->infoElements[i].key, "AC"))
 	{
-	int alCounts[64];
+	if (rec->infoElements[i].count > 0)
+	    {
+	    gotAltCounts = TRUE;
 	int j;
-	gotAltCounts = (rec->infoElements[i].count > 0);
-	for (j = 0;  j < rec->infoElements[i].count;  j++)
+	    for (j = 0;  j < rec->infoElements[i].count && j < alDescCount-1;  j++)
 	    {
 	    int ac = rec->infoElements[i].values[j].datInt;
-	    if (j < altCount)
 		alCounts[1+j] = ac;
-	    refAlleleCount -= ac;
+		if (gotTotalCount)
+		    alCounts[0] -= ac;
 	    }
-	if (gotAltCounts)
-	    {
-	    while (j++ < altCount)
+	    while (j++ < alDescCount-1)
 		alCounts[1+j] = -1;
-	    alCounts[0] = refAlleleCount;
-	    if (refAlleleCount >= 0)
-		dyStringPrintf(dy, "%d", refAlleleCount);
+	    if (gotTotalCount)
+		dyStringPrintf(dy, "%d", alCounts[0]);
 	    else
 		dyStringAppend(dy, "-1");
-	    for (j = 0;  j < altCount;  j++)
-		if (alCounts[1+j] >= 0)
-		    dyStringPrintf(dy, ",%d", alCounts[1+j]);
+	    for (j = 1;  j < alDescCount;  j++)
+		if (alCounts[j] >= 0)
+		    dyStringPrintf(dy, ",%d", alCounts[j]);
 		else
 		    dyStringAppend(dy, ",-1");
 	    }
 	break;
 	}
-if (refAlleleCount > 0 && !gotAltCounts)
-    dyStringPrintf(dy, "%d", refAlleleCount);
-pgs->alleleFreq = cloneStringZ(dy->string, dy->stringSize+1);
+if (gotTotalCount && !gotAltCounts)
+    dyStringPrintf(dy, "%d", alCounts[0]);
+return cloneStringZ(dy->string, dy->stringSize+1);
+}
+
+struct pgSnp *pgSnpFromVcfRecord(struct vcfRecord *rec)
+/* Convert VCF rec to pgSnp; don't free rec->file (vcfFile) until
+ * you're done with pgSnp because pgSnp points to rec->chrom. */
+{
+static struct dyString *dy = NULL;
+if (dy == NULL)
+    dy = dyStringNew(0);
+else
+    dyStringClear(dy);
+struct pgSnp *pgs;
+AllocVar(pgs);
+pgs->chrom = rec->chrom;
+pgs->chromStart = rec->chromStart;
+pgs->chromEnd = rec->chromEnd;
+// Build up slash-separated allele string from rec->alleles, starting with ref allele:
+dyStringAppend(dy, rec->alleles[0]);
+int alCount = rec->alleleCount, i;
+if (rec->alleleCount == 2 && sameString(rec->alleles[1], "."))
+    // ignore N/A alternate allele
+    alCount = 1;
+else if (rec->alleleCount >= 2)
+    {
+    // append /-sep'd alternate alleles, unless/until it gets too long:
+    for (i = 1;  i < rec->alleleCount;  i++)
+	{
+	if ((dy->stringSize + 1 + strlen(rec->alleles[i])) > VCF_MAX_ALLELE_LEN)
+	    break;
+	dyStringPrintf(dy, "/%s", rec->alleles[i]);
+	}
+    if (i < rec->alleleCount)
+	alCount = i;
+    }
+pgs->name = cloneStringZ(dy->string, dy->stringSize+1);
+pgs->alleleCount = alCount;
+pgs->alleleFreq = alleleCountsFromVcfRecord(rec, alCount);
 // Build up comma-sep list... supposed to be per-allele quality scores but I think
 // the VCF spec only gives us one BQ... for the reference position?  should ask.
 dyStringClear(dy);
 for (i = 0;  i < rec->infoCount;  i++)
     if (sameString(rec->infoElements[i].key, "BQ"))
 	{
 	float qual = rec->infoElements[i].values[0].datFloat;
 	dyStringPrintf(dy, "%.1f", qual);
 	int j;
-	for (j = 0;  j < altCount;  j++)
+	for (j = 1;  j < rec->alleleCount;  j++)
 	    dyStringPrintf(dy, ",%.1f", qual);
 	break;
 	}
 pgs->alleleScores = cloneStringZ(dy->string, dy->stringSize+1);
 return pgs;
 }