dacecbaf00b2193872633dde175c53d263922ff9
angie
  Wed May 11 16:18:38 2011 -0700
Feature #2823 (VCF track handler): request from alpha-test user CarlosBorroto: call Belinda's coding variant functional prediction function
as on the pgSnp details page.

diff --git src/hg/hgTracks/vcfTrack.c src/hg/hgTracks/vcfTrack.c
index 02c2a01..2c6994d 100644
--- src/hg/hgTracks/vcfTrack.c
+++ src/hg/hgTracks/vcfTrack.c
@@ -9,123 +9,39 @@
 #include "hgTracks.h"
 #include "pgSnp.h"
 #include "trashDir.h"
 #include "vcf.h"
 #if (defined USE_TABIX && defined KNETFILE_HOOKS)
 #include "knetUdc.h"
 #include "udc.h"
 #endif//def USE_TABIX && KNETFILE_HOOKS
 
 #ifdef USE_TABIX
 
 //#*** TODO: use trackDb/cart setting or something
 static boolean doHapClusterDisplay = TRUE;
 static boolean colorHapByRefAlt = TRUE;
 
-#define VCF_MAX_ALLELE_LEN 80
-
 static struct pgSnp *vcfFileToPgSnp(struct vcfFile *vcff)
 /* Convert vcff's records to pgSnp; don't free vcff until you're done with pgSnp
  * because it contains pointers into vcff's records' chrom. */
 {
 struct pgSnp *pgsList = NULL;
 struct vcfRecord *rec;
-struct dyString *dy = dyStringNew(0);
 for (rec = vcff->records;  rec != NULL;  rec = rec->next)
     {
-    struct pgSnp *pgs;
-    AllocVar(pgs);
-    pgs->chrom = rec->chrom;
-    pgs->chromStart = rec->chromStart;
-    pgs->chromEnd = rec->chromEnd;
-    // Build up slash-separated allele string from rec->ref + rec->alt:
-    dyStringClear(dy);
-    dyStringAppend(dy, rec->ref);
-    int altCount, i;
-    if (sameString(rec->alt, "."))
-	altCount = 0;
-    else
-	{
-	char *words[64]; // we're going to truncate anyway if there are this many alleles!
-	char copy[VCF_MAX_ALLELE_LEN+1];
-	strncpy(copy, rec->alt, VCF_MAX_ALLELE_LEN);
-	copy[VCF_MAX_ALLELE_LEN] = '\0';
-	altCount = chopCommas(copy, words);
-	for (i = 0;  i < altCount && dy->stringSize < VCF_MAX_ALLELE_LEN;  i++)
-	    dyStringPrintf(dy, "/%s", words[i]);
-	if (i < altCount)
-	    altCount = i;
-	}
-    pgs->name = cloneStringZ(dy->string, dy->stringSize+1);
-    pgs->alleleCount = altCount + 1;
-    // Build up comma-sep list of per-allele counts, if available:
-    dyStringClear(dy);
-    int refAlleleCount = 0;
-    boolean gotAltCounts = FALSE;
-    for (i = 0;  i < rec->infoCount;  i++)
-	if (sameString(rec->infoElements[i].key, "AN"))
-	    {
-	    refAlleleCount = rec->infoElements[i].values[0].datInt;
-	    break;
-	    }
-    for (i = 0;  i < rec->infoCount;  i++)
-	if (sameString(rec->infoElements[i].key, "AC"))
-	    {
-	    int alCounts[64];
-	    int j;
-	    gotAltCounts = (rec->infoElements[i].count > 0);
-	    for (j = 0;  j < rec->infoElements[i].count;  j++)
-		{
-		int ac = rec->infoElements[i].values[j].datInt;
-		if (j < altCount)
-		    alCounts[1+j] = ac;
-		refAlleleCount -= ac;
-		}
-	    if (gotAltCounts)
-		{
-		while (j++ < altCount)
-		    alCounts[1+j] = -1;
-		alCounts[0] = refAlleleCount;
-		if (refAlleleCount >= 0)
-		    dyStringPrintf(dy, "%d", refAlleleCount);
-		else
-		    dyStringAppend(dy, "-1");
-		for (j = 0;  j < altCount;  j++)
-		    if (alCounts[1+j] >= 0)
-			dyStringPrintf(dy, ",%d", alCounts[1+j]);
-		    else
-			dyStringAppend(dy, ",-1");
-		}
-	    break;
-	    }
-    if (refAlleleCount > 0 && !gotAltCounts)
-	dyStringPrintf(dy, "%d", refAlleleCount);
-    pgs->alleleFreq = cloneStringZ(dy->string, dy->stringSize+1);
-    // Build up comma-sep list... supposed to be per-allele quality scores but I think
-    // the VCF spec only gives us one BQ... for the reference position?  should ask.
-    dyStringClear(dy);
-    for (i = 0;  i < rec->infoCount;  i++)
-	if (sameString(rec->infoElements[i].key, "BQ"))
-	    {
-	    float qual = rec->infoElements[i].values[0].datFloat;
-	    dyStringPrintf(dy, "%.1f", qual);
-	    int j;
-	    for (j = 0;  j < altCount;  j++)
-		dyStringPrintf(dy, ",%.1f", qual);
-	    break;
-	    }
-    pgs->alleleScores = cloneStringZ(dy->string, dy->stringSize+1);
+    struct pgSnp *pgs = pgSnpFromVcfRecord(rec);
     slAddHead(&pgsList, pgs);
     }
 slReverse(&pgsList);
 return pgsList;
 }
 
 
 // Center-weighted alpha clustering of haplotypes -- see Redmine #3711, #2823 note 7
 // It might be nice to use an allele-frequency representation here instead of [ACGTN] strings
 // with "N" for missing info or differences, but keep it simple.
 
 struct cwaExtraData
 /* Helper data for hacTree clustering of haplotypes by center-weighted alpha distance */
     {
     int center;    // index from which each point's contribution to distance is to be weighted
@@ -144,47 +60,55 @@
     struct hapCluster *next;   // hacTree wants slList of items
     unsigned short *refCounts; // per-variant count of reference alleles observed
     unsigned short *unkCounts; // per-variant count of unknown (or unphased het) alleles
     unsigned short leafCount;  // number of leaves under this node (or 1 if leaf)
     unsigned short gtHapIx;    // if leaf, (genotype index << 1) + hapIx (0 or 1 for diploid)
 };
 
 INLINE boolean isRef(const struct hapCluster *c, int varIx)
 // Return TRUE if the leaves of cluster have at least as many reference alleles
 // as alternate alleles for variant varIx.
 {
 unsigned short altCount = c->leafCount - c->refCounts[varIx] - c->unkCounts[varIx];
 return (c->refCounts[varIx] >= altCount);
 }
 
+INLINE boolean hasUnk(const struct hapCluster *c, int varIx)
+// Return TRUE if at least one haplotype in this cluster has an unknown/unphased value at varIx.
+{
+return (c->unkCounts[varIx] > 0);
+}
+
 static double cwaDistance(const struct slList *item1, const struct slList *item2, void *extraData)
 /* Center-weighted alpha sequence distance function for hacTree clustering of haplotype seqs */
 // This is inner-loop so I am not doing defensive checks.  Caller must ensure:
 // 1. kids's sequences' lengths are both equal to helper->len
 // 2. 0 <= helper->center <= len
 // 3. 0.0 < helper->alpha <= 1.0
 {
 const struct hapCluster *kid1 = (const struct hapCluster *)item1;
 const struct hapCluster *kid2 = (const struct hapCluster *)item2;
 struct cwaExtraData *helper = extraData;
 double distance = 0;
 double weight = 1; // start at center: alpha to the 0th power
 int i;
 for (i=helper->center;  i >= 0;  i--)
     {
     if (isRef(kid1, i) != isRef(kid2, i))
 	distance += weight;
+    else if (hasUnk(kid1, i) != hasUnk(kid2, i))
+	distance += weight/2;
     weight *= helper->alpha;
     }
 weight = helper->alpha; // start at center+1: alpha to the 1st power
 for (i=helper->center+1;  i < helper->len;  i++)
     {
     if (isRef(kid1, i) != isRef(kid2, i))
 	distance += weight;
     weight *= helper->alpha;
     }
 return distance;
 }
 
 static struct hapCluster *lmHapCluster(struct cwaExtraData *helper)
 /* Use localMem to allocate a new cluster of the given len. */
 {