fb0b9a237e0c6936a069af0afe7107db15f822d6
angie
  Wed May 4 15:08:28 2011 -0700
Feature #3711 (center-weighted alpha haplo sorting for vcfTabix):Performance improvement for hacTree: if caller passes in comparison
function, then pre-sort the items and pre-cluster adjacent identical
items before generating pairs.  When many of the inputs are identical,
this greatly reduces the number of pairs that the main clustering
algorithm starts with.  For example, a 1000 Genomes file has
genotypes for 1360 people (2720 haplotypes), and starting with
all pairs of 2720 haps was impossibly slow for hgTracks.  However,
in regions of a few tens of thousands of bases and a few tens of
variants, in practice there's usually less than 100 distinct
haplotypes, which makes it possible to cluster in tenths of seconds
instead of timing out.  The pre-clustering also makes nice balanced
trees; the main clustering step still seems prone to chaining to me,
so there's probably still more room for improvement there.

diff --git src/hg/hgTracks/vcfTrack.c src/hg/hgTracks/vcfTrack.c
index c79ad82..9218d87 100644
--- src/hg/hgTracks/vcfTrack.c
+++ src/hg/hgTracks/vcfTrack.c
@@ -8,30 +8,31 @@
 #include "hdb.h"
 #include "hgTracks.h"
 #include "pgSnp.h"
 #include "trashDir.h"
 #include "vcf.h"
 #if (defined USE_TABIX && defined KNETFILE_HOOKS)
 #include "knetUdc.h"
 #include "udc.h"
 #endif//def USE_TABIX && KNETFILE_HOOKS
 
 #ifdef USE_TABIX
 
 //#*** TODO: use trackDb/cart setting or something
 static boolean boringBed = FALSE;
 static boolean doHapClusterDisplay = TRUE;
+static boolean colorHapByRefAlt = TRUE;
 
 static struct bed4 *vcfFileToBed4(struct vcfFile *vcff)
 /* Convert vcff's records to bed4; don't free vcff until you're done with bed4
  * because bed4 contains pointers into vcff's records' chrom and name. */
 {
 struct bed4 *bedList = NULL;
 struct vcfRecord *rec;
 for (rec = vcff->records;  rec != NULL;  rec = rec->next)
     {
     struct bed4 *bed;
     AllocVar(bed);
     bed->chrom = rec->chrom;
     bed->chromStart = rec->chromStart;
     bed->chromEnd = rec->chromEnd;
     bed->name = rec->name;
@@ -128,60 +129,30 @@
 	    {
 	    float qual = rec->infoElements[i].values[0].datFloat;
 	    dyStringPrintf(dy, "%.1f", qual);
 	    int j;
 	    for (j = 0;  j < altCount;  j++)
 		dyStringPrintf(dy, ",%.1f", qual);
 	    break;
 	    }
     pgs->alleleScores = cloneStringZ(dy->string, dy->stringSize+1);
     slAddHead(&pgsList, pgs);
     }
 slReverse(&pgsList);
 return pgsList;
 }
 
-INLINE char *hapIxToAllele(int hapIx, char *refAllele, char *altAlleles[])
-/* Look up allele by index into reference allele and alternate allele(s). */
-{
-return (hapIx == 0) ? refAllele : altAlleles[hapIx-1];
-}
-
-INLINE Color colorFromGt(struct vcfGenotype *gt, int ploidIx, char *refAllele,
-			 char *altAlleles[], int altCount, boolean grayUnphasedHet)
-/* Color allele by base. */
-{
-int hapIx = ploidIx ? gt->hapIxB : gt->hapIxA;
-char *allele = hapIxToAllele(hapIx, refAllele, altAlleles);
-if (gt->isHaploid && hapIx > 0)
-    return shadesOfGray[5];
-if (grayUnphasedHet && !gt->isPhased && gt->hapIxA != gt->hapIxB)
-    return shadesOfGray[5];
-// Copying pgSnp color scheme here, using first base of allele which is not ideal for multibase
-// but allows us to simplify it to 5 colors:
-else if (allele[0] == 'A')
-    return MG_RED;
-else if (allele[0] == 'C')
-    return MG_BLUE;
-else if (allele[0] == 'G')
-    return darkGreenColor;
-else if (allele[0] == 'T')
-    return MG_MAGENTA;
-else
-    return shadesOfGray[5];
-}
-
 
 // Center-weighted alpha clustering of haplotypes -- see Redmine #3711, #2823 note 7
 // It might be nice to use an allele-frequency representation here instead of [ACGTN] strings
 // with "N" for missing info or differences, but keep it simple.
 
 struct cwaExtraData
 /* Helper data for hacTree clustering of haplotypes by center-weighted alpha distance */
     {
     int center;    // index from which each point's contribution to distance is to be weighted
     int len;       // total length of haplotype strings
     double alpha;  // weighting factor for distance from center
     struct lm *localMem;
     };
 
 // This is the representation of a cluster of up to 65,535 haplotypes of equal length,
@@ -254,40 +225,92 @@
 const struct hapCluster *kid1 = (const struct hapCluster *)item1;
 const struct hapCluster *kid2 = (const struct hapCluster *)item2;
 struct cwaExtraData *helper = extraData;
 struct hapCluster *consensus = lmHapCluster(helper);
 consensus->leafCount = kid1->leafCount + kid2->leafCount;
 consensus->gtHapIx = kid1->gtHapIx;
 int i;
 for (i=0;  i < helper->len;  i++)
     {
     consensus->refCounts[i] = kid1->refCounts[i] + kid2->refCounts[i];
     consensus->unkCounts[i] = kid1->unkCounts[i] + kid2->unkCounts[i];
     }
 return (struct slList *)consensus;
 }
 
-static struct hacTree *clusterChroms(const struct vcfFile *vcff)
+INLINE void hapClusterToString(const struct hapCluster *c, struct dyString *dy, int len)
+/* Write a text representation of hapCluster's alleles into dy.  */
+{
+dyStringClear(dy);
+int i;
+for (i=0;  i < len;  i++)
+    dyStringAppendC(dy, (isRef(c, i) ? '0': '1'));
+}
+
+static int cwaCmp(const struct slList *item1, const struct slList *item2, void *extraData)
+/* Convert hapCluster to allele strings for easy comparison by strcmp. */
+{
+const struct hapCluster *c1 = (const struct hapCluster *)item1;
+const struct hapCluster *c2 = (const struct hapCluster *)item2;
+struct cwaExtraData *helper = extraData;
+static struct dyString *dy1 = NULL, *dy2 = NULL;
+if (dy1 == NULL)
+    {
+    dy1 = dyStringNew(0);
+    dy2 = dyStringNew(0);
+    }
+hapClusterToString(c1, dy1, helper->len);
+hapClusterToString(c2, dy2, helper->len);
+return strcmp(dy1->string, dy2->string);
+}
+
+void rSetGtHapOrder(struct hacTree *ht, unsigned short *gtHapOrder, unsigned short *retGtHapEnd)
+/* Traverse hacTree and build an ordered array of genotype + haplotype indices. */
+{
+if (ht->left == NULL && ht->right == NULL)
+    {
+    struct hapCluster *c = (struct hapCluster *)ht->itemOrCluster;
+    gtHapOrder[(*retGtHapEnd)++] = c->gtHapIx;
+    }
+else
+    {
+    struct hapCluster *cL = (struct hapCluster *)ht->left->itemOrCluster;
+    struct hapCluster *cR = (struct hapCluster *)ht->right->itemOrCluster;
+    if (cL->leafCount >= cR->leafCount)
+	{
+	rSetGtHapOrder(ht->left, gtHapOrder, retGtHapEnd);
+	rSetGtHapOrder(ht->right, gtHapOrder, retGtHapEnd);
+	}
+    else
+	{
+	rSetGtHapOrder(ht->right, gtHapOrder, retGtHapEnd);
+	rSetGtHapOrder(ht->left, gtHapOrder, retGtHapEnd);
+	}
+    }
+}
+
+static unsigned short *clusterChroms(const struct vcfFile *vcff, unsigned short *retGtHapEnd)
 /* Given a bunch of VCF records with phased genotypes, build up one haplotype string
  * per chromosome that is the sequence of alleles in all variants (simplified to one base
  * per variant).  Each individual/sample will have two haplotype strings (unless haploid
  * like Y or male X).  Independently cluster the haplotype strings using hacTree with the 
- * center-weighted alpha functions above.  */
+ * center-weighted alpha functions above. Return an array of genotype+haplotype indices
+ * in the order determined by the hacTree, and set retGtHapEnd to its length/end. */
 {
 int len = slCount(vcff->records);
-// TODO: make this settable by user right-click or some other means of choosing a specific
-// variant or position:
+// Use the median variant in the window as the center; would be even nicer to allow
+// the user to choose a variant (or position) to use as center:
 int center = len / 2;
 // Should alpha depend on len?  Should the penalty drop off with distance?  Seems like
 // straight-up exponential will cause the signal to drop to nothing pretty quickly...
 double alpha = 0.5;
 struct lm *lm = lmInit(0);
 struct cwaExtraData helper = { center, len, alpha, lm };
 int ploidy = 2; // Assuming diploid genomes here, no XXY, tetraploid etc.
 int gtCount = vcff->genotypeCount;
 // Make an slList of hapClusters, but allocate in a big block so I can use
 // array indexing.
 struct hapCluster **hapArray = lmAlloc(lm, sizeof(struct hapCluster *) * gtCount * ploidy);
 int i;
 for (i=0;  i < ploidy * gtCount;  i++)
     {
     hapArray[i] = lmHapCluster(&helper);
@@ -333,121 +356,98 @@
 	    }
 	}
     if (haveHaploid)
 	{
 	// Some array items will have an empty cluster for missing hap2 --
 	// trim those from the linked list.
 	struct hapCluster *c = hapArray[0];
 	while (c != NULL && c->next != NULL)
 	    {
 	    if (c->next->leafCount == 0)
 		c->next = c->next->next;
 	    c = c->next;
 	    }
 	}
     }
-return hacTreeFromItems((struct slList *)(hapArray[0]), lm, cwaDistance, cwaMerge, &helper);
+struct hacTree *ht = hacTreeFromItems((struct slList *)(hapArray[0]), lm,
+				      cwaDistance, cwaMerge, cwaCmp, &helper);
+unsigned short *gtHapOrder = needMem(vcff->genotypeCount * 2 * sizeof(unsigned short));
+rSetGtHapOrder(ht, gtHapOrder, retGtHapEnd);
+return gtHapOrder;
 }
 
-INLINE int drawOneHap(struct vcfGenotype *gt, int hapIx,
-		      char *ref, char *altAlleles[], int altCount,
-		      struct hvGfx *hvg, int x1, int y, int w, int itemHeight, int lineHeight)
-/* Draw a base-colored box for genotype[hapIx].  Return the new y offset. */
+INLINE char *hapIxToAllele(int hapIx, char *refAllele, char *altAlleles[])
+/* Look up allele by index into reference allele and alternate allele(s). */
 {
-Color color = colorFromGt(gt, hapIx, ref, altAlleles, altCount, TRUE);
-hvGfxBox(hvg, x1, y, w, itemHeight+1, color);
-y += itemHeight+1;
-return y;
+return (hapIx == 0) ? refAllele : altAlleles[hapIx-1];
 }
 
-
-#ifdef DEBUG
-static void rPrintHapClusterTree(FILE *f, struct hacTree *tree, int len, int level)
-/* Recursively print out cluster as nested-parens with {}'s around leaf nodes. */
-{
-static struct dyString *dy = NULL;
-if (dy == NULL)
-    dy = dyStringNew(0);
-dyStringClear(dy);
-struct hapCluster *c = (struct hapCluster *)(tree->itemOrCluster);
-int i;
-for (i=0;  i < len;  i++)
-    dyStringAppendC(dy, isRef(c, i) ? '0' : '1');
-for (i=0;  i < level;  i++)
-    fputc('0'+i, f);
-if (tree->left == NULL && tree->right == NULL)
+INLINE Color colorFromGt(struct vcfGenotype *gt, int ploidIx, char *refAllele,
+			 char *altAlleles[], int altCount, boolean grayUnphasedHet)
+/* Color allele by base. */
     {
-    fprintf(f, "{%s}", dy->string);
-    return;
-    }
-else if (tree->left == NULL || tree->right == NULL)
-    errAbort("\nHow did we get a node with one NULL kid??");
-fprintf(f, "(%s:%d:\n", dy->string, (int)(tree->childDistance));
-rPrintHapClusterTree(f, tree->left, len, level+1);
-fputs(",\n", f);
-rPrintHapClusterTree(f, tree->right, len, level+1);
-fputc('\n', f);
-for (i=0;  i < level;  i++)
-    fputc('0'+i, f);
-fputs(")", f);
+int hapIx = ploidIx ? gt->hapIxB : gt->hapIxA;
+char *allele = hapIxToAllele(hapIx, refAllele, altAlleles);
+if (gt->isHaploid && hapIx > 0)
+    return shadesOfGray[5];
+if (grayUnphasedHet && !gt->isPhased && gt->hapIxA != gt->hapIxB)
+    return shadesOfGray[5];
+// Copying pgSnp color scheme here, using first base of allele which is not ideal for multibase
+// but allows us to simplify it to 5 colors:
+else if (allele[0] == 'A')
+    return MG_RED;
+else if (allele[0] == 'C')
+    return MG_BLUE;
+else if (allele[0] == 'G')
+    return darkGreenColor;
+else if (allele[0] == 'T')
+    return MG_MAGENTA;
+else
+    return shadesOfGray[5];
 }
 
-void printHapClusterTree(FILE *f, struct hacTree *tree, int len)
-/* Print out cluster as nested-parens with {}'s around leaf nodes. */
+INLINE Color colorFromRefAlt(struct vcfGenotype *gt, int hapIx, boolean grayUnphasedHet)
+/* Color allele red for alternate allele, blue for reference allele. */
 {
-if (tree == NULL)
-    {
-    fputs("Empty tree.\n", f);
-    return;
-    }
-rPrintHapClusterTree(f, tree, len, 0);
-fputc('\n', f);
+if (grayUnphasedHet && !gt->isPhased && gt->hapIxA != gt->hapIxB)
+    return shadesOfGray[5];
+int alIx = hapIx ? gt->hapIxB : gt->hapIxA;
+return alIx ? MG_RED : MG_BLUE;
 }
-#endif//def DEBUG
 
-void rSetGtHapOrder(struct hacTree *ht, unsigned short *gtHapOrder, unsigned short *retGtHapEnd)
-/* Traverse hacTree and build an ordered array of genotype + haplotype indices. */
-{
-if (ht->left == NULL && ht->right == NULL)
-    {
-    struct hapCluster *c = (struct hapCluster *)ht->itemOrCluster;
-    gtHapOrder[(*retGtHapEnd)++] = c->gtHapIx;
-    }
-else
+
+INLINE int drawOneHap(struct vcfGenotype *gt, int hapIx,
+		      char *ref, char *altAlleles[], int altCount,
+		      struct hvGfx *hvg, int x1, int y, int w, int itemHeight, int lineHeight)
+/* Draw a base-colored box for genotype[hapIx].  Return the new y offset. */
     {
-    rSetGtHapOrder(ht->left, gtHapOrder, retGtHapEnd);
-    rSetGtHapOrder(ht->right, gtHapOrder, retGtHapEnd);
-    }
+Color color = colorHapByRefAlt ? colorFromRefAlt(gt, hapIx, TRUE) :
+				 colorFromGt(gt, hapIx, ref, altAlleles, altCount, TRUE);
+hvGfxBox(hvg, x1, y, w, itemHeight+1, color);
+y += itemHeight+1;
+return y;
 }
 
 static void vcfHapClusterDraw(struct track *tg, int seqStart, int seqEnd,
 			      struct hvGfx *hvg, int xOff, int yOff, int width,
 			      MgFont *font, Color color, enum trackVisibility vis)
 /* Split samples' chromosomes (haplotypes), cluster them by center-weighted
  * alpha similarity, and draw in the order determined by clustering. */
 {
 const struct vcfFile *vcff = tg->extraUiData;
-struct hacTree *ht = clusterChroms(vcff);
-#ifdef DEBUG
-puts("<pre>");
-printHapClusterTree(stdout, ht, slCount(vcff->records));
-puts("</pre>");
-#endif//def DEBUG
-// the following 4 lines should probably be moved into clusterChroms.
-unsigned short *gtHapOrder = needMem(vcff->genotypeCount * 2 * sizeof(unsigned short));
 unsigned short gtHapEnd = 0;
-rSetGtHapOrder(ht, gtHapOrder, &gtHapEnd);
+unsigned short *gtHapOrder = clusterChroms(vcff, &gtHapEnd);
 struct dyString *tmp = dyStringNew(0);
 struct vcfRecord *rec;
 const int lineHeight = tg->lineHeight;
 const int itemHeight = tg->heightPer;
 const double scale = scaleForPixels(width);
 for (rec = vcff->records;  rec != NULL;  rec = rec->next)
     {
     dyStringClear(tmp);
     dyStringAppend(tmp, rec->alt);
     char *altAlleles[256];
     int altCount = chopCommas(tmp->string, altAlleles);
     int x1 = round((double)(rec->chromStart-winStart)*scale) + xOff;
     int x2 = round((double)(rec->chromEnd-winStart)*scale) + xOff;
     int w = x2-x1;
     if (w < 1)
@@ -520,31 +520,31 @@
     vcff = vcfTabixFileMayOpen(fileOrUrl, chromName, winStart, winEnd, vcfMaxErr);
     }
 errCatchEnd(errCatch);
 if (errCatch->gotError)
     {
     if (isNotEmpty(errCatch->message->string))
 	tg->networkErrMsg = cloneString(errCatch->message->string);
     tg->drawItems = bigDrawWarning;
     tg->totalHeight = bigWarnTotalHeight;
     }
 errCatchFree(&errCatch);
 if (vcff != NULL)
     {
     if (boringBed)
 	tg->items = vcfFileToBed4(vcff);
-    else if (doHapClusterDisplay && vcff->genotypeCount > 0 && vcff->genotypeCount < 100 &&
+    else if (doHapClusterDisplay && vcff->genotypeCount > 0 && vcff->genotypeCount < 3000 &&
 	     (tg->visibility == tvPack || tg->visibility == tvSquish))
 	vcfHapClusterOverloadMethods(tg, vcff);
     else
 	{
 	tg->items = vcfFileToPgSnp(vcff);
 	/* base coloring/display decision on count of items */
 	tg->customInt = slCount(tg->items);
 	}
     // Don't vcfFileFree here -- we are using its string pointers!
     }
 }
 
 void vcfTabixMethods(struct track *track)
 /* Methods for VCF + tabix files. */
 {