43b503cbb9e91b597a2ba97b8807096114e68935
kent
  Thu Aug 28 13:06:17 2014 -0700
Removing hacTreeExpensiveMerges and simplifying hacTreeMultiThread
diff --git src/lib/hacTree.c src/lib/hacTree.c
index 53e3b89..dda1b97 100644
--- src/lib/hacTree.c
+++ src/lib/hacTree.c
@@ -282,277 +282,223 @@
 	}
     // root now has a stable address, unlike nodes still in the pool, so set parents here:
     if (root->left != NULL)
 	root->left->parent = root;
     if (root->right != NULL)
 	root->right->parent = root;
     }
 // This shouldn't be necessary as long as initNode leaves parent pointers alone,
 // but just in case that changes:
 root->parent = NULL;
 return root;
 }
 
 /** The code from here on down is an alternative implementation that calls the merge
  ** function and for that matter the distance function much less than the function
- ** above.  */
+ ** above, and also is multithreaded.  It does seem to produce the same output
+ ** but the algorigthm is sufficiently different, at least for now, I'm keeping
+ ** both. */
 
-typedef double hacPairingFunction(struct dlList *list, struct hash *distanceHash, 
-    hacDistanceFunction *distF, void *distData, struct dlNode **retNodeA, struct dlNode **retNodeB);
-
-static double pairSerially(struct dlList *list, struct hash *distanceHash, 
-    hacDistanceFunction *distF, void *extraData, struct dlNode **retNodeA, struct dlNode **retNodeB)
-/* Loop through list returning closest two nodes */
-{
-struct dlNode *aNode;
-double closestDistance = BIGDOUBLE;
-struct dlNode *closestA = NULL, *closestB = NULL;
-for (aNode = list->head; !dlEnd(aNode); aNode = aNode->next)
-    {
-    struct hacTree *aHt = aNode->val;
-    struct slList *a = aHt->itemOrCluster;
-    struct dlNode *bNode;
-    for (bNode = aNode->next; !dlEnd(bNode); bNode = bNode->next)
-        {
-	struct hacTree *bHt = bNode->val;
-	char key[64];
-	safef(key, sizeof(key), "%p %p", aHt, bHt);
-	double *pd = hashFindVal(distanceHash, key);
-	if (pd == NULL)
-	     {
-	     lmAllocVar(distanceHash->lm, pd);
-	     *pd = distF(a, bHt->itemOrCluster, extraData);
-	     hashAdd(distanceHash, key, pd);
-	     }
-	double d = *pd;
-	if (d < closestDistance)
-	    {
-	    closestDistance = d;
-	    closestA = aNode;
-	    closestB = bNode;
-	    }
-	}
-    }
-*retNodeA = closestA;
-*retNodeB = closestB;
-return closestDistance;
-}
-
-static void lmDlAddValTail(struct lm *lm, struct dlList *list, void *val)
-/* Allocate new dlNode out of lm, initialize it with val, and add it to end of list */
-{
-struct dlNode *node;
-lmAllocVar(lm, node);
-node->val = val;
-dlAddTail(list, node);
-}
-
-struct hacTree *hacTreeVirtualPairing(struct slList *itemList, struct lm *localMem,
-				 hacDistanceFunction *distF, hacMergeFunction *mergeF,
-				 void *extraData, hacPairingFunction *pairingF,
-				 struct hash *precalcDistanceHash)
-/* Construct hacTree using a method that will minimize the number of calls to
- * the distance and merge functions, assuming they are expensive.  Do a lmCleanup(localMem)
- * to free the returned tree. */
-{
-/* Make up a doubly-linked list in 'remaining' with all items in it */
-struct dlList remaining;
-dlListInit(&remaining);
-struct slList *item;
-int count = 0;
-struct hash *distanceHash = (precalcDistanceHash ? precalcDistanceHash : hashNew(0));
-for (item = itemList; item != NULL; item = item->next)
-    {
-    struct hacTree *ht;
-    lmAllocVar(localMem, ht);
-    ht->itemOrCluster = item;
-    lmDlAddValTail(localMem, &remaining, ht);
-    count += 1;
-    }
-
-/* Loop through finding closest and merging until only one node left on remaining. */
-int i;
-for (i=1; i<count; ++i)
-    {
-    /* Find closest pair */
-    struct dlNode *aNode, *bNode;
-    double distance = pairingF(&remaining, distanceHash, distF, extraData, &aNode, &bNode);
-
-    /* Allocated new hacTree item for them and fill it in with a merged value. */
-    struct hacTree *ht;
-    lmAllocVar(localMem, ht);
-    struct hacTree *left = ht->left = aNode->val;
-    struct hacTree *right = ht->right = bNode->val;
-    left->parent = right->parent = ht;
-    ht->itemOrCluster = mergeF(left->itemOrCluster, right->itemOrCluster, extraData);
-    ht->childDistance = distance;
-
-    /* Put merged item onto remaining list before first matching node in pair. */
-    struct dlNode *mergedNode;
-    lmAllocVar(localMem, mergedNode);
-    mergedNode->val = ht;
-    dlAddBefore(aNode, mergedNode);
-
-    /* Remove nodes we merged out */
-    dlRemove(aNode);
-    dlRemove(bNode);
-    }
-
-/* Clean up and go home. */
-if (distanceHash != precalcDistanceHash)
-    hashFree(&distanceHash);
-struct dlNode *lastNode = dlPopHead(&remaining);
-return lastNode->val;
-}
-
-struct hacTree *hacTreeForCostlyMerges(struct slList *itemList, struct lm *localMem,
-				 hacDistanceFunction *distF, hacMergeFunction *mergeF,
-				 void *extraData)
-/* Construct hacTree using a method that will minimize the number of calls to
- * the distance and merge functions, assuming they are expensive.  Do a lmCleanup(localMem)
- * to free the returned tree. */
-{
-return hacTreeVirtualPairing(itemList, localMem, distF, mergeF, extraData, pairSerially, NULL);
-}
-
-/** The code from here on down is an implementation that uses multiple threads **/
 
 #define distKeySize 64  
 
 static void calcDistKey(void *a, void *b, char key[distKeySize])
 /* Put key for distance in key */
 {
 safef(key, distKeySize, "%p %p", a, b);
 }
 
 void hacTreeDistanceHashAdd(struct hash *hash, void *itemA, void *itemB, double distance)
 /* Add an item to distance hash */
 {
 char key[distKeySize];
 calcDistKey(itemA, itemB, key);
 hashAdd(hash, key, CloneVar(&distance));
 }
 
+double *hacTreeDistanceHashLookup(struct hash *hash, void *itemA, void *itemB)
+/* Look up pair in distance hash.  Returns NULL if not found, otherwise pointer to
+ * distance */
+{
+char key[distKeySize];
+calcDistKey(itemA, itemB, key);
+return hashFindVal(hash, key);
+}
+
 struct hctPair
 /* A pair of hacTree nodes and the distance between them */
     {
     struct hctPair *next;  /* Next in list */
     struct hacTree *a, *b;	   /* The pair of nodes */
     double distance;	   /* Distance between pair */
     };
 
 struct distanceWorkerContext
 /* Context for a distance worker */
     {
     hacDistanceFunction *distF;	/* Function to call to calc distance */
     void *extraData;		/* Extra data for that function */
     };
 
 static void distanceWorker(void *item, void *context)
 /* fill in distance for pair */
 {
 struct hctPair *pair = item;
 struct distanceWorkerContext *dc = context;
 struct hacTree *aHt = pair->a, *bHt = pair->b;
 pair->distance = (dc->distF)((struct slList *)(aHt->itemOrCluster), 
     (struct slList *)(bHt->itemOrCluster), dc->extraData);
 }
 
-static int distThreadCount = 1;  // Number of threads to use for distance calcs.
-
 static void precacheMissingDistances(struct dlList *list, struct hash *distanceHash,
-    hacDistanceFunction *distF, void *extraData)
+    hacDistanceFunction *distF, void *extraData, int threadCount)
 /* Force computation of all distances not already in hash */
 {
 /* Make up list of all missing distances in pairList */
 struct synQueue *sq = synQueueNew();
 struct hctPair *pairList = NULL, *pair;
 struct dlNode *aNode;
 for (aNode = list->head; !dlEnd(aNode); aNode = aNode->next)
     {
     struct hacTree *aHt = aNode->val;
     struct dlNode *bNode;
     for (bNode = aNode->next; !dlEnd(bNode); bNode = bNode->next)
         {
 	struct hacTree *bHt = bNode->val;
 	char key[64];
 	calcDistKey(aHt->itemOrCluster, bHt->itemOrCluster, key);
 	double *pd = hashFindVal(distanceHash, key);
 	if (pd == NULL)
 	     {
 	     AllocVar(pair);
 	     pair->a = aHt;
 	     pair->b = bHt;
 	     slAddHead(&pairList, pair);
 	     synQueuePut(sq, pair);
 	     }
 	}
     }
 
 /* Parallelize distance calculations */
 struct distanceWorkerContext context = {.distF=distF, .extraData=extraData};
-pthreadDoList(distThreadCount, pairList, distanceWorker, &context);
+pthreadDoList(threadCount, pairList, distanceWorker, &context);
 
 for (pair = pairList; pair != NULL; pair = pair->next)
     {
     hacTreeDistanceHashAdd(distanceHash, 
 	pair->a->itemOrCluster, pair->b->itemOrCluster, pair->distance);
     }
 slFreeList(&pairList);
 }
 
 static double pairParallel(struct dlList *list, struct hash *distanceHash, 
-    hacDistanceFunction *distF, void *extraData, struct dlNode **retNodeA, struct dlNode **retNodeB)
+    hacDistanceFunction *distF, void *extraData, int threadCount,
+    struct dlNode **retNodeA, struct dlNode **retNodeB)
 /* Loop through list returning closest two nodes */
 {
-precacheMissingDistances(list, distanceHash, distF, extraData);
+precacheMissingDistances(list, distanceHash, distF, extraData, threadCount);
 struct dlNode *aNode;
 double closestDistance = BIGDOUBLE;
 struct dlNode *closestA = NULL, *closestB = NULL;
 for (aNode = list->head; !dlEnd(aNode); aNode = aNode->next)
     {
     struct hacTree *aHt = aNode->val;
     struct dlNode *bNode;
     for (bNode = aNode->next; !dlEnd(bNode); bNode = bNode->next)
         {
 	struct hacTree *bHt = bNode->val;
 	char key[64];
 	safef(key, sizeof(key), "%p %p", aHt->itemOrCluster, bHt->itemOrCluster);
 	double *pd = hashMustFindVal(distanceHash, key);
 	double d = *pd;
 	if (d < closestDistance)
 	    {
 	    closestDistance = d;
 	    closestA = aNode;
 	    closestB = bNode;
 	    }
 	}
     }
 *retNodeA = closestA;
 *retNodeB = closestB;
 return closestDistance;
 }
 
-struct hacTree *hacTreeMultiThread(int threadCount, struct slList *itemList, struct lm *localMem,
-				 hacDistanceFunction *distF, hacMergeFunction *mergeF,
+static void lmDlAddValTail(struct lm *lm, struct dlList *list, void *val)
+/* Allocate new dlNode out of lm, initialize it with val, and add it to end of list */
+{
+struct dlNode *node;
+lmAllocVar(lm, node);
+node->val = val;
+dlAddTail(list, node);
+}
+
+struct hacTree *hacTreeMultiThread(int threadCount, struct slList *itemList, 
+		    struct lm *localMem, hacDistanceFunction *distF, hacMergeFunction *mergeF,
 		    void *extraData, struct hash *precalcDistanceHash)
 /* Construct hacTree minimizing number of merges called, and doing distance calls
  * in parallel when possible.   Do a lmCleanup(localMem) to free returned tree. 
  * The inputs are
  *	threadCount - number of threads - at least one, recommended no more than 15
  *	itemList - list of items to tree up.  Format can vary, but must start with a
  *	           pointer to next item in list.
  *	localMem - memory pool where hacTree and a few other things are allocated from
  *	distF - function that calculates distance between two items, passed items and extraData
  *	mergeF - function that creates a new item in same format as itemList from two passed
  *	         in items and the extraData.  Typically does average of two input items
  *	extraData - parameter passed through to distF and mergeF, otherwise unused, may be NULL
  *	precalcDistanceHash - a hash containing at least some of the pairwise distances
  *	            between items on itemList, set with hacTreeDistanceHashAdd. 
  *	            As a side effect this hash will be expanded to include all distances 
  *	            including those between intermediate nodes. */
 {
-distThreadCount = threadCount;
-return hacTreeVirtualPairing(itemList, localMem, distF, mergeF, extraData, pairParallel,
-    precalcDistanceHash);
+/* Make up a doubly-linked list in 'remaining' with all items in it */
+struct dlList remaining;
+dlListInit(&remaining);
+struct slList *item;
+int count = 0;
+struct hash *distanceHash = (precalcDistanceHash ? precalcDistanceHash : hashNew(0));
+for (item = itemList; item != NULL; item = item->next)
+    {
+    struct hacTree *ht;
+    lmAllocVar(localMem, ht);
+    ht->itemOrCluster = item;
+    lmDlAddValTail(localMem, &remaining, ht);
+    count += 1;
+    }
+
+/* Loop through finding closest and merging until only one node left on remaining. */
+int i;
+for (i=1; i<count; ++i)
+    {
+    /* Find closest pair */
+    struct dlNode *aNode, *bNode;
+    double distance = pairParallel(&remaining, distanceHash, distF, extraData, threadCount,
+	&aNode, &bNode);
+
+    /* Allocated new hacTree item for them and fill it in with a merged value. */
+    struct hacTree *ht;
+    lmAllocVar(localMem, ht);
+    struct hacTree *left = ht->left = aNode->val;
+    struct hacTree *right = ht->right = bNode->val;
+    left->parent = right->parent = ht;
+    ht->itemOrCluster = mergeF(left->itemOrCluster, right->itemOrCluster, extraData);
+    ht->childDistance = distance;
+
+    /* Put merged item onto remaining list before first matching node in pair. */
+    struct dlNode *mergedNode;
+    lmAllocVar(localMem, mergedNode);
+    mergedNode->val = ht;
+    dlAddBefore(aNode, mergedNode);
+
+    /* Remove nodes we merged out */
+    dlRemove(aNode);
+    dlRemove(bNode);
+    }
+
+/* Clean up and go home. */
+if (distanceHash != precalcDistanceHash)
+    hashFree(&distanceHash);
+struct dlNode *lastNode = dlPopHead(&remaining);
+return lastNode->val;
 }