fd2a2b8b29bb837f6bab08cdac0dd05c3c7d9fdb
kent
  Tue Aug 26 11:15:58 2014 -0700
Adding possibility to include precalculated distances to hacTreeMultiThread and improving commenting.
diff --git src/lib/hacTree.c src/lib/hacTree.c
index 82eddbd..9e197b8 100644
--- src/lib/hacTree.c
+++ src/lib/hacTree.c
@@ -326,41 +326,42 @@
 *retNodeB = closestB;
 return closestDistance;
 }
 
 static void lmDlAddValHead(struct lm *lm, struct dlList *list, void *val)
 /* Allocate new dlNode out of lm, initialize it with val, and add it to end of list */
 {
 struct dlNode *node;
 lmAllocVar(lm, node);
 node->val = val;
 dlAddHead(list, node);
 }
 
 struct hacTree *hacTreeVirtualPairing(struct slList *itemList, struct lm *localMem,
 				 hacDistanceFunction *distF, hacMergeFunction *mergeF,
-				 void *extraData, hacPairingFunction *pairingF)
+				 void *extraData, hacPairingFunction *pairingF,
+				 struct hash *precalcDistanceHash)
 /* Construct hacTree using a method that will minimize the number of calls to
  * the distance and merge functions, assuming they are expensive.  Do a lmCleanup(localMem)
  * to free the returned tree. */
 {
 /* Make up a doubly-linked list in 'remaining' with all items in it */
 struct dlList remaining;
 dlListInit(&remaining);
 struct slList *item;
 int count = 0;
-struct hash *distanceHash = hashNew(0);
+struct hash *distanceHash = (precalcDistanceHash ? precalcDistanceHash : hashNew(0));
 for (item = itemList; item != NULL; item = item->next)
     {
     struct hacTree *ht;
     lmAllocVar(localMem, ht);
     ht->itemOrCluster = item;
     lmDlAddValHead(localMem, &remaining, ht);
     count += 1;
     }
 
 /* Loop through finding closest and merging until only one node left on remaining. */
 int i;
 for (i=1; i<count; ++i)
     {
     /* Find closest pair and take them off of remaining list */
     struct dlNode *aNode, *bNode;
@@ -370,55 +371,64 @@
 
     /* Allocated new hacTree item for them and fill it in with a merged value. */
     struct hacTree *ht;
     lmAllocVar(localMem, ht);
     struct hacTree *left = ht->left = aNode->val;
     struct hacTree *right = ht->right = bNode->val;
     left->parent = right->parent = ht;
     ht->itemOrCluster = mergeF(left->itemOrCluster, right->itemOrCluster, extraData);
     ht->childDistance = distance;
 
     /* Put merged item onto remaining list. */
     lmDlAddValHead(localMem, &remaining, ht);
     }
 
 /* Clean up and go home. */
+if (distanceHash != precalcDistanceHash)
     hashFree(&distanceHash);
 struct dlNode *lastNode = dlPopHead(&remaining);
 return lastNode->val;
 }
 
 struct hacTree *hacTreeForCostlyMerges(struct slList *itemList, struct lm *localMem,
 				 hacDistanceFunction *distF, hacMergeFunction *mergeF,
 				 void *extraData)
 /* Construct hacTree using a method that will minimize the number of calls to
  * the distance and merge functions, assuming they are expensive.  Do a lmCleanup(localMem)
  * to free the returned tree. */
 {
-return hacTreeVirtualPairing(itemList, localMem, distF, mergeF, extraData, pairSerially);
+return hacTreeVirtualPairing(itemList, localMem, distF, mergeF, extraData, pairSerially, NULL);
 }
 
 /** The code from here on down is an implementation that uses multiple threads **/
 
 #define distKeySize 64  
 
 static void calcDistKey(void *a, void *b, char key[distKeySize])
 /* Put key for distance in key */
 {
 safef(key, distKeySize, "%p %p", a, b);
 }
 
+void hacTreeDistanceHashAdd(struct hash *hash, void *itemA, void *itemB, double distance)
+/* Add an item to distance hash */
+{
+char key[distKeySize];
+calcDistKey(itemA, itemB, key);
+hashAdd(hash, key, CloneVar(&distance));
+}
+
 struct hctPair
 /* A pair of hacTree nodes and the distance between them */
     {
     struct hctPair *next;  /* Next in list */
     struct hacTree *a, *b;	   /* The pair of nodes */
     double distance;	   /* Distance between pair */
     };
 
 struct distanceWorkerContext
 /* Context for a distance worker */
     {
     hacDistanceFunction *distF;	/* Function to call to calc distance */
     void *extraData;		/* Extra data for that function */
     };
 
@@ -438,85 +448,96 @@
     hacDistanceFunction *distF, void *extraData)
 /* Force computation of all distances not already in hash */
 {
 /* Make up list of all missing distances in pairList */
 struct synQueue *sq = synQueueNew();
 struct hctPair *pairList = NULL, *pair;
 struct dlNode *aNode;
 for (aNode = list->head; !dlEnd(aNode); aNode = aNode->next)
     {
     struct hacTree *aHt = aNode->val;
     struct dlNode *bNode;
     for (bNode = aNode->next; !dlEnd(bNode); bNode = bNode->next)
         {
 	struct hacTree *bHt = bNode->val;
 	char key[64];
-	calcDistKey(aHt, bHt, key);
+	calcDistKey(aHt->itemOrCluster, bHt->itemOrCluster, key);
 	double *pd = hashFindVal(distanceHash, key);
 	if (pd == NULL)
 	     {
 	     AllocVar(pair);
 	     pair->a = aHt;
 	     pair->b = bHt;
 	     slAddHead(&pairList, pair);
 	     synQueuePut(sq, pair);
 	     }
 	}
     }
 
 /* Parallelize distance calculations */
 struct distanceWorkerContext context = {.distF=distF, .extraData=extraData};
 pthreadDoList(distThreadCount, pairList, distanceWorker, &context);
 
 for (pair = pairList; pair != NULL; pair = pair->next)
     {
-    char key[64];
-    calcDistKey(pair->a, pair->b, key);
-    double *pd = needMem(sizeof(double));
-    *pd = pair->distance;
-    hashAdd(distanceHash, key, pd);
+    hacTreeDistanceHashAdd(distanceHash, 
+	pair->a->itemOrCluster, pair->b->itemOrCluster, pair->distance);
     }
 slFreeList(&pairList);
 }
 
 static double pairParallel(struct dlList *list, struct hash *distanceHash, 
     hacDistanceFunction *distF, void *extraData, struct dlNode **retNodeA, struct dlNode **retNodeB)
 /* Loop through list returning closest two nodes */
 {
 precacheMissingDistances(list, distanceHash, distF, extraData);
 struct dlNode *aNode;
 double closestDistance = BIGDOUBLE;
 struct dlNode *closestA = NULL, *closestB = NULL;
 for (aNode = list->head; !dlEnd(aNode); aNode = aNode->next)
     {
     struct hacTree *aHt = aNode->val;
     struct dlNode *bNode;
     for (bNode = aNode->next; !dlEnd(bNode); bNode = bNode->next)
         {
 	struct hacTree *bHt = bNode->val;
 	char key[64];
-	safef(key, sizeof(key), "%p %p", aHt, bHt);
+	safef(key, sizeof(key), "%p %p", aHt->itemOrCluster, bHt->itemOrCluster);
 	double *pd = hashMustFindVal(distanceHash, key);
 	double d = *pd;
 	if (d < closestDistance)
 	    {
 	    closestDistance = d;
 	    closestA = aNode;
 	    closestB = bNode;
 	    }
 	}
     }
 *retNodeA = closestA;
 *retNodeB = closestB;
 return closestDistance;
 }
 
 struct hacTree *hacTreeMultiThread(int threadCount, struct slList *itemList, struct lm *localMem,
 				 hacDistanceFunction *distF, hacMergeFunction *mergeF,
-				 void *extraData)
+				 void *extraData, struct hash *precalcDistanceHash)
 /* Construct hacTree minimizing number of merges called, and doing distance calls
- * in parallel when possible.   Do a lmCleanup(localMem) to free returned tree. */
+ * in parallel when possible.   Do a lmCleanup(localMem) to free returned tree. 
+ * The inputs are
+ *	threadCount - number of threads - at least one, recommended no more than 15
+ *	itemList - list of items to tree up.  Format can vary, but must start with a
+ *	           pointer to next item in list.
+ *	localMem - memory pool where hacTree and a few other things are allocated from
+ *	distF - function that calculates distance between two items, passed items and extraData
+ *	mergeF - function that creates a new item in same format as itemList from two passed
+ *	         in items and the extraData.  Typically does average of two input items
+ *	extraData - parameter passed through to distF and mergeF, otherwise unused, may be NULL
+ *	precalcDistanceHash - a hash containing at least some of the pairwise distances
+ *	            between items on itemList, set with hacTreeDistanceHashAdd. 
+ *	            As a side effect this hash will be expanded to include all distances 
+ *	            including those between intermediate nodes. */
 {
 distThreadCount = threadCount;
-return hacTreeVirtualPairing(itemList, localMem, distF, mergeF, extraData, pairParallel);
+return hacTreeVirtualPairing(itemList, localMem, distF, mergeF, extraData, pairParallel,
+    precalcDistanceHash);
 }