fd2a2b8b29bb837f6bab08cdac0dd05c3c7d9fdb kent Tue Aug 26 11:15:58 2014 -0700 Adding possibility to include precalculated distances to hacTreeMultiThread and improving commenting. diff --git src/lib/hacTree.c src/lib/hacTree.c index 82eddbd..9e197b8 100644 --- src/lib/hacTree.c +++ src/lib/hacTree.c @@ -326,41 +326,42 @@ *retNodeB = closestB; return closestDistance; } static void lmDlAddValHead(struct lm *lm, struct dlList *list, void *val) /* Allocate new dlNode out of lm, initialize it with val, and add it to end of list */ { struct dlNode *node; lmAllocVar(lm, node); node->val = val; dlAddHead(list, node); } struct hacTree *hacTreeVirtualPairing(struct slList *itemList, struct lm *localMem, hacDistanceFunction *distF, hacMergeFunction *mergeF, - void *extraData, hacPairingFunction *pairingF) + void *extraData, hacPairingFunction *pairingF, + struct hash *precalcDistanceHash) /* Construct hacTree using a method that will minimize the number of calls to * the distance and merge functions, assuming they are expensive. Do a lmCleanup(localMem) * to free the returned tree. */ { /* Make up a doubly-linked list in 'remaining' with all items in it */ struct dlList remaining; dlListInit(&remaining); struct slList *item; int count = 0; -struct hash *distanceHash = hashNew(0); +struct hash *distanceHash = (precalcDistanceHash ? precalcDistanceHash : hashNew(0)); for (item = itemList; item != NULL; item = item->next) { struct hacTree *ht; lmAllocVar(localMem, ht); ht->itemOrCluster = item; lmDlAddValHead(localMem, &remaining, ht); count += 1; } /* Loop through finding closest and merging until only one node left on remaining. */ int i; for (i=1; i<count; ++i) { /* Find closest pair and take them off of remaining list */ struct dlNode *aNode, *bNode; @@ -370,55 +371,64 @@ /* Allocated new hacTree item for them and fill it in with a merged value. */ struct hacTree *ht; lmAllocVar(localMem, ht); struct hacTree *left = ht->left = aNode->val; struct hacTree *right = ht->right = bNode->val; left->parent = right->parent = ht; ht->itemOrCluster = mergeF(left->itemOrCluster, right->itemOrCluster, extraData); ht->childDistance = distance; /* Put merged item onto remaining list. */ lmDlAddValHead(localMem, &remaining, ht); } /* Clean up and go home. */ +if (distanceHash != precalcDistanceHash) hashFree(&distanceHash); struct dlNode *lastNode = dlPopHead(&remaining); return lastNode->val; } struct hacTree *hacTreeForCostlyMerges(struct slList *itemList, struct lm *localMem, hacDistanceFunction *distF, hacMergeFunction *mergeF, void *extraData) /* Construct hacTree using a method that will minimize the number of calls to * the distance and merge functions, assuming they are expensive. Do a lmCleanup(localMem) * to free the returned tree. */ { -return hacTreeVirtualPairing(itemList, localMem, distF, mergeF, extraData, pairSerially); +return hacTreeVirtualPairing(itemList, localMem, distF, mergeF, extraData, pairSerially, NULL); } /** The code from here on down is an implementation that uses multiple threads **/ #define distKeySize 64 static void calcDistKey(void *a, void *b, char key[distKeySize]) /* Put key for distance in key */ { safef(key, distKeySize, "%p %p", a, b); } +void hacTreeDistanceHashAdd(struct hash *hash, void *itemA, void *itemB, double distance) +/* Add an item to distance hash */ +{ +char key[distKeySize]; +calcDistKey(itemA, itemB, key); +hashAdd(hash, key, CloneVar(&distance)); +} + struct hctPair /* A pair of hacTree nodes and the distance between them */ { struct hctPair *next; /* Next in list */ struct hacTree *a, *b; /* The pair of nodes */ double distance; /* Distance between pair */ }; struct distanceWorkerContext /* Context for a distance worker */ { hacDistanceFunction *distF; /* Function to call to calc distance */ void *extraData; /* Extra data for that function */ }; @@ -438,85 +448,96 @@ hacDistanceFunction *distF, void *extraData) /* Force computation of all distances not already in hash */ { /* Make up list of all missing distances in pairList */ struct synQueue *sq = synQueueNew(); struct hctPair *pairList = NULL, *pair; struct dlNode *aNode; for (aNode = list->head; !dlEnd(aNode); aNode = aNode->next) { struct hacTree *aHt = aNode->val; struct dlNode *bNode; for (bNode = aNode->next; !dlEnd(bNode); bNode = bNode->next) { struct hacTree *bHt = bNode->val; char key[64]; - calcDistKey(aHt, bHt, key); + calcDistKey(aHt->itemOrCluster, bHt->itemOrCluster, key); double *pd = hashFindVal(distanceHash, key); if (pd == NULL) { AllocVar(pair); pair->a = aHt; pair->b = bHt; slAddHead(&pairList, pair); synQueuePut(sq, pair); } } } /* Parallelize distance calculations */ struct distanceWorkerContext context = {.distF=distF, .extraData=extraData}; pthreadDoList(distThreadCount, pairList, distanceWorker, &context); for (pair = pairList; pair != NULL; pair = pair->next) { - char key[64]; - calcDistKey(pair->a, pair->b, key); - double *pd = needMem(sizeof(double)); - *pd = pair->distance; - hashAdd(distanceHash, key, pd); + hacTreeDistanceHashAdd(distanceHash, + pair->a->itemOrCluster, pair->b->itemOrCluster, pair->distance); } slFreeList(&pairList); } static double pairParallel(struct dlList *list, struct hash *distanceHash, hacDistanceFunction *distF, void *extraData, struct dlNode **retNodeA, struct dlNode **retNodeB) /* Loop through list returning closest two nodes */ { precacheMissingDistances(list, distanceHash, distF, extraData); struct dlNode *aNode; double closestDistance = BIGDOUBLE; struct dlNode *closestA = NULL, *closestB = NULL; for (aNode = list->head; !dlEnd(aNode); aNode = aNode->next) { struct hacTree *aHt = aNode->val; struct dlNode *bNode; for (bNode = aNode->next; !dlEnd(bNode); bNode = bNode->next) { struct hacTree *bHt = bNode->val; char key[64]; - safef(key, sizeof(key), "%p %p", aHt, bHt); + safef(key, sizeof(key), "%p %p", aHt->itemOrCluster, bHt->itemOrCluster); double *pd = hashMustFindVal(distanceHash, key); double d = *pd; if (d < closestDistance) { closestDistance = d; closestA = aNode; closestB = bNode; } } } *retNodeA = closestA; *retNodeB = closestB; return closestDistance; } struct hacTree *hacTreeMultiThread(int threadCount, struct slList *itemList, struct lm *localMem, hacDistanceFunction *distF, hacMergeFunction *mergeF, - void *extraData) + void *extraData, struct hash *precalcDistanceHash) /* Construct hacTree minimizing number of merges called, and doing distance calls - * in parallel when possible. Do a lmCleanup(localMem) to free returned tree. */ + * in parallel when possible. Do a lmCleanup(localMem) to free returned tree. + * The inputs are + * threadCount - number of threads - at least one, recommended no more than 15 + * itemList - list of items to tree up. Format can vary, but must start with a + * pointer to next item in list. + * localMem - memory pool where hacTree and a few other things are allocated from + * distF - function that calculates distance between two items, passed items and extraData + * mergeF - function that creates a new item in same format as itemList from two passed + * in items and the extraData. Typically does average of two input items + * extraData - parameter passed through to distF and mergeF, otherwise unused, may be NULL + * precalcDistanceHash - a hash containing at least some of the pairwise distances + * between items on itemList, set with hacTreeDistanceHashAdd. + * As a side effect this hash will be expanded to include all distances + * including those between intermediate nodes. */ { distThreadCount = threadCount; -return hacTreeVirtualPairing(itemList, localMem, distF, mergeF, extraData, pairParallel); +return hacTreeVirtualPairing(itemList, localMem, distF, mergeF, extraData, pairParallel, + precalcDistanceHash); }