43b503cbb9e91b597a2ba97b8807096114e68935 kent Thu Aug 28 13:06:17 2014 -0700 Removing hacTreeExpensiveMerges and simplifying hacTreeMultiThread diff --git src/lib/hacTree.c src/lib/hacTree.c index 53e3b89..dda1b97 100644 --- src/lib/hacTree.c +++ src/lib/hacTree.c @@ -282,277 +282,223 @@ } // root now has a stable address, unlike nodes still in the pool, so set parents here: if (root->left != NULL) root->left->parent = root; if (root->right != NULL) root->right->parent = root; } // This shouldn't be necessary as long as initNode leaves parent pointers alone, // but just in case that changes: root->parent = NULL; return root; } /** The code from here on down is an alternative implementation that calls the merge ** function and for that matter the distance function much less than the function - ** above. */ + ** above, and also is multithreaded. It does seem to produce the same output + ** but the algorigthm is sufficiently different, at least for now, I'm keeping + ** both. */ -typedef double hacPairingFunction(struct dlList *list, struct hash *distanceHash, - hacDistanceFunction *distF, void *distData, struct dlNode **retNodeA, struct dlNode **retNodeB); - -static double pairSerially(struct dlList *list, struct hash *distanceHash, - hacDistanceFunction *distF, void *extraData, struct dlNode **retNodeA, struct dlNode **retNodeB) -/* Loop through list returning closest two nodes */ -{ -struct dlNode *aNode; -double closestDistance = BIGDOUBLE; -struct dlNode *closestA = NULL, *closestB = NULL; -for (aNode = list->head; !dlEnd(aNode); aNode = aNode->next) - { - struct hacTree *aHt = aNode->val; - struct slList *a = aHt->itemOrCluster; - struct dlNode *bNode; - for (bNode = aNode->next; !dlEnd(bNode); bNode = bNode->next) - { - struct hacTree *bHt = bNode->val; - char key[64]; - safef(key, sizeof(key), "%p %p", aHt, bHt); - double *pd = hashFindVal(distanceHash, key); - if (pd == NULL) - { - lmAllocVar(distanceHash->lm, pd); - *pd = distF(a, bHt->itemOrCluster, extraData); - hashAdd(distanceHash, key, pd); - } - double d = *pd; - if (d < closestDistance) - { - closestDistance = d; - closestA = aNode; - closestB = bNode; - } - } - } -*retNodeA = closestA; -*retNodeB = closestB; -return closestDistance; -} - -static void lmDlAddValTail(struct lm *lm, struct dlList *list, void *val) -/* Allocate new dlNode out of lm, initialize it with val, and add it to end of list */ -{ -struct dlNode *node; -lmAllocVar(lm, node); -node->val = val; -dlAddTail(list, node); -} - -struct hacTree *hacTreeVirtualPairing(struct slList *itemList, struct lm *localMem, - hacDistanceFunction *distF, hacMergeFunction *mergeF, - void *extraData, hacPairingFunction *pairingF, - struct hash *precalcDistanceHash) -/* Construct hacTree using a method that will minimize the number of calls to - * the distance and merge functions, assuming they are expensive. Do a lmCleanup(localMem) - * to free the returned tree. */ -{ -/* Make up a doubly-linked list in 'remaining' with all items in it */ -struct dlList remaining; -dlListInit(&remaining); -struct slList *item; -int count = 0; -struct hash *distanceHash = (precalcDistanceHash ? precalcDistanceHash : hashNew(0)); -for (item = itemList; item != NULL; item = item->next) - { - struct hacTree *ht; - lmAllocVar(localMem, ht); - ht->itemOrCluster = item; - lmDlAddValTail(localMem, &remaining, ht); - count += 1; - } - -/* Loop through finding closest and merging until only one node left on remaining. */ -int i; -for (i=1; ileft = aNode->val; - struct hacTree *right = ht->right = bNode->val; - left->parent = right->parent = ht; - ht->itemOrCluster = mergeF(left->itemOrCluster, right->itemOrCluster, extraData); - ht->childDistance = distance; - - /* Put merged item onto remaining list before first matching node in pair. */ - struct dlNode *mergedNode; - lmAllocVar(localMem, mergedNode); - mergedNode->val = ht; - dlAddBefore(aNode, mergedNode); - - /* Remove nodes we merged out */ - dlRemove(aNode); - dlRemove(bNode); - } - -/* Clean up and go home. */ -if (distanceHash != precalcDistanceHash) - hashFree(&distanceHash); -struct dlNode *lastNode = dlPopHead(&remaining); -return lastNode->val; -} - -struct hacTree *hacTreeForCostlyMerges(struct slList *itemList, struct lm *localMem, - hacDistanceFunction *distF, hacMergeFunction *mergeF, - void *extraData) -/* Construct hacTree using a method that will minimize the number of calls to - * the distance and merge functions, assuming they are expensive. Do a lmCleanup(localMem) - * to free the returned tree. */ -{ -return hacTreeVirtualPairing(itemList, localMem, distF, mergeF, extraData, pairSerially, NULL); -} - -/** The code from here on down is an implementation that uses multiple threads **/ #define distKeySize 64 static void calcDistKey(void *a, void *b, char key[distKeySize]) /* Put key for distance in key */ { safef(key, distKeySize, "%p %p", a, b); } void hacTreeDistanceHashAdd(struct hash *hash, void *itemA, void *itemB, double distance) /* Add an item to distance hash */ { char key[distKeySize]; calcDistKey(itemA, itemB, key); hashAdd(hash, key, CloneVar(&distance)); } +double *hacTreeDistanceHashLookup(struct hash *hash, void *itemA, void *itemB) +/* Look up pair in distance hash. Returns NULL if not found, otherwise pointer to + * distance */ +{ +char key[distKeySize]; +calcDistKey(itemA, itemB, key); +return hashFindVal(hash, key); +} + struct hctPair /* A pair of hacTree nodes and the distance between them */ { struct hctPair *next; /* Next in list */ struct hacTree *a, *b; /* The pair of nodes */ double distance; /* Distance between pair */ }; struct distanceWorkerContext /* Context for a distance worker */ { hacDistanceFunction *distF; /* Function to call to calc distance */ void *extraData; /* Extra data for that function */ }; static void distanceWorker(void *item, void *context) /* fill in distance for pair */ { struct hctPair *pair = item; struct distanceWorkerContext *dc = context; struct hacTree *aHt = pair->a, *bHt = pair->b; pair->distance = (dc->distF)((struct slList *)(aHt->itemOrCluster), (struct slList *)(bHt->itemOrCluster), dc->extraData); } -static int distThreadCount = 1; // Number of threads to use for distance calcs. - static void precacheMissingDistances(struct dlList *list, struct hash *distanceHash, - hacDistanceFunction *distF, void *extraData) + hacDistanceFunction *distF, void *extraData, int threadCount) /* Force computation of all distances not already in hash */ { /* Make up list of all missing distances in pairList */ struct synQueue *sq = synQueueNew(); struct hctPair *pairList = NULL, *pair; struct dlNode *aNode; for (aNode = list->head; !dlEnd(aNode); aNode = aNode->next) { struct hacTree *aHt = aNode->val; struct dlNode *bNode; for (bNode = aNode->next; !dlEnd(bNode); bNode = bNode->next) { struct hacTree *bHt = bNode->val; char key[64]; calcDistKey(aHt->itemOrCluster, bHt->itemOrCluster, key); double *pd = hashFindVal(distanceHash, key); if (pd == NULL) { AllocVar(pair); pair->a = aHt; pair->b = bHt; slAddHead(&pairList, pair); synQueuePut(sq, pair); } } } /* Parallelize distance calculations */ struct distanceWorkerContext context = {.distF=distF, .extraData=extraData}; -pthreadDoList(distThreadCount, pairList, distanceWorker, &context); +pthreadDoList(threadCount, pairList, distanceWorker, &context); for (pair = pairList; pair != NULL; pair = pair->next) { hacTreeDistanceHashAdd(distanceHash, pair->a->itemOrCluster, pair->b->itemOrCluster, pair->distance); } slFreeList(&pairList); } static double pairParallel(struct dlList *list, struct hash *distanceHash, - hacDistanceFunction *distF, void *extraData, struct dlNode **retNodeA, struct dlNode **retNodeB) + hacDistanceFunction *distF, void *extraData, int threadCount, + struct dlNode **retNodeA, struct dlNode **retNodeB) /* Loop through list returning closest two nodes */ { -precacheMissingDistances(list, distanceHash, distF, extraData); +precacheMissingDistances(list, distanceHash, distF, extraData, threadCount); struct dlNode *aNode; double closestDistance = BIGDOUBLE; struct dlNode *closestA = NULL, *closestB = NULL; for (aNode = list->head; !dlEnd(aNode); aNode = aNode->next) { struct hacTree *aHt = aNode->val; struct dlNode *bNode; for (bNode = aNode->next; !dlEnd(bNode); bNode = bNode->next) { struct hacTree *bHt = bNode->val; char key[64]; safef(key, sizeof(key), "%p %p", aHt->itemOrCluster, bHt->itemOrCluster); double *pd = hashMustFindVal(distanceHash, key); double d = *pd; if (d < closestDistance) { closestDistance = d; closestA = aNode; closestB = bNode; } } } *retNodeA = closestA; *retNodeB = closestB; return closestDistance; } -struct hacTree *hacTreeMultiThread(int threadCount, struct slList *itemList, struct lm *localMem, - hacDistanceFunction *distF, hacMergeFunction *mergeF, +static void lmDlAddValTail(struct lm *lm, struct dlList *list, void *val) +/* Allocate new dlNode out of lm, initialize it with val, and add it to end of list */ +{ +struct dlNode *node; +lmAllocVar(lm, node); +node->val = val; +dlAddTail(list, node); +} + +struct hacTree *hacTreeMultiThread(int threadCount, struct slList *itemList, + struct lm *localMem, hacDistanceFunction *distF, hacMergeFunction *mergeF, void *extraData, struct hash *precalcDistanceHash) /* Construct hacTree minimizing number of merges called, and doing distance calls * in parallel when possible. Do a lmCleanup(localMem) to free returned tree. * The inputs are * threadCount - number of threads - at least one, recommended no more than 15 * itemList - list of items to tree up. Format can vary, but must start with a * pointer to next item in list. * localMem - memory pool where hacTree and a few other things are allocated from * distF - function that calculates distance between two items, passed items and extraData * mergeF - function that creates a new item in same format as itemList from two passed * in items and the extraData. Typically does average of two input items * extraData - parameter passed through to distF and mergeF, otherwise unused, may be NULL * precalcDistanceHash - a hash containing at least some of the pairwise distances * between items on itemList, set with hacTreeDistanceHashAdd. * As a side effect this hash will be expanded to include all distances * including those between intermediate nodes. */ { -distThreadCount = threadCount; -return hacTreeVirtualPairing(itemList, localMem, distF, mergeF, extraData, pairParallel, - precalcDistanceHash); +/* Make up a doubly-linked list in 'remaining' with all items in it */ +struct dlList remaining; +dlListInit(&remaining); +struct slList *item; +int count = 0; +struct hash *distanceHash = (precalcDistanceHash ? precalcDistanceHash : hashNew(0)); +for (item = itemList; item != NULL; item = item->next) + { + struct hacTree *ht; + lmAllocVar(localMem, ht); + ht->itemOrCluster = item; + lmDlAddValTail(localMem, &remaining, ht); + count += 1; + } + +/* Loop through finding closest and merging until only one node left on remaining. */ +int i; +for (i=1; ileft = aNode->val; + struct hacTree *right = ht->right = bNode->val; + left->parent = right->parent = ht; + ht->itemOrCluster = mergeF(left->itemOrCluster, right->itemOrCluster, extraData); + ht->childDistance = distance; + + /* Put merged item onto remaining list before first matching node in pair. */ + struct dlNode *mergedNode; + lmAllocVar(localMem, mergedNode); + mergedNode->val = ht; + dlAddBefore(aNode, mergedNode); + + /* Remove nodes we merged out */ + dlRemove(aNode); + dlRemove(bNode); + } + +/* Clean up and go home. */ +if (distanceHash != precalcDistanceHash) + hashFree(&distanceHash); +struct dlNode *lastNode = dlPopHead(&remaining); +return lastNode->val; }