752bae9723eeeabb02d950157ab0f9512fd689b3
tdreszer
  Mon Jan 28 15:05:59 2013 -0800
Checking in 'elmTree' which uses neighbor joining to build a tree in local memory.  Also checking in 2 sets of changes to vcf.  First, added support for a 'reuse' local memory pool which aid traversing giant files.  Second, added routines for bitmap based analysis of variants.
diff --git src/lib/elmTree.c src/lib/elmTree.c
new file mode 100644
index 0000000..c136416
--- /dev/null
+++ src/lib/elmTree.c
@@ -0,0 +1,275 @@
+// elmTree.c/.h - Extensible local memory tree grown from an slList of objects, via
+// NEIGHBOR JOINING and a supplied compare routine.  Unlike hacTree, this is not a binary tree
+// and any node can have original content, a single parent and N children.  Superficially similar
+// to running slSort(), running elmTreeGrow() creates a tree from a single "root".  All branches
+// and leaves are represented as "tree nodes" which contain pointers to a user defined object,
+// a single parent, a first child (if the node is not a leaf) and a next sibling.  All nodes are
+// additionally joined together as an slList with the single "root" as the first member.  Thus
+// the tree can be traversed linerarly through node->next or hierarchically through recursive
+// node->firstChild,child->sibling access.
+// The tree is grown through neighbor joins and involves figuring out whether each successive 
+// content should be attached as a branch node or leaf node to the existing tree.  To facilitate
+// neighbor joining, each node has cumulative content from the single root node (whcih has NULL
+// content).  An example might be a tree of bitmaps where each node contains all the bits of its
+// parent plus additional bits.
+// Since an elmTree is build with local memory, free the tree by lmCleanup().
+
+#include "common.h"
+#include "elmTree.h"
+
+struct elmNode *treeNewNode(struct lm *lm,struct elmNode **tree,struct slList *content)
+// Returns a new node to attach to a tree.
+{
+assert((*tree != NULL && content != NULL) || (*tree == NULL && content == NULL));
+
+struct elmNode *node;
+lmAllocVar(lm,node);
+if (tree != NULL) // Keep all nodes in a simple slList
+    slAddHead(tree,node);
+node->selfAndDescendants = 1;
+node->content = content;
+return node;
+}
+// NOTE the root of the tree always has NULL content.
+#define treePlant(lm,tree) treeNewNode(lm,tree,NULL)
+
+inline void treeSproutLeaf(struct elmNode *parent,struct elmNode *leaf)
+// Adds a leaf to a node.  Note that the leaf may itself have children.
+{
+leaf->parent = parent;
+leaf->sibling = parent->firstChild;
+parent->firstChild = leaf;
+parent->selfAndDescendants += leaf->selfAndDescendants;
+}
+
+struct elmNode *treeClipLeaf(struct elmNode *leaf)
+// Clips leaf and return parent
+{
+assert(leaf->parent != NULL);
+struct elmNode *parent = leaf->parent;
+leaf->parent = NULL;
+
+// leaf may not be first child, so find leaf in children 
+struct elmNode *child = parent->firstChild; 
+struct elmNode *prevChild = NULL;
+struct elmNode *nextChild = NULL;
+for (; child != NULL; prevChild = child, child = nextChild)
+    {
+    nextChild = child->sibling;
+    if (child == leaf)
+        {
+        child->sibling = NULL;
+        if (prevChild != NULL)
+            prevChild->sibling = nextChild;
+        else
+            parent->firstChild = nextChild;
+        break;
+        }
+    }
+parent->selfAndDescendants -= leaf->selfAndDescendants;
+return parent;
+}
+
+
+inline void treeInsertBranchBefore(struct elmNode *branch,struct elmNode *node)
+{
+struct elmNode *parent = treeClipLeaf(node);
+treeSproutLeaf(parent,branch);
+treeSproutLeaf(branch,node);
+}
+
+enum elmNodeOverlap rTreeBranchFindMaxWeight(struct elmNode *branch, struct slList *element, 
+                                                    elmNodeCompareFunction *neighborCmp, 
+                                                    int *pWeight, void *extra)
+// Using the compare routine, the max weight is found an element and a candidate branch
+{
+int bestWeight = 0;
+int topNodeResult = neighborCmp(branch->content,element,&bestWeight,extra);
+if (topNodeResult != enoEqual && topNodeResult != enoSuperset // could do better
+&&  bestWeight > 0 && bestWeight >= *pWeight)           // reason to look for more
+    {
+    struct elmNode *child = branch->firstChild;
+    for ( ;child != NULL; child = child->sibling)
+        {
+        int weight = bestWeight;  // will not recurse if new weight is less than best
+        int result = rTreeBranchFindMaxWeight(child,element,neighborCmp,&weight,extra);
+        if (bestWeight < weight)
+            bestWeight = weight;
+        if (result == enoEqual
+        ||  result == enoSuperset)
+            break;  // don't return result of compare to child, but do select this best weight
+        }
+    }
+*pWeight = bestWeight;
+return topNodeResult;
+}
+
+static enum elmNodeOverlap treeChooseBranch(struct elmNode **pBranch, struct slList *element,
+                                            elmNodeCompareFunction *neighborCmp, void *extra)
+// Using the compare routine, a branch is chosen that most closely matches the element being tested
+// Simple case: one of two branches has best weight so follow that branch.  More subtle case:
+// both branches have equal weight.  In this case using recursive routine will look deeper into 
+// each branch to find the best weight before choosing a branch.
+{
+struct elmNode *branch = *pBranch;
+assert(branch->content != NULL);
+
+struct elmNode *bestBranch = branch;
+int bestWeight = 0;
+// NOTE: using recursive routine will look deeper into a branch to find the best weight
+//int bestResult = neighborCmp(bestBranch->content,element,&bestWeight,extra);
+int bestResult = rTreeBranchFindMaxWeight(bestBranch,element,neighborCmp,&bestWeight,extra);
+
+// Is there need to compare further?
+if (bestResult != enoEqual && bestResult != enoSuperset)  // Could do better
+    {
+    for (branch = branch->sibling;branch != NULL; branch = branch->sibling)
+        {
+        int weight = bestWeight;  // will not recurse if new weight is less than best
+        //int result = neighborCmp(branch->content,element,&weight,extra);
+        int result = rTreeBranchFindMaxWeight(branch,element,neighborCmp,&weight,extra);
+
+        if (result == enoEqual || result == enoSuperset  // best that we can do
+        ||  bestWeight <  weight)                         // better than last branch
+        // NOTE: preferring larger of 2 equal branches does not help.
+        //|| (bestWeight == weight                         // prefer largest branch
+        //   &&  branch->selfAndDescendants > bestBranch->selfAndDescendants))
+            {
+            bestBranch = branch;
+            bestWeight = weight;
+            bestResult = result;
+            if (bestResult == enoEqual
+            ||  bestResult == enoSuperset) // doesn't get any better
+                break;
+            }
+        }
+    }
+*pBranch   = bestBranch;
+return bestResult;
+}
+
+
+struct elmNode *elmTreeGrow(struct lm *lm,struct slList *collection, void *extra,
+                            elmNodeCompareFunction *neighborCmp,
+                            elmNodeJoiningFunction *neighborJoin)
+// Given a collection of content and a function to compare elements of the content,
+// returns a tree based on neighbor joining.  The nodes of the tree will have content
+// which is cumulative out to the leaves: the root node has null content, while any child
+// has content that is the superset of all it's parents.  Fill 'extra' with anything (like lm)
+// that you want passed into your compare and joining functions.
+{
+if (collection == NULL)
+    return NULL;
+
+// Begin tree
+struct elmNode *tree = NULL;
+struct elmNode *root = treePlant(lm,&tree);
+
+// Add first element:
+struct slList *element = collection;
+struct elmNode *newNode = treeNewNode(lm,&tree,element);
+treeSproutLeaf(root,newNode);
+element = element->next;
+
+// for each additional element, walks current tree level by level, choosing the best branch and
+// ultimately adding the element as a leaf.  Current leaves can become branches and
+// current branches may get split to add a new leaf sprout in the middle.
+struct elmNode *curNode = NULL;
+struct elmNode *newBranch = NULL;
+for ( ;element != NULL; element = element->next)
+    {
+    for (curNode = root->firstChild;  // For each element, start at just off root
+         curNode != NULL; 
+         curNode = curNode->firstChild) // walks down
+        {
+        int result = treeChooseBranch(&curNode,element,neighborCmp,extra); // chooses across
+        if (result == enoEqual)                       // Replaces current
+            {
+            // Should only get here if a previous node has the same content as this new element
+            struct slList *replacement = neighborJoin(curNode->content,element,extra);
+            curNode->content = replacement;
+            }
+        else if (result == enoExcluding)              // New leaf
+            {
+            newNode = treeNewNode(lm,&tree,element);
+            treeSproutLeaf(curNode->parent,newNode);  // shares nothing
+            }
+        else if (result == enoSubset)                 // on to children or new leaf
+            {
+            if (curNode->firstChild != NULL) 
+                continue;                             // Only case where we continue!
+            newNode = treeNewNode(lm,&tree,element);
+            treeSproutLeaf(curNode,newNode);          // shares all of curNode
+            }
+        else if (result == enoSuperset)               // New branch with old node as child
+            {
+            newNode = treeNewNode(lm,&tree,element);
+            treeInsertBranchBefore(newNode,curNode);
+            }
+        else if (result == enoMixed)                  // New branch with both nodes as children
+            {
+            struct slList *matching = neighborJoin(curNode->content,element,extra);
+            newBranch = treeNewNode(lm,&tree,matching);
+            treeInsertBranchBefore(newBranch,curNode);
+            newNode   = treeNewNode(lm,&tree,element);
+            treeSproutLeaf(newBranch,newNode);
+            }
+        break; // done, move on to next element
+        }
+    }
+slReverse(&tree);
+assert(elmNodeIsRoot(tree));
+return tree;
+}
+
+boolean rElmTreeClimb(const struct elmNode *node, struct slList *parent, void *extra,
+                      elmNodePickFunction *elmPick, struct slList **results)
+// Recursively climbs tree and examines every node using the supplied function.
+// Each call on a node iterates through its siblings, recursively calling itself on any children.
+// This function might be used to build a list of objects from each or a subset of nodes.
+// If all examinations resulted in a structure, then the list will be in REVERSE traversal order.
+// If you immediately slReverse(results) then the list will ascend to the furthest leaf before
+// moving on to sibling leaves, twigs and branches.
+// Note: if results are returned, then "parent" is filled with nearest parent's result.
+// Return FALSE from the elmPick function to stop the traversal.  Thus, a complete traversal
+// returns TRUE, but one that has been stopped (after finding one node?) returns FALSE.
+{
+const struct elmNode *sibling = elmNodeIsRoot(node) ? node->firstChild: node;
+for (;sibling!=NULL;sibling = sibling->sibling)
+    {
+    // Some nodes are subsets rather than alleles
+    struct slList *localParent = NULL;
+    struct slList *result = NULL;
+    boolean ret = elmPick(sibling->content,parent,extra,&result);
+    if (result)
+        {
+        //assert(results != NULL);
+        slAddHead(results,result);
+        localParent = result;
+        result = NULL;
+        }
+    else
+        localParent = parent;
+
+    if (!ret)
+        return FALSE; // Stop traversing
+
+    // a node points to only one child, but that child may have siblings
+    struct elmNode *child = sibling->firstChild; // additional children are siblings
+    if (child)
+        {
+        assert(child->content != NULL);
+        ret = rElmTreeClimb(child, localParent, extra, elmPick, &result);
+        if (result != NULL)
+            {
+            //assert(results != NULL);
+            *results = slCat(result,*results); // newer results go to front!
+            }
+        if (!ret)
+            return FALSE; // Stop traversing
+        }
+    }
+return TRUE; // continue traversing
+}
+
+