f4ab7c72c85bd8f5f90dc8586aa4ab4ef3cf4703
kehayden
  Wed Apr 25 15:37:53 2012 -0700
consolidated alphaChain between kent libraries, with parent
diff --git src/kehayden/alphaChain/alphaChain.c src/kehayden/alphaChain/alphaChain.c
index 1dec49e..3c39547 100644
--- src/kehayden/alphaChain/alphaChain.c
+++ src/kehayden/alphaChain/alphaChain.c
@@ -80,77 +80,116 @@
  * Note how the tree is able to compress the two chains "the black dog" and "the black cat."
  *
  * A node in the tree can have as many children as it needs to at each node.  The depth of
  * the tree is the same as the chain size, by default 3. At each node in the tree you get
  * a word, and a list of all words that are observed in the text to follow that word.
  *
  * There are special cases in the code so that the first and last words in the text get included 
  * as much as possible in the tree. 
  *
  * Once the program has build up the wordTree, it can output it in a couple of fashions. */
 
 struct wordTree
 /* A node in a tree of words.  The head of the tree is a node with word value the empty string. */
     {
     struct rbTree *following;	/* Contains words (as struct wordTree) that follow us. */
+    struct wordTree *parent;    /* Parent of this node or NULL for root. */
     char *word;			/* The word itself including comma, period etc. */
     int useCount;		/* Number of times word used. */
     int outputCount;            /* each level of tree and initialize that to a normalized version of it. */
     double normVal;             /* value to place the normalization value */    
-    double deadEnd;             /* value to keep track of missing branches */
+    int missingFromChildren;    /* Uses not in children. */
     };
 
 struct wordTree *wordTreeNew(char *word)
 /* Create and return new wordTree element. */
 {
 struct wordTree *wt;
 AllocVar(wt);
 wt->word = cloneString(word);
 return wt;
 }
 
 int wordTreeCmpWord(void *va, void *vb)
 /* Compare two wordTree. */
 {
 struct wordTree *a = va, *b = vb;
 return strcmp(a->word, b->word);
 }
-
-
+int wordTreeChildrenUseCount(struct wordTree *wt)
+/* Return sum of useCounts of all children */
+{
+struct rbTree *following = wt->following;
+if (following == NULL)
+    return 0;
+struct slRef *childList = rbTreeItems(following);
+struct slRef *childRef;
+int total = 0;
+for (childRef = childList; childRef != NULL; childRef = childRef->next)
+    {
+    struct wordTree *child = childRef->val;
+    total += child->useCount;
+    }
+slFreeList(&childList);
+return total;
+}
+int wordTreeCountNotInChildren(struct wordTree *wt)
+/* Count up useCounts of all children and return difference between this and our own useCount. */
+{
+return wt->useCount - wordTreeChildrenUseCount(wt);
+}
+void wordTreeSetMissing(struct wordTree *wt)
+/* Set missingFromChildren in self and all children. */
+{
+wt->missingFromChildren = wordTreeCountNotInChildren(wt);
+struct rbTree *following = wt->following;
+if (following != NULL)
+    {
+    struct slRef *childList = rbTreeItems(following);
+    struct slRef *childRef;
+    for (childRef = childList; childRef != NULL; childRef = childRef->next)
+        {
+        struct wordTree *child = childRef->val;
+        wordTreeSetMissing(child);
+        }
+    slFreeList(&childList);
+    }
+}
 
 struct wordTree *wordTreeAddFollowing(struct wordTree *wt, char *word, 
 	struct lm *lm, struct rbTreeNode **stack)
 /* Make word follow wt in tree.  If word already exists among followers
  * return it and bump use count.  Otherwise create new one. */
 {
 struct wordTree *w;   /* Points to following element if any */
 if (wt->following == NULL)
     {
     /* Allocate new if you've never seen it before. */
     wt->following = rbTreeNewDetailed(wordTreeCmpWord, lm, stack);
     w = NULL;
     }
 else
     {
     /* Find word in existing tree */
     struct wordTree key;
     key.word = word;
     w = rbTreeFind(wt->following, &key);
     }
 if (w == NULL)
     {
     w = wordTreeNew(word);
+    w->parent = wt;
     rbTreeAdd(wt->following, w);
     }
 w->useCount += 1;
 return w;
 }
 
 void addChainToTree(struct wordTree *wt, struct dlList *chain, 
 	struct lm *lm, struct rbTreeNode **stack)
 /* Add chain of words to tree. */
 {
 struct dlNode *node;
 wt->useCount += 1;
 for (node = chain->head; !dlEnd(node); node = node->next)
     {
     char *word = node->val;
@@ -221,31 +260,31 @@
  */
 }
 void wordTreeDump(int level, struct wordTree *wt, FILE *f)
 /* Write out wordTree to file. */
 {
 static char *words[64];
 struct slRef *list, *ref;
 int i;
 assert(level < ArraySize(words));
 
 words[level] = wt->word;
 if (wt->useCount >= minUse)
     {
     if (!fullOnly || level == maxChainSize)
 	{
-	fprintf(f, "%d\t%d\t%d\t%f\t%f\t", level, wt->useCount, wt->outputCount, wt->normVal, wt->deadEnd);
+	fprintf(f, "%d\t%d\t%d\t%f\t%d\t", level, wt->useCount, wt->outputCount, wt->normVal, wt->missingFromChildren);
 	
 	for (i=1; i<=level; ++i)
             {
             spaceOut(f, level*2);
 	    fprintf(f, "%s ", words[i]);
             }
 	fprintf(f, "\n");
 	}
     }
 if (wt->following != NULL)
     {
     list = rbTreeItems(wt->following);
     for (ref = list; ref != NULL; ref = ref->next)
         wordTreeDump(level+1, ref->val, f);
     slFreeList(&list);
@@ -442,31 +481,32 @@
 	freeMem(node);
       }
     dlListFree(&chain);
     }
 
 lineFileClose(&lf);
 return wt;
 }
 
 void alphaChain(char *inFile, char *outFile)
 /* alphaChain - Create Markov chain of words and optionally output chain in two formats. */
 {
 struct lm *lm = lmInit(0);
 struct wordTree *wt = wordTreeForChainsInFile(inFile, maxChainSize, lm);
 wordTreeNormalize(wt, 1.0);
-wordTreeDeadEnd(wt);
+wordTreeSetMissing(wt);
+
 if (optionExists("chain"))
     {
     char *fileName = optionVal("chain", NULL);
     FILE *f = mustOpen(fileName, "w");
     wordTreeDump(0, wt, f);
     carefulClose(&f);
     }
 
 
  FILE *f = mustOpen(outFile, "w");
  int maxSize = min(wt->useCount, maxNonsenseSize);
 
  /* KEH NOTES: controls how many words we emit */
 
  wordTreeMakeNonsense(wt, maxChainSize, pickRandomWord(wt->following), maxSize, f);