7982829ea415cb7bfb760f0ec79793c6230d8d86
kent
  Thu May 3 09:25:08 2012 -0700
Further work on faux-generation side.  Getting close but not quite perfect.
diff --git src/kehayden/alphaChain/alphaChain.c src/kehayden/alphaChain/alphaChain.c
index 071bccf..6d9b8b3 100644
--- src/kehayden/alphaChain/alphaChain.c
+++ src/kehayden/alphaChain/alphaChain.c
@@ -71,43 +71,41 @@
  *     black
  *        dog
  *           and
  *     dog
  *        and
  *           the
  *     and
  *        the
  *           black
  * Note how the tree is able to compress the two chains "the black dog" and "the black cat."
  *
  * A node in the tree can have as many children as it needs to at each node.  The depth of
  * the tree is the same as the chain size, by default 3. At each node in the tree you get
  * a word, and a list of all words that are observed in the text to follow that word.
  *
- * There are special cases in the code so that the first and last words in the text get included 
- * as much as possible in the tree. 
- *
  * Once the program has build up the wordTree, it can output it in a couple of fashions. */
 
 struct wordTree
 /* A node in a tree of words.  The head of the tree is a node with word value the empty string. */
     {
     struct rbTree *following;	/* Contains words (as struct wordTree) that follow us. */
     struct wordTree *parent;    /* Parent of this node or NULL for root. */
     char *word;			/* The word itself including comma, period etc. */
-    int useCount;		/* Number of times word used. */
-    int outputCount;            /* each level of tree and initialize that to a normalized version of it. */
+    int useCount;		/* Number of times word used in input. */
+    int outTarget;              /* Number of times want to output word. */
+    int outCount;	/* Number of times output. */
     double normVal;             /* value to place the normalization value */    
     };
 
 struct wordTree *wordTreeNew(char *word)
 /* Create and return new wordTree element. */
 {
 struct wordTree *wt;
 AllocVar(wt);
 wt->word = cloneString(word);
 return wt;
 }
 
 int wordTreeCmpWord(void *va, void *vb)
 /* Compare two wordTree. */
 {
@@ -164,177 +162,136 @@
 
 void addChainToTree(struct wordTree *wt, struct dlList *chain, 
 	struct lm *lm, struct rbTreeNode **stack)
 /* Add chain of words to tree. */
 {
 struct dlNode *node;
 wt->useCount += 1;
 for (node = chain->head; !dlEnd(node); node = node->next)
     {
     char *word = node->val;
     verbose(2, "  %s\n", word);
     wt = wordTreeAddFollowing(wt, word, lm, stack);
     }
 }
 
-void wordTreeNormalize(struct wordTree *wt, int outputCount, double normVal)
-/* Recursively set wt->normVal  and wt->outputCount */
+void wordTreeNormalize(struct wordTree *wt, double outTarget, double normVal)
+/* Recursively set wt->normVal  and wt->outTarget so each branch gets its share */
 {
 wt->normVal = normVal;
-wt->outputCount = outputCount;
+wt->outTarget = outTarget;
 if (wt->following != NULL)
     {
     int totalChildUses = wordTreeChildrenUseCount(wt);
     struct slRef *list = rbTreeItems(wt->following);
     struct slRef *ref;
     for (ref = list; ref !=NULL; ref = ref->next)
 	{
 	struct wordTree *child = ref->val;
 	double childRatio = (double)child->useCount / totalChildUses;
-	wordTreeNormalize(child, round(childRatio*outputCount), childRatio*normVal);
+	wordTreeNormalize(child, childRatio*outTarget, childRatio*normVal);
 	}
     slFreeList(&list);
     }
 }
 
-void wordTreeDeadEnd(struct wordTree *wt)
-/* tally and include incomplete branches */
-{
-/* int levelNormVal = 0;
- * int levelCount = 0;
- * int sumNormVal = 0;
- * int sumCount = 0;
- * int diffNormVal = 0;
- * int diffCount=0;
- * Loop pseudocode
- * work recursively through level 1-> 3, start at root of tree
- * foreach word at level 1
- * {
- *   sumCount = 0
- *   sumNormVal = 0
- *   levelCount = wt -> outputCount
- *   levelNormVal = wt-> normVal
- *   if(wt->following == NULL)                                                                                           
- *   { 
- *   create new child recursively (level 2 and level 3/default)
- *     wt->normVal = levelNormVal
- *     wt->word = 'NaN'
- *     wt->outputCount = levelCount
- *   }
- *   else
- *   {
- *    foreach wt->following at level + 1
- *    {
- *    sumCount += wt->outputCount
- *    sumNormVal  += wt->normVal
- *    ** RECURSIVE level 2 + 1 here **
- *   }
- *   diffCount = levelCount - sumCount
- *   diffNormVal = levelNormVal - sumNormVal
- *   if(diffCount > 0)
- *   {
- *   create level 2:
- *     wt->normVal = diffNormVal
- *     wt->word = 'NaN'
- *     wt->outputVal = diffCount
- *   }
- */
-}
-
 void wordTreeDump(int level, struct wordTree *wt, FILE *f)
 /* Write out wordTree to file. */
 {
 static char *words[64];
 struct slRef *list, *ref;
 int i;
 assert(level < ArraySize(words));
 
 words[level] = wt->word;
 if (wt->useCount >= minUse)
     {
     if (!fullOnly || level == maxChainSize)
 	{
-	fprintf(f, "%d\t%d\t%d\t%f\t", level, wt->useCount, wt->outputCount, wt->normVal);
+	fprintf(f, "%d\t%d\t%d\t%d\t%f\t", level, wt->useCount, wt->outTarget, wt->outCount, wt->normVal);
 	
 	for (i=1; i<=level; ++i)
             {
             spaceOut(f, level*2);
 	    fprintf(f, "%s ", words[i]);
             }
 	fprintf(f, "\n");
 	}
     }
 if (wt->following != NULL)
     {
     list = rbTreeItems(wt->following);
     for (ref = list; ref != NULL; ref = ref->next)
         wordTreeDump(level+1, ref->val, f);
     slFreeList(&list);
     }
 }
 
 int totalUses = 0;
 int curUses = 0;
 int useThreshold = 0;
-struct wordTree *picked;
+struct wordTree *pickedNode;
 
 int totUseZeroCount = 0;
 
 void addUse(void *v)
 /* Add up to total uses. */
 {
 struct wordTree *wt = v;
-totalUses += wt->outputCount;
+totalUses += wt->outTarget;
 }
 
 void pickIfInThreshold(void *v)
-/* See if inside threshold, and if so store it in picked. */
+/* See if inside threshold, and if so store it in pickedNode. */
+{
+if (pickedNode == NULL)
 {
 struct wordTree *wt = v;
-int top = curUses + wt->outputCount;
-if (curUses <= useThreshold && useThreshold < top)
-    picked = wt;
+    int top = curUses + wt->outTarget;
+    if (useThreshold < top)
+	pickedNode = wt;
 curUses = top;
 }
+}
 
 void pickAny(void *v)
-/* See if inside threshold, and if so store it in picked. */
+/* Force it to pick something - first thing as it turns out. */
 {
-struct wordTree *wt = v;
-picked = wt;
+if (pickedNode == NULL)
+    pickedNode = v;
 }
 
 struct wordTree *pickRandom(struct rbTree *rbTree)
 /* Pick word from list randomly, but so that words more
  * commonly seen are picked more often. */
 {
-picked = NULL;
+pickedNode = NULL;
 curUses = 0;
 totalUses = 0;
 rbTreeTraverse(rbTree, addUse);
-if (totalUses != 0)
+if (totalUses > 0)
     {
     useThreshold = rand() % totalUses; 
     rbTreeTraverse(rbTree, pickIfInThreshold);
     }
-if (picked == NULL)
+if (pickedNode == NULL)
     {
     ++totUseZeroCount;
     rbTreeTraverse(rbTree, pickAny);
     }
-assert(picked != NULL);
-return picked;
+assert(pickedNode != NULL);
+return pickedNode;
 }
 
 struct wordTree *predictNextFromAllPredecessors(struct wordTree *wt, struct dlNode *list)
 /* Predict next word given tree and recently used word list.  If tree doesn't
  * have statistics for what comes next given the words in list, then it returns
  * NULL. */
 {
 struct dlNode *node;
 for (node = list; !dlEnd(node); node = node->next)
     {
     char *word = node->val;
     struct wordTree key;
     key.word = word;
     wt = rbTreeFind(wt->following, &key);
     if (wt == NULL || wt->following == NULL)
@@ -355,82 +312,86 @@
 struct dlNode *node;
 for (node = recent->head; !dlEnd(node); node = node->next)
     {
     struct wordTree *result = predictNextFromAllPredecessors(wt, node);
     if (result != NULL)
         return result;
     }
 return pickRandom(wt->following); 
 }
 
 void decrementOutputCounts(struct wordTree *wt)
 /* Decrement output count of self and parents. */
 {
 while (wt != NULL)
     {
-    wt->outputCount -= 1;
+    wt->outTarget -= 1;
+    wt->outCount += 1;
     wt = wt->parent;
     }
 }
 
 static void wordTreeGenerateFaux(struct wordTree *wt, int maxSize, struct wordTree *firstWord, 
-	int maxOutputWords, FILE *f)
+	int maxOutputWords, char *fileName)
 /* Go spew out a bunch of words according to probabilities in tree. */
 {
+FILE *f = mustOpen(fileName, "w");
 struct dlList *ll = dlListNew();
 int listSize = 0;
 int outputWords = 0;
 
 for (;;)
     {
     if (++outputWords > maxOutputWords)
         break;
     struct dlNode *node;
     struct wordTree *picked;
 
     /* Get next predicted word. */
     if (listSize == 0)
         {
 	AllocVar(node);
 	++listSize;
 	picked = firstWord;
 	}
     else if (listSize >= maxSize)
 	{
 	node = dlPopHead(ll);
 	picked = predictNext(wt, ll);
+//         decrementOutputCounts(picked);   // ugly placement?
 	}
     else
 	{
 	picked = predictNext(wt, ll);
 	AllocVar(node);
 	++listSize;
 	}
 
     if (picked == NULL)
          break;
 
 
     /* Add word from whatever level we fetched back to our chain of up to maxChainSize. */
     node->val = picked->word;
     dlAddTail(ll, node);
 
     fprintf(f, "%s\n", picked->word);
 
     decrementOutputCounts(picked);
     }
 dlListFree(&ll);
+carefulClose(&f);
 }
 
 struct wordTree *wordTreeForChainsInFile(char *fileName, int chainSize, struct lm *lm)
 /* Return a wordTree of all chains-of-words of length chainSize seen in file. 
  * Allocate the structure in local memory pool lm. */ 
 {
 /* Stuff for processing file a line at a time. */
 struct lineFile *lf = lineFileOpen(fileName, TRUE);
 char *line, *word;
 
 /* We'll build up the tree starting with an empty root node. */
 struct wordTree *wt = wordTreeNew("");	
 
 /* Save time/space by sharing stack between all "following" rbTrees. */
 struct rbTreeNode **stack;	
@@ -449,33 +410,30 @@
     int wordCount = 0;
 
     /* skipping the first word which is the read id */
     word = nextWord(&line);
 
     while ((word = nextWord(&line)) != NULL)
 	{
 	/* We come to this point in the code for each word in the file. 
 	 * Here we want to maintain a chain of sequential words up to
 	 * chainSize long.  We do this with a doubly-linked list structure.
 	 * For the first few words in the file we'll just build up the list,
 	 * only adding it to the tree when we finally do get to the desired
 	 * chain size.  Once past the initial section of the file we'll be
 	 * getting rid of the first link in the chain as well as adding a new
 	 * last link in the chain with each new word we see. */
-
-
-
 	if (curSize < chainSize)
 	    {
 	    dlAddValTail(chain, cloneString(word));
 	    ++curSize;
 	    if (curSize == chainSize)
 		addChainToTree(wt, chain, lm, stack);
 	    }
 	else
 	    {
 	    /* Reuse doubly-linked-list node, but give it a new value, as we move
 	     * it from head to tail of list. */
 	    node = dlPopHead(chain);
 	    freeMem(node->val);
 	    node->val = cloneString(word);
 	    dlAddTail(chain, node);
@@ -489,57 +447,57 @@
  	addChainToTree(wt, chain, lm, stack);
     while ((node = dlPopHead(chain)) != NULL)
 	{
 	if (!dlEmpty(chain))
 	    addChainToTree(wt, chain, lm, stack);
 	freeMem(node->val);
 	freeMem(node);
 	}
     dlListFree(&chain);
     }
 lineFileClose(&lf);
 
 return wt;
 }
 
+void wordTreeWrite(struct wordTree *wt, char *fileName)
+/* Write out tree to file */
+{
+FILE *f = mustOpen(fileName, "w");
+fprintf(f, "#level\tuseCount\toutTarget\toutCount\tnormVal\tmonomers\n");
+wordTreeDump(0, wt, f);
+carefulClose(&f);
+}
+
 void alphaChain(char *inFile, char *outFile)
 /* alphaChain - Create Markov chain of words and optionally output chain in two formats. */
 {
 struct lm *lm = lmInit(0);
 struct wordTree *wt = wordTreeForChainsInFile(inFile, maxChainSize, lm);
 wordTreeNormalize(wt, outSize, 1.0);
 
 if (optionExists("chain"))
     {
     char *fileName = optionVal("chain", NULL);
-    FILE *f = mustOpen(fileName, "w");
-    fprintf(f, "#level\tuseCount\toutputCount\tnormVal\tmonomers\n");
-    wordTreeDump(0, wt, f);
-    carefulClose(&f);
+    wordTreeWrite(wt, fileName);
     }
 
 
-FILE *f = mustOpen(outFile, "w");
-wordTreeGenerateFaux(wt, maxChainSize, pickRandom(wt->following), outSize, f);
-carefulClose(&f);
-uglyf("totUseZeroCount = %d\n", totUseZeroCount);
+wordTreeGenerateFaux(wt, maxChainSize, pickRandom(wt->following), outSize, outFile);
 
-    {
-    FILE *f = mustOpen("foo.chain", "w");
-    wordTreeDump(0, wt, f);
-    carefulClose(&f);
-    }
+uglyf("totUseZeroCount = %d\n", totUseZeroCount);
+wordTreeWrite(wt, "ugly.chain");
 
 lmCleanup(&lm);	// Not really needed since we're just going to exit.
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 #ifdef SOON
 srand( (unsigned)time(0) );
 #endif /* SOON */
 optionInit(&argc, argv, options);
 if (argc != 3)
     usage();
 maxChainSize = optionInt("size", maxChainSize);
 minUse = optionInt("minUse", minUse);