55a0fb30ff422a2a0688685111fb68b89db1f266
kent
  Wed May 2 11:10:54 2012 -0700
Getting rid of dummy nodes. Output looking more reasonable, but still not 100%.
diff --git src/kehayden/alphaChain/alphaChain.c src/kehayden/alphaChain/alphaChain.c
index 9ef18f9..071bccf 100644
--- src/kehayden/alphaChain/alphaChain.c
+++ src/kehayden/alphaChain/alphaChain.c
@@ -1,59 +1,57 @@
 /* alphaChain - Predicts faux centromere sequences using a probablistic model. */
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "localmem.h"
 #include "options.h"
 #include "dlist.h"
 #include "rbTree.h"
 
 /* Global vars - all of which can be set by command line options. */
 int maxChainSize = 3;
-int maxNonsenseSize = 10000;
+int outSize = 10000;
 int minUse = 1;
 boolean lower = FALSE;
 boolean unpunc = FALSE;
 boolean fullOnly = FALSE;
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "alphaChain - create a linear projection of alpha satellite arrays using the probablistic model\n"
   "of HuRef satellite graphs\n"
   "usage:\n"
   "   alphaChain alphaMonFile.fa significant_output.txt\n"
   "options:\n"
   "   -size=N - Set max chain size, default %d\n"
   "   -chain=fileName - Write out word chain to file\n"
-  "   -maxNonsenseSize=N - Keep nonsense output to this many words.\n"
+  "   -outSize=N - Output this many words.\n"
   "   -fullOnly - Only output chains of size\n"
   "   -minUse=N - Set minimum use in output chain, default %d\n"
   , maxChainSize, minUse
   );
 }
 
-char *noData  = "n/a";   // Used to indicate a dummy node representing missing data
-
 /* Command line validation table. */
 static struct optionSpec options[] = {
    {"size", OPTION_INT},
    {"minUse", OPTION_INT},
    {"chain", OPTION_STRING},
    {"fullOnly", OPTION_BOOLEAN},
-   {"maxNonsenseSize", OPTION_INT},
+   {"outSize", OPTION_INT},
    {NULL, 0},
 };
 
 /* The wordTree structure below is the central data structure for this program.  It is
  * used to build up a tree that contains all observed N-word-long sequences observed in
  * the text, where N corresponds to the "size" command line option which defaults to 3,
  * an option that in turn is stored in the maxChainSize variable.  At this chain size the
  * text 
  *     this is the black dog and the black cat
  * would have the chains 
  *     this is the 
  *     is the black
  *     the black dog
  *     black dog and
  *     dog and the
@@ -152,87 +150,59 @@
 struct rbTree *following = wt->following;
 if (following == NULL)
     return 0;
 struct slRef *childList = rbTreeItems(following);
 struct slRef *childRef;
 int total = 0;
 for (childRef = childList; childRef != NULL; childRef = childRef->next)
     {
     struct wordTree *child = childRef->val;
     total += child->useCount;
     }
 slFreeList(&childList);
 return total;
 }
 
-int wordTreeCountNotInChildren(struct wordTree *wt)
-/* Count up useCounts of all children and return difference between this and our own useCount. */
-{
-return wt->useCount - wordTreeChildrenUseCount(wt);
-}
-
-void wordTreeSetMissing(struct wordTree *wt, int level, struct lm *lm, struct rbTreeNode **stack)
-/* Set missingFromChildren in self and all children. */
-{
-int missingFromChildren = wordTreeCountNotInChildren(wt);
-if (missingFromChildren > 0)
-    {
-    struct wordTree *missing = wordTreeAddFollowing(wt, noData, lm, stack);
-    missing->useCount = missingFromChildren;
-    }
-struct rbTree *following = wt->following;
-if (following != NULL && level < maxChainSize)
-    {
-    struct slRef *childList = rbTreeItems(following);
-    struct slRef *childRef;
-    for (childRef = childList; childRef != NULL; childRef = childRef->next)
-        {
-        struct wordTree *child = childRef->val;
-        wordTreeSetMissing(child, level+1, lm, stack);
-        }
-    slFreeList(&childList);
-    }
-}
-
 void addChainToTree(struct wordTree *wt, struct dlList *chain, 
 	struct lm *lm, struct rbTreeNode **stack)
 /* Add chain of words to tree. */
 {
 struct dlNode *node;
 wt->useCount += 1;
 for (node = chain->head; !dlEnd(node); node = node->next)
     {
     char *word = node->val;
     verbose(2, "  %s\n", word);
     wt = wordTreeAddFollowing(wt, word, lm, stack);
     }
 }
 
-void wordTreeNormalize(struct wordTree *wt, double normVal)
-/* Recursively set wt->normVal */
+void wordTreeNormalize(struct wordTree *wt, int outputCount, double normVal)
+/* Recursively set wt->normVal  and wt->outputCount */
 {
 wt->normVal = normVal;
-wt->outputCount = normVal * maxNonsenseSize;
+wt->outputCount = outputCount;
 if (wt->following != NULL)
     {
+    int totalChildUses = wordTreeChildrenUseCount(wt);
     struct slRef *list = rbTreeItems(wt->following);
     struct slRef *ref;
     for (ref = list; ref !=NULL; ref = ref->next)
 	{
 	struct wordTree *child = ref->val;
-	double childRatio = (double)child->useCount / wt->useCount;
-	wordTreeNormalize(child, childRatio*normVal);
+	double childRatio = (double)child->useCount / totalChildUses;
+	wordTreeNormalize(child, round(childRatio*outputCount), childRatio*normVal);
 	}
     slFreeList(&list);
     }
 }
 
 void wordTreeDeadEnd(struct wordTree *wt)
 /* tally and include incomplete branches */
 {
 /* int levelNormVal = 0;
  * int levelCount = 0;
  * int sumNormVal = 0;
  * int sumCount = 0;
  * int diffNormVal = 0;
  * int diffCount=0;
  * Loop pseudocode
@@ -295,69 +265,76 @@
     }
 if (wt->following != NULL)
     {
     list = rbTreeItems(wt->following);
     for (ref = list; ref != NULL; ref = ref->next)
         wordTreeDump(level+1, ref->val, f);
     slFreeList(&list);
     }
 }
 
 int totalUses = 0;
 int curUses = 0;
 int useThreshold = 0;
 struct wordTree *picked;
 
+int totUseZeroCount = 0;
+
 void addUse(void *v)
 /* Add up to total uses. */
 {
 struct wordTree *wt = v;
 totalUses += wt->outputCount;
 }
 
 void pickIfInThreshold(void *v)
 /* See if inside threshold, and if so store it in picked. */
 {
 struct wordTree *wt = v;
 int top = curUses + wt->outputCount;
 if (curUses <= useThreshold && useThreshold < top)
     picked = wt;
 curUses = top;
 }
 
+void pickAny(void *v)
+/* See if inside threshold, and if so store it in picked. */
+{
+struct wordTree *wt = v;
+picked = wt;
+}
+
 struct wordTree *pickRandom(struct rbTree *rbTree)
 /* Pick word from list randomly, but so that words more
  * commonly seen are picked more often. */
 {
 picked = NULL;
 curUses = 0;
 totalUses = 0;
 rbTreeTraverse(rbTree, addUse);
+if (totalUses != 0)
+    {
 useThreshold = rand() % totalUses; 
 rbTreeTraverse(rbTree, pickIfInThreshold);
-assert(picked != NULL);
-return picked;
 }
-
-void dumpWordList(struct dlNode *list)
+if (picked == NULL)
 {
-struct dlNode *node;
-for (node = list; !dlEnd(node); node = node->next)
-    {
-    char *word = node->val;
-    uglyf("%s ", word);
+    ++totUseZeroCount;
+    rbTreeTraverse(rbTree, pickAny);
     }
+assert(picked != NULL);
+return picked;
 }
 
 struct wordTree *predictNextFromAllPredecessors(struct wordTree *wt, struct dlNode *list)
 /* Predict next word given tree and recently used word list.  If tree doesn't
  * have statistics for what comes next given the words in list, then it returns
  * NULL. */
 {
 struct dlNode *node;
 for (node = list; !dlEnd(node); node = node->next)
     {
     char *word = node->val;
     struct wordTree key;
     key.word = word;
     wt = rbTreeFind(wt->following, &key);
     if (wt == NULL || wt->following == NULL)
@@ -383,102 +360,75 @@
         return result;
     }
 return pickRandom(wt->following); 
 }
 
 void decrementOutputCounts(struct wordTree *wt)
 /* Decrement output count of self and parents. */
 {
 while (wt != NULL)
     {
     wt->outputCount -= 1;
     wt = wt->parent;
     }
 }
 
-int anyForceCount = 0;
-int fullForceCount = 0;
-
-struct wordTree *forceRealWord(struct wordTree *wt, struct dlNode *list)
-/* Get a choice that is not one of the fake no-date ones by backing up to progressively
- * higher levels of markov chain.  */
-{
-++anyForceCount;
-// uglyf("forceRealWord("); dumpWordList(list); uglyf(")\n");
-struct dlNode *sublist;
-for (sublist = list->next; !dlEnd(sublist); sublist = sublist->next)    /* Skip over first one, it failed already. */
-    {
-  //   uglyf("  "); dumpWordList(sublist); uglyf("\n");
-    struct wordTree *picked = predictNextFromAllPredecessors(wt, sublist);
-    if (picked != NULL && !sameString(picked->word, noData))
-        return picked;
-    }
-++fullForceCount;
-return pickRandom(wt->following);
-}
-
 static void wordTreeGenerateFaux(struct wordTree *wt, int maxSize, struct wordTree *firstWord, 
 	int maxOutputWords, FILE *f)
 /* Go spew out a bunch of words according to probabilities in tree. */
 {
 struct dlList *ll = dlListNew();
 int listSize = 0;
 int outputWords = 0;
 
 for (;;)
     {
     if (++outputWords > maxOutputWords)
         break;
     struct dlNode *node;
-    struct wordTree *maybeWord;	// This might be what we want, or it might be a dummy node
+    struct wordTree *picked;
 
     /* Get next predicted word. */
     if (listSize == 0)
         {
 	AllocVar(node);
 	++listSize;
-	maybeWord = firstWord;
+	picked = firstWord;
 	}
     else if (listSize >= maxSize)
 	{
 	node = dlPopHead(ll);
-	maybeWord = predictNext(wt, ll);
+	picked = predictNext(wt, ll);
 	}
     else
 	{
-	maybeWord = predictNext(wt, ll);
+	picked = predictNext(wt, ll);
 	AllocVar(node);
 	++listSize;
 	}
 
-    if (maybeWord == NULL)
+    if (picked == NULL)
          break;
 
-    /* Here we deal with possibly having fetched a dummy node. */
-    struct wordTree *realWord = maybeWord;
-    if (sameString(maybeWord->word, noData))
-        {
-	realWord = forceRealWord(wt, ll->head);
-	}
 
     /* Add word from whatever level we fetched back to our chain of up to maxChainSize. */
-    node->val = realWord->word;
+    node->val = picked->word;
     dlAddTail(ll, node);
 
-    fprintf(f, "%s\n", maybeWord->word);
+    fprintf(f, "%s\n", picked->word);
 
-    decrementOutputCounts(maybeWord);
+    decrementOutputCounts(picked);
     }
 dlListFree(&ll);
 }
 
 struct wordTree *wordTreeForChainsInFile(char *fileName, int chainSize, struct lm *lm)
 /* Return a wordTree of all chains-of-words of length chainSize seen in file. 
  * Allocate the structure in local memory pool lm. */ 
 {
 /* Stuff for processing file a line at a time. */
 struct lineFile *lf = lineFileOpen(fileName, TRUE);
 char *line, *word;
 
 /* We'll build up the tree starting with an empty root node. */
 struct wordTree *wt = wordTreeNew("");	
 
@@ -536,66 +486,65 @@
     /* Handle last few words in line, where can't make a chain of full size.  Also handles       
     * lines that have fewer than chain size words. */
     if (curSize < chainSize)
  	addChainToTree(wt, chain, lm, stack);
     while ((node = dlPopHead(chain)) != NULL)
 	{
 	if (!dlEmpty(chain))
 	    addChainToTree(wt, chain, lm, stack);
 	freeMem(node->val);
 	freeMem(node);
 	}
     dlListFree(&chain);
     }
 lineFileClose(&lf);
 
-/* Add in additional information to help traverse tree . */
-wordTreeSetMissing(wt, 1, lm, stack);
-wordTreeNormalize(wt, 1.0);
 return wt;
 }
 
 void alphaChain(char *inFile, char *outFile)
 /* alphaChain - Create Markov chain of words and optionally output chain in two formats. */
 {
 struct lm *lm = lmInit(0);
 struct wordTree *wt = wordTreeForChainsInFile(inFile, maxChainSize, lm);
+wordTreeNormalize(wt, outSize, 1.0);
 
 if (optionExists("chain"))
     {
     char *fileName = optionVal("chain", NULL);
     FILE *f = mustOpen(fileName, "w");
     fprintf(f, "#level\tuseCount\toutputCount\tnormVal\tmonomers\n");
     wordTreeDump(0, wt, f);
     carefulClose(&f);
     }
 
 
 FILE *f = mustOpen(outFile, "w");
-int maxSize = min(wt->useCount, maxNonsenseSize);
-
-/* KEH NOTES: controls how many words we emit */
-
-wordTreeGenerateFaux(wt, maxChainSize, pickRandom(wt->following), maxSize, f);
+wordTreeGenerateFaux(wt, maxChainSize, pickRandom(wt->following), outSize, f);
 carefulClose(&f);
+uglyf("totUseZeroCount = %d\n", totUseZeroCount);
 
-uglyf("anyForce %d, fullForce %d (%4.2f%%)\n", anyForceCount, fullForceCount, 100.0*fullForceCount/anyForceCount);
+    {
+    FILE *f = mustOpen("foo.chain", "w");
+    wordTreeDump(0, wt, f);
+    carefulClose(&f);
+    }
 
 lmCleanup(&lm);	// Not really needed since we're just going to exit.
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 #ifdef SOON
 srand( (unsigned)time(0) );
 #endif /* SOON */
 optionInit(&argc, argv, options);
 if (argc != 3)
     usage();
 maxChainSize = optionInt("size", maxChainSize);
 minUse = optionInt("minUse", minUse);
-maxNonsenseSize = optionInt("maxNonsenseSize", maxNonsenseSize);
+outSize = optionInt("outSize", outSize);
 fullOnly = optionExists("fullOnly");
 alphaChain(argv[1], argv[2]);
 return 0;
 }