b174d4ad085553d5a905b7147868a79070f5ea9d
kent
  Wed Apr 25 18:19:27 2012 -0700
Fixed a bug in the useCounts from double counting once per line of input.  Starting to implement the output, but it's still rough and buggy.
diff --git src/kehayden/alphaChain/alphaChain.c src/kehayden/alphaChain/alphaChain.c
index d59c43b..9ef18f9 100644
--- src/kehayden/alphaChain/alphaChain.c
+++ src/kehayden/alphaChain/alphaChain.c
@@ -22,30 +22,31 @@
   "alphaChain - create a linear projection of alpha satellite arrays using the probablistic model\n"
   "of HuRef satellite graphs\n"
   "usage:\n"
   "   alphaChain alphaMonFile.fa significant_output.txt\n"
   "options:\n"
   "   -size=N - Set max chain size, default %d\n"
   "   -chain=fileName - Write out word chain to file\n"
   "   -maxNonsenseSize=N - Keep nonsense output to this many words.\n"
   "   -fullOnly - Only output chains of size\n"
   "   -minUse=N - Set minimum use in output chain, default %d\n"
   , maxChainSize, minUse
   );
 }
 
 char *noData  = "n/a";   // Used to indicate a dummy node representing missing data
+
 /* Command line validation table. */
 static struct optionSpec options[] = {
    {"size", OPTION_INT},
    {"minUse", OPTION_INT},
    {"chain", OPTION_STRING},
    {"fullOnly", OPTION_BOOLEAN},
    {"maxNonsenseSize", OPTION_INT},
    {NULL, 0},
 };
 
 /* The wordTree structure below is the central data structure for this program.  It is
  * used to build up a tree that contains all observed N-word-long sequences observed in
  * the text, where N corresponds to the "size" command line option which defaults to 3,
  * an option that in turn is stored in the maxChainSize variable.  At this chain size the
  * text 
@@ -292,140 +293,192 @@
 	fprintf(f, "\n");
 	}
     }
 if (wt->following != NULL)
     {
     list = rbTreeItems(wt->following);
     for (ref = list; ref != NULL; ref = ref->next)
         wordTreeDump(level+1, ref->val, f);
     slFreeList(&list);
     }
 }
 
 int totalUses = 0;
 int curUses = 0;
 int useThreshold = 0;
-char *pickedWord;
+struct wordTree *picked;
 
 void addUse(void *v)
 /* Add up to total uses. */
 {
 struct wordTree *wt = v;
-totalUses += wt->useCount;
+totalUses += wt->outputCount;
 }
 
 void pickIfInThreshold(void *v)
-/* See if inside threshold, and if so store it in pickedWord. */
+/* See if inside threshold, and if so store it in picked. */
 {
 struct wordTree *wt = v;
-int top = curUses + wt->useCount;
+int top = curUses + wt->outputCount;
 if (curUses <= useThreshold && useThreshold < top)
-    pickedWord = wt->word;
+    picked = wt;
 curUses = top;
 }
 
-char *pickRandomWord(struct rbTree *rbTree)
+struct wordTree *pickRandom(struct rbTree *rbTree)
 /* Pick word from list randomly, but so that words more
  * commonly seen are picked more often. */
 {
-pickedWord = NULL;
+picked = NULL;
 curUses = 0;
 totalUses = 0;
 rbTreeTraverse(rbTree, addUse);
 useThreshold = rand() % totalUses; 
 rbTreeTraverse(rbTree, pickIfInThreshold);
-assert(pickedWord != NULL);
-return pickedWord;
+assert(picked != NULL);
+return picked;
+}
+
+void dumpWordList(struct dlNode *list)
+{
+struct dlNode *node;
+for (node = list; !dlEnd(node); node = node->next)
+    {
+    char *word = node->val;
+    uglyf("%s ", word);
+    }
 }
 
-char *predictNextFromAllPredecessors(struct wordTree *wt, struct dlNode *list)
+struct wordTree *predictNextFromAllPredecessors(struct wordTree *wt, struct dlNode *list)
 /* Predict next word given tree and recently used word list.  If tree doesn't
  * have statistics for what comes next given the words in list, then it returns
  * NULL. */
 {
 struct dlNode *node;
 for (node = list; !dlEnd(node); node = node->next)
     {
     char *word = node->val;
     struct wordTree key;
     key.word = word;
     wt = rbTreeFind(wt->following, &key);
     if (wt == NULL || wt->following == NULL)
         break;
     }
-char *result = NULL;
+struct wordTree *result = NULL;
 if (wt != NULL && wt->following != NULL)
-    result = pickRandomWord(wt->following);
+    result = pickRandom(wt->following);
 return result;
 }
 
-char *predictNext(struct wordTree *wt, struct dlList *recent)
+struct wordTree *predictNext(struct wordTree *wt, struct dlList *recent)
 /* Predict next word given tree and recently used word list.  Will use all words in
  * recent list if can,  but if there is not data in tree, will back off, and use
  * progressively less previous words until ultimately it just picks a random
  * word. */
 {
 struct dlNode *node;
 for (node = recent->head; !dlEnd(node); node = node->next)
     {
-    char *result = predictNextFromAllPredecessors(wt, node);
+    struct wordTree *result = predictNextFromAllPredecessors(wt, node);
     if (result != NULL)
         return result;
     }
-return pickRandomWord(wt->following); 
+return pickRandom(wt->following); 
+}
+
+void decrementOutputCounts(struct wordTree *wt)
+/* Decrement output count of self and parents. */
+{
+while (wt != NULL)
+    {
+    wt->outputCount -= 1;
+    wt = wt->parent;
+    }
 }
 
-static void wordTreeMakeNonsense(struct wordTree *wt, int maxSize, char *firstWord, 
+int anyForceCount = 0;
+int fullForceCount = 0;
+
+struct wordTree *forceRealWord(struct wordTree *wt, struct dlNode *list)
+/* Get a choice that is not one of the fake no-date ones by backing up to progressively
+ * higher levels of markov chain.  */
+{
+++anyForceCount;
+// uglyf("forceRealWord("); dumpWordList(list); uglyf(")\n");
+struct dlNode *sublist;
+for (sublist = list->next; !dlEnd(sublist); sublist = sublist->next)    /* Skip over first one, it failed already. */
+    {
+  //   uglyf("  "); dumpWordList(sublist); uglyf("\n");
+    struct wordTree *picked = predictNextFromAllPredecessors(wt, sublist);
+    if (picked != NULL && !sameString(picked->word, noData))
+        return picked;
+    }
+++fullForceCount;
+return pickRandom(wt->following);
+}
+
+static void wordTreeGenerateFaux(struct wordTree *wt, int maxSize, struct wordTree *firstWord, 
 	int maxOutputWords, FILE *f)
 /* Go spew out a bunch of words according to probabilities in tree. */
 {
 struct dlList *ll = dlListNew();
 int listSize = 0;
 int outputWords = 0;
 
 for (;;)
     {
     if (++outputWords > maxOutputWords)
         break;
     struct dlNode *node;
-    char *word;
+    struct wordTree *maybeWord;	// This might be what we want, or it might be a dummy node
 
     /* Get next predicted word. */
     if (listSize == 0)
         {
 	AllocVar(node);
 	++listSize;
-	word = firstWord;
+	maybeWord = firstWord;
 	}
     else if (listSize >= maxSize)
 	{
 	node = dlPopHead(ll);
-	word = predictNext(wt, ll);
+	maybeWord = predictNext(wt, ll);
 	}
     else
 	{
-	word = predictNext(wt, ll);
+	maybeWord = predictNext(wt, ll);
 	AllocVar(node);
 	++listSize;
 	}
-    node->val = word;
-    dlAddTail(ll, node);
 
-    if (word == NULL)
+    if (maybeWord == NULL)
          break;
 
-    fprintf(f, "%s\n", word);
+    /* Here we deal with possibly having fetched a dummy node. */
+    struct wordTree *realWord = maybeWord;
+    if (sameString(maybeWord->word, noData))
+        {
+	realWord = forceRealWord(wt, ll->head);
+	}
+
+    /* Add word from whatever level we fetched back to our chain of up to maxChainSize. */
+    node->val = realWord->word;
+    dlAddTail(ll, node);
+
+    fprintf(f, "%s\n", maybeWord->word);
+
+    decrementOutputCounts(maybeWord);
     }
 dlListFree(&ll);
 }
 
 struct wordTree *wordTreeForChainsInFile(char *fileName, int chainSize, struct lm *lm)
 /* Return a wordTree of all chains-of-words of length chainSize seen in file. 
  * Allocate the structure in local memory pool lm. */ 
 {
 /* Stuff for processing file a line at a time. */
 struct lineFile *lf = lineFileOpen(fileName, TRUE);
 char *line, *word;
 
 /* We'll build up the tree starting with an empty root node. */
 struct wordTree *wt = wordTreeNew("");	
 
@@ -510,36 +563,39 @@
 if (optionExists("chain"))
     {
     char *fileName = optionVal("chain", NULL);
     FILE *f = mustOpen(fileName, "w");
     fprintf(f, "#level\tuseCount\toutputCount\tnormVal\tmonomers\n");
     wordTreeDump(0, wt, f);
     carefulClose(&f);
     }
 
 
  FILE *f = mustOpen(outFile, "w");
  int maxSize = min(wt->useCount, maxNonsenseSize);
 
  /* KEH NOTES: controls how many words we emit */
 
- wordTreeMakeNonsense(wt, maxChainSize, pickRandomWord(wt->following), maxSize, f);
+wordTreeGenerateFaux(wt, maxChainSize, pickRandom(wt->following), maxSize, f);
  carefulClose(&f);
     
+uglyf("anyForce %d, fullForce %d (%4.2f%%)\n", anyForceCount, fullForceCount, 100.0*fullForceCount/anyForceCount);
 
 lmCleanup(&lm);	// Not really needed since we're just going to exit.
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
+#ifdef SOON
 srand( (unsigned)time(0) );
+#endif /* SOON */
 optionInit(&argc, argv, options);
 if (argc != 3)
     usage();
 maxChainSize = optionInt("size", maxChainSize);
 minUse = optionInt("minUse", minUse);
 maxNonsenseSize = optionInt("maxNonsenseSize", maxNonsenseSize);
 fullOnly = optionExists("fullOnly");
 alphaChain(argv[1], argv[2]);
 return 0;
 }