src/kehayden/alphaChain/alphaChain.c 1568a9ea025f3b3def190c2b3b93b6bbcee6e0ab

1568a9ea025f3b3def190c2b3b93b6bbcee6e0ab
kent
  Wed Sep 5 19:13:39 2012 -0700
Refactoring - working towards making predictNext more general.
diff --git src/kehayden/alphaChain/alphaChain.c src/kehayden/alphaChain/alphaChain.c
index f76263d..92b9706 100644
--- src/kehayden/alphaChain/alphaChain.c
+++ src/kehayden/alphaChain/alphaChain.c
@@ -65,30 +65,31 @@
     };
 
 struct wordType
 /* A collection of words of the same type - or monomers of same type. */
     {
     struct wordType *next;   /* Next wordType */
     struct wordInfoRef *list;	    /* List of all words of that type */
     };
 
 struct wordStore
 /* Stores info on all words */
     {
     struct wordInfo *infoList;   /* List of words, fairly arbitrary order. */
     struct hash *infoHash;	     /* Hash of wordInfo, keyed by word. */
     struct wordTree *markovChains;   /* Tree of words that follow other words. */
+    int maxChainSize;		/* Maximum depth of tree. */
     struct wordType *typeList;	/* List of all types. */
     struct hash *typeHash;	/* Hash with wordType values, keyed by all words. */
     };
 
 /* The wordTree structure below is the central data structure for this program.  It is
  * used to build up a tree that contains all observed N-word-long sequences observed in
  * the text, where N corresponds to the "size" command line option which defaults to 3,
  * an option that in turn is stored in the maxChainSize variable.  At this chain size the
  * text 
  *     this is the black dog and the black cat
  * would have the chains 
  *     this is the 
  *     is the black
  *     the black dog
  *     black dog and
@@ -255,53 +256,53 @@
 {
 struct dyString *dy = dyStringNew(0);
 dyStringAppendC(dy, '{');
 struct dlNode *node;
 for (node = head; !dlEnd(node); node = node->next)
     {
     if (node != head)
        dyStringAppendC(dy, ' ');
     struct wordInfo *info = node->val;
     dyStringAppend(dy, info->word);
     }
 dyStringAppendC(dy, '}');
 return dyStringCannibalize(&dy);
 }
 
-void wordTreeDump(int level, struct wordTree *wt, FILE *f)
+void wordTreeDump(int level, struct wordTree *wt, int maxChainSize, FILE *f)
 /* Write out wordTree to file. */
 {
 static char *words[64];
 int i;
 assert(level < ArraySize(words));
 
 words[level] = wt->info->word;
 if (!fullOnly || level == maxChainSize)
     {
     fprintf(f, "%d\t%d\t%d\t%d\t%f\t", 
 	    level, wt->useCount, wt->outTarget, wt->outCount, wt->normVal);
     
     for (i=1; i<=level; ++i)
 	{
 	spaceOut(f, level*2);
 	fprintf(f, "%s ", words[i]);
 	}
     fprintf(f, "\n");
     }
 struct wordTree *child;
 for (child = wt->children; child != NULL; child = child->next)
-    wordTreeDump(level+1, child, f);
+    wordTreeDump(level+1, child, maxChainSize, f);
 }
 
 struct wordTree *pickRandomOnOutTarget(struct wordTree *list)
 /* Pick word from list randomly, but so that words with higher outTargets
  * are picked more often. */
 {
 struct wordTree *picked = NULL;
 
 /* Debug output. */
     {
     verbose(2, "   pickRandomOnOutTarget(");
     struct wordTree *wt;
     for (wt = list; wt != NULL; wt = wt->next)
         {
 	if (wt != list)
@@ -396,103 +397,124 @@
 struct dlNode *node;
 for (node = list; !dlEnd(node); node = node->next)
     {
     struct wordInfo *info = node->val;
     wt = wordTreeFindInList(wt->children, info);
     verbose(2, "   wordTreeFindInList(%s) = %p %s\n", info->word, wt, wordTreeString(wt));
     if (wt == NULL || wt->children == NULL)
         break;
     }
 struct wordTree *result = NULL;
 if (wt != NULL && wt->children != NULL)
     result = pickRandom(wt->children);
 return result;
 }
 
-struct wordTree *predictNext(struct wordTree *wt, struct dlNode *recent)
+struct wordTree *predictNextFromRecent(struct wordTree *wt, struct dlNode *recent)
 /* Predict next word given tree and recently used word list.  Will use all words in
  * recent list if can,  but if there is not data in tree, will back off, and use
  * progressively less previous words until ultimately it just picks a random
  * word. */
 {
 struct dlNode *node;
 
 for (node = recent; !dlEnd(node); node = node->next)
     {
     struct wordTree *result = predictNextFromAllPredecessors(wt, node);
     if (result != NULL)
         return result;
     }
 struct wordTree *topLevel = pickRandom(wt->children);
 verbose(2, "in predictNext(%s, %s) ", wordTreeString(wt), dlListFragWords(recent));
 verbose(2, "last resort pick of %s\n", topLevel->info->word);
 return topLevel;
 }
 
+struct dlNode *nodesFromTail(struct dlList *list, int count)
+/* Return count nodes from tail of list.  If list is shorter than count, it returns the
+ * whole list. */
+{
+int i;
+struct dlNode *node;
+for (i=0, node = list->tail; i<count; ++i)
+    {
+    if (dlStart(node))
+        return list->head;
+    node = node->prev;
+    }
+return node;
+}
+
+struct wordTree *predictNext(struct wordStore *store, struct dlList *past)
+/* Given input data store and what is known from the past, predict the next word. */
+{
+struct dlNode *recent = nodesFromTail(past, store->maxChainSize);
+return predictNextFromRecent(store->markovChains, recent);
+}
+
 void decrementOutputCounts(struct wordTree *wt)
 /* Decrement output count of self and parents. */
 {
 while (wt != NULL)
     {
     /* Decrement target count, but don't let it fall below sum of counts of all children. 
      * This can happen with incomplete data if we don't prevent it.  This
      * same code also prevents us from having negative outTarget. */
     int outTarget = wt->outTarget - 1;
     int kidSum = wordTreeSumOutTargets(wt->children);
     if (outTarget < kidSum)
         outTarget = kidSum;
     wt->outTarget = outTarget;
 
     /* Always bump outCount for debugging. */
     wt->outCount += 1;
     wt = wt->parent;
     }
 }
 
 static struct dlList *wordTreeGenerateList(struct wordStore *store, int maxSize, 
     struct wordTree *firstWord, int maxOutputWords)
 /* Make a list of words based on probabilities in tree. */
 {
-struct wordTree *wt = store->markovChains;
 struct dlList *ll = dlListNew();
 int chainSize = 0;
 int outputWords = 0;
 struct dlNode *chainStartNode = NULL;
 
 for (;;)
     {
     if (++outputWords > maxOutputWords)
         break;
     struct dlNode *node;
     struct wordTree *picked;
 
     /* Get next predicted word. */
     AllocVar(node);
     if (chainSize == 0)
         {
 	chainStartNode = node;
 	++chainSize;
 	picked = firstWord;
 	}
     else if (chainSize >= maxSize)
 	{
 	chainStartNode = chainStartNode->next;
-	picked = predictNext(wt, chainStartNode);
+	picked = predictNext(store, ll);
 	}
     else
 	{
-	picked = predictNext(wt, chainStartNode);
+	picked = predictNext(store, ll);
 	++chainSize;
 	}
 
     if (picked == NULL)
          break;
 
 
     /* Add word from whatever level we fetched back to our output chain. */
     node->val = picked->info;
     dlAddTail(ll, node);
 
     decrementOutputCounts(picked);
     }
 verbose(2, "totUseZeroCount = %d\n", totUseZeroCount);
 return ll;
@@ -513,64 +535,65 @@
     }
 carefulClose(&f);
 dlListFree(&ll);
 }
 
 
 static void wordTreeSort(struct wordTree *wt)
 /* Sort all children lists in tree. */
 {
 slSort(&wt->children, wordTreeCmpWord);
 struct wordTree *child;
 for (child = wt->children; child != NULL; child = child->next)
     wordTreeSort(child);
 }
 
-struct wordStore *wordStoreNew()
+struct wordStore *wordStoreNew(int maxChainSize)
 /* Allocate and initialize a new word store. */
 {
 struct wordStore *store;
 AllocVar(store);
 store->infoHash = hashNew(0);
+store->maxChainSize = maxChainSize;
 return store;
 }
 
 struct wordInfo *wordStoreAdd(struct wordStore *store, char *word)
 /* Add word to store,  incrementing it's useCount if it's already there, otherwise
  * making up a new record for it. */
 {
 struct wordInfo *info = hashFindVal(store->infoHash, word);
 if (info == NULL)
     {
     AllocVar(info);
     hashAddSaveName(store->infoHash, word, info, &info->word);
     slAddHead(&store->infoList, info);
     }
 info->useCount += 1;
 return info;
 }
 
 struct wordStore *wordStoreForChainsInFile(char *fileName, int chainSize)
 /* Return a wordStore containing all words, and also all chains-of-words of length 
  * chainSize seen in file.  */
 {
 /* Stuff for processing file a line at a time. */
 struct lineFile *lf = lineFileOpen(fileName, TRUE);
 char *line, *word;
 
 /* We'll build up the tree starting with an empty root node. */
-struct wordStore *store = wordStoreNew();
+struct wordStore *store = wordStoreNew(chainSize);
 struct wordTree *wt = store->markovChains = wordTreeNew(wordStoreAdd(store, ""));	
 
 /* Loop through each line of file, treating it as a separate read. There's 
  * special cases at the beginning and end of line, and for short lines.  In the
  * main case we'll be maintaining a chain (doubly linked list) of maxChainSize words, 
  * popping off one word from the start, and adding one word to the end for each
  * new word we encounter. This list is added to the tree each iteration. */
 while (lineFileNext(lf, &line, NULL))
     {
     /* We'll keep a chain of three or so words in a doubly linked list. */
     struct dlNode *node;
     struct dlList *chain = dlListNew();
     int curSize = 0;
     int wordCount = 0;
 
@@ -609,36 +632,36 @@
  	addChainToTree(wt, chain);
     while ((node = dlPopHead(chain)) != NULL)
 	{
 	if (!dlEmpty(chain))
 	    addChainToTree(wt, chain);
 	freeMem(node);
 	}
     dlListFree(&chain);
     }
 lineFileClose(&lf);
 
 wordTreeSort(wt);  // Make output of chain file prettier
 return store;
 }
 
-void wordTreeWrite(struct wordTree *wt, char *fileName)
+void wordTreeWrite(struct wordTree *wt, int maxChainSize, char *fileName)
 /* Write out tree to file */
 {
 FILE *f = mustOpen(fileName, "w");
 fprintf(f, "#level\tuseCount\toutTarget\toutCount\tnormVal\tmonomers\n");
-wordTreeDump(0, wt, f);
+wordTreeDump(0, wt, maxChainSize, f);
 carefulClose(&f);
 }
 
 void wordStoreLoadMonomerOrder(struct wordStore *store, char *readsFile, char *fileName)
 /* Read in a file with one line for each monomer type, containing a word for each
  * monomer variant.  Requires all variants already be in store.  The readsFile is passed
  * just for nicer error reporting. */
 {
 /* Stuff for processing file a line at a time. */
 struct lineFile *lf = lineFileOpen(fileName, TRUE);
 char *line, *word;
 
 /* Set up variables we'll put results in in store. */
 store->typeHash = hashNew(0);
 store->typeList = NULL;
@@ -664,39 +687,39 @@
 verbose(2, "Added %d types containing %d words from %s\n", 
     slCount(store->typeList), store->typeHash->elCount, fileName);
 }
 
 void alphaChain(char *readsFile, char *monomerOrderFile, char *outFile)
 /* alphaChain - Create Markov chain of words and optionally output chain in two formats. */
 {
 struct wordStore *store = wordStoreForChainsInFile(readsFile, maxChainSize);
 struct wordTree *wt = store->markovChains;
 wordStoreLoadMonomerOrder(store, readsFile, monomerOrderFile);
 wordTreeNormalize(wt, outSize, 1.0);
 
 if (optionExists("chain"))
     {
     char *fileName = optionVal("chain", NULL);
-    wordTreeWrite(wt, fileName);
+    wordTreeWrite(wt, store->maxChainSize, fileName);
     }
 
-wordTreeGenerateFile(store, maxChainSize, pickRandom(wt->children), outSize, outFile);
+wordTreeGenerateFile(store, store->maxChainSize, pickRandom(wt->children), outSize, outFile);
 
 if (optionExists("afterChain"))
     {
     char *fileName = optionVal("afterChain", NULL);
-    wordTreeWrite(wt, fileName);
+    wordTreeWrite(wt, store->maxChainSize, fileName);
     }
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 4)
     usage();
 maxChainSize = optionInt("size", maxChainSize);
 outSize = optionInt("outSize", outSize);
 fullOnly = optionExists("fullOnly");
 int seed = optionInt("seed", (int)time(0));
 srand(seed);
 alphaChain(argv[1], argv[2], argv[3]);