src/kehayden/alphaChain/alphaChain.c d1deca879029b44519864c9178ec35db15c816d4

d1deca879029b44519864c9178ec35db15c816d4
kent
  Thu Sep 6 15:30:59 2012 -0700
Making program look at the monomer order file to help it's predictions in case where it has nothing it can use in the markov chains.
diff --git src/kehayden/alphaChain/alphaChain.c src/kehayden/alphaChain/alphaChain.c
index 92b9706..3414909 100644
--- src/kehayden/alphaChain/alphaChain.c
+++ src/kehayden/alphaChain/alphaChain.c
@@ -376,94 +376,182 @@
  * commonly seen are picked more often. */
 {
 struct wordTree *picked = pickRandomOnOutTarget(list);
 
 /* If did not find anything, that's ok. It can happen on legitimate input due to unevenness
  * of read coverage.  In this case we pick a random number based on original counts
  * rather than normalized/counted down counts. */
 if (picked == NULL)
     {
     picked = pickRandomOnUseCounts(list);
     ++totUseZeroCount;
     }
 return picked;
 }
 
+struct wordInfo *pickRandomFromType(struct wordType *type)
+/* Pick a random word, weighted by outTarget, from all available of given type */
+{
+/* Figure out total on list */
+int total = 0;
+struct wordInfoRef *ref;
+for (ref = type->list; ref != NULL; ref = ref->next)
+    total += ref->val->outTarget;
+
+/* Loop through list returning selection corresponding to random threshold. */
+int threshold = rand() % total; 
+int binStart = 0;
+for (ref = type->list; ref != NULL; ref = ref->next)
+    {
+    struct wordInfo *info = ref->val;
+    int size = info->outTarget;
+    int binEnd = binStart + size;
+    if (threshold < binEnd)
+	return info;
+    binStart = binEnd;
+    }
+
+verbose(2, "Fell off end in pickRandomFromType\n");
+return type->list->val;	    // Fall back position (necessary?) just first in list
+}
+
 struct wordTree *predictNextFromAllPredecessors(struct wordTree *wt, struct dlNode *list)
 /* Predict next word given tree and recently used word list.  If tree doesn't
  * have statistics for what comes next given the words in list, then it returns
  * NULL. */
 {
 verbose(2, " predictNextFromAllPredecessors(%s, %s)\n", wordTreeString(wt), dlListFragWords(list));
 struct dlNode *node;
 for (node = list; !dlEnd(node); node = node->next)
     {
     struct wordInfo *info = node->val;
     wt = wordTreeFindInList(wt->children, info);
     verbose(2, "   wordTreeFindInList(%s) = %p %s\n", info->word, wt, wordTreeString(wt));
     if (wt == NULL || wt->children == NULL)
         break;
     }
 struct wordTree *result = NULL;
 if (wt != NULL && wt->children != NULL)
     result = pickRandom(wt->children);
 return result;
 }
 
-struct wordTree *predictNextFromRecent(struct wordTree *wt, struct dlNode *recent)
+struct wordTree *predictFromWordTree(struct wordTree *wt, struct dlNode *recent)
 /* Predict next word given tree and recently used word list.  Will use all words in
  * recent list if can,  but if there is not data in tree, will back off, and use
  * progressively less previous words until ultimately it just picks a random
  * word. */
 {
 struct dlNode *node;
 
 for (node = recent; !dlEnd(node); node = node->next)
     {
     struct wordTree *result = predictNextFromAllPredecessors(wt, node);
     if (result != NULL)
         return result;
     }
-struct wordTree *topLevel = pickRandom(wt->children);
-verbose(2, "in predictNext(%s, %s) ", wordTreeString(wt), dlListFragWords(recent));
-verbose(2, "last resort pick of %s\n", topLevel->info->word);
-return topLevel;
+return NULL;
 }
 
 struct dlNode *nodesFromTail(struct dlList *list, int count)
 /* Return count nodes from tail of list.  If list is shorter than count, it returns the
  * whole list. */
 {
 int i;
 struct dlNode *node;
 for (i=0, node = list->tail; i<count; ++i)
     {
     if (dlStart(node))
         return list->head;
     node = node->prev;
     }
 return node;
 }
 
+struct wordType *advanceTypeWithWrap(struct wordType *typeList, struct wordType *startType,
+    int amountToAdvance)
+/* Skip forward amountToAdvance in typeList from startType.  If get to end of list start
+ * again at the beginning */
+{
+int i;
+struct wordType *type = startType;
+for (i=0; i<amountToAdvance; ++i)
+    {
+    type = type->next;
+    if (type == NULL)
+	type = typeList;
+    }
+return type;
+}
+
+struct wordTree *predictFromPreviousTypes(struct wordStore *store, struct dlList *past)
+/* Predict next based on general pattern of monomer order. */
+{
+int maxToLookBack = 3;
+    { /* Debugging block */
+    struct dlNode *node = nodesFromTail(past, maxToLookBack);
+    verbose(3, "predictFromPreviousTypes(");
+    for (; !dlEnd(node); node = node->next)
+        {
+	struct wordInfo *info = node->val;
+	verbose(3, "%s, ", info->word);
+	}
+    verbose(3, ")\n");
+    }
+int pastDepth;
+struct dlNode *node;
+for (pastDepth = 1, node = past->tail; pastDepth <= maxToLookBack; ++pastDepth)
+    {
+    if (dlStart(node))
+	{
+	verbose(2, "predictFromPreviousTypes with empty past\n");
+	return NULL;
+	}
+    struct wordInfo *info = node->val;
+    struct wordType *prevType = hashFindVal(store->typeHash, info->word);
+    if (prevType != NULL)
+        {
+	struct wordType *curType = advanceTypeWithWrap(store->typeList, prevType, pastDepth);
+	struct wordInfo *curInfo = pickRandomFromType(curType);
+	struct wordTree *curTree = wordTreeFindInList(store->markovChains->children, curInfo);
+	verbose(3, "predictFromPreviousType pastDepth %d, curInfo %s, curTree %p %s\n", 
+	    pastDepth, curInfo->word, curTree, curTree->info->word);
+	return curTree;
+	}
+    node = node->prev;
+    }
+verbose(2, "predictFromPreviousTypes past all unknown types\n");
+return NULL;
+}
+
+
 struct wordTree *predictNext(struct wordStore *store, struct dlList *past)
 /* Given input data store and what is known from the past, predict the next word. */
 {
 struct dlNode *recent = nodesFromTail(past, store->maxChainSize);
-return predictNextFromRecent(store->markovChains, recent);
+struct wordTree *pick =  predictFromWordTree(store->markovChains, recent);
+if (pick == NULL)
+    pick = predictFromPreviousTypes(store, past);
+if (pick == NULL)
+    {
+    pick = pickRandom(store->markovChains->children);
+    verbose(2, "in predictNext() last resort pick of %s\n", pick->info->word);
+    }
+return pick;
 }
 
-void decrementOutputCounts(struct wordTree *wt)
+void decrementOutputCountsInTree(struct wordTree *wt)
 /* Decrement output count of self and parents. */
 {
 while (wt != NULL)
     {
     /* Decrement target count, but don't let it fall below sum of counts of all children. 
      * This can happen with incomplete data if we don't prevent it.  This
      * same code also prevents us from having negative outTarget. */
     int outTarget = wt->outTarget - 1;
     int kidSum = wordTreeSumOutTargets(wt->children);
     if (outTarget < kidSum)
         outTarget = kidSum;
     wt->outTarget = outTarget;
 
     /* Always bump outCount for debugging. */
     wt->outCount += 1;
@@ -499,34 +587,37 @@
 	{
 	chainStartNode = chainStartNode->next;
 	picked = predictNext(store, ll);
 	}
     else
 	{
 	picked = predictNext(store, ll);
 	++chainSize;
 	}
 
     if (picked == NULL)
          break;
 
 
     /* Add word from whatever level we fetched back to our output chain. */
-    node->val = picked->info;
+    struct wordInfo *info = picked->info;
+    node->val = info;
     dlAddTail(ll, node);
 
-    decrementOutputCounts(picked);
+    decrementOutputCountsInTree(picked);
+    info->outTarget -= 1;
+    info->outCount += 1;
     }
 verbose(2, "totUseZeroCount = %d\n", totUseZeroCount);
 return ll;
 }
 
 static void wordTreeGenerateFile(struct wordStore *store, int maxSize, struct wordTree *firstWord, 
 	int maxOutputWords, char *fileName)
 /* Create file containing words base on tree probabilities.  The wordTreeGenerateList does
  * most of work. */
 {
 struct dlList *ll = wordTreeGenerateList(store, maxSize, firstWord, maxOutputWords);
 FILE *f = mustOpen(fileName, "w");
 struct dlNode *node;
 for (node = ll->head; !dlEnd(node); node = node->next)
     {
@@ -671,42 +762,60 @@
     struct wordType *type;
     AllocVar(type);
     slAddHead(&store->typeList, type);
     while ((word = nextWord(&line)) != NULL)
         {
 	struct wordInfo *info = hashFindVal(store->infoHash, word);
 	if (info == NULL)
 	    errAbort("%s is in %s but not %s", word, lf->fileName, readsFile);
 	struct wordInfoRef *ref;
 	AllocVar(ref);
 	ref->val = info;
 	slAddHead(&type->list, ref);
 	hashAddUnique(store->typeHash, word, type);
 	}
     }
+slReverse(&store->typeList);
 lineFileClose(&lf);
 verbose(2, "Added %d types containing %d words from %s\n", 
     slCount(store->typeList), store->typeHash->elCount, fileName);
 }
 
+void wordInfoListNormalise(struct wordInfo *list, int totalCount, int outputSize)
+/* Set outTarget field in all of list to be normalized to outputSize */
+{
+struct wordInfo *info;
+double scale = outputSize/totalCount;
+for (info = list; info != NULL; info = info->next)
+    info->outTarget = round(scale * info->useCount);
+}
+
+void wordStoreNormalize(struct wordStore *store, int outputSize)
+/* Set up output counts on each word to make sure it gets it's share of output size */
+{
+struct wordTree *wt = store->markovChains;
+wordTreeNormalize(wt, outputSize, 1.0);
+wordInfoListNormalise(store->infoList, wt->useCount, outputSize);
+}
+
 void alphaChain(char *readsFile, char *monomerOrderFile, char *outFile)
 /* alphaChain - Create Markov chain of words and optionally output chain in two formats. */
 {
 struct wordStore *store = wordStoreForChainsInFile(readsFile, maxChainSize);
 struct wordTree *wt = store->markovChains;
 wordStoreLoadMonomerOrder(store, readsFile, monomerOrderFile);
-wordTreeNormalize(wt, outSize, 1.0);
+wordStoreNormalize(store, outSize);
 
 if (optionExists("chain"))
     {
     char *fileName = optionVal("chain", NULL);
     wordTreeWrite(wt, store->maxChainSize, fileName);
     }
 
 wordTreeGenerateFile(store, store->maxChainSize, pickRandom(wt->children), outSize, outFile);
 
 if (optionExists("afterChain"))
     {
     char *fileName = optionVal("afterChain", NULL);
     wordTreeWrite(wt, store->maxChainSize, fileName);
     }
 }