src/kehayden/alphaChain/alphaChain.c 5696f8e630ba445a0b16255d7a7a76298158a3d1

5696f8e630ba445a0b16255d7a7a76298158a3d1
kent
  Tue Jun 12 20:15:22 2012 -0700
Adding a bunch of debugging code.  Preventing higher level counts on output from follang below the children's counts.  Thinking about (but not yet implementing) change so ends of input lines are not put in a 2-mers and 1-mers when lines are 3 or more long.
diff --git src/kehayden/alphaChain/alphaChain.c src/kehayden/alphaChain/alphaChain.c
index c9e946e..ff18a1c 100644
--- src/kehayden/alphaChain/alphaChain.c
+++ src/kehayden/alphaChain/alphaChain.c
@@ -1,21 +1,22 @@
 /* alphaChain - Predicts faux centromere sequences using a probablistic model. */
 
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
+#include "dystring.h"
 #include "dlist.h"
 
 /* Global vars - all of which can be set by command line options. */
 int maxChainSize = 3;
 int outSize = 10000;
 boolean fullOnly = FALSE;
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "alphaChain - create a linear projection of alpha satellite arrays using the probablistic model\n"
   "of HuRef satellite graphs\n"
   "usage:\n"
   "   alphaChain alphaMonFile.fa significant_output.txt\n"
@@ -174,30 +175,71 @@
 
 void wordTreeNormalize(struct wordTree *wt, double outTarget, double normVal)
 /* Recursively set wt->normVal  and wt->outTarget so each branch gets its share */
 {
 wt->normVal = normVal;
 wt->outTarget = outTarget;
 int childrenTotalUses = wordTreeSumUseCounts(wt->children);
 struct wordTree *child;
 for (child = wt->children; child != NULL; child = child->next)
     {
     double childRatio = (double)child->useCount / childrenTotalUses;
     wordTreeNormalize(child, childRatio*outTarget, childRatio*normVal);
     }
 }
 
+char *wordTreeString(struct wordTree *wt)
+/* Return something like '(a b c)' where c would be the value at wt itself, and
+ * a and b would be gotten by following parents. */
+{
+struct slName *list = NULL, *el;
+for (;wt != NULL; wt = wt->parent)
+    {
+    if (!isEmpty(wt->word))   // Avoid blank great grandparent
+	slNameAddHead(&list, wt->word);
+    }
+
+struct dyString *dy = dyStringNew(0);
+dyStringAppendC(dy, '(');
+for (el = list; el != NULL; el = el->next)
+   {
+   dyStringPrintf(dy, "%s", el->name);
+   if (el->next != NULL)
+       dyStringAppendC(dy, ' ');
+   }
+dyStringAppendC(dy, ')');
+slFreeList(&list);
+return dyStringCannibalize(&dy);
+}
+
+char *dlListFragWords(struct dlNode *head)
+/* Return string containing all words in list pointed to by head. */
+{
+struct dyString *dy = dyStringNew(0);
+dyStringAppendC(dy, '{');
+struct dlNode *node;
+for (node = head; !dlEnd(node); node = node->next)
+    {
+    if (node != head)
+       dyStringAppendC(dy, ' ');
+    char *word = node->val;
+    dyStringAppend(dy, word);
+    }
+dyStringAppendC(dy, '}');
+return dyStringCannibalize(&dy);
+}
+
 void wordTreeDump(int level, struct wordTree *wt, FILE *f)
 /* Write out wordTree to file. */
 {
 static char *words[64];
 int i;
 assert(level < ArraySize(words));
 
 words[level] = wt->word;
 if (!fullOnly || level == maxChainSize)
     {
     fprintf(f, "%d\t%d\t%d\t%d\t%f\t", 
 	    level, wt->useCount, wt->outTarget, wt->outCount, wt->normVal);
     
     for (i=1; i<=level; ++i)
 	{
@@ -205,46 +247,62 @@
 	fprintf(f, "%s ", words[i]);
 	}
     fprintf(f, "\n");
     }
 struct wordTree *child;
 for (child = wt->children; child != NULL; child = child->next)
     wordTreeDump(level+1, child, f);
 }
 
 struct wordTree *pickRandomOnOutTarget(struct wordTree *list)
 /* Pick word from list randomly, but so that words with higher outTargets
  * are picked more often. */
 {
 struct wordTree *picked = NULL;
 
+/* Debug output. */
+    {
+    uglyf("   pickRandomOnOutTarget(");
+    struct wordTree *wt;
+    for (wt = list; wt != NULL; wt = wt->next)
+        {
+	if (wt != list)
+	    uglyf(" ");
+	uglyf("%s:%d", wt->word, wt->outTarget);
+	}
+    uglyf(") total %d\n", wordTreeSumOutTargets(list));
+    }
+
 /* Figure out total number of outputs left, and a random number between 0 and that total. */
 int total = wordTreeSumOutTargets(list);
 if (total > 0)
     {
     int threshold = rand() % total; 
+    uglyf("      threshold %d\n", threshold);
 
     /* Loop through list returning selection corresponding to random threshold. */
     int binStart = 0;
     struct wordTree *wt;
     for (wt = list; wt != NULL; wt = wt->next)
 	{
 	int size = wt->outTarget;
 	int binEnd = binStart + size;
+	uglyf("      %s size %d, binEnd %d\n", wt->word, size, binEnd);
 	if (threshold < binEnd)
 	    {
 	    picked = wt;
+	    uglyf("      picked %s\n", wt->word);
 	    break;
 	    }
 	binStart = binEnd;
 	}
     }
 return picked;
 }
 
 struct wordTree *pickRandomOnUseCounts(struct wordTree *list)
 /* Pick word from list randomly, but so that words with higher useCounts
  * are picked more often.  Much like above routine, but a little simple
  * since we know useCounts are non-zero. */
 {
 struct wordTree *picked = NULL;
 
@@ -283,68 +341,78 @@
  * of read coverage.  In this case we pick a random number based on original counts
  * rather than normalized/counted down counts. */
 if (picked == NULL)
     {
     picked = pickRandomOnUseCounts(list);
     ++totUseZeroCount;
     }
 return picked;
 }
 
 struct wordTree *predictNextFromAllPredecessors(struct wordTree *wt, struct dlNode *list)
 /* Predict next word given tree and recently used word list.  If tree doesn't
  * have statistics for what comes next given the words in list, then it returns
  * NULL. */
 {
+uglyf(" predictNextFromAllPredecessors(%s, %s)\n", wordTreeString(wt), dlListFragWords(list));
 struct dlNode *node;
 for (node = list; !dlEnd(node); node = node->next)
     {
     char *word = node->val;
     wt = wordTreeFindInList(wt->children, word);
+    uglyf("   wordTreeFindInList(%s) = %p %s\n", word, wt, wordTreeString(wt));
     if (wt == NULL || wt->children == NULL)
         break;
     }
 struct wordTree *result = NULL;
 if (wt != NULL && wt->children != NULL)
     result = pickRandom(wt->children);
 return result;
 }
 
 struct wordTree *predictNext(struct wordTree *wt, struct dlList *recent)
 /* Predict next word given tree and recently used word list.  Will use all words in
  * recent list if can,  but if there is not data in tree, will back off, and use
  * progressively less previous words until ultimately it just picks a random
  * word. */
 {
 struct dlNode *node;
+
 for (node = recent->head; !dlEnd(node); node = node->next)
     {
     struct wordTree *result = predictNextFromAllPredecessors(wt, node);
     if (result != NULL)
         return result;
     }
-return pickRandom(wt->children); 
+struct wordTree *topLevel = pickRandom(wt->children);
+verbose(2, "in predictNext(%s, %s) ", wordTreeString(wt), dlListFragWords(recent->head));
+verbose(2, "last resort pick of %s\n", topLevel->word);
+return topLevel;
 }
 
 void decrementOutputCounts(struct wordTree *wt)
 /* Decrement output count of self and parents. */
 {
 while (wt != NULL)
     {
-    /* Decrement target count, but don't let it go below zero. */
+    /* Decrement target count, but don't let it fall below sum of counts of all children. 
+     * This can happen with incomplete data if we don't prevent it.  This
+     * same code also prevents us from having negative outTarget. */
     int outTarget = wt->outTarget - 1;
-    if (outTarget >= 0)
+    int kidSum = wordTreeSumOutTargets(wt->children);
+    if (outTarget < kidSum)
+        outTarget = kidSum;
         wt->outTarget = outTarget;
     
     /* Always bump outCount for debugging. */
     wt->outCount += 1;
     wt = wt->parent;
     }
 }
 
 static void wordTreeGenerateFaux(struct wordTree *wt, int maxSize, struct wordTree *firstWord, 
 	int maxOutputWords, char *fileName)
 /* Go spew out a bunch of words according to probabilities in tree. */
 {
 FILE *f = mustOpen(fileName, "w");
 struct dlList *ll = dlListNew();
 int listSize = 0;
@@ -356,31 +424,30 @@
         break;
     struct dlNode *node;
     struct wordTree *picked;
 
     /* Get next predicted word. */
     if (listSize == 0)
         {
 	AllocVar(node);
 	++listSize;
 	picked = firstWord;
 	}
     else if (listSize >= maxSize)
 	{
 	node = dlPopHead(ll);
 	picked = predictNext(wt, ll);
-//         decrementOutputCounts(picked);   // ugly placement?
 	}
     else
 	{
 	picked = predictNext(wt, ll);
 	AllocVar(node);
 	++listSize;
 	}
 
     if (picked == NULL)
          break;
 
 
     /* Add word from whatever level we fetched back to our chain of up to maxChainSize. */
     node->val = picked->word;
     dlAddTail(ll, node);