de78c1c906a7f58028f633049ab08f43ea96d198
kent
  Wed Apr 25 17:12:13 2012 -0700
Adding in dummy nodes to help deal with missing data from reads not being infinite.
diff --git src/kehayden/alphaChain/alphaChain.c src/kehayden/alphaChain/alphaChain.c
index 1e68c98..d59c43b 100644
--- src/kehayden/alphaChain/alphaChain.c
+++ src/kehayden/alphaChain/alphaChain.c
@@ -21,30 +21,31 @@
 errAbort(
   "alphaChain - create a linear projection of alpha satellite arrays using the probablistic model\n"
   "of HuRef satellite graphs\n"
   "usage:\n"
   "   alphaChain alphaMonFile.fa significant_output.txt\n"
   "options:\n"
   "   -size=N - Set max chain size, default %d\n"
   "   -chain=fileName - Write out word chain to file\n"
   "   -maxNonsenseSize=N - Keep nonsense output to this many words.\n"
   "   -fullOnly - Only output chains of size\n"
   "   -minUse=N - Set minimum use in output chain, default %d\n"
   , maxChainSize, minUse
   );
 }
 
+char *noData  = "n/a";   // Used to indicate a dummy node representing missing data
 /* Command line validation table. */
 static struct optionSpec options[] = {
    {"size", OPTION_INT},
    {"minUse", OPTION_INT},
    {"chain", OPTION_STRING},
    {"fullOnly", OPTION_BOOLEAN},
    {"maxNonsenseSize", OPTION_INT},
    {NULL, 0},
 };
 
 /* The wordTree structure below is the central data structure for this program.  It is
  * used to build up a tree that contains all observed N-word-long sequences observed in
  * the text, where N corresponds to the "size" command line option which defaults to 3,
  * an option that in turn is stored in the maxChainSize variable.  At this chain size the
  * text 
@@ -85,117 +86,124 @@
  *
  * There are special cases in the code so that the first and last words in the text get included 
  * as much as possible in the tree. 
  *
  * Once the program has build up the wordTree, it can output it in a couple of fashions. */
 
 struct wordTree
 /* A node in a tree of words.  The head of the tree is a node with word value the empty string. */
     {
     struct rbTree *following;	/* Contains words (as struct wordTree) that follow us. */
     struct wordTree *parent;    /* Parent of this node or NULL for root. */
     char *word;			/* The word itself including comma, period etc. */
     int useCount;		/* Number of times word used. */
     int outputCount;            /* each level of tree and initialize that to a normalized version of it. */
     double normVal;             /* value to place the normalization value */    
-    int missingFromChildren;    /* Uses not in children. */
     };
 
 struct wordTree *wordTreeNew(char *word)
 /* Create and return new wordTree element. */
 {
 struct wordTree *wt;
 AllocVar(wt);
 wt->word = cloneString(word);
 return wt;
 }
 
 int wordTreeCmpWord(void *va, void *vb)
 /* Compare two wordTree. */
 {
 struct wordTree *a = va, *b = vb;
 return strcmp(a->word, b->word);
 }
+
+struct wordTree *wordTreeAddFollowing(struct wordTree *wt, char *word, 
+	struct lm *lm, struct rbTreeNode **stack)
+/* Make word follow wt in tree.  If word already exists among followers
+ * return it and bump use count.  Otherwise create new one. */
+{
+struct wordTree *w;   /* Points to following element if any */
+if (wt->following == NULL)
+    {
+    /* Allocate new if you've never seen it before. */
+    wt->following = rbTreeNewDetailed(wordTreeCmpWord, lm, stack);
+    w = NULL;
+    }
+else
+    {
+    /* Find word in existing tree */
+    struct wordTree key;
+    key.word = word;
+    w = rbTreeFind(wt->following, &key);
+    }
+if (w == NULL)
+    {
+    w = wordTreeNew(word);
+    w->parent = wt;
+    rbTreeAdd(wt->following, w);
+    }
+w->useCount += 1;
+return w;
+}
+
 int wordTreeChildrenUseCount(struct wordTree *wt)
 /* Return sum of useCounts of all children */
 {
 struct rbTree *following = wt->following;
 if (following == NULL)
     return 0;
 struct slRef *childList = rbTreeItems(following);
 struct slRef *childRef;
 int total = 0;
 for (childRef = childList; childRef != NULL; childRef = childRef->next)
     {
     struct wordTree *child = childRef->val;
     total += child->useCount;
     }
 slFreeList(&childList);
 return total;
 }
+
 int wordTreeCountNotInChildren(struct wordTree *wt)
 /* Count up useCounts of all children and return difference between this and our own useCount. */
 {
 return wt->useCount - wordTreeChildrenUseCount(wt);
 }
-void wordTreeSetMissing(struct wordTree *wt)
+
+void wordTreeSetMissing(struct wordTree *wt, int level, struct lm *lm, struct rbTreeNode **stack)
 /* Set missingFromChildren in self and all children. */
 {
-wt->missingFromChildren = wordTreeCountNotInChildren(wt);
+int missingFromChildren = wordTreeCountNotInChildren(wt);
+if (missingFromChildren > 0)
+    {
+    struct wordTree *missing = wordTreeAddFollowing(wt, noData, lm, stack);
+    missing->useCount = missingFromChildren;
+    }
 struct rbTree *following = wt->following;
-if (following != NULL)
+if (following != NULL && level < maxChainSize)
     {
     struct slRef *childList = rbTreeItems(following);
     struct slRef *childRef;
     for (childRef = childList; childRef != NULL; childRef = childRef->next)
         {
         struct wordTree *child = childRef->val;
-        wordTreeSetMissing(child);
+        wordTreeSetMissing(child, level+1, lm, stack);
         }
     slFreeList(&childList);
     }
 }
 
-struct wordTree *wordTreeAddFollowing(struct wordTree *wt, char *word, 
-	struct lm *lm, struct rbTreeNode **stack)
-/* Make word follow wt in tree.  If word already exists among followers
- * return it and bump use count.  Otherwise create new one. */
-{
-struct wordTree *w;   /* Points to following element if any */
-if (wt->following == NULL)
-    {
-    /* Allocate new if you've never seen it before. */
-    wt->following = rbTreeNewDetailed(wordTreeCmpWord, lm, stack);
-    w = NULL;
-    }
-else
-    {
-    /* Find word in existing tree */
-    struct wordTree key;
-    key.word = word;
-    w = rbTreeFind(wt->following, &key);
-    }
-if (w == NULL)
-    {
-    w = wordTreeNew(word);
-    w->parent = wt;
-    rbTreeAdd(wt->following, w);
-    }
-w->useCount += 1;
-return w;
-}
-
 void addChainToTree(struct wordTree *wt, struct dlList *chain, 
 	struct lm *lm, struct rbTreeNode **stack)
 /* Add chain of words to tree. */
 {
 struct dlNode *node;
 wt->useCount += 1;
 for (node = chain->head; !dlEnd(node); node = node->next)
     {
     char *word = node->val;
     verbose(2, "  %s\n", word);
     wt = wordTreeAddFollowing(wt, word, lm, stack);
     }
 }
 
 void wordTreeNormalize(struct wordTree *wt, double normVal)
@@ -204,30 +212,31 @@
 wt->normVal = normVal;
 wt->outputCount = normVal * maxNonsenseSize;
 if (wt->following != NULL)
     {
     struct slRef *list = rbTreeItems(wt->following);
     struct slRef *ref;
     for (ref = list; ref !=NULL; ref = ref->next)
 	{
 	struct wordTree *child = ref->val;
 	double childRatio = (double)child->useCount / wt->useCount;
 	wordTreeNormalize(child, childRatio*normVal);
 	}
     slFreeList(&list);
     }
 }
+
 void wordTreeDeadEnd(struct wordTree *wt)
 /* tally and include incomplete branches */
 {
 /* int levelNormVal = 0;
  * int levelCount = 0;
  * int sumNormVal = 0;
  * int sumCount = 0;
  * int diffNormVal = 0;
  * int diffCount=0;
  * Loop pseudocode
  * work recursively through level 1-> 3, start at root of tree
  * foreach word at level 1
  * {
  *   sumCount = 0
  *   sumNormVal = 0
@@ -247,44 +256,45 @@
  *    sumCount += wt->outputCount
  *    sumNormVal  += wt->normVal
  *    ** RECURSIVE level 2 + 1 here **
  *   }
  *   diffCount = levelCount - sumCount
  *   diffNormVal = levelNormVal - sumNormVal
  *   if(diffCount > 0)
  *   {
  *   create level 2:
  *     wt->normVal = diffNormVal
  *     wt->word = 'NaN'
  *     wt->outputVal = diffCount
  *   }
  */
 }
+
 void wordTreeDump(int level, struct wordTree *wt, FILE *f)
 /* Write out wordTree to file. */
 {
 static char *words[64];
 struct slRef *list, *ref;
 int i;
 assert(level < ArraySize(words));
 
 words[level] = wt->word;
 if (wt->useCount >= minUse)
     {
     if (!fullOnly || level == maxChainSize)
 	{
-	fprintf(f, "%d\t%d\t%d\t%f\t%d\t", level, wt->useCount, wt->outputCount, wt->normVal, wt->missingFromChildren);
+	fprintf(f, "%d\t%d\t%d\t%f\t", level, wt->useCount, wt->outputCount, wt->normVal);
 	
 	for (i=1; i<=level; ++i)
             {
             spaceOut(f, level*2);
 	    fprintf(f, "%s ", words[i]);
             }
 	fprintf(f, "\n");
 	}
     }
 if (wt->following != NULL)
     {
     list = rbTreeItems(wt->following);
     for (ref = list; ref != NULL; ref = ref->next)
         wordTreeDump(level+1, ref->val, f);
     slFreeList(&list);
@@ -464,53 +474,56 @@
 	     * it from head to tail of list. */
 	    node = dlPopHead(chain);
 	    freeMem(node->val);
 	    node->val = cloneString(word);
 	    dlAddTail(chain, node);
 	    addChainToTree(wt, chain, lm, stack);
 	    }
 	++wordCount;
 	}
     /* Handle last few words in line, where can't make a chain of full size.  Also handles       
      * lines that have fewer than chain size words. */
     if (curSize < chainSize)
       addChainToTree(wt, chain, lm, stack);
     while ((node = dlPopHead(chain)) != NULL)
       {
+	if (!dlEmpty(chain))
 	addChainToTree(wt, chain, lm, stack);
 	freeMem(node->val);
 	freeMem(node);
       }
     dlListFree(&chain);
     }
-
 lineFileClose(&lf);
+
+/* Add in additional information to help traverse tree . */
+wordTreeSetMissing(wt, 1, lm, stack);
+wordTreeNormalize(wt, 1.0);
 return wt;
 }
 
 void alphaChain(char *inFile, char *outFile)
 /* alphaChain - Create Markov chain of words and optionally output chain in two formats. */
 {
 struct lm *lm = lmInit(0);
 struct wordTree *wt = wordTreeForChainsInFile(inFile, maxChainSize, lm);
-wordTreeNormalize(wt, 1.0);
-wordTreeSetMissing(wt);
 
 if (optionExists("chain"))
     {
     char *fileName = optionVal("chain", NULL);
     FILE *f = mustOpen(fileName, "w");
+    fprintf(f, "#level\tuseCount\toutputCount\tnormVal\tmonomers\n");
     wordTreeDump(0, wt, f);
     carefulClose(&f);
     }
 
 
  FILE *f = mustOpen(outFile, "w");
  int maxSize = min(wt->useCount, maxNonsenseSize);
 
  /* KEH NOTES: controls how many words we emit */
 
  wordTreeMakeNonsense(wt, maxChainSize, pickRandomWord(wt->following), maxSize, f);
  carefulClose(&f);
     
 
 lmCleanup(&lm);	// Not really needed since we're just going to exit.