618f7cd74f08e3b8314729f9dca945627cb4d4e3
kent
  Thu May 3 11:16:25 2012 -0700
When we use up our target out count, revert to original input counts when making random choices.
diff --git src/kehayden/alphaChain/alphaChain.c src/kehayden/alphaChain/alphaChain.c
index 8864b3d..c9e946e 100644
--- src/kehayden/alphaChain/alphaChain.c
+++ src/kehayden/alphaChain/alphaChain.c
@@ -1,53 +1,56 @@
 /* alphaChain - Predicts faux centromere sequences using a probablistic model. */
+
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "dlist.h"
 
 /* Global vars - all of which can be set by command line options. */
 int maxChainSize = 3;
 int outSize = 10000;
-int minUse = 1;
 boolean fullOnly = FALSE;
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "alphaChain - create a linear projection of alpha satellite arrays using the probablistic model\n"
   "of HuRef satellite graphs\n"
   "usage:\n"
   "   alphaChain alphaMonFile.fa significant_output.txt\n"
   "options:\n"
   "   -size=N - Set max chain size, default %d\n"
+  "   -fullOnly - Only output chains of size above\n"
   "   -chain=fileName - Write out word chain to file\n"
+  "   -afterChain=fileName - Write out word chain after faux generation to file for debugging\n"
   "   -outSize=N - Output this many words.\n"
-  "   -fullOnly - Only output chains of size\n"
-  "   -minUse=N - Set minimum use in output chain, default %d\n"
-  , maxChainSize, minUse
+  "   -seed=N - Initialize random number with this seed for consistent results, otherwise\n"
+  "             it will generate different results each time it's run.\n"
+  , maxChainSize
   );
 }
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
    {"size", OPTION_INT},
-   {"minUse", OPTION_INT},
    {"chain", OPTION_STRING},
+   {"afterChain", OPTION_STRING},
    {"fullOnly", OPTION_BOOLEAN},
    {"outSize", OPTION_INT},
+   {"seed", OPTION_INT},
    {NULL, 0},
 };
 
 /* The wordTree structure below is the central data structure for this program.  It is
  * used to build up a tree that contains all observed N-word-long sequences observed in
  * the text, where N corresponds to the "size" command line option which defaults to 3,
  * an option that in turn is stored in the maxChainSize variable.  At this chain size the
  * text 
  *     this is the black dog and the black cat
  * would have the chains 
  *     this is the 
  *     is the black
  *     the black dog
  *     black dog and
  *     dog and the
@@ -96,31 +99,31 @@
 
 struct wordTree *wordTreeNew(char *word)
 /* Create and return new wordTree element. */
 {
 struct wordTree *wt;
 AllocVar(wt);
 wt->word = cloneString(word);
 return wt;
 }
 
 int wordTreeCmpWord(const void *va, const void *vb)
 /* Compare two wordTree for slSort. */
 {
 const struct wordTree *a = *((struct wordTree **)va);
 const struct wordTree *b = *((struct wordTree **)vb);
-return strcmp(a->word, b->word);
+return cmpStringsWithEmbeddedNumbers(a->word, b->word);
 }
 
 struct wordTree *wordTreeFindInList(struct wordTree *list, char *word)
 /* Return wordTree element in list that has given word, or NULL if none. */
 {
 struct wordTree *wt;
 for (wt = list; wt != NULL; wt = wt->next)
     if (sameString(wt->word, word))
         break;
 return wt;
 }
 
 struct wordTree *wordTreeAddFollowing(struct wordTree *wt, char *word)
 /* Make word follow wt in tree.  If word already exists among followers
  * return it and bump use count.  Otherwise create new one. */
@@ -134,181 +137,217 @@
     }
 child->useCount += 1;
 return child;
 }
 
 int wordTreeSumUseCounts(struct wordTree *list)
 /* Sum up useCounts in list */
 {
 int total = 0;
 struct wordTree *wt;
 for (wt = list; wt != NULL; wt = wt->next)
     total += wt->useCount;
 return total;
 }
 
-int wordTreeChildrenUseCount(struct wordTree *wt)
-/* Return sum of useCounts of all children */
-{
-return wordTreeSumUseCounts(wt->children);
-}
-
 int wordTreeSumOutTargets(struct wordTree *list)
 /* Sum up useCounts in list */
 {
 int total = 0;
 struct wordTree *wt;
 for (wt = list; wt != NULL; wt = wt->next)
     total += wt->outTarget;
 return total;
 }
 
 void addChainToTree(struct wordTree *wt, struct dlList *chain)
 /* Add chain of words to tree. */
 {
 struct dlNode *node;
 wt->useCount += 1;
 for (node = chain->head; !dlEnd(node); node = node->next)
     {
     char *word = node->val;
     verbose(2, "  %s\n", word);
     wt = wordTreeAddFollowing(wt, word);
     }
 }
 
 void wordTreeNormalize(struct wordTree *wt, double outTarget, double normVal)
 /* Recursively set wt->normVal  and wt->outTarget so each branch gets its share */
 {
 wt->normVal = normVal;
 wt->outTarget = outTarget;
-int totalChildUses = wordTreeChildrenUseCount(wt);
+int childrenTotalUses = wordTreeSumUseCounts(wt->children);
 struct wordTree *child;
 for (child = wt->children; child != NULL; child = child->next)
     {
-    double childRatio = (double)child->useCount / totalChildUses;
+    double childRatio = (double)child->useCount / childrenTotalUses;
     wordTreeNormalize(child, childRatio*outTarget, childRatio*normVal);
     }
 }
 
 void wordTreeDump(int level, struct wordTree *wt, FILE *f)
 /* Write out wordTree to file. */
 {
 static char *words[64];
 int i;
 assert(level < ArraySize(words));
 
 words[level] = wt->word;
-if (wt->useCount >= minUse)
-    {
     if (!fullOnly || level == maxChainSize)
 	{
-	fprintf(f, "%d\t%d\t%d\t%d\t%f\t", level, wt->useCount, wt->outTarget, wt->outCount, wt->normVal);
+    fprintf(f, "%d\t%d\t%d\t%d\t%f\t", 
+	    level, wt->useCount, wt->outTarget, wt->outCount, wt->normVal);
 	
 	for (i=1; i<=level; ++i)
             {
             spaceOut(f, level*2);
 	    fprintf(f, "%s ", words[i]);
             }
 	fprintf(f, "\n");
 	}
-    }
 struct wordTree *child;
 for (child = wt->children; child != NULL; child = child->next)
     wordTreeDump(level+1, child, f);
 }
 
-int totUseZeroCount;  // debugging aid
-
 struct wordTree *pickRandomOnOutTarget(struct wordTree *list)
-/* Pick word from list randomly, but so that words more
- * commonly seen are picked more often. */
+/* Pick word from list randomly, but so that words with higher outTargets
+ * are picked more often. */
 {
 struct wordTree *picked = NULL;
 
 /* Figure out total number of outputs left, and a random number between 0 and that total. */
 int total = wordTreeSumOutTargets(list);
 if (total > 0)
     {
     int threshold = rand() % total; 
 
     /* Loop through list returning selection corresponding to random threshold. */
     int binStart = 0;
     struct wordTree *wt;
     for (wt = list; wt != NULL; wt = wt->next)
 	{
 	int size = wt->outTarget;
 	int binEnd = binStart + size;
 	if (threshold < binEnd)
 	    {
 	    picked = wt;
 	    break;
 	    }
 	binStart = binEnd;
 	}
     }
+return picked;
+}
+
+struct wordTree *pickRandomOnUseCounts(struct wordTree *list)
+/* Pick word from list randomly, but so that words with higher useCounts
+ * are picked more often.  Much like above routine, but a little simple
+ * since we know useCounts are non-zero. */
+{
+struct wordTree *picked = NULL;
+
+/* Figure out total number and a random number between 0 and that total. */
+int total = wordTreeSumUseCounts(list);
+assert(total > 0);
+int threshold = rand() % total; 
+
+/* Loop through list returning selection corresponding to random threshold. */
+int binStart = 0;
+struct wordTree *wt;
+for (wt = list; wt != NULL; wt = wt->next)
+    {
+    int size = wt->useCount;
+    int binEnd = binStart + size;
+    if (threshold < binEnd)
+	{
+	picked = wt;
+	break;
+	}
+    binStart = binEnd;
+    }
+assert(picked != NULL);
+return picked;
+}
+
+int totUseZeroCount = 0;
+
+struct wordTree *pickRandom(struct wordTree *list)
+/* Pick word from list randomly, but so that words more
+ * commonly seen are picked more often. */
+{
+struct wordTree *picked = pickRandomOnOutTarget(list);
 
 /* If did not find anything, that's ok. It can happen on legitimate input due to unevenness
- * of read coverage.  In this case we just return an arbitrary element. */
+ * of read coverage.  In this case we pick a random number based on original counts
+ * rather than normalized/counted down counts. */
 if (picked == NULL)
     {
-    picked = list;
-    totUseZeroCount += 1;
+    picked = pickRandomOnUseCounts(list);
+    ++totUseZeroCount;
     }
 return picked;
 }
 
 struct wordTree *predictNextFromAllPredecessors(struct wordTree *wt, struct dlNode *list)
 /* Predict next word given tree and recently used word list.  If tree doesn't
  * have statistics for what comes next given the words in list, then it returns
  * NULL. */
 {
 struct dlNode *node;
 for (node = list; !dlEnd(node); node = node->next)
     {
     char *word = node->val;
     wt = wordTreeFindInList(wt->children, word);
     if (wt == NULL || wt->children == NULL)
         break;
     }
 struct wordTree *result = NULL;
 if (wt != NULL && wt->children != NULL)
-    result = pickRandomOnOutTarget(wt->children);
+    result = pickRandom(wt->children);
 return result;
 }
 
 struct wordTree *predictNext(struct wordTree *wt, struct dlList *recent)
 /* Predict next word given tree and recently used word list.  Will use all words in
  * recent list if can,  but if there is not data in tree, will back off, and use
  * progressively less previous words until ultimately it just picks a random
  * word. */
 {
 struct dlNode *node;
 for (node = recent->head; !dlEnd(node); node = node->next)
     {
     struct wordTree *result = predictNextFromAllPredecessors(wt, node);
     if (result != NULL)
         return result;
     }
-return pickRandomOnOutTarget(wt->children); 
+return pickRandom(wt->children); 
 }
 
 void decrementOutputCounts(struct wordTree *wt)
 /* Decrement output count of self and parents. */
 {
 while (wt != NULL)
     {
-    wt->outTarget -= 1;
+    /* Decrement target count, but don't let it go below zero. */
+    int outTarget = wt->outTarget - 1;
+    if (outTarget >= 0)
+        wt->outTarget = outTarget;
+    
+    /* Always bump outCount for debugging. */
     wt->outCount += 1;
     wt = wt->parent;
     }
 }
 
 static void wordTreeGenerateFaux(struct wordTree *wt, int maxSize, struct wordTree *firstWord, 
 	int maxOutputWords, char *fileName)
 /* Go spew out a bunch of words according to probabilities in tree. */
 {
 FILE *f = mustOpen(fileName, "w");
 struct dlList *ll = dlListNew();
 int listSize = 0;
 int outputWords = 0;
 
 for (;;)
@@ -419,65 +458,65 @@
     /* Handle last few words in line, where can't make a chain of full size.  Also handles       
     * lines that have fewer than chain size words. */
     if (curSize < chainSize)
  	addChainToTree(wt, chain);
     while ((node = dlPopHead(chain)) != NULL)
 	{
 	if (!dlEmpty(chain))
 	    addChainToTree(wt, chain);
 	freeMem(node->val);
 	freeMem(node);
 	}
     dlListFree(&chain);
     }
 lineFileClose(&lf);
 
-wordTreeSort(wt);   // ugly debug
+wordTreeSort(wt);  // Make output of chain file prettier
+verbose(2, "totUseZeroCount = %d\n", totUseZeroCount);
 
 return wt;
 }
 
 void wordTreeWrite(struct wordTree *wt, char *fileName)
 /* Write out tree to file */
 {
 FILE *f = mustOpen(fileName, "w");
 fprintf(f, "#level\tuseCount\toutTarget\toutCount\tnormVal\tmonomers\n");
 wordTreeDump(0, wt, f);
 carefulClose(&f);
 }
 
 void alphaChain(char *inFile, char *outFile)
 /* alphaChain - Create Markov chain of words and optionally output chain in two formats. */
 {
 struct wordTree *wt = wordTreeForChainsInFile(inFile, maxChainSize);
 wordTreeNormalize(wt, outSize, 1.0);
 
 if (optionExists("chain"))
     {
     char *fileName = optionVal("chain", NULL);
     wordTreeWrite(wt, fileName);
     }
 
+wordTreeGenerateFaux(wt, maxChainSize, pickRandom(wt->children), outSize, outFile);
 
-wordTreeGenerateFaux(wt, maxChainSize, pickRandomOnOutTarget(wt->children), outSize, outFile);
-
-uglyf("totUseZeroCount = %d\n", totUseZeroCount);
-wordTreeWrite(wt, "ugly.chain");
+if (optionExists("afterChain"))
+    {
+    char *fileName = optionVal("afterChain", NULL);
+    wordTreeWrite(wt, fileName);
+    }
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
-#ifdef SOON
-/* Seed random number generator with time, so it doesn't always generate same sequence of #s */
-srand( (unsigned)time(0) );
-#endif /* SOON */
 optionInit(&argc, argv, options);
 if (argc != 3)
     usage();
 maxChainSize = optionInt("size", maxChainSize);
-minUse = optionInt("minUse", minUse);
 outSize = optionInt("outSize", outSize);
 fullOnly = optionExists("fullOnly");
+int seed = optionInt("seed", (int)time(0));
+srand(seed);
 alphaChain(argv[1], argv[2]);
 return 0;
 }