7d9c05653a7bb79b9bb3797b0a6753e3ed53748d
kent
  Wed Sep 12 10:58:53 2012 -0700
Adding pseudoCount.
diff --git src/kehayden/alphaAsm/alphaAsm.c src/kehayden/alphaAsm/alphaAsm.c
index 4f76dcd..46fdb2b 100644
--- src/kehayden/alphaAsm/alphaAsm.c
+++ src/kehayden/alphaAsm/alphaAsm.c
@@ -1,27 +1,28 @@
 /* alphaAsm - assemble alpha repeat regions such as centromeres from reads that have
  * been parsed into various repeat monomer variants.  Cycles of these variants tend to
  * form higher order repeats. */
 
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "dystring.h"
 #include "dlist.h"
 
 /* Global vars - all of which can be set by command line options. */
+int pseudoCount = 1;
 int maxChainSize = 3;
 int outSize = 10000;
 boolean fullOnly = FALSE;
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "alphaAsm - create a linear projection of alpha satellite arrays using the probablistic model\n"
   "of HuRef satellite graphs\n"
   "usage:\n"
   "   alphaAsm alphaMonFile.txt monomerOrder.txt output.txt\n"
   "Where alphaMonFile is a list of reads with alpha chains monomers parsed out, one read per\n"
   "line, with the first word in the line being the read id and subsequent words the monomers\n"
   "within the read. The monomerOrder.txt has one line per major monomer type with a word for\n"
@@ -36,30 +37,31 @@
   "   -outSize=N - Output this many words.\n"
   "   -seed=N - Initialize random number with this seed for consistent results, otherwise\n"
   "             it will generate different results each time it's run.\n"
   , maxChainSize
   );
 }
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
    {"size", OPTION_INT},
    {"chain", OPTION_STRING},
    {"afterChain", OPTION_STRING},
    {"fullOnly", OPTION_BOOLEAN},
    {"outSize", OPTION_INT},
    {"seed", OPTION_INT},
+   {"pseudoCount", OPTION_INT},
    {NULL, 0},
 };
 
 /* Some structures to keep track of words (which correspond to alpha satellight monomers)
  * seen in input. */
 
 struct monomer
 /* Basic information on a monomer including how many times it is seen in input and output
  * streams.  Unlike the wordTree, this is flat, and does not include predecessors. */
     {
     struct monomer *next;	/* Next in list of all words. */
     char *word;			/* The word used to represent monomer.  Not allocated here. */
     int useCount;		/* Number of times used. */
     int outTarget;		/* Number of times want to output word. */
     int outCount;		/* Number of times have output word so far. */
@@ -223,33 +225,62 @@
 }
 
 void addChainToTree(struct wordTree *wt, struct dlList *chain)
 /* Add chain of words to tree. */
 {
 struct dlNode *node;
 wt->useCount += 1;
 for (node = chain->head; !dlEnd(node); node = node->next)
     {
     struct monomer *monomer = node->val;
     verbose(2, "  adding %s\n", monomer->word);
     wt = wordTreeAddFollowing(wt, monomer);
     }
 }
 
+int wordTreeAddPseudoCount(struct wordTree *wt, int pseudo)
+/* Add pseudo to all leaves of tree and propagate counts up to parents. */
+{
+if (wt->children == NULL)
+    {
+    wt->useCount += pseudo;
+    return wt->useCount;
+    }
+else
+    {
+    struct wordTree *child;
+    int oldChildTotal = 0;
+    for (child = wt->children; child != NULL; child = child->next)
+	oldChildTotal += child->useCount;
+    int oldDiff = wt->useCount - oldChildTotal;
+    if (oldDiff < 0) oldDiff = 0;  // Necessary in rare cases, not sure why.
+
+    int total = 0;
+    for (child = wt->children; child != NULL; child = child->next)
+	total += wordTreeAddPseudoCount(child, pseudo);
+    wt->useCount = total + oldDiff + pseudo;
+    return total;
+    }
+}
+
 void wordTreeNormalize(struct wordTree *wt, double outTarget, double normVal)
 /* Recursively set wt->normVal  and wt->outTarget so each branch gets its share */
 {
+if (pseudoCount > 0)
+    wordTreeAddPseudoCount(wt, pseudoCount);
+#ifdef SOON
+#endif /* SOON */
 wt->normVal = normVal;
 wt->outTarget = outTarget;
 int childrenTotalUses = wordTreeSumUseCounts(wt->children);
 struct wordTree *child;
 for (child = wt->children; child != NULL; child = child->next)
     {
     double childRatio = (double)child->useCount / childrenTotalUses;
     wordTreeNormalize(child, childRatio*outTarget, childRatio*normVal);
     }
 }
 
 char *wordTreeString(struct wordTree *wt)
 /* Return something like '(a b c)' where c would be the value at wt itself, and
  * a and b would be gotten by following parents. */
 {
@@ -535,31 +566,31 @@
 verbose(2, "predictFromPreviousTypes past all unknown types\n");
 return NULL;
 }
 
 
 struct wordTree *predictNext(struct alphaStore *store, struct dlList *past)
 /* Given input data store and what is known from the past, predict the next word. */
 {
 struct dlNode *recent = nodesFromTail(past, store->maxChainSize);
 struct wordTree *pick =  predictFromWordTree(store->markovChains, recent);
 if (pick == NULL)
     pick = predictFromPreviousTypes(store, past);
 if (pick == NULL)
     {
     pick = pickRandom(store->markovChains->children);
-    verbose(2, "in predictNext() last resort pick of %s\n", pick->monomer->word);
+    warn("in predictNext() last resort pick of %s", pick->monomer->word);
     }
 return pick;
 }
 
 void decrementOutputCountsInTree(struct wordTree *wt)
 /* Decrement output count of self and parents. */
 {
 while (wt != NULL)
     {
     /* Decrement target count, but don't let it fall below sum of counts of all children. 
      * This can happen with incomplete data if we don't prevent it.  This
      * same code also prevents us from having negative outTarget. */
     int outTarget = wt->outTarget - 1;
     int kidSum = wordTreeSumOutTargets(wt->children);
     if (outTarget < kidSum)
@@ -1209,20 +1240,21 @@
     {
     char *fileName = optionVal("afterChain", NULL);
     wordTreeWrite(wt, store->maxChainSize, fileName);
     }
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 4)
     usage();
 maxChainSize = optionInt("size", maxChainSize);
 outSize = optionInt("outSize", outSize);
 fullOnly = optionExists("fullOnly");
+pseudoCount = optionInt("pseudoCount", pseudoCount);
 int seed = optionInt("seed", (int)time(0));
 srand(seed);
 alphaAsm(argv[1], argv[2], argv[3]);
 return 0;
 }