src/kehayden/linearSat/alphaAsm.c 753cbedeb38830d6d15779ebded2d75c0640a521

753cbedeb38830d6d15779ebded2d75c0640a521
kent
  Wed Jun 19 13:17:02 2013 -0700
Moving alphaAsm to linearSat, a bettern name.
diff --git src/kehayden/linearSat/alphaAsm.c src/kehayden/linearSat/alphaAsm.c
new file mode 100644
index 0000000..9ce4ab6
--- /dev/null
+++ src/kehayden/linearSat/alphaAsm.c
@@ -0,0 +1,1832 @@
+/* alphaAsm - assemble alpha repeat regions such as centromeres from reads that have
+ * been parsed into various repeat monomer variants.  Cycles of these variants tend to
+ * form higher order repeats. */
+
+#include "common.h"
+#include "linefile.h"
+#include "hash.h"
+#include "options.h"
+#include "dystring.h"
+#include "dlist.h"
+#include "obscure.h"
+#include "errabort.h"
+
+/* Global vars - all of which can be set by command line options. */
+int pseudoCount = 1;
+int maxChainSize = 3;
+int initialOutSize = 10000;
+int maxOutSize;
+int maxToMiss = 0;
+boolean fullOnly = FALSE;
+boolean betweens = FALSE;
+
+void usage()
+/* Explain usage and exit. */
+{
+errAbort(
+  "alphaAsm - create a linear projection of alpha satellite arrays using the probablistic model\n"
+  "of HuRef satellite graphs based on Sanger style reads.\n"
+  "usage:\n"
+  "   alphaAsm alphaMonFile.txt monomerOrder.txt output.txt\n"
+  "Where alphaMonFile is a list of reads with alpha chains monomers parsed out, one read per\n"
+  "line, with the first word in the line being the read id and subsequent words the monomers\n"
+  "within the read. The monomerOrder.txt has one line per major monomer type with a word for\n"
+  "each variant. The lines are in the same order the monomers appear, with the last line\n"
+  "being followed in order by the first since they repeat.   The output.txt contains one line\n"
+  "for each monomer in the output, just containing the monomer symbol.\n"
+  "options:\n"
+  "   -size=N - Set max chain size, default %d\n"
+  "   -fullOnly - Only output chains of size above\n"
+  "   -chain=fileName - Write out word chain to file\n"
+  "   -afterChain=fileName - Write out word chain after faux generation to file for debugging\n"
+  "   -initialOutSize=N - Output this many words. Default %d\n"
+  "   -maxOutSize=N - Expand output up to this many words if not all monomers output initially\n"
+  "   -maxToMiss=N - How many monomers may be missing from output. Default %d \n"
+  "   -pseudoCount=N - Add this number to observed quantities - helps compensate for what's not\n"
+  "                observed.  Default %d\n"
+  "   -seed=N - Initialize random number with this seed for consistent results, otherwise\n"
+  "             it will generate different results each time it's run.\n"
+  "   -betweens - Add <m> lines in output that indicate level of markov model used join\n"
+  "   -unsubbed=fileName - Write output before substitution of missing monomers to file\n"
+  , maxChainSize, initialOutSize, maxToMiss, pseudoCount
+  );
+}
+
+/* Command line validation table. */
+static struct optionSpec options[] = {
+   {"size", OPTION_INT},
+   {"chain", OPTION_STRING},
+   {"afterChain", OPTION_STRING},
+   {"fullOnly", OPTION_BOOLEAN},
+   {"initialOutSize", OPTION_INT},
+   {"maxOutSize", OPTION_INT},
+   {"maxToMiss", OPTION_INT},
+   {"pseudoCount", OPTION_INT},
+   {"seed", OPTION_INT},
+   {"unsubbed", OPTION_STRING},
+   {"betweens", OPTION_BOOLEAN},
+   {NULL, 0},
+};
+
+/* Some structures to keep track of words (which correspond to alpha satellight monomers)
+ * seen in input. */
+
+struct monomer
+/* Basic information on a monomer including how many times it is seen in input and output
+ * streams.  Unlike the wordTree, this is flat, and does not include predecessors. */
+    {
+    struct monomer *next;	/* Next in list of all words. */
+    char *word;			/* The word used to represent monomer.  Not allocated here. */
+    int useCount;		/* Number of times used. */
+    int outTarget;		/* Number of times want to output word. */
+    int outCount;		/* Number of times have output word so far. */
+    struct monomerType *type;	/* The type of the monomer. */
+    struct slRef *readList;	/* List of references to reads this is in. */
+    int subbedOutCount;		/* Output count after substitution. */
+    };
+
+struct monomerRef
+/* A reference to a monomer. */
+    {
+    struct monomerRef *next;	/* Next in list */
+    struct monomer *val;	/* The word referred to. */
+    };
+
+struct monomerType
+/* A collection of words of the same type - or monomers of same type. */
+    {
+    struct monomerType *next;   /* Next monomerType */
+    char *name;			/* Short name of type */
+    struct monomerRef *list;	    /* List of all words of that type */
+    };
+
+struct alphaRead 
+/* A read that's been parsed into alpha variants. */
+    {
+    struct alphaRead *next;	/* Next in list */
+    char *name;			/* Id of read, not allocated here. */
+    struct monomerRef *list;	/* List of alpha subunits */
+    };
+
+struct alphaStore
+/* Stores info on all monomers */
+    {
+    struct monomer *monomerList;   /* List of words, fairly arbitrary order. */
+    struct hash *monomerHash;       /* Hash of monomer, keyed by word. */
+    struct alphaRead *readList;	 /* List of all reads. */
+    struct hash *readHash;	 /* Hash of all reads keyed by read ID. */
+    struct wordTree *markovChains;   /* Tree of words that follow other words. */
+    struct wordTree *markovChainsNoOrphans; /* Markov tree before orphans are added. */
+    int maxChainSize;		/* Maximum depth of tree. */
+    struct monomerType *typeList;	/* List of all types. */
+    struct hash *typeHash;	/* Hash with monomerType values, keyed by all words. */
+    int typeCount;		/* Count of types. */
+    struct monomerType **typeArray;  /* Array as opposed to list of types */
+    };
+
+/* The wordTree structure below is the central data structure for this program.  It is
+ * used to build up a tree that contains all observed N-word-long sequences observed in
+ * the text, where N corresponds to the "size" command line option which defaults to 3,
+ * an option that in turn is stored in the maxChainSize variable.  At this chain size the
+ * text 
+ *     this is the black dog and the black cat
+ * would have the chains 
+ *     this is the 
+ *     is the black
+ *     the black dog
+ *     black dog and
+ *     dog and the
+ *     and the black
+ *     the black cat
+ * and turn into the tree
+ *     this
+ *        is
+ *           the
+ *     is
+ *        the
+ *           black
+ *     the
+ *        black
+ *           dog
+ *           cat
+ *     black
+ *        dog
+ *           and
+ *     dog
+ *        and
+ *           the
+ *     and
+ *        the
+ *           black
+ * Note how the tree is able to compress the two chains "the black dog" and "the black cat."
+ *
+ * A node in the tree can have as many children as it needs to at each node.  The depth of
+ * the tree is the same as the chain size, by default 3. At each node in the tree you get
+ * a word, and a list of all words that are observed in the text to follow that word.
+ *
+ * Once the program has build up the wordTree, it can output it in a couple of fashions. */
+
+struct wordTree
+/* A node in a tree of words.  The head of the tree is a node with word value the empty string. */
+    {
+    struct wordTree *next;	/* Next sibling */
+    struct wordTree *children;	/* Children in list. */
+    struct wordTree *parent;    /* Parent of this node or NULL for root. */
+    struct monomer *monomer;	/* The info on the word itself. */
+    int useCount;		/* Number of times word used in input with given predecessors. */
+    int outTarget;              /* Number of times want to output word with given predecessors. */
+    int outCount;		/* Number of times actually output with given predecessors. */
+    double normVal;             /* value to place the normalization value */    
+    };
+
+struct wordTree *wordTreeNew(struct monomer *monomer)
+/* Create and return new wordTree element. */
+{
+struct wordTree *wt;
+AllocVar(wt);
+wt->monomer = monomer;
+return wt;
+}
+
+int wordTreeCmpWord(const void *va, const void *vb)
+/* Compare two wordTree for slSort. */
+{
+const struct wordTree *a = *((struct wordTree **)va);
+const struct wordTree *b = *((struct wordTree **)vb);
+return cmpStringsWithEmbeddedNumbers(a->monomer->word, b->monomer->word);
+}
+
+static void wordTreeSort(struct wordTree *wt)
+/* Sort all children lists in tree. */
+{
+slSort(&wt->children, wordTreeCmpWord);
+struct wordTree *child;
+for (child = wt->children; child != NULL; child = child->next)
+    wordTreeSort(child);
+}
+
+struct wordTree *wordTreeFindInList(struct wordTree *list, struct monomer *monomer)
+/* Return wordTree element in list that has given word, or NULL if none. */
+{
+struct wordTree *wt;
+for (wt = list; wt != NULL; wt = wt->next)
+    if (wt->monomer == monomer)
+        break;
+return wt;
+}
+
+struct wordTree *wordTreeAddFollowing(struct wordTree *wt, struct monomer *monomer)
+/* Make word follow wt in tree.  If word already exists among followers
+ * return it and bump use count.  Otherwise create new one. */
+{
+struct wordTree *child = wordTreeFindInList(wt->children, monomer);
+if (child == NULL)
+    {
+    child = wordTreeNew(monomer);
+    child->parent = wt;
+    slAddHead(&wt->children, child);
+    }
+child->useCount += 1;
+return child;
+}
+
+int wordTreeSumUseCounts(struct wordTree *list)
+/* Sum up useCounts in list */
+{
+int total = 0;
+struct wordTree *wt;
+for (wt = list; wt != NULL; wt = wt->next)
+    total += wt->useCount;
+return total;
+}
+
+int wordTreeSumOutTargets(struct wordTree *list)
+/* Sum up useCounts in list */
+{
+int total = 0;
+struct wordTree *wt;
+for (wt = list; wt != NULL; wt = wt->next)
+    total += wt->outTarget;
+return total;
+}
+
+void addChainToTree(struct wordTree *wt, struct dlList *chain)
+/* Add chain of words to tree. */
+{
+struct dlNode *node;
+wt->useCount += 1;
+for (node = chain->head; !dlEnd(node); node = node->next)
+    {
+    struct monomer *monomer = node->val;
+    verbose(4, "  adding %s\n", monomer->word);
+    wt = wordTreeAddFollowing(wt, monomer);
+    }
+}
+
+int useCountInTree(struct wordTree *wt, struct dlNode *nodeList, int chainSize)
+/* Return number of times chainSize successive nodes from nodeList are found
+ * in wt,  0 if not found. */
+{
+int i;
+struct wordTree *subTree = wt;
+struct dlNode *node = nodeList;
+for (i=0; i<chainSize; ++i)
+    {
+    struct monomer *monomer = node->val;
+    subTree = wordTreeFindInList(subTree->children, monomer);
+    if (subTree == NULL)
+        return FALSE;
+    node = node->next;
+    }
+return subTree->useCount;
+}
+
+
+void findLongestSupportingMarkovChain(struct wordTree *wt, struct dlNode *node,
+    int *retChainSize, int *retReadCount)
+/* See if chain of words is in tree tree. */
+{
+struct dlNode *start = node;
+int chainSize = 1;
+int readCount = 0;
+for (;;)
+    {
+    int useCount = useCountInTree(wt, start, chainSize);
+    if (useCount == 0)
+        break;
+    readCount = useCount;
+    chainSize += 1;
+    start = start->prev;
+    if (dlStart(start))
+        break;
+    }
+*retChainSize = chainSize;
+*retReadCount = readCount;
+}
+
+static void writeMonomerListAndBetweens(struct alphaStore *store, 
+    char *fileName, struct dlList *ll)
+/* Write out monomer list to file. */
+{
+FILE *f = mustOpen(fileName, "w");
+struct dlNode *node;
+struct wordTree *origTree = store->markovChainsNoOrphans;
+for (node = ll->head; !dlEnd(node); node = node->next)
+    {
+    struct monomer *monomer = node->val;
+    if (betweens)
+	{
+	int chainSize = 0, readCount = 0;
+	findLongestSupportingMarkovChain(origTree, node, 
+	    &chainSize, &readCount);
+	/* The -2 is for 1 extra for the empty tree root, and 1 extra to get
+	 * from chain-size to markov model terminology. */
+	char between[24];
+	safef(between, sizeof(between), "<%d:%d:%d>", chainSize-2, 
+		readCount, useCountInTree(origTree, node, 1)); 
+	fprintf(f, "%-11s\t", between);
+	}
+    fprintf(f, "%s\n", monomer->word);
+    }
+carefulClose(&f);
+}
+
+
+int wordTreeAddPseudoCount(struct wordTree *wt, int pseudo)
+/* Add pseudo to all leaves of tree and propagate counts up to parents. */
+{
+if (wt->children == NULL)
+    {
+    wt->useCount += pseudo;
+    return wt->useCount;
+    }
+else
+    {
+    struct wordTree *child;
+    int oldChildTotal = 0;
+    for (child = wt->children; child != NULL; child = child->next)
+	oldChildTotal += child->useCount;
+    int oldDiff = wt->useCount - oldChildTotal;
+    if (oldDiff < 0) oldDiff = 0;  // Necessary in rare cases, not sure why.
+
+    int total = 0;
+    for (child = wt->children; child != NULL; child = child->next)
+	total += wordTreeAddPseudoCount(child, pseudo);
+    wt->useCount = total + oldDiff + pseudo;
+    return total;
+    }
+}
+
+void wordTreeNormalize(struct wordTree *wt, double outTarget, double normVal)
+/* Recursively set wt->normVal  and wt->outTarget so each branch gets its share */
+{
+if (pseudoCount > 0)
+    wordTreeAddPseudoCount(wt, pseudoCount);
+wt->normVal = normVal;
+wt->outTarget = outTarget;
+wt->outCount = 0;
+int childrenTotalUses = wordTreeSumUseCounts(wt->children);
+struct wordTree *child;
+for (child = wt->children; child != NULL; child = child->next)
+    {
+    double childRatio = (double)child->useCount / childrenTotalUses;
+    wordTreeNormalize(child, childRatio*outTarget, childRatio*normVal);
+    }
+}
+
+char *wordTreeString(struct wordTree *wt)
+/* Return something like '(a b c)' where c would be the value at wt itself, and
+ * a and b would be gotten by following parents. */
+{
+struct slName *list = NULL, *el;
+for (;wt != NULL; wt = wt->parent)
+    {
+    char *word = wt->monomer->word;
+    if (!isEmpty(word))   // Avoid blank great grandparent at root of tree.
+	slNameAddHead(&list, word);
+    }
+
+struct dyString *dy = dyStringNew(0);
+dyStringAppendC(dy, '(');
+for (el = list; el != NULL; el = el->next)
+   {
+   dyStringPrintf(dy, "%s", el->name);
+   if (el->next != NULL)
+       dyStringAppendC(dy, ' ');
+   }
+dyStringAppendC(dy, ')');
+slFreeList(&list);
+return dyStringCannibalize(&dy);
+}
+
+char *dlListFragWords(struct dlNode *head)
+/* Return string containing all words in list pointed to by head. */
+{
+struct dyString *dy = dyStringNew(0);
+dyStringAppendC(dy, '{');
+struct dlNode *node;
+for (node = head; !dlEnd(node); node = node->next)
+    {
+    if (node != head)
+       dyStringAppendC(dy, ' ');
+    struct monomer *monomer = node->val;
+    dyStringAppend(dy, monomer->word);
+    }
+dyStringAppendC(dy, '}');
+return dyStringCannibalize(&dy);
+}
+
+void wordTreeDump(int level, struct wordTree *wt, int maxChainSize, FILE *f)
+/* Write out wordTree to file. */
+{
+static char *words[64];
+int i;
+assert(level < ArraySize(words));
+
+words[level] = wt->monomer->word;
+if (!fullOnly || level == maxChainSize)
+    {
+    fprintf(f, "%d\t%d\t%d\t%d\t%f\t", 
+	    level, wt->useCount, wt->outTarget, wt->outCount, wt->normVal);
+    
+    for (i=1; i<=level; ++i)
+	{
+	spaceOut(f, level*2);
+	fprintf(f, "%s ", words[i]);
+	}
+    fprintf(f, "\n");
+    }
+struct wordTree *child;
+for (child = wt->children; child != NULL; child = child->next)
+    wordTreeDump(level+1, child, maxChainSize, f);
+}
+
+struct wordTree *pickWeightedRandomFromList(struct wordTree *list)
+/* Pick word from list randomly, but so that words with higher outTargets
+ * are picked more often. */
+{
+struct wordTree *picked = NULL;
+
+/* Debug output. */
+    {
+    verbose(4, "   pickWeightedRandomFromList(");
+    struct wordTree *wt;
+    for (wt = list; wt != NULL; wt = wt->next)
+        {
+	if (wt != list)
+	    verbose(4, " ");
+	verbose(4, "%s:%d", wt->monomer->word, wt->outTarget);
+	}
+    verbose(4, ") total %d\n", wordTreeSumOutTargets(list));
+    }
+
+/* Figure out total number of outputs left, and a random number between 0 and that total. */
+int total = wordTreeSumOutTargets(list);
+if (total > 0)
+    {
+    int threshold = rand() % total; 
+    verbose(4, "      threshold %d\n", threshold);
+
+    /* Loop through list returning selection corresponding to random threshold. */
+    int binStart = 0;
+    struct wordTree *wt;
+    for (wt = list; wt != NULL; wt = wt->next)
+	{
+	int size = wt->outTarget;
+	int binEnd = binStart + size;
+	verbose(4, "      %s size %d, binEnd %d\n", wt->monomer->word, size, binEnd);
+	if (threshold < binEnd)
+	    {
+	    picked = wt;
+	    verbose(4, "      picked %s\n", wt->monomer->word);
+	    break;
+	    }
+	binStart = binEnd;
+	}
+    }
+return picked;
+}
+
+int totUseZeroCount = 0;
+
+struct monomer *pickRandomFromType(struct monomerType *type)
+/* Pick a random word, weighted by outTarget, from all available of given type */
+{
+/* Figure out total on list */
+int total = 0;
+struct monomerRef *ref;
+for (ref = type->list; ref != NULL; ref = ref->next)
+    total += ref->val->outTarget;
+verbose(3, "pickRandomFromType %s total of outTarget=%d\n", type->name, total);
+
+/* Loop through list returning selection corresponding to random threshold. */
+if (total > 0)
+    {
+    int threshold = rand() % total; 
+    int binStart = 0;
+    for (ref = type->list; ref != NULL; ref = ref->next)
+	{
+	struct monomer *monomer = ref->val;
+	int size = monomer->outTarget;
+	int binEnd = binStart + size;
+	if (threshold < binEnd)
+	    return monomer;
+	binStart = binEnd;
+	}
+    }
+
+verbose(3, "Backup plan for %s in pickRandomFromType\n", type->name);
+
+/* Try again based on use counts. */
+total = 0;
+for (ref = type->list; ref != NULL; ref = ref->next)
+    total += ref->val->useCount;
+if (total > 0)
+    {
+    int threshold = rand() % total; 
+    int binStart = 0;
+    for (ref = type->list; ref != NULL; ref = ref->next)
+	{
+	struct monomer *monomer = ref->val;
+	int size = monomer->useCount;
+	int binEnd = binStart + size;
+	if (threshold < binEnd)
+	    return monomer;
+	binStart = binEnd;
+	}
+    }
+
+errAbort("Fell off end in pickRandomFromType on %s", type->name);
+return NULL;
+}
+
+struct wordTree *predictNextFromAllPredecessors(struct wordTree *wt, struct dlNode *list)
+/* Predict next word given tree and recently used word list.  If tree doesn't
+ * have statistics for what comes next given the words in list, then it returns
+ * NULL. */
+{
+if (verboseLevel() >= 4)
+    verbose(4, "  predictNextFromAllPredecessors(%s, %s)\n", wordTreeString(wt), dlListFragWords(list));
+struct dlNode *node;
+for (node = list; !dlEnd(node); node = node->next)
+    {
+    struct monomer *monomer = node->val;
+    wt = wordTreeFindInList(wt->children, monomer);
+    if (verboseLevel() >= 4)
+	verbose(4, "   wordTreeFindInList(%s) = %p %s\n", monomer->word, wt, wordTreeString(wt));
+    if (wt == NULL || wt->children == NULL)
+        break;
+    }
+struct wordTree *result = NULL;
+if (wt != NULL && wt->children != NULL)
+    result = pickWeightedRandomFromList(wt->children);
+return result;
+}
+
+struct wordTree *predictFromWordTree(struct wordTree *wt, struct dlNode *recent)
+/* Predict next word given tree and recently used word list.  Will use all words in
+ * recent list if can,  but if there is not data in tree, will back off, and use
+ * progressively less previous words. */
+{
+struct dlNode *node;
+if (verboseLevel() >= 4)
+    verbose(4, " predictNextFromWordTree(%s)\n", wordTreeString(wt));
+
+for (node = recent; !dlEnd(node); node = node->next)
+    {
+    struct wordTree *result = predictNextFromAllPredecessors(wt, node);
+    if (result != NULL)
+        return result;
+    }
+return NULL;
+}
+
+struct dlNode *nodesFromTail(struct dlList *list, int count)
+/* Return count nodes from tail of list.  If list is shorter than count, it returns the
+ * whole list. */
+{
+int i;
+struct dlNode *node;
+for (i=0, node = list->tail; i<count; ++i)
+    {
+    if (dlStart(node))
+        return list->head;
+    node = node->prev;
+    }
+return node;
+}
+
+struct monomerType *typeAfter(struct alphaStore *store, struct monomerType *oldType,
+    int amount)
+/* Skip forward amount in typeList from oldType.  If get to end of list start
+ * again at the beginning */
+{
+int i;
+struct monomerType *type = oldType;
+for (i=0; i<amount; ++i)
+    {
+    type = type->next;
+    if (type == NULL)
+	type = store->typeList;
+    }
+return type;
+}
+
+struct monomerType *typeBefore(struct alphaStore *store, struct monomerType *oldType, int amount)
+/* Pick type that comes amount before given type. */
+{
+int typeIx = ptArrayIx(oldType, store->typeArray, store->typeCount);
+typeIx -= amount;
+while (typeIx < 0)
+    typeIx += store->typeCount;
+return store->typeArray[typeIx];
+}
+
+struct wordTree *predictFromPreviousTypes(struct alphaStore *store, struct dlList *past)
+/* Predict next based on general pattern of monomer order. */
+{
+/* This routine is now a bit more complex than necessary but it still works.  It
+ * these days only needs to look back 1 because of the read-based type fill-in. */
+int maxToLookBack = 3;
+    { /* Debugging block */
+    struct dlNode *node = nodesFromTail(past, maxToLookBack);
+    verbose(4, "predictFromPreviousTypes(");
+    for (; !dlEnd(node); node = node->next)
+        {
+	struct monomer *monomer = node->val;
+	verbose(4, "%s, ", monomer->word);
+	}
+    verbose(4, ")\n");
+    }
+int pastDepth;
+struct dlNode *node;
+for (pastDepth = 1, node = past->tail; pastDepth <= maxToLookBack; ++pastDepth)
+    {
+    if (dlStart(node))
+	{
+	verbose(3, "predictFromPreviousTypes with empty past\n");
+	return NULL;
+	}
+    struct monomer *monomer = node->val;
+    struct monomerType *prevType = hashFindVal(store->typeHash, monomer->word);
+    if (prevType != NULL)
+        {
+	struct monomerType *curType = typeAfter(store, prevType, pastDepth);
+	struct monomer *curInfo = pickRandomFromType(curType);
+	struct wordTree *curTree = wordTreeFindInList(store->markovChains->children, curInfo);
+	verbose(4, "predictFromPreviousType pastDepth %d, curInfo %s, curTree %p %s\n", 
+	    pastDepth, curInfo->word, curTree, curTree->monomer->word);
+	return curTree;
+	}
+    node = node->prev;
+    }
+verbose(3, "predictFromPreviousTypes past all unknown types\n");
+return NULL;
+}
+
+
+struct wordTree *predictNext(struct alphaStore *store, struct dlList *past)
+/* Given input data store and what is known from the past, predict the next word. */
+{
+struct dlNode *recent = nodesFromTail(past, store->maxChainSize);
+struct wordTree *pick =  predictFromWordTree(store->markovChains, recent);
+if (pick == NULL)
+    pick = predictFromPreviousTypes(store, past);
+if (pick == NULL)
+    {
+    pick = pickWeightedRandomFromList(store->markovChains->children);
+    verbose(3, "in predictNext() last resort pick of %s\n", pick->monomer->word);
+    }
+return pick;
+}
+
+void decrementOutputCountsInTree(struct wordTree *wt)
+/* Decrement output count of self and parents. */
+{
+while (wt != NULL)
+    {
+    /* Decrement target count, but don't let it fall below sum of counts of all children. 
+     * This can happen with incomplete data if we don't prevent it.  This
+     * same code also prevents us from having negative outTarget. */
+    int outTarget = wt->outTarget - 1;
+    int kidSum = wordTreeSumOutTargets(wt->children);
+    if (outTarget < kidSum)
+        outTarget = kidSum;
+    wt->outTarget = outTarget;
+
+    /* Always bump outCount to help us debug (only real use of outCount). */
+    wt->outCount += 1;
+    wt = wt->parent;
+    }
+}
+
+static struct dlList *wordTreeGenerateList(struct alphaStore *store, int maxSize, 
+    struct wordTree *firstWord, int maxOutputWords)
+/* Make a list of words based on probabilities in tree. */
+{
+struct dlList *ll = dlListNew();
+int chainSize = 0;
+int outputWords = 0;
+struct dlNode *chainStartNode = NULL;
+
+for (;;)
+    {
+    if (++outputWords > maxOutputWords)
+        break;
+    struct dlNode *node;
+    struct wordTree *picked;
+
+    /* Get next predicted word. */
+    AllocVar(node);
+    if (chainSize == 0)
+        {
+	chainStartNode = node;
+	++chainSize;
+	picked = firstWord;
+	}
+    else if (chainSize >= maxSize)
+	{
+	chainStartNode = chainStartNode->next;
+	picked = predictNext(store, ll);
+	}
+    else
+	{
+	picked = predictNext(store, ll);
+	++chainSize;
+	}
+
+    if (picked == NULL)
+         break;
+
+    /* Add word from whatever level we fetched back to our output chain. */
+    struct monomer *monomer = picked->monomer;
+    node->val = monomer;
+    dlAddTail(ll, node);
+
+    decrementOutputCountsInTree(picked);
+    monomer->outTarget -= 1;
+    monomer->outCount += 1;
+    }
+verbose(3, "totUseZeroCount = %d\n", totUseZeroCount);
+return ll;
+}
+
+
+struct slRef *listUnusedMonomers(struct alphaStore *store, struct dlList *ll)
+/* Return list of references to monomers that are in store but not on ll. */
+{
+struct slRef *refList = NULL, *ref;
+struct hash *usedHash = hashNew(0);
+struct dlNode *node;
+/* Make hash of used monomers. */
+for (node = ll->head; !dlEnd(node); node = node->next)
+    {
+    struct monomer *monomer = node->val;
+    hashAdd(usedHash, monomer->word, monomer);
+    }
+
+/* Stream through all monomers looking for ones not used and adding to list. */
+struct monomer *monomer;
+for (monomer = store->monomerList; monomer != NULL; monomer = monomer->next)
+    {
+    if (isEmpty(monomer->word))
+        continue;
+    if (!hashLookup(usedHash, monomer->word))
+	{
+	AllocVar(ref);
+	ref->val = monomer;
+	slAddHead(&refList, ref);
+	}
+    }
+hashFree(&usedHash);
+slReverse(&refList);
+return refList;
+}
+
+
+int monomerRefIx(struct monomerRef *list, struct monomer *monomer)
+/* Return index of monomer in list. */
+{
+int i;
+struct monomerRef *ref;
+for (i=0, ref = list; ref != NULL; ref = ref->next, ++i)
+    {
+    if (ref->val == monomer)
+        return i;
+    }
+errAbort("Monomer %s not on list\n", monomer->word);
+return -1;
+}
+
+struct monomerRef *findNeighborhoodFromReads(struct monomer *center)
+/* Find if possible one monomer to either side of center */
+{
+struct slRef *readRef;
+struct monomerRef *before = NULL, *after = NULL;
+
+/* Loop through reads hoping to find a case where center is flanked by two monomers in
+ * same read.   As a fallback, keep track of a monomer before and a monomer after in
+ * any read. */
+for (readRef = center->readList; readRef != NULL; readRef = readRef->next)
+    {
+    struct alphaRead *read = readRef->val;
+    int readSize = slCount(read->list);
+    int centerIx = monomerRefIx(read->list, center);
+    if (readSize >= 3 && centerIx > 0 && centerIx < readSize-1)
+	 {
+	 before = slElementFromIx(read->list, centerIx-1);
+	 after = slElementFromIx(read->list, centerIx+1);
+	 break;
+	 }
+    else if (readSize >= 2)
+         {
+	 if (centerIx == 0)
+	     after = slElementFromIx(read->list, centerIx+1);
+	 else
+	     before = slElementFromIx(read->list, centerIx-1);
+	 }
+    }
+
+/* Make up list from end to start. */
+struct monomerRef *retList = NULL, *monoRef;
+if (after)
+    {
+    AllocVar(monoRef);
+    monoRef->val = after->val;
+    slAddHead(&retList, monoRef);
+    }
+AllocVar(monoRef);
+monoRef->val = center;
+slAddHead(&retList, monoRef);
+if (before)
+    {
+    AllocVar(monoRef);
+    monoRef->val = before->val;
+    slAddHead(&retList, monoRef);
+    }
+return retList;
+}
+
+struct dlNode *matchExceptCenter(struct dlNode *node, struct monomerRef *neighborhood, struct monomer *center)
+/* Return center node if node matches neighborhood except for center. */
+{
+struct dlNode *centerNode = NULL;
+struct monomerRef *neighbor;
+struct dlNode *n;
+for (n = node, neighbor=neighborhood; ; n = n->next, neighbor = neighbor->next)
+    {
+    if (neighbor == NULL)
+        break;
+    if (dlEnd(n))
+        return NULL;
+    if (neighbor->val == center)
+        centerNode = n;
+    else if (n->val != neighbor->val)
+        return NULL;
+    }
+return centerNode;
+}
+
+void printMonomerRefList(struct monomerRef *refList, FILE *f)
+/* Print out a line to file with the list of monomers. */
+{
+struct monomerRef *ref;
+for (ref = refList; ref != NULL; ref = ref->next)
+    fprintf(f, "%s ", ref->val->word);
+fprintf(f, "\n");
+}
+
+struct slRef *refsToPossibleCenters(struct monomer *center, struct monomerRef *neighborhood, struct dlList *ll)
+/* Return a list of dlNodes where neighborhood, but not center matches. */
+{
+struct slRef *list = NULL;
+struct dlNode *node;
+for (node = ll->head; !dlEnd(node); node = node->next)
+    {
+    struct dlNode *centerNode = matchExceptCenter(node, neighborhood, center);
+    if (centerNode != NULL)
+	refAdd(&list, centerNode);
+    }
+return list;
+}
+
+void mostCommonMonomerWord(struct slRef *refList, char **retWord, int *retCount)
+/* Given refs to dlNodes containing monomers, find word associated with most common monomer. */
+{
+/* Make up a hash that contains counts of all monomers. */
+struct hash *countHash = hashNew(0);
+struct slRef *ref;
+for (ref = refList; ref != NULL; ref = ref->next)
+    {
+    struct dlNode *node = ref->val;
+    struct monomer *monomer = node->val;
+    hashIncInt(countHash, monomer->word);
+    }
+
+/* Iterate through hash finding max value. */
+char *maxWord = NULL;
+int maxCount = 0;
+struct hashEl *hel;
+struct hashCookie cookie = hashFirst(countHash);
+while ((hel = hashNext(&cookie)) != NULL)
+    {
+    int count = ptToInt(hel->val);
+    if (count > maxCount)
+        {
+	maxCount = count;
+	maxWord = hel->name;
+	}
+    }
+*retWord = maxWord;
+*retCount = maxCount;
+hashFree(&countHash);
+}
+
+boolean subCommonCenter(struct alphaStore *store,
+    struct monomer *center, struct monomerRef *neighborhood, struct dlList *ll)
+/* Scan list for places where have all items in neighborhood (except for center) matching. 
+ * Substitute in center at one of these places chosen at random and return TRUE if possible. */
+{
+struct slRef *centerRefList = refsToPossibleCenters(center, neighborhood, ll);
+verbose(4, "sub %s in neighborhood: ", center->word);
+if (verboseLevel() >= 4)
+    printMonomerRefList(neighborhood, stderr);
+verbose(4, "Got %d possible centers\n", slCount(centerRefList));
+
+if (centerRefList == NULL)
+    return FALSE;
+int commonCount = 0;
+char *commonWord = NULL;
+mostCommonMonomerWord(centerRefList, &commonWord, &commonCount);
+struct monomer *commonMonomer = hashFindVal(store->monomerHash, commonWord);
+verbose(4, "Commonest word to displace with %s is %s which occurs %d times in context and %d overall\n", center->word, commonWord, commonCount, commonMonomer->subbedOutCount);
+if (commonMonomer->subbedOutCount < 2)
+    {
+    verbose(3, "Want to substitute %s for %s, but %s only occurs %d time.\n", 
+	center->word, commonWord, commonWord, commonMonomer->subbedOutCount);
+    return FALSE;
+    }
+
+/* Select a random one of the most commonly occuring possible centers. */
+int targetIx = rand() % commonCount;
+struct slRef *ref;
+int currentIx = 0;
+for (ref = centerRefList; ref != NULL; ref = ref->next)
+    {
+    struct dlNode *node = ref->val;
+    struct monomer *monomer = node->val;
+    if (sameString(monomer->word, commonWord))
+         {
+	 if (currentIx == targetIx)
+	     {
+	     verbose(3, "Substituting %s for %s in context of %d\n", center->word, commonWord, slCount(neighborhood));
+	     struct monomer *oldCenter = node->val;
+	     if (oldCenter->type != center->type)
+		 verbose(3, "Type mismatch subbig %s vs %s\n", oldCenter->word, center->word);
+	     node->val = center;
+	     oldCenter->subbedOutCount -= 1;
+	     center->subbedOutCount += 1;
+	     return TRUE;
+	     }
+	 ++currentIx;
+	 }
+    }
+internalErr();	// Should not get here.
+return FALSE;	
+}
+
+boolean subCenterInNeighborhood(struct alphaStore *store, 
+    struct monomer *center, struct monomerRef *neighborhood, struct dlList *ll)
+/* Scan ll for cases where neighborhood around center matches.  Replace one of these 
+ * cases with center. */
+{
+int size = slCount(neighborhood);
+assert(size == 3 || size == 2);
+if (subCommonCenter(store, center, neighborhood, ll))
+   return TRUE;
+if (size == 3)
+    {
+    if (subCommonCenter(store, center, neighborhood->next, ll))
+       return TRUE;
+    struct monomerRef *third = neighborhood->next->next;
+    neighborhood->next->next = NULL;
+    boolean ok = subCommonCenter(store, center, neighborhood, ll);
+    neighborhood->next->next = third;
+    return ok;
+    }
+else
+    return FALSE;
+}
+
+struct monomer *mostCommonInType(struct monomerType *type)
+/* Return most common monomer of given type */
+{
+struct monomerRef *ref;
+int commonCount = 0;
+struct monomer *common = NULL;
+for (ref = type->list; ref != NULL; ref = ref->next)
+    {
+    struct monomer *monomer = ref->val;
+    if (monomer->subbedOutCount > commonCount)
+        {
+	commonCount = monomer->subbedOutCount;
+	common = monomer;
+	}
+    }
+return common;
+}
+
+boolean subIntoFirstMostCommonOfType(struct alphaStore *store, struct monomer *unused, 
+    struct dlList *ll)
+/* Substitute unused for first occurence of most common monomer of same type. */
+{
+struct monomer *common = mostCommonInType(unused->type);
+if (common == NULL)
+    {
+    verbose(3, "No monomers of type %s used at all!\n", unused->type->name);
+    return FALSE;
+    }
+if (common->subbedOutCount < 2)
+    {
+    verbose(3, "Trying to sub in %s, but there's no monomers of type %s that are used more than once.\n", 
+	unused->word, unused->type->name);
+    return FALSE;
+    }
+struct dlNode *node;
+for (node = ll->head; !dlEnd(node); node = node->next)
+    {
+    struct monomer *monomer = node->val;
+    if (monomer == common)
+        {
+	verbose(3, "Subbing %s for %s of type %s\n", unused->word, monomer->word, unused->type->name);
+	node->val = unused;
+	unused->subbedOutCount += 1;
+	monomer->subbedOutCount -= 1;
+	break;
+	}
+    }
+return TRUE;
+}
+
+void setInitialSubbedOutCount(struct alphaStore *store, struct dlList *ll)
+/* Set subbedOutCount based on how many times monomers occur in list. */
+{
+struct dlNode *node;
+for (node = ll->head; !dlEnd(node); node = node->next)
+    {
+    struct monomer *monomer = node->val;
+    monomer->subbedOutCount += 1;
+    }
+#ifdef PARANOID
+/* As a check see if subbedOutCount agrees with outCount. */
+int mismatchCount = 0;
+struct monomer *monomer;
+for (monomer = store->monomerList; monomer != NULL; monomer = monomer->next)
+    {
+    uglyf("%s %d %d\n", monomer->word, monomer->outCount, monomer->subbedOutCount);
+    if (monomer->outCount != monomer->subbedOutCount)
+        ++mismatchCount;
+    }
+uglyf("mismatch count = %d\n", mismatchCount);
+#endif /* PARANOID */
+}
+
+void subInMissing(struct alphaStore *store, struct dlList *ll)
+/* Go figure out missing monomers in ll, and attempt to substitute them in somewhere they would fit. */
+{
+setInitialSubbedOutCount(store, ll);
+struct slRef *unusedList = listUnusedMonomers(store, ll);
+verbose(3, "%d monomers, %d unused\n", slCount(store->monomerList), slCount(unusedList));
+struct slRef *unusedRef;
+for (unusedRef = unusedList; unusedRef != NULL; unusedRef = unusedRef->next)
+    {
+    struct monomer *unused = unusedRef->val;
+    struct monomerRef *neighborhood = findNeighborhoodFromReads(unused);
+    if (!subCenterInNeighborhood(store, unused, neighborhood, ll))
+	{
+	verbose(3, "Couldn't substitute in %s with context, falling back to type logic\n", 
+	    unused->word);
+        subIntoFirstMostCommonOfType(store, unused, ll);
+	}
+    slFreeList(&neighborhood);
+    }
+}
+
+static void writeMonomerList(char *fileName, struct dlList *ll)
+/* Write out monomer list to file. */
+{
+FILE *f = mustOpen(fileName, "w");
+struct dlNode *node;
+for (node = ll->head; !dlEnd(node); node = node->next)
+    {
+    struct monomer *monomer = node->val;
+    fprintf(f, "%s\n", monomer->word);
+    }
+carefulClose(&f);
+}
+
+static struct dlList *wordTreeMakeList(struct alphaStore *store, int maxSize, 
+    struct wordTree *firstWord, int maxOutputWords)
+/* Create word list base on tree probabilities, and then substituting in unused
+ * monomers if possible. */
+{
+struct dlList *ll = wordTreeGenerateList(store, maxSize, firstWord, maxOutputWords);
+verbose(2, "Generated initial output list\n");
+char *unsubbedFile = optionVal("unsubbed", NULL);
+if (unsubbedFile)
+    writeMonomerList(unsubbedFile, ll);
+subInMissing(store, ll);
+verbose(2, "Substituted in missing rare monomers\n");
+return ll;
+}
+
+
+struct alphaStore *alphaStoreNew(int maxChainSize)
+/* Allocate and initialize a new word store. */
+{
+struct alphaStore *store;
+AllocVar(store);
+store->maxChainSize = maxChainSize;
+store->monomerHash = hashNew(0);
+store->typeHash = hashNew(0);
+store->readHash = hashNew(0);
+return store;
+}
+
+struct monomer *alphaStoreAddMonomer(struct alphaStore *store, char *word)
+/* Add word to store,  incrementing it's useCount if it's already there, otherwise
+ * making up a new record for it. */
+{
+struct monomer *monomer = hashFindVal(store->monomerHash, word);
+if (monomer == NULL)
+    {
+    AllocVar(monomer);
+    hashAddSaveName(store->monomerHash, word, monomer, &monomer->word);
+    slAddHead(&store->monomerList, monomer);
+    }
+monomer->useCount += 1;
+monomer->outTarget = monomer->useCount;	    /* Set default value, may be renormalized. */
+return monomer;
+}
+
+void connectReadsToMonomers(struct alphaStore *store)
+/* Add each read monomer appears in to monomer's readList. */
+{
+struct alphaRead *read;
+for (read = store->readList; read != NULL; read = read->next)
+    {
+    struct monomerRef *ref;
+    for (ref = read->list; ref != NULL; ref = ref->next)
+        {
+	struct monomer *monomer = ref->val;
+	refAdd(&monomer->readList, read);
+	}
+    }
+}
+
+void alphaReadListFromFile(char *fileName, struct alphaStore *store)
+/* Read in file and turn it into a list of alphaReads. */
+{
+/* Stuff for processing file a line at a time. */
+struct lineFile *lf = lineFileOpen(fileName, TRUE);
+char *line, *word;
+
+/* Loop through each line of file making up a read for it. */
+struct alphaRead *readList = NULL;
+while (lineFileNextReal(lf, &line))
+    {
+    /* Parse out first word of line into name, and rest into list. */
+    char *name = nextWord(&line);
+    struct monomerRef *list = NULL;
+    while ((word = nextWord(&line)) != NULL)
+        {
+	struct monomer *monomer = alphaStoreAddMonomer(store, word);
+	struct monomerRef *ref;
+	AllocVar(ref);
+	ref->val = monomer;
+	slAddHead(&list, ref);
+	}
+    slReverse(&list);
+    if (list == NULL)
+	errAbort("Line %d of %s has no alpha monomers.", lf->lineIx, lf->fileName);
+
+    /* Create data structure and add read to list and hash */
+    if (hashLookup(store->readHash, name))
+        errAbort("Read ID %s is repeated line %d of %s", name, lf->lineIx, lf->fileName);
+    struct alphaRead *read;
+    AllocVar(read);
+    read->list = list;
+    hashAddSaveName(store->readHash, name, read, &read->name);
+    slAddHead(&readList, read);
+    }
+slReverse(&readList);
+store->readList = readList;
+if (store->readList == NULL)
+    errAbort("%s contains no reads", lf->fileName);
+lineFileClose(&lf);
+connectReadsToMonomers(store);
+}
+
+struct alphaOrphan
+/* Information about an orphan - something without great joining information. */
+    {
+    struct alphaOrphan *next;	/* Next in list */
+    struct monomer *monomer;	/* Pointer to orphan */
+    boolean paired;		/* True if has been paired with somebody. */
+    };
+
+struct alphaOrphan *findOrphanStarts(struct alphaRead *readList, struct alphaStore *store)
+/* Return list of monomers that are found only at start of reads. */
+{
+struct alphaOrphan *orphanList = NULL;
+
+/* Build up hash of things seen later in reads. */
+struct alphaRead *read;
+struct hash *later = hashNew(0);    /* Hash of monomers found later. */
+for (read = readList; read != NULL; read = read->next)
+    {
+    struct monomerRef *ref;
+    for (ref = read->list->next; ref != NULL; ref = ref->next)
+        {
+	hashAdd(later, ref->val->word, NULL);
+	}
+    }
+
+/* Pass through again collecting orphans. */
+struct hash *orphanHash = hashNew(0);
+for (read = readList; read != NULL; read = read->next)
+    {
+    struct monomer *monomer = read->list->val;
+    char *word = monomer->word;
+    if (!hashLookup(later, word))
+        {
+	struct alphaOrphan *orphan = hashFindVal(orphanHash, word);
+	if (orphan == NULL)
+	    {
+	    AllocVar(orphan);
+	    orphan->monomer = monomer;
+	    slAddHead(&orphanList, orphan);
+	    hashAdd(orphanHash, word, orphan);
+	    verbose(3, "Adding orphan start %s\n", word);
+	    }
+	}
+    }
+hashFree(&later);
+hashFree(&orphanHash);
+slReverse(&orphanList);
+return orphanList;
+}
+
+struct alphaOrphan *findOrphanEnds(struct alphaRead *readList, struct alphaStore *store)
+/* Return list of monomers that are found only at end of reads. */
+{
+struct alphaOrphan *orphanList = NULL;
+
+/* Build up hash of things seen sooner in reads. */
+struct alphaRead *read;
+struct hash *sooner = hashNew(0);    /* Hash of monomers found sooner. */
+for (read = readList; read != NULL; read = read->next)
+    {
+    struct monomerRef *ref;
+    /* Loop over all but last. */
+    for (ref = read->list; ref->next != NULL; ref = ref->next)
+        {
+	hashAdd(sooner, ref->val->word, NULL);
+	}
+    }
+
+/* Pass through again collecting orphans. */
+struct hash *orphanHash = hashNew(0);
+for (read = readList; read != NULL; read = read->next)
+    {
+    struct monomerRef *lastRef = slLastEl(read->list);
+    struct monomer *monomer = lastRef->val;
+    if (!hashLookup(sooner, monomer->word))
+        {
+	struct alphaOrphan *orphan = hashFindVal(orphanHash, monomer->word);
+	if (orphan == NULL)
+	    {
+	    AllocVar(orphan);
+	    orphan->monomer = monomer;
+	    slAddHead(&orphanList, orphan);
+	    hashAdd(orphanHash, monomer->word, orphan);
+	    verbose(3, "Adding orphan end %s\n", monomer->word);
+	    }
+	}
+    }
+hashFree(&sooner);
+slReverse(&orphanList);
+hashFree(&orphanHash);
+return orphanList;
+}
+
+struct alphaRead *addReadOfTwo(struct alphaStore *store, struct monomer *a, struct monomer *b)
+/* Make up a fake read that goes from a to b and add it to store. */
+{
+/* Create fake name and start fake read with it. */
+static int fakeId = 0;
+char name[32];
+safef(name, sizeof(name), "fake%d", ++fakeId);
+
+/* Allocate fake read. */
+struct alphaRead *read;
+AllocVar(read);
+
+/* Add a and b to the read. */
+struct monomerRef *aRef, *bRef;
+AllocVar(aRef);
+aRef->val = a;
+AllocVar(bRef);
+bRef->val = b;
+aRef->next = bRef;
+read->list = aRef;
+
+/* Add read to the store. */
+slAddHead(&store->readList, read);
+hashAddSaveName(store->readHash, name, read, &read->name);
+
+/* Add read to the monomers. */
+refAdd(&a->readList, read);
+refAdd(&b->readList, read);
+
+a->useCount += 1;
+b->useCount += 1;
+return read;
+}
+
+void integrateOrphans(struct alphaStore *store)
+/* Make up fake reads that integrate orphans (monomers only found at beginning or end
+ * of a read) better. */
+{
+struct alphaOrphan *orphanStarts = findOrphanStarts(store->readList, store);
+struct alphaOrphan *orphanEnds = findOrphanEnds(store->readList, store);
+verbose(3, "orphanStarts has %d items, orphanEnds %d\n", 
+    slCount(orphanStarts), slCount(orphanEnds));
+
+/* First look for the excellent situation where you can pair up orphans with each 
+ * other.  For Y at least this happens half the time. */
+struct alphaOrphan *start, *end;
+for (start = orphanStarts; start != NULL; start = start->next)
+    {
+    struct monomerType *startType = start->monomer->type;
+    if (startType == NULL)
+        continue;
+    for (end = orphanEnds; end != NULL && !start->paired; end = end->next)
+        {
+	struct monomerType *endType = end->monomer->type;
+	if (endType == NULL)
+	    continue;
+	if (!end->paired)
+	    {
+	    struct monomerType *nextType = typeAfter(store, endType, 1);
+	    if (nextType == startType)
+	        {
+		end->paired = TRUE;
+		start->paired = TRUE;
+		addReadOfTwo(store, end->monomer, start->monomer);
+		verbose(3, "Pairing %s with %s\n", end->monomer->word, start->monomer->word);
+		}
+	    }
+	}
+    }
+
+/* Ok, now have to just manufacture other side of orphan starts out of thin air. */
+for (start = orphanStarts; start != NULL; start = start->next)
+    {
+    if (start->paired)
+        continue;
+    struct monomer *startMono = start->monomer;
+    struct monomerType *startType = startMono->type;
+    if (startType == NULL)
+        continue;
+
+    struct monomerType *newType = typeBefore(store, startType, 1);
+    verbose(3, "Trying to find end of type %s\n", newType->name);
+    struct monomer *newMono = pickRandomFromType(newType);
+    addReadOfTwo(store, newMono, startMono);
+    verbose(3, "Pairing new %s with start %s\n", newMono->word, startMono->word);
+    }
+
+/* Ok, now have to just manufacture other side of orphan ends out of thin air. */
+for (end = orphanEnds; end != NULL; end = end->next)
+    {
+    if (end->paired)
+        continue;
+    struct monomer *endMono = end->monomer;
+    struct monomerType *endType = endMono->type;
+    if (endType == NULL)
+        continue;
+
+    struct monomerType *newType = typeAfter(store, endType, 1);
+    verbose(3, "Trying to find start of type %s\n", newType->name);
+    struct monomer *newMono = pickRandomFromType(newType);
+    addReadOfTwo(store, endMono, newMono);
+    verbose(3, "Pairing end %s with new %s\n", endMono->word, newMono->word);
+    }
+}
+
+struct wordTree *makeMarkovChains(struct alphaStore *store)
+/* Return wordTree containing all words, and also all chains-of-words of length 
+ * chainSize seen in file.  */
+{
+/* We'll build up the tree starting with an empty root node. */
+struct wordTree *wt = wordTreeNew(alphaStoreAddMonomer(store, ""));	
+int chainSize = store->maxChainSize;
+
+/* Loop through each read. There's special cases at the beginning and end of read, and for 
+ * short reads.  In the main case we'll be maintaining a chain (doubly linked list) of maxChainSize 
+ * words, * popping off one word from the start, and adding one word to the end for each
+ * new word we encounter. This list is added to the tree each iteration. */
+struct alphaRead *read;
+for (read = store->readList; read != NULL; read = read->next)
+    {
+    /* We'll keep a chain of three or so words in a doubly linked list. */
+    struct dlNode *node;
+    struct dlList *chain = dlListNew();
+    int curSize = 0;
+    int wordCount = 0;
+
+    struct monomerRef *ref;
+    for (ref = read->list; ref != NULL; ref = ref->next)
+        {
+	struct monomer *monomer = ref->val;
+	 /* For the first few words in the file after ID, we'll just build up the chain,
+	 * only adding it to the tree when we finally do get to the desired
+	 * chain size.  Once past the initial section of the file we'll be
+	 * getting rid of the first link in the chain as well as adding a new
+	 * last link in the chain with each new word we see. */
+	if (curSize < chainSize)
+	    {
+	    dlAddValTail(chain, monomer);
+	    ++curSize;
+	    if (curSize == chainSize)
+		addChainToTree(wt, chain);
+	    }
+	else
+	    {
+	    /* Reuse doubly-linked-list node, but give it a new value, as we move
+	     * it from head to tail of list. */
+	    node = dlPopHead(chain);
+	    node->val = monomer;
+	    dlAddTail(chain, node);
+	    addChainToTree(wt, chain);
+	    }
+	++wordCount;
+	}
+    /* Handle last few words in line, where can't make a chain of full size.  Also handles       
+    * lines that have fewer than chain size words. */
+    if (curSize < chainSize)
+ 	addChainToTree(wt, chain);
+    while ((node = dlPopHead(chain)) != NULL)
+	{
+	if (!dlEmpty(chain))
+	    addChainToTree(wt, chain);
+	freeMem(node);
+	}
+    dlListFree(&chain);
+    }
+wordTreeSort(wt);  // Make output of chain file prettier
+return wt;
+}
+
+void wordTreeWrite(struct wordTree *wt, int maxChainSize, char *fileName)
+/* Write out tree to file */
+{
+FILE *f = mustOpen(fileName, "w");
+fprintf(f, "#level\tuseCount\toutTarget\toutCount\tnormVal\tmonomers\n");
+wordTreeDump(0, wt, maxChainSize, f);
+carefulClose(&f);
+}
+
+void alphaStoreLoadMonomerOrder(struct alphaStore *store, char *readsFile, char *fileName)
+/* Read in a file with one line for each monomer type, where first word is type name,
+ * and rest of words are all the actually variants of that type.
+ * Requires all variants already be in store.  The readsFile is passed
+ * just for nicer error reporting. */
+{
+/* Stuff for processing file a line at a time. */
+struct lineFile *lf = lineFileOpen(fileName, TRUE);
+char *line, *word;
+struct hash *uniq = hashNew(0);
+
+/* Set up variables we'll put results in in store. */
+store->typeList = NULL;
+
+while (lineFileNextReal(lf, &line))
+    {
+    char *name = nextWord(&line);
+    if (hashLookup(uniq, name) != NULL)
+        errAbort("Type name '%s' repeated line %d of %s\n", name, lf->lineIx, lf->fileName);
+    struct monomerType *type;
+    AllocVar(type);
+    type->name = cloneString(name);
+    slAddHead(&store->typeList, type);
+    while ((word = nextWord(&line)) != NULL)
+        {
+	struct monomer *monomer = hashFindVal(store->monomerHash, word);
+	if (monomer == NULL)
+	    errAbort("%s is in %s but not %s", word, lf->fileName, readsFile);
+	struct monomerRef *ref;
+	AllocVar(ref);
+	ref->val = monomer;
+	slAddHead(&type->list, ref);
+	hashAddUnique(store->typeHash, word, type);
+	}
+    if (type->list == NULL)
+        errAbort("Short line %d of %s.  Format should be:\ntype list-of-monomers-of-type\n",
+	    lf->lineIx, lf->fileName);
+    }
+slReverse(&store->typeList);
+if (store->typeList == NULL)
+    errAbort("%s is empty", lf->fileName);
+lineFileClose(&lf);
+hashFree(&uniq);
+verbose(3, "Added %d types containing %d words from %s\n", 
+    slCount(store->typeList), store->typeHash->elCount, fileName);
+
+/* Create type array */
+store->typeCount = slCount(store->typeList);
+struct monomerType **types = AllocArray(store->typeArray, store->typeCount);
+struct monomerType *type;
+int i;
+for (i=0, type = store->typeList; i<store->typeCount; ++i, type = type->next)
+    types[i] = type;
+}
+
+void crossCheckMonomerOrderAndReads(struct alphaStore *store, char *orderedPrefix, char *readFile, char *typeFile)
+/* Make sure all monomer that begin with ordered prefix are present in monomer order file. */
+{
+/* Make hash of all monomers that have type info. */
+struct hash *orderHash = hashNew(0);
+struct monomerType *type;
+for (type = store->typeList; type != NULL; type = type->next)
+     {
+     struct monomerRef *ref;
+     for (ref = type->list; ref != NULL; ref = ref->next)
+         hashAdd(orderHash, ref->val->word, ref->val);
+     }
+
+/* Go through all monomers and make sure ones with correct prefix are in list. */
+struct monomer *mon;
+for (mon = store->monomerList; mon != NULL; mon = mon->next)
+    {
+    char *word = mon->word;
+    if (startsWith(orderedPrefix, word) && !hashLookup(orderHash, word))
+        {
+	errAbort("%s is in %s but not %s", word, readFile, typeFile);
+	}
+    }
+hashFree(&orderHash);
+}
+
+
+void monomerListNormalise(struct monomer *list, int totalCount, int outputSize)
+/* Set outTarget field in all of list to be normalized to outputSize */
+{
+struct monomer *monomer;
+double scale = (double)outputSize/totalCount;
+for (monomer = list; monomer != NULL; monomer = monomer->next)
+    {
+    monomer->outCount = 0;
+    monomer->outTarget = round(scale * monomer->useCount);
+    }
+}
+
+void alphaStoreNormalize(struct alphaStore *store, int outputSize)
+/* Set up output counts on each word to make sure it gets it's share of output size */
+{
+struct wordTree *wt = store->markovChains;
+wordTreeNormalize(wt, outputSize, 1.0);
+monomerListNormalise(store->monomerList, wt->useCount, outputSize);
+}
+
+struct monomerType *fillInTypeFromReads(struct monomer *monomer, struct alphaStore *store)
+/* Look through read list trying to find best supported type for this monomer. */
+{
+/* The algorithm here could be better.  It picks the closest nearby monomer that is typed
+ * to fill in from.  When have information from both before and after the program arbitrarily 
+ * uses before preferentially.  When have information that is equally close from multiple reads,
+ * it just uses info from first read.  Fortunately the data, at least on Y, shows a lot of
+ * consistency, so it's probably not actually worth a more sophisticated algorithm that would
+ * instead take the most popular choice rather than the first one. */
+verbose(3, "fillInTypeFromReads on %s from %d reads\n", monomer->word, slCount(monomer->readList));
+struct monomerType *bestType = NULL;
+int bestPos = 0;
+boolean bestIsAfter = FALSE;
+struct slRef *readRef;
+for (readRef = monomer->readList; readRef != NULL; readRef = readRef->next)
+    {
+    struct alphaRead *read = readRef->val;
+    struct monomerRef *ref;
+	{
+	verbose(3, "  read %s (%d els):", read->name, slCount(read->list));
+	for (ref = read->list; ref != NULL; ref = ref->next)
+	    {
+	    struct monomer *m = ref->val;
+	    verbose(3, " %s", m->word);
+	    }
+	verbose(3, "\n");
+	}
+    struct monomerType *beforeType = NULL;
+    int beforePos = 0;
+
+    /* Look before. */
+    for (ref = read->list; ref != NULL; ref = ref->next)
+        {
+	struct monomer *m = ref->val;
+	if (m == monomer)
+	     break;
+	if (m->type != NULL)
+	    {
+	    beforeType = m->type;
+	    beforePos = 0;
+	    }
+	++beforePos;
+	}
+
+    /* Look after. */
+    struct monomerType *afterType = NULL;
+    int afterPos = 0;
+    assert(ref != NULL && ref->val == monomer);   
+    for (ref = ref->next; ref != NULL; ref = ref->next)
+        {
+	struct monomer *m = ref->val;
+	++afterPos;
+	if (m->type != NULL)
+	    {
+	    afterType = m->type;
+	    break;
+	    }
+	}
+
+    if (beforeType != NULL)
+        {
+	if (bestType == NULL || beforePos < bestPos)
+	    {
+	    bestType = beforeType;
+	    bestPos = beforePos;
+	    bestIsAfter = FALSE;
+	    }
+	}
+
+    if (afterType != NULL)
+        {
+	if (bestType == NULL || afterPos < bestPos)
+	    {
+	    bestType = afterType;
+	    bestPos = afterPos;
+	    bestIsAfter = TRUE;
+	    }
+	}
+    }
+
+/* Now have found a type that is at a known position relative to ourselves.  From this
+ * infer our own type. */
+struct monomerType *chosenType = NULL;
+if (bestType != NULL)
+    {
+    if (bestIsAfter)
+	chosenType = typeBefore(store, bestType, bestPos);
+    else
+	chosenType = typeAfter(store, bestType, bestPos);
+    }
+verbose(3, "chosenType for %s is %s\n", monomer->word, 
+    (bestType == NULL ? "(null)" : chosenType->name));
+return chosenType;
+}
+
+void fillInTypes(struct alphaStore *store)
+/* We have types for most but not all monomers. Try and fill out types for most of
+ * the rest by looking for position in reads they are in that do contain known types */
+{
+/* Do first pass on ones defined - just have to look them up in hash. */
+struct monomer *monomer;
+for (monomer = store->monomerList; monomer != NULL; monomer = monomer->next)
+    monomer->type = hashFindVal(store->typeHash, monomer->word);
+
+/* Do second pass on ones that are not yet defined */
+for (monomer = store->monomerList; monomer != NULL; monomer = monomer->next)
+    {
+    if (monomer->type == NULL)
+	{
+	monomer->type = fillInTypeFromReads(monomer, store);
+	}
+    }
+}
+
+struct dlList *alphaAsmOneSize(struct alphaStore *store, int outSize)
+/* alphaAsm - assemble alpha repeat regions such as centromeres from reads that have
+ * been parsed into various repeat monomer variants.  Cycles of these variants tend to
+ * form higher order repeats. Returns list of outSize monomers. */
+{
+struct wordTree *wt = store->markovChains;
+alphaStoreNormalize(store, outSize);
+verbose(2, "Normalized Markov Chains\n");
+
+if (optionExists("chain"))
+    {
+    char *fileName = optionVal("chain", NULL);
+    wordTreeWrite(wt, store->maxChainSize, fileName);
+    }
+
+struct dlList *ll = wordTreeMakeList(store, store->maxChainSize, 
+    pickWeightedRandomFromList(wt->children), outSize);
+
+if (optionExists("afterChain"))
+    {
+    char *fileName = optionVal("afterChain", NULL);
+    wordTreeWrite(wt, store->maxChainSize, fileName);
+    }
+
+return ll;
+}
+
+int countMissingMonomers(struct dlList *ll, struct monomer *monomerList)
+/* Set the outCounts in monomer list to match how often they occur in ll.
+ * Return number of monomers that do not occur at all in ll. */
+{
+/* Zero out output counts. */
+struct monomer *monomer;
+for (monomer = monomerList; monomer != NULL; monomer = monomer->next)
+    monomer->outCount = 0;
+
+/* Increase output count each time a monomer is used. */
+struct dlNode *node;
+for (node = ll->head; !dlEnd(node); node = node->next)
+    {
+    monomer = node->val;
+    monomer->outCount += 1;
+    }
+
+/* Count up unused. */
+int missing = 0;
+for (monomer = monomerList; monomer != NULL; monomer = monomer->next)
+    if (monomer->outCount == 0 && !sameString(monomer->word, ""))
+        missing += 1;
+
+return missing;
+}
+
+void alphaAsm(char *readsFile, char *monomerOrderFile, char *outFile)
+/* alphaAsm - assemble alpha repeat regions such as centromeres from reads that have
+ * been parsed into various repeat monomer variants.  Cycles of these variants tend to
+ * form higher order repeats. */
+{
+/* This routine reads in the input,  and then calls a routine that produces the
+ * output for a given size.  If not all monomers are included in the output, then it
+ * will try to find the minimum output size needed to include all monomers. */
+
+/* Read in inputs, and put in "store" */
+struct alphaStore *store = alphaStoreNew(maxChainSize);
+alphaReadListFromFile(readsFile, store);
+alphaStoreLoadMonomerOrder(store, readsFile, monomerOrderFile);
+verbose(2, "Loaded input reads and monomer order\n");
+
+if (betweens)
+    {
+    store->markovChainsNoOrphans = makeMarkovChains(store);
+    verbose(2, "Made initial markov chains\n");
+    }
+
+/* Do some cross checking and filling out a bit for missing data. */
+crossCheckMonomerOrderAndReads(store, "m", readsFile, monomerOrderFile);
+fillInTypes(store);
+integrateOrphans(store);
+verbose(2, "Filled in missing types and integrated orphan ends\n");
+
+/* Make the main markov chains out of the reads. */
+store->markovChains = makeMarkovChains(store);
+verbose(2, "Made Markov Chains\n");
+
+/* Loop gradually increasing size of output we make until get to maxOutSize or until
+ * we generate output that includes all input monomers. */
+struct dlList *ll;
+int outSize = initialOutSize;
+while (outSize <= maxOutSize)
+    {
+    ll = alphaAsmOneSize(store, outSize);
+    assert(outSize == dlCount(ll));
+    int missing = countMissingMonomers(ll, store->monomerList);
+    if (missing <= maxToMiss)
+        break;
+    if (outSize == maxOutSize)
+	{
+	errAbort("Could not include all monomers. Missing %d.\n"
+	         "consider increasing maxOutSize (now %d) or increasing maxToMiss (now %d)",
+	         missing, maxOutSize, maxToMiss);
+        break;
+	}
+    dlListFree(&ll);
+    outSize *= 1.1;	/* Grow by 10% */
+    if (outSize > maxOutSize)
+        outSize = maxOutSize;
+    verbose(1, "%d missing. Trying again with outSize=%d (initialOutSize %d, maxOutSize %d)\n",
+	missing, outSize, initialOutSize, maxOutSize);
+    }
+writeMonomerListAndBetweens(store, outFile, ll);
+verbose(2, "Wrote primary output\n");
+dlListFree(&ll);
+}
+
+
+int main(int argc, char *argv[])
+/* Process command line. */
+{
+optionInit(&argc, argv, options);
+if (argc != 4)
+    usage();
+maxChainSize = optionInt("size", maxChainSize);
+initialOutSize = optionInt("initialOutSize", initialOutSize);
+maxOutSize = optionInt("maxOutSize", initialOutSize);  // Defaults to same as initialOutSize
+if (maxOutSize < initialOutSize)
+    errAbort("maxOutSize (%d) needs to be bigger than initialOutSize (%d)\n", 
+	maxOutSize, initialOutSize);
+maxToMiss = optionInt("maxToMiss", maxToMiss);
+fullOnly = optionExists("fullOnly");
+pseudoCount = optionInt("pseudoCount", pseudoCount);
+betweens = optionExists("betweens");
+int seed = optionInt("seed", 0);
+srand(seed);
+alphaAsm(argv[1], argv[2], argv[3]);
+return 0;
+}