src/kehayden/linearSat/alphaAsm.c bd11b61b73ac03b8f3a1787bea84b36cd5332457

bd11b61b73ac03b8f3a1787bea84b36cd5332457
kent
  Wed Jun 19 13:18:03 2013 -0700
Moving alphaAsm to linearSat, a bettern name.
diff --git src/kehayden/linearSat/alphaAsm.c src/kehayden/linearSat/alphaAsm.c
deleted file mode 100644
index 9ce4ab6..0000000
--- src/kehayden/linearSat/alphaAsm.c
+++ /dev/null
@@ -1,1832 +0,0 @@
-/* alphaAsm - assemble alpha repeat regions such as centromeres from reads that have
- * been parsed into various repeat monomer variants.  Cycles of these variants tend to
- * form higher order repeats. */
-
-#include "common.h"
-#include "linefile.h"
-#include "hash.h"
-#include "options.h"
-#include "dystring.h"
-#include "dlist.h"
-#include "obscure.h"
-#include "errabort.h"
-
-/* Global vars - all of which can be set by command line options. */
-int pseudoCount = 1;
-int maxChainSize = 3;
-int initialOutSize = 10000;
-int maxOutSize;
-int maxToMiss = 0;
-boolean fullOnly = FALSE;
-boolean betweens = FALSE;
-
-void usage()
-/* Explain usage and exit. */
-{
-errAbort(
-  "alphaAsm - create a linear projection of alpha satellite arrays using the probablistic model\n"
-  "of HuRef satellite graphs based on Sanger style reads.\n"
-  "usage:\n"
-  "   alphaAsm alphaMonFile.txt monomerOrder.txt output.txt\n"
-  "Where alphaMonFile is a list of reads with alpha chains monomers parsed out, one read per\n"
-  "line, with the first word in the line being the read id and subsequent words the monomers\n"
-  "within the read. The monomerOrder.txt has one line per major monomer type with a word for\n"
-  "each variant. The lines are in the same order the monomers appear, with the last line\n"
-  "being followed in order by the first since they repeat.   The output.txt contains one line\n"
-  "for each monomer in the output, just containing the monomer symbol.\n"
-  "options:\n"
-  "   -size=N - Set max chain size, default %d\n"
-  "   -fullOnly - Only output chains of size above\n"
-  "   -chain=fileName - Write out word chain to file\n"
-  "   -afterChain=fileName - Write out word chain after faux generation to file for debugging\n"
-  "   -initialOutSize=N - Output this many words. Default %d\n"
-  "   -maxOutSize=N - Expand output up to this many words if not all monomers output initially\n"
-  "   -maxToMiss=N - How many monomers may be missing from output. Default %d \n"
-  "   -pseudoCount=N - Add this number to observed quantities - helps compensate for what's not\n"
-  "                observed.  Default %d\n"
-  "   -seed=N - Initialize random number with this seed for consistent results, otherwise\n"
-  "             it will generate different results each time it's run.\n"
-  "   -betweens - Add <m> lines in output that indicate level of markov model used join\n"
-  "   -unsubbed=fileName - Write output before substitution of missing monomers to file\n"
-  , maxChainSize, initialOutSize, maxToMiss, pseudoCount
-  );
-}
-
-/* Command line validation table. */
-static struct optionSpec options[] = {
-   {"size", OPTION_INT},
-   {"chain", OPTION_STRING},
-   {"afterChain", OPTION_STRING},
-   {"fullOnly", OPTION_BOOLEAN},
-   {"initialOutSize", OPTION_INT},
-   {"maxOutSize", OPTION_INT},
-   {"maxToMiss", OPTION_INT},
-   {"pseudoCount", OPTION_INT},
-   {"seed", OPTION_INT},
-   {"unsubbed", OPTION_STRING},
-   {"betweens", OPTION_BOOLEAN},
-   {NULL, 0},
-};
-
-/* Some structures to keep track of words (which correspond to alpha satellight monomers)
- * seen in input. */
-
-struct monomer
-/* Basic information on a monomer including how many times it is seen in input and output
- * streams.  Unlike the wordTree, this is flat, and does not include predecessors. */
-    {
-    struct monomer *next;	/* Next in list of all words. */
-    char *word;			/* The word used to represent monomer.  Not allocated here. */
-    int useCount;		/* Number of times used. */
-    int outTarget;		/* Number of times want to output word. */
-    int outCount;		/* Number of times have output word so far. */
-    struct monomerType *type;	/* The type of the monomer. */
-    struct slRef *readList;	/* List of references to reads this is in. */
-    int subbedOutCount;		/* Output count after substitution. */
-    };
-
-struct monomerRef
-/* A reference to a monomer. */
-    {
-    struct monomerRef *next;	/* Next in list */
-    struct monomer *val;	/* The word referred to. */
-    };
-
-struct monomerType
-/* A collection of words of the same type - or monomers of same type. */
-    {
-    struct monomerType *next;   /* Next monomerType */
-    char *name;			/* Short name of type */
-    struct monomerRef *list;	    /* List of all words of that type */
-    };
-
-struct alphaRead 
-/* A read that's been parsed into alpha variants. */
-    {
-    struct alphaRead *next;	/* Next in list */
-    char *name;			/* Id of read, not allocated here. */
-    struct monomerRef *list;	/* List of alpha subunits */
-    };
-
-struct alphaStore
-/* Stores info on all monomers */
-    {
-    struct monomer *monomerList;   /* List of words, fairly arbitrary order. */
-    struct hash *monomerHash;       /* Hash of monomer, keyed by word. */
-    struct alphaRead *readList;	 /* List of all reads. */
-    struct hash *readHash;	 /* Hash of all reads keyed by read ID. */
-    struct wordTree *markovChains;   /* Tree of words that follow other words. */
-    struct wordTree *markovChainsNoOrphans; /* Markov tree before orphans are added. */
-    int maxChainSize;		/* Maximum depth of tree. */
-    struct monomerType *typeList;	/* List of all types. */
-    struct hash *typeHash;	/* Hash with monomerType values, keyed by all words. */
-    int typeCount;		/* Count of types. */
-    struct monomerType **typeArray;  /* Array as opposed to list of types */
-    };
-
-/* The wordTree structure below is the central data structure for this program.  It is
- * used to build up a tree that contains all observed N-word-long sequences observed in
- * the text, where N corresponds to the "size" command line option which defaults to 3,
- * an option that in turn is stored in the maxChainSize variable.  At this chain size the
- * text 
- *     this is the black dog and the black cat
- * would have the chains 
- *     this is the 
- *     is the black
- *     the black dog
- *     black dog and
- *     dog and the
- *     and the black
- *     the black cat
- * and turn into the tree
- *     this
- *        is
- *           the
- *     is
- *        the
- *           black
- *     the
- *        black
- *           dog
- *           cat
- *     black
- *        dog
- *           and
- *     dog
- *        and
- *           the
- *     and
- *        the
- *           black
- * Note how the tree is able to compress the two chains "the black dog" and "the black cat."
- *
- * A node in the tree can have as many children as it needs to at each node.  The depth of
- * the tree is the same as the chain size, by default 3. At each node in the tree you get
- * a word, and a list of all words that are observed in the text to follow that word.
- *
- * Once the program has build up the wordTree, it can output it in a couple of fashions. */
-
-struct wordTree
-/* A node in a tree of words.  The head of the tree is a node with word value the empty string. */
-    {
-    struct wordTree *next;	/* Next sibling */
-    struct wordTree *children;	/* Children in list. */
-    struct wordTree *parent;    /* Parent of this node or NULL for root. */
-    struct monomer *monomer;	/* The info on the word itself. */
-    int useCount;		/* Number of times word used in input with given predecessors. */
-    int outTarget;              /* Number of times want to output word with given predecessors. */
-    int outCount;		/* Number of times actually output with given predecessors. */
-    double normVal;             /* value to place the normalization value */    
-    };
-
-struct wordTree *wordTreeNew(struct monomer *monomer)
-/* Create and return new wordTree element. */
-{
-struct wordTree *wt;
-AllocVar(wt);
-wt->monomer = monomer;
-return wt;
-}
-
-int wordTreeCmpWord(const void *va, const void *vb)
-/* Compare two wordTree for slSort. */
-{
-const struct wordTree *a = *((struct wordTree **)va);
-const struct wordTree *b = *((struct wordTree **)vb);
-return cmpStringsWithEmbeddedNumbers(a->monomer->word, b->monomer->word);
-}
-
-static void wordTreeSort(struct wordTree *wt)
-/* Sort all children lists in tree. */
-{
-slSort(&wt->children, wordTreeCmpWord);
-struct wordTree *child;
-for (child = wt->children; child != NULL; child = child->next)
-    wordTreeSort(child);
-}
-
-struct wordTree *wordTreeFindInList(struct wordTree *list, struct monomer *monomer)
-/* Return wordTree element in list that has given word, or NULL if none. */
-{
-struct wordTree *wt;
-for (wt = list; wt != NULL; wt = wt->next)
-    if (wt->monomer == monomer)
-        break;
-return wt;
-}
-
-struct wordTree *wordTreeAddFollowing(struct wordTree *wt, struct monomer *monomer)
-/* Make word follow wt in tree.  If word already exists among followers
- * return it and bump use count.  Otherwise create new one. */
-{
-struct wordTree *child = wordTreeFindInList(wt->children, monomer);
-if (child == NULL)
-    {
-    child = wordTreeNew(monomer);
-    child->parent = wt;
-    slAddHead(&wt->children, child);
-    }
-child->useCount += 1;
-return child;
-}
-
-int wordTreeSumUseCounts(struct wordTree *list)
-/* Sum up useCounts in list */
-{
-int total = 0;
-struct wordTree *wt;
-for (wt = list; wt != NULL; wt = wt->next)
-    total += wt->useCount;
-return total;
-}
-
-int wordTreeSumOutTargets(struct wordTree *list)
-/* Sum up useCounts in list */
-{
-int total = 0;
-struct wordTree *wt;
-for (wt = list; wt != NULL; wt = wt->next)
-    total += wt->outTarget;
-return total;
-}
-
-void addChainToTree(struct wordTree *wt, struct dlList *chain)
-/* Add chain of words to tree. */
-{
-struct dlNode *node;
-wt->useCount += 1;
-for (node = chain->head; !dlEnd(node); node = node->next)
-    {
-    struct monomer *monomer = node->val;
-    verbose(4, "  adding %s\n", monomer->word);
-    wt = wordTreeAddFollowing(wt, monomer);
-    }
-}
-
-int useCountInTree(struct wordTree *wt, struct dlNode *nodeList, int chainSize)
-/* Return number of times chainSize successive nodes from nodeList are found
- * in wt,  0 if not found. */
-{
-int i;
-struct wordTree *subTree = wt;
-struct dlNode *node = nodeList;
-for (i=0; i<chainSize; ++i)
-    {
-    struct monomer *monomer = node->val;
-    subTree = wordTreeFindInList(subTree->children, monomer);
-    if (subTree == NULL)
-        return FALSE;
-    node = node->next;
-    }
-return subTree->useCount;
-}
-
-
-void findLongestSupportingMarkovChain(struct wordTree *wt, struct dlNode *node,
-    int *retChainSize, int *retReadCount)
-/* See if chain of words is in tree tree. */
-{
-struct dlNode *start = node;
-int chainSize = 1;
-int readCount = 0;
-for (;;)
-    {
-    int useCount = useCountInTree(wt, start, chainSize);
-    if (useCount == 0)
-        break;
-    readCount = useCount;
-    chainSize += 1;
-    start = start->prev;
-    if (dlStart(start))
-        break;
-    }
-*retChainSize = chainSize;
-*retReadCount = readCount;
-}
-
-static void writeMonomerListAndBetweens(struct alphaStore *store, 
-    char *fileName, struct dlList *ll)
-/* Write out monomer list to file. */
-{
-FILE *f = mustOpen(fileName, "w");
-struct dlNode *node;
-struct wordTree *origTree = store->markovChainsNoOrphans;
-for (node = ll->head; !dlEnd(node); node = node->next)
-    {
-    struct monomer *monomer = node->val;
-    if (betweens)
-	{
-	int chainSize = 0, readCount = 0;
-	findLongestSupportingMarkovChain(origTree, node, 
-	    &chainSize, &readCount);
-	/* The -2 is for 1 extra for the empty tree root, and 1 extra to get
-	 * from chain-size to markov model terminology. */
-	char between[24];
-	safef(between, sizeof(between), "<%d:%d:%d>", chainSize-2, 
-		readCount, useCountInTree(origTree, node, 1)); 
-	fprintf(f, "%-11s\t", between);
-	}
-    fprintf(f, "%s\n", monomer->word);
-    }
-carefulClose(&f);
-}
-
-
-int wordTreeAddPseudoCount(struct wordTree *wt, int pseudo)
-/* Add pseudo to all leaves of tree and propagate counts up to parents. */
-{
-if (wt->children == NULL)
-    {
-    wt->useCount += pseudo;
-    return wt->useCount;
-    }
-else
-    {
-    struct wordTree *child;
-    int oldChildTotal = 0;
-    for (child = wt->children; child != NULL; child = child->next)
-	oldChildTotal += child->useCount;
-    int oldDiff = wt->useCount - oldChildTotal;
-    if (oldDiff < 0) oldDiff = 0;  // Necessary in rare cases, not sure why.
-
-    int total = 0;
-    for (child = wt->children; child != NULL; child = child->next)
-	total += wordTreeAddPseudoCount(child, pseudo);
-    wt->useCount = total + oldDiff + pseudo;
-    return total;
-    }
-}
-
-void wordTreeNormalize(struct wordTree *wt, double outTarget, double normVal)
-/* Recursively set wt->normVal  and wt->outTarget so each branch gets its share */
-{
-if (pseudoCount > 0)
-    wordTreeAddPseudoCount(wt, pseudoCount);
-wt->normVal = normVal;
-wt->outTarget = outTarget;
-wt->outCount = 0;
-int childrenTotalUses = wordTreeSumUseCounts(wt->children);
-struct wordTree *child;
-for (child = wt->children; child != NULL; child = child->next)
-    {
-    double childRatio = (double)child->useCount / childrenTotalUses;
-    wordTreeNormalize(child, childRatio*outTarget, childRatio*normVal);
-    }
-}
-
-char *wordTreeString(struct wordTree *wt)
-/* Return something like '(a b c)' where c would be the value at wt itself, and
- * a and b would be gotten by following parents. */
-{
-struct slName *list = NULL, *el;
-for (;wt != NULL; wt = wt->parent)
-    {
-    char *word = wt->monomer->word;
-    if (!isEmpty(word))   // Avoid blank great grandparent at root of tree.
-	slNameAddHead(&list, word);
-    }
-
-struct dyString *dy = dyStringNew(0);
-dyStringAppendC(dy, '(');
-for (el = list; el != NULL; el = el->next)
-   {
-   dyStringPrintf(dy, "%s", el->name);
-   if (el->next != NULL)
-       dyStringAppendC(dy, ' ');
-   }
-dyStringAppendC(dy, ')');
-slFreeList(&list);
-return dyStringCannibalize(&dy);
-}
-
-char *dlListFragWords(struct dlNode *head)
-/* Return string containing all words in list pointed to by head. */
-{
-struct dyString *dy = dyStringNew(0);
-dyStringAppendC(dy, '{');
-struct dlNode *node;
-for (node = head; !dlEnd(node); node = node->next)
-    {
-    if (node != head)
-       dyStringAppendC(dy, ' ');
-    struct monomer *monomer = node->val;
-    dyStringAppend(dy, monomer->word);
-    }
-dyStringAppendC(dy, '}');
-return dyStringCannibalize(&dy);
-}
-
-void wordTreeDump(int level, struct wordTree *wt, int maxChainSize, FILE *f)
-/* Write out wordTree to file. */
-{
-static char *words[64];
-int i;
-assert(level < ArraySize(words));
-
-words[level] = wt->monomer->word;
-if (!fullOnly || level == maxChainSize)
-    {
-    fprintf(f, "%d\t%d\t%d\t%d\t%f\t", 
-	    level, wt->useCount, wt->outTarget, wt->outCount, wt->normVal);
-    
-    for (i=1; i<=level; ++i)
-	{
-	spaceOut(f, level*2);
-	fprintf(f, "%s ", words[i]);
-	}
-    fprintf(f, "\n");
-    }
-struct wordTree *child;
-for (child = wt->children; child != NULL; child = child->next)
-    wordTreeDump(level+1, child, maxChainSize, f);
-}
-
-struct wordTree *pickWeightedRandomFromList(struct wordTree *list)
-/* Pick word from list randomly, but so that words with higher outTargets
- * are picked more often. */
-{
-struct wordTree *picked = NULL;
-
-/* Debug output. */
-    {
-    verbose(4, "   pickWeightedRandomFromList(");
-    struct wordTree *wt;
-    for (wt = list; wt != NULL; wt = wt->next)
-        {
-	if (wt != list)
-	    verbose(4, " ");
-	verbose(4, "%s:%d", wt->monomer->word, wt->outTarget);
-	}
-    verbose(4, ") total %d\n", wordTreeSumOutTargets(list));
-    }
-
-/* Figure out total number of outputs left, and a random number between 0 and that total. */
-int total = wordTreeSumOutTargets(list);
-if (total > 0)
-    {
-    int threshold = rand() % total; 
-    verbose(4, "      threshold %d\n", threshold);
-
-    /* Loop through list returning selection corresponding to random threshold. */
-    int binStart = 0;
-    struct wordTree *wt;
-    for (wt = list; wt != NULL; wt = wt->next)
-	{
-	int size = wt->outTarget;
-	int binEnd = binStart + size;
-	verbose(4, "      %s size %d, binEnd %d\n", wt->monomer->word, size, binEnd);
-	if (threshold < binEnd)
-	    {
-	    picked = wt;
-	    verbose(4, "      picked %s\n", wt->monomer->word);
-	    break;
-	    }
-	binStart = binEnd;
-	}
-    }
-return picked;
-}
-
-int totUseZeroCount = 0;
-
-struct monomer *pickRandomFromType(struct monomerType *type)
-/* Pick a random word, weighted by outTarget, from all available of given type */
-{
-/* Figure out total on list */
-int total = 0;
-struct monomerRef *ref;
-for (ref = type->list; ref != NULL; ref = ref->next)
-    total += ref->val->outTarget;
-verbose(3, "pickRandomFromType %s total of outTarget=%d\n", type->name, total);
-
-/* Loop through list returning selection corresponding to random threshold. */
-if (total > 0)
-    {
-    int threshold = rand() % total; 
-    int binStart = 0;
-    for (ref = type->list; ref != NULL; ref = ref->next)
-	{
-	struct monomer *monomer = ref->val;
-	int size = monomer->outTarget;
-	int binEnd = binStart + size;
-	if (threshold < binEnd)
-	    return monomer;
-	binStart = binEnd;
-	}
-    }
-
-verbose(3, "Backup plan for %s in pickRandomFromType\n", type->name);
-
-/* Try again based on use counts. */
-total = 0;
-for (ref = type->list; ref != NULL; ref = ref->next)
-    total += ref->val->useCount;
-if (total > 0)
-    {
-    int threshold = rand() % total; 
-    int binStart = 0;
-    for (ref = type->list; ref != NULL; ref = ref->next)
-	{
-	struct monomer *monomer = ref->val;
-	int size = monomer->useCount;
-	int binEnd = binStart + size;
-	if (threshold < binEnd)
-	    return monomer;
-	binStart = binEnd;
-	}
-    }
-
-errAbort("Fell off end in pickRandomFromType on %s", type->name);
-return NULL;
-}
-
-struct wordTree *predictNextFromAllPredecessors(struct wordTree *wt, struct dlNode *list)
-/* Predict next word given tree and recently used word list.  If tree doesn't
- * have statistics for what comes next given the words in list, then it returns
- * NULL. */
-{
-if (verboseLevel() >= 4)
-    verbose(4, "  predictNextFromAllPredecessors(%s, %s)\n", wordTreeString(wt), dlListFragWords(list));
-struct dlNode *node;
-for (node = list; !dlEnd(node); node = node->next)
-    {
-    struct monomer *monomer = node->val;
-    wt = wordTreeFindInList(wt->children, monomer);
-    if (verboseLevel() >= 4)
-	verbose(4, "   wordTreeFindInList(%s) = %p %s\n", monomer->word, wt, wordTreeString(wt));
-    if (wt == NULL || wt->children == NULL)
-        break;
-    }
-struct wordTree *result = NULL;
-if (wt != NULL && wt->children != NULL)
-    result = pickWeightedRandomFromList(wt->children);
-return result;
-}
-
-struct wordTree *predictFromWordTree(struct wordTree *wt, struct dlNode *recent)
-/* Predict next word given tree and recently used word list.  Will use all words in
- * recent list if can,  but if there is not data in tree, will back off, and use
- * progressively less previous words. */
-{
-struct dlNode *node;
-if (verboseLevel() >= 4)
-    verbose(4, " predictNextFromWordTree(%s)\n", wordTreeString(wt));
-
-for (node = recent; !dlEnd(node); node = node->next)
-    {
-    struct wordTree *result = predictNextFromAllPredecessors(wt, node);
-    if (result != NULL)
-        return result;
-    }
-return NULL;
-}
-
-struct dlNode *nodesFromTail(struct dlList *list, int count)
-/* Return count nodes from tail of list.  If list is shorter than count, it returns the
- * whole list. */
-{
-int i;
-struct dlNode *node;
-for (i=0, node = list->tail; i<count; ++i)
-    {
-    if (dlStart(node))
-        return list->head;
-    node = node->prev;
-    }
-return node;
-}
-
-struct monomerType *typeAfter(struct alphaStore *store, struct monomerType *oldType,
-    int amount)
-/* Skip forward amount in typeList from oldType.  If get to end of list start
- * again at the beginning */
-{
-int i;
-struct monomerType *type = oldType;
-for (i=0; i<amount; ++i)
-    {
-    type = type->next;
-    if (type == NULL)
-	type = store->typeList;
-    }
-return type;
-}
-
-struct monomerType *typeBefore(struct alphaStore *store, struct monomerType *oldType, int amount)
-/* Pick type that comes amount before given type. */
-{
-int typeIx = ptArrayIx(oldType, store->typeArray, store->typeCount);
-typeIx -= amount;
-while (typeIx < 0)
-    typeIx += store->typeCount;
-return store->typeArray[typeIx];
-}
-
-struct wordTree *predictFromPreviousTypes(struct alphaStore *store, struct dlList *past)
-/* Predict next based on general pattern of monomer order. */
-{
-/* This routine is now a bit more complex than necessary but it still works.  It
- * these days only needs to look back 1 because of the read-based type fill-in. */
-int maxToLookBack = 3;
-    { /* Debugging block */
-    struct dlNode *node = nodesFromTail(past, maxToLookBack);
-    verbose(4, "predictFromPreviousTypes(");
-    for (; !dlEnd(node); node = node->next)
-        {
-	struct monomer *monomer = node->val;
-	verbose(4, "%s, ", monomer->word);
-	}
-    verbose(4, ")\n");
-    }
-int pastDepth;
-struct dlNode *node;
-for (pastDepth = 1, node = past->tail; pastDepth <= maxToLookBack; ++pastDepth)
-    {
-    if (dlStart(node))
-	{
-	verbose(3, "predictFromPreviousTypes with empty past\n");
-	return NULL;
-	}
-    struct monomer *monomer = node->val;
-    struct monomerType *prevType = hashFindVal(store->typeHash, monomer->word);
-    if (prevType != NULL)
-        {
-	struct monomerType *curType = typeAfter(store, prevType, pastDepth);
-	struct monomer *curInfo = pickRandomFromType(curType);
-	struct wordTree *curTree = wordTreeFindInList(store->markovChains->children, curInfo);
-	verbose(4, "predictFromPreviousType pastDepth %d, curInfo %s, curTree %p %s\n", 
-	    pastDepth, curInfo->word, curTree, curTree->monomer->word);
-	return curTree;
-	}
-    node = node->prev;
-    }
-verbose(3, "predictFromPreviousTypes past all unknown types\n");
-return NULL;
-}
-
-
-struct wordTree *predictNext(struct alphaStore *store, struct dlList *past)
-/* Given input data store and what is known from the past, predict the next word. */
-{
-struct dlNode *recent = nodesFromTail(past, store->maxChainSize);
-struct wordTree *pick =  predictFromWordTree(store->markovChains, recent);
-if (pick == NULL)
-    pick = predictFromPreviousTypes(store, past);
-if (pick == NULL)
-    {
-    pick = pickWeightedRandomFromList(store->markovChains->children);
-    verbose(3, "in predictNext() last resort pick of %s\n", pick->monomer->word);
-    }
-return pick;
-}
-
-void decrementOutputCountsInTree(struct wordTree *wt)
-/* Decrement output count of self and parents. */
-{
-while (wt != NULL)
-    {
-    /* Decrement target count, but don't let it fall below sum of counts of all children. 
-     * This can happen with incomplete data if we don't prevent it.  This
-     * same code also prevents us from having negative outTarget. */
-    int outTarget = wt->outTarget - 1;
-    int kidSum = wordTreeSumOutTargets(wt->children);
-    if (outTarget < kidSum)
-        outTarget = kidSum;
-    wt->outTarget = outTarget;
-
-    /* Always bump outCount to help us debug (only real use of outCount). */
-    wt->outCount += 1;
-    wt = wt->parent;
-    }
-}
-
-static struct dlList *wordTreeGenerateList(struct alphaStore *store, int maxSize, 
-    struct wordTree *firstWord, int maxOutputWords)
-/* Make a list of words based on probabilities in tree. */
-{
-struct dlList *ll = dlListNew();
-int chainSize = 0;
-int outputWords = 0;
-struct dlNode *chainStartNode = NULL;
-
-for (;;)
-    {
-    if (++outputWords > maxOutputWords)
-        break;
-    struct dlNode *node;
-    struct wordTree *picked;
-
-    /* Get next predicted word. */
-    AllocVar(node);
-    if (chainSize == 0)
-        {
-	chainStartNode = node;
-	++chainSize;
-	picked = firstWord;
-	}
-    else if (chainSize >= maxSize)
-	{
-	chainStartNode = chainStartNode->next;
-	picked = predictNext(store, ll);
-	}
-    else
-	{
-	picked = predictNext(store, ll);
-	++chainSize;
-	}
-
-    if (picked == NULL)
-         break;
-
-    /* Add word from whatever level we fetched back to our output chain. */
-    struct monomer *monomer = picked->monomer;
-    node->val = monomer;
-    dlAddTail(ll, node);
-
-    decrementOutputCountsInTree(picked);
-    monomer->outTarget -= 1;
-    monomer->outCount += 1;
-    }
-verbose(3, "totUseZeroCount = %d\n", totUseZeroCount);
-return ll;
-}
-
-
-struct slRef *listUnusedMonomers(struct alphaStore *store, struct dlList *ll)
-/* Return list of references to monomers that are in store but not on ll. */
-{
-struct slRef *refList = NULL, *ref;
-struct hash *usedHash = hashNew(0);
-struct dlNode *node;
-/* Make hash of used monomers. */
-for (node = ll->head; !dlEnd(node); node = node->next)
-    {
-    struct monomer *monomer = node->val;
-    hashAdd(usedHash, monomer->word, monomer);
-    }
-
-/* Stream through all monomers looking for ones not used and adding to list. */
-struct monomer *monomer;
-for (monomer = store->monomerList; monomer != NULL; monomer = monomer->next)
-    {
-    if (isEmpty(monomer->word))
-        continue;
-    if (!hashLookup(usedHash, monomer->word))
-	{
-	AllocVar(ref);
-	ref->val = monomer;
-	slAddHead(&refList, ref);
-	}
-    }
-hashFree(&usedHash);
-slReverse(&refList);
-return refList;
-}
-
-
-int monomerRefIx(struct monomerRef *list, struct monomer *monomer)
-/* Return index of monomer in list. */
-{
-int i;
-struct monomerRef *ref;
-for (i=0, ref = list; ref != NULL; ref = ref->next, ++i)
-    {
-    if (ref->val == monomer)
-        return i;
-    }
-errAbort("Monomer %s not on list\n", monomer->word);
-return -1;
-}
-
-struct monomerRef *findNeighborhoodFromReads(struct monomer *center)
-/* Find if possible one monomer to either side of center */
-{
-struct slRef *readRef;
-struct monomerRef *before = NULL, *after = NULL;
-
-/* Loop through reads hoping to find a case where center is flanked by two monomers in
- * same read.   As a fallback, keep track of a monomer before and a monomer after in
- * any read. */
-for (readRef = center->readList; readRef != NULL; readRef = readRef->next)
-    {
-    struct alphaRead *read = readRef->val;
-    int readSize = slCount(read->list);
-    int centerIx = monomerRefIx(read->list, center);
-    if (readSize >= 3 && centerIx > 0 && centerIx < readSize-1)
-	 {
-	 before = slElementFromIx(read->list, centerIx-1);
-	 after = slElementFromIx(read->list, centerIx+1);
-	 break;
-	 }
-    else if (readSize >= 2)
-         {
-	 if (centerIx == 0)
-	     after = slElementFromIx(read->list, centerIx+1);
-	 else
-	     before = slElementFromIx(read->list, centerIx-1);
-	 }
-    }
-
-/* Make up list from end to start. */
-struct monomerRef *retList = NULL, *monoRef;
-if (after)
-    {
-    AllocVar(monoRef);
-    monoRef->val = after->val;
-    slAddHead(&retList, monoRef);
-    }
-AllocVar(monoRef);
-monoRef->val = center;
-slAddHead(&retList, monoRef);
-if (before)
-    {
-    AllocVar(monoRef);
-    monoRef->val = before->val;
-    slAddHead(&retList, monoRef);
-    }
-return retList;
-}
-
-struct dlNode *matchExceptCenter(struct dlNode *node, struct monomerRef *neighborhood, struct monomer *center)
-/* Return center node if node matches neighborhood except for center. */
-{
-struct dlNode *centerNode = NULL;
-struct monomerRef *neighbor;
-struct dlNode *n;
-for (n = node, neighbor=neighborhood; ; n = n->next, neighbor = neighbor->next)
-    {
-    if (neighbor == NULL)
-        break;
-    if (dlEnd(n))
-        return NULL;
-    if (neighbor->val == center)
-        centerNode = n;
-    else if (n->val != neighbor->val)
-        return NULL;
-    }
-return centerNode;
-}
-
-void printMonomerRefList(struct monomerRef *refList, FILE *f)
-/* Print out a line to file with the list of monomers. */
-{
-struct monomerRef *ref;
-for (ref = refList; ref != NULL; ref = ref->next)
-    fprintf(f, "%s ", ref->val->word);
-fprintf(f, "\n");
-}
-
-struct slRef *refsToPossibleCenters(struct monomer *center, struct monomerRef *neighborhood, struct dlList *ll)
-/* Return a list of dlNodes where neighborhood, but not center matches. */
-{
-struct slRef *list = NULL;
-struct dlNode *node;
-for (node = ll->head; !dlEnd(node); node = node->next)
-    {
-    struct dlNode *centerNode = matchExceptCenter(node, neighborhood, center);
-    if (centerNode != NULL)
-	refAdd(&list, centerNode);
-    }
-return list;
-}
-
-void mostCommonMonomerWord(struct slRef *refList, char **retWord, int *retCount)
-/* Given refs to dlNodes containing monomers, find word associated with most common monomer. */
-{
-/* Make up a hash that contains counts of all monomers. */
-struct hash *countHash = hashNew(0);
-struct slRef *ref;
-for (ref = refList; ref != NULL; ref = ref->next)
-    {
-    struct dlNode *node = ref->val;
-    struct monomer *monomer = node->val;
-    hashIncInt(countHash, monomer->word);
-    }
-
-/* Iterate through hash finding max value. */
-char *maxWord = NULL;
-int maxCount = 0;
-struct hashEl *hel;
-struct hashCookie cookie = hashFirst(countHash);
-while ((hel = hashNext(&cookie)) != NULL)
-    {
-    int count = ptToInt(hel->val);
-    if (count > maxCount)
-        {
-	maxCount = count;
-	maxWord = hel->name;
-	}
-    }
-*retWord = maxWord;
-*retCount = maxCount;
-hashFree(&countHash);
-}
-
-boolean subCommonCenter(struct alphaStore *store,
-    struct monomer *center, struct monomerRef *neighborhood, struct dlList *ll)
-/* Scan list for places where have all items in neighborhood (except for center) matching. 
- * Substitute in center at one of these places chosen at random and return TRUE if possible. */
-{
-struct slRef *centerRefList = refsToPossibleCenters(center, neighborhood, ll);
-verbose(4, "sub %s in neighborhood: ", center->word);
-if (verboseLevel() >= 4)
-    printMonomerRefList(neighborhood, stderr);
-verbose(4, "Got %d possible centers\n", slCount(centerRefList));
-
-if (centerRefList == NULL)
-    return FALSE;
-int commonCount = 0;
-char *commonWord = NULL;
-mostCommonMonomerWord(centerRefList, &commonWord, &commonCount);
-struct monomer *commonMonomer = hashFindVal(store->monomerHash, commonWord);
-verbose(4, "Commonest word to displace with %s is %s which occurs %d times in context and %d overall\n", center->word, commonWord, commonCount, commonMonomer->subbedOutCount);
-if (commonMonomer->subbedOutCount < 2)
-    {
-    verbose(3, "Want to substitute %s for %s, but %s only occurs %d time.\n", 
-	center->word, commonWord, commonWord, commonMonomer->subbedOutCount);
-    return FALSE;
-    }
-
-/* Select a random one of the most commonly occuring possible centers. */
-int targetIx = rand() % commonCount;
-struct slRef *ref;
-int currentIx = 0;
-for (ref = centerRefList; ref != NULL; ref = ref->next)
-    {
-    struct dlNode *node = ref->val;
-    struct monomer *monomer = node->val;
-    if (sameString(monomer->word, commonWord))
-         {
-	 if (currentIx == targetIx)
-	     {
-	     verbose(3, "Substituting %s for %s in context of %d\n", center->word, commonWord, slCount(neighborhood));
-	     struct monomer *oldCenter = node->val;
-	     if (oldCenter->type != center->type)
-		 verbose(3, "Type mismatch subbig %s vs %s\n", oldCenter->word, center->word);
-	     node->val = center;
-	     oldCenter->subbedOutCount -= 1;
-	     center->subbedOutCount += 1;
-	     return TRUE;
-	     }
-	 ++currentIx;
-	 }
-    }
-internalErr();	// Should not get here.
-return FALSE;	
-}
-
-boolean subCenterInNeighborhood(struct alphaStore *store, 
-    struct monomer *center, struct monomerRef *neighborhood, struct dlList *ll)
-/* Scan ll for cases where neighborhood around center matches.  Replace one of these 
- * cases with center. */
-{
-int size = slCount(neighborhood);
-assert(size == 3 || size == 2);
-if (subCommonCenter(store, center, neighborhood, ll))
-   return TRUE;
-if (size == 3)
-    {
-    if (subCommonCenter(store, center, neighborhood->next, ll))
-       return TRUE;
-    struct monomerRef *third = neighborhood->next->next;
-    neighborhood->next->next = NULL;
-    boolean ok = subCommonCenter(store, center, neighborhood, ll);
-    neighborhood->next->next = third;
-    return ok;
-    }
-else
-    return FALSE;
-}
-
-struct monomer *mostCommonInType(struct monomerType *type)
-/* Return most common monomer of given type */
-{
-struct monomerRef *ref;
-int commonCount = 0;
-struct monomer *common = NULL;
-for (ref = type->list; ref != NULL; ref = ref->next)
-    {
-    struct monomer *monomer = ref->val;
-    if (monomer->subbedOutCount > commonCount)
-        {
-	commonCount = monomer->subbedOutCount;
-	common = monomer;
-	}
-    }
-return common;
-}
-
-boolean subIntoFirstMostCommonOfType(struct alphaStore *store, struct monomer *unused, 
-    struct dlList *ll)
-/* Substitute unused for first occurence of most common monomer of same type. */
-{
-struct monomer *common = mostCommonInType(unused->type);
-if (common == NULL)
-    {
-    verbose(3, "No monomers of type %s used at all!\n", unused->type->name);
-    return FALSE;
-    }
-if (common->subbedOutCount < 2)
-    {
-    verbose(3, "Trying to sub in %s, but there's no monomers of type %s that are used more than once.\n", 
-	unused->word, unused->type->name);
-    return FALSE;
-    }
-struct dlNode *node;
-for (node = ll->head; !dlEnd(node); node = node->next)
-    {
-    struct monomer *monomer = node->val;
-    if (monomer == common)
-        {
-	verbose(3, "Subbing %s for %s of type %s\n", unused->word, monomer->word, unused->type->name);
-	node->val = unused;
-	unused->subbedOutCount += 1;
-	monomer->subbedOutCount -= 1;
-	break;
-	}
-    }
-return TRUE;
-}
-
-void setInitialSubbedOutCount(struct alphaStore *store, struct dlList *ll)
-/* Set subbedOutCount based on how many times monomers occur in list. */
-{
-struct dlNode *node;
-for (node = ll->head; !dlEnd(node); node = node->next)
-    {
-    struct monomer *monomer = node->val;
-    monomer->subbedOutCount += 1;
-    }
-#ifdef PARANOID
-/* As a check see if subbedOutCount agrees with outCount. */
-int mismatchCount = 0;
-struct monomer *monomer;
-for (monomer = store->monomerList; monomer != NULL; monomer = monomer->next)
-    {
-    uglyf("%s %d %d\n", monomer->word, monomer->outCount, monomer->subbedOutCount);
-    if (monomer->outCount != monomer->subbedOutCount)
-        ++mismatchCount;
-    }
-uglyf("mismatch count = %d\n", mismatchCount);
-#endif /* PARANOID */
-}
-
-void subInMissing(struct alphaStore *store, struct dlList *ll)
-/* Go figure out missing monomers in ll, and attempt to substitute them in somewhere they would fit. */
-{
-setInitialSubbedOutCount(store, ll);
-struct slRef *unusedList = listUnusedMonomers(store, ll);
-verbose(3, "%d monomers, %d unused\n", slCount(store->monomerList), slCount(unusedList));
-struct slRef *unusedRef;
-for (unusedRef = unusedList; unusedRef != NULL; unusedRef = unusedRef->next)
-    {
-    struct monomer *unused = unusedRef->val;
-    struct monomerRef *neighborhood = findNeighborhoodFromReads(unused);
-    if (!subCenterInNeighborhood(store, unused, neighborhood, ll))
-	{
-	verbose(3, "Couldn't substitute in %s with context, falling back to type logic\n", 
-	    unused->word);
-        subIntoFirstMostCommonOfType(store, unused, ll);
-	}
-    slFreeList(&neighborhood);
-    }
-}
-
-static void writeMonomerList(char *fileName, struct dlList *ll)
-/* Write out monomer list to file. */
-{
-FILE *f = mustOpen(fileName, "w");
-struct dlNode *node;
-for (node = ll->head; !dlEnd(node); node = node->next)
-    {
-    struct monomer *monomer = node->val;
-    fprintf(f, "%s\n", monomer->word);
-    }
-carefulClose(&f);
-}
-
-static struct dlList *wordTreeMakeList(struct alphaStore *store, int maxSize, 
-    struct wordTree *firstWord, int maxOutputWords)
-/* Create word list base on tree probabilities, and then substituting in unused
- * monomers if possible. */
-{
-struct dlList *ll = wordTreeGenerateList(store, maxSize, firstWord, maxOutputWords);
-verbose(2, "Generated initial output list\n");
-char *unsubbedFile = optionVal("unsubbed", NULL);
-if (unsubbedFile)
-    writeMonomerList(unsubbedFile, ll);
-subInMissing(store, ll);
-verbose(2, "Substituted in missing rare monomers\n");
-return ll;
-}
-
-
-struct alphaStore *alphaStoreNew(int maxChainSize)
-/* Allocate and initialize a new word store. */
-{
-struct alphaStore *store;
-AllocVar(store);
-store->maxChainSize = maxChainSize;
-store->monomerHash = hashNew(0);
-store->typeHash = hashNew(0);
-store->readHash = hashNew(0);
-return store;
-}
-
-struct monomer *alphaStoreAddMonomer(struct alphaStore *store, char *word)
-/* Add word to store,  incrementing it's useCount if it's already there, otherwise
- * making up a new record for it. */
-{
-struct monomer *monomer = hashFindVal(store->monomerHash, word);
-if (monomer == NULL)
-    {
-    AllocVar(monomer);
-    hashAddSaveName(store->monomerHash, word, monomer, &monomer->word);
-    slAddHead(&store->monomerList, monomer);
-    }
-monomer->useCount += 1;
-monomer->outTarget = monomer->useCount;	    /* Set default value, may be renormalized. */
-return monomer;
-}
-
-void connectReadsToMonomers(struct alphaStore *store)
-/* Add each read monomer appears in to monomer's readList. */
-{
-struct alphaRead *read;
-for (read = store->readList; read != NULL; read = read->next)
-    {
-    struct monomerRef *ref;
-    for (ref = read->list; ref != NULL; ref = ref->next)
-        {
-	struct monomer *monomer = ref->val;
-	refAdd(&monomer->readList, read);
-	}
-    }
-}
-
-void alphaReadListFromFile(char *fileName, struct alphaStore *store)
-/* Read in file and turn it into a list of alphaReads. */
-{
-/* Stuff for processing file a line at a time. */
-struct lineFile *lf = lineFileOpen(fileName, TRUE);
-char *line, *word;
-
-/* Loop through each line of file making up a read for it. */
-struct alphaRead *readList = NULL;
-while (lineFileNextReal(lf, &line))
-    {
-    /* Parse out first word of line into name, and rest into list. */
-    char *name = nextWord(&line);
-    struct monomerRef *list = NULL;
-    while ((word = nextWord(&line)) != NULL)
-        {
-	struct monomer *monomer = alphaStoreAddMonomer(store, word);
-	struct monomerRef *ref;
-	AllocVar(ref);
-	ref->val = monomer;
-	slAddHead(&list, ref);
-	}
-    slReverse(&list);
-    if (list == NULL)
-	errAbort("Line %d of %s has no alpha monomers.", lf->lineIx, lf->fileName);
-
-    /* Create data structure and add read to list and hash */
-    if (hashLookup(store->readHash, name))
-        errAbort("Read ID %s is repeated line %d of %s", name, lf->lineIx, lf->fileName);
-    struct alphaRead *read;
-    AllocVar(read);
-    read->list = list;
-    hashAddSaveName(store->readHash, name, read, &read->name);
-    slAddHead(&readList, read);
-    }
-slReverse(&readList);
-store->readList = readList;
-if (store->readList == NULL)
-    errAbort("%s contains no reads", lf->fileName);
-lineFileClose(&lf);
-connectReadsToMonomers(store);
-}
-
-struct alphaOrphan
-/* Information about an orphan - something without great joining information. */
-    {
-    struct alphaOrphan *next;	/* Next in list */
-    struct monomer *monomer;	/* Pointer to orphan */
-    boolean paired;		/* True if has been paired with somebody. */
-    };
-
-struct alphaOrphan *findOrphanStarts(struct alphaRead *readList, struct alphaStore *store)
-/* Return list of monomers that are found only at start of reads. */
-{
-struct alphaOrphan *orphanList = NULL;
-
-/* Build up hash of things seen later in reads. */
-struct alphaRead *read;
-struct hash *later = hashNew(0);    /* Hash of monomers found later. */
-for (read = readList; read != NULL; read = read->next)
-    {
-    struct monomerRef *ref;
-    for (ref = read->list->next; ref != NULL; ref = ref->next)
-        {
-	hashAdd(later, ref->val->word, NULL);
-	}
-    }
-
-/* Pass through again collecting orphans. */
-struct hash *orphanHash = hashNew(0);
-for (read = readList; read != NULL; read = read->next)
-    {
-    struct monomer *monomer = read->list->val;
-    char *word = monomer->word;
-    if (!hashLookup(later, word))
-        {
-	struct alphaOrphan *orphan = hashFindVal(orphanHash, word);
-	if (orphan == NULL)
-	    {
-	    AllocVar(orphan);
-	    orphan->monomer = monomer;
-	    slAddHead(&orphanList, orphan);
-	    hashAdd(orphanHash, word, orphan);
-	    verbose(3, "Adding orphan start %s\n", word);
-	    }
-	}
-    }
-hashFree(&later);
-hashFree(&orphanHash);
-slReverse(&orphanList);
-return orphanList;
-}
-
-struct alphaOrphan *findOrphanEnds(struct alphaRead *readList, struct alphaStore *store)
-/* Return list of monomers that are found only at end of reads. */
-{
-struct alphaOrphan *orphanList = NULL;
-
-/* Build up hash of things seen sooner in reads. */
-struct alphaRead *read;
-struct hash *sooner = hashNew(0);    /* Hash of monomers found sooner. */
-for (read = readList; read != NULL; read = read->next)
-    {
-    struct monomerRef *ref;
-    /* Loop over all but last. */
-    for (ref = read->list; ref->next != NULL; ref = ref->next)
-        {
-	hashAdd(sooner, ref->val->word, NULL);
-	}
-    }
-
-/* Pass through again collecting orphans. */
-struct hash *orphanHash = hashNew(0);
-for (read = readList; read != NULL; read = read->next)
-    {
-    struct monomerRef *lastRef = slLastEl(read->list);
-    struct monomer *monomer = lastRef->val;
-    if (!hashLookup(sooner, monomer->word))
-        {
-	struct alphaOrphan *orphan = hashFindVal(orphanHash, monomer->word);
-	if (orphan == NULL)
-	    {
-	    AllocVar(orphan);
-	    orphan->monomer = monomer;
-	    slAddHead(&orphanList, orphan);
-	    hashAdd(orphanHash, monomer->word, orphan);
-	    verbose(3, "Adding orphan end %s\n", monomer->word);
-	    }
-	}
-    }
-hashFree(&sooner);
-slReverse(&orphanList);
-hashFree(&orphanHash);
-return orphanList;
-}
-
-struct alphaRead *addReadOfTwo(struct alphaStore *store, struct monomer *a, struct monomer *b)
-/* Make up a fake read that goes from a to b and add it to store. */
-{
-/* Create fake name and start fake read with it. */
-static int fakeId = 0;
-char name[32];
-safef(name, sizeof(name), "fake%d", ++fakeId);
-
-/* Allocate fake read. */
-struct alphaRead *read;
-AllocVar(read);
-
-/* Add a and b to the read. */
-struct monomerRef *aRef, *bRef;
-AllocVar(aRef);
-aRef->val = a;
-AllocVar(bRef);
-bRef->val = b;
-aRef->next = bRef;
-read->list = aRef;
-
-/* Add read to the store. */
-slAddHead(&store->readList, read);
-hashAddSaveName(store->readHash, name, read, &read->name);
-
-/* Add read to the monomers. */
-refAdd(&a->readList, read);
-refAdd(&b->readList, read);
-
-a->useCount += 1;
-b->useCount += 1;
-return read;
-}
-
-void integrateOrphans(struct alphaStore *store)
-/* Make up fake reads that integrate orphans (monomers only found at beginning or end
- * of a read) better. */
-{
-struct alphaOrphan *orphanStarts = findOrphanStarts(store->readList, store);
-struct alphaOrphan *orphanEnds = findOrphanEnds(store->readList, store);
-verbose(3, "orphanStarts has %d items, orphanEnds %d\n", 
-    slCount(orphanStarts), slCount(orphanEnds));
-
-/* First look for the excellent situation where you can pair up orphans with each 
- * other.  For Y at least this happens half the time. */
-struct alphaOrphan *start, *end;
-for (start = orphanStarts; start != NULL; start = start->next)
-    {
-    struct monomerType *startType = start->monomer->type;
-    if (startType == NULL)
-        continue;
-    for (end = orphanEnds; end != NULL && !start->paired; end = end->next)
-        {
-	struct monomerType *endType = end->monomer->type;
-	if (endType == NULL)
-	    continue;
-	if (!end->paired)
-	    {
-	    struct monomerType *nextType = typeAfter(store, endType, 1);
-	    if (nextType == startType)
-	        {
-		end->paired = TRUE;
-		start->paired = TRUE;
-		addReadOfTwo(store, end->monomer, start->monomer);
-		verbose(3, "Pairing %s with %s\n", end->monomer->word, start->monomer->word);
-		}
-	    }
-	}
-    }
-
-/* Ok, now have to just manufacture other side of orphan starts out of thin air. */
-for (start = orphanStarts; start != NULL; start = start->next)
-    {
-    if (start->paired)
-        continue;
-    struct monomer *startMono = start->monomer;
-    struct monomerType *startType = startMono->type;
-    if (startType == NULL)
-        continue;
-
-    struct monomerType *newType = typeBefore(store, startType, 1);
-    verbose(3, "Trying to find end of type %s\n", newType->name);
-    struct monomer *newMono = pickRandomFromType(newType);
-    addReadOfTwo(store, newMono, startMono);
-    verbose(3, "Pairing new %s with start %s\n", newMono->word, startMono->word);
-    }
-
-/* Ok, now have to just manufacture other side of orphan ends out of thin air. */
-for (end = orphanEnds; end != NULL; end = end->next)
-    {
-    if (end->paired)
-        continue;
-    struct monomer *endMono = end->monomer;
-    struct monomerType *endType = endMono->type;
-    if (endType == NULL)
-        continue;
-
-    struct monomerType *newType = typeAfter(store, endType, 1);
-    verbose(3, "Trying to find start of type %s\n", newType->name);
-    struct monomer *newMono = pickRandomFromType(newType);
-    addReadOfTwo(store, endMono, newMono);
-    verbose(3, "Pairing end %s with new %s\n", endMono->word, newMono->word);
-    }
-}
-
-struct wordTree *makeMarkovChains(struct alphaStore *store)
-/* Return wordTree containing all words, and also all chains-of-words of length 
- * chainSize seen in file.  */
-{
-/* We'll build up the tree starting with an empty root node. */
-struct wordTree *wt = wordTreeNew(alphaStoreAddMonomer(store, ""));	
-int chainSize = store->maxChainSize;
-
-/* Loop through each read. There's special cases at the beginning and end of read, and for 
- * short reads.  In the main case we'll be maintaining a chain (doubly linked list) of maxChainSize 
- * words, * popping off one word from the start, and adding one word to the end for each
- * new word we encounter. This list is added to the tree each iteration. */
-struct alphaRead *read;
-for (read = store->readList; read != NULL; read = read->next)
-    {
-    /* We'll keep a chain of three or so words in a doubly linked list. */
-    struct dlNode *node;
-    struct dlList *chain = dlListNew();
-    int curSize = 0;
-    int wordCount = 0;
-
-    struct monomerRef *ref;
-    for (ref = read->list; ref != NULL; ref = ref->next)
-        {
-	struct monomer *monomer = ref->val;
-	 /* For the first few words in the file after ID, we'll just build up the chain,
-	 * only adding it to the tree when we finally do get to the desired
-	 * chain size.  Once past the initial section of the file we'll be
-	 * getting rid of the first link in the chain as well as adding a new
-	 * last link in the chain with each new word we see. */
-	if (curSize < chainSize)
-	    {
-	    dlAddValTail(chain, monomer);
-	    ++curSize;
-	    if (curSize == chainSize)
-		addChainToTree(wt, chain);
-	    }
-	else
-	    {
-	    /* Reuse doubly-linked-list node, but give it a new value, as we move
-	     * it from head to tail of list. */
-	    node = dlPopHead(chain);
-	    node->val = monomer;
-	    dlAddTail(chain, node);
-	    addChainToTree(wt, chain);
-	    }
-	++wordCount;
-	}
-    /* Handle last few words in line, where can't make a chain of full size.  Also handles       
-    * lines that have fewer than chain size words. */
-    if (curSize < chainSize)
- 	addChainToTree(wt, chain);
-    while ((node = dlPopHead(chain)) != NULL)
-	{
-	if (!dlEmpty(chain))
-	    addChainToTree(wt, chain);
-	freeMem(node);
-	}
-    dlListFree(&chain);
-    }
-wordTreeSort(wt);  // Make output of chain file prettier
-return wt;
-}
-
-void wordTreeWrite(struct wordTree *wt, int maxChainSize, char *fileName)
-/* Write out tree to file */
-{
-FILE *f = mustOpen(fileName, "w");
-fprintf(f, "#level\tuseCount\toutTarget\toutCount\tnormVal\tmonomers\n");
-wordTreeDump(0, wt, maxChainSize, f);
-carefulClose(&f);
-}
-
-void alphaStoreLoadMonomerOrder(struct alphaStore *store, char *readsFile, char *fileName)
-/* Read in a file with one line for each monomer type, where first word is type name,
- * and rest of words are all the actually variants of that type.
- * Requires all variants already be in store.  The readsFile is passed
- * just for nicer error reporting. */
-{
-/* Stuff for processing file a line at a time. */
-struct lineFile *lf = lineFileOpen(fileName, TRUE);
-char *line, *word;
-struct hash *uniq = hashNew(0);
-
-/* Set up variables we'll put results in in store. */
-store->typeList = NULL;
-
-while (lineFileNextReal(lf, &line))
-    {
-    char *name = nextWord(&line);
-    if (hashLookup(uniq, name) != NULL)
-        errAbort("Type name '%s' repeated line %d of %s\n", name, lf->lineIx, lf->fileName);
-    struct monomerType *type;
-    AllocVar(type);
-    type->name = cloneString(name);
-    slAddHead(&store->typeList, type);
-    while ((word = nextWord(&line)) != NULL)
-        {
-	struct monomer *monomer = hashFindVal(store->monomerHash, word);
-	if (monomer == NULL)
-	    errAbort("%s is in %s but not %s", word, lf->fileName, readsFile);
-	struct monomerRef *ref;
-	AllocVar(ref);
-	ref->val = monomer;
-	slAddHead(&type->list, ref);
-	hashAddUnique(store->typeHash, word, type);
-	}
-    if (type->list == NULL)
-        errAbort("Short line %d of %s.  Format should be:\ntype list-of-monomers-of-type\n",
-	    lf->lineIx, lf->fileName);
-    }
-slReverse(&store->typeList);
-if (store->typeList == NULL)
-    errAbort("%s is empty", lf->fileName);
-lineFileClose(&lf);
-hashFree(&uniq);
-verbose(3, "Added %d types containing %d words from %s\n", 
-    slCount(store->typeList), store->typeHash->elCount, fileName);
-
-/* Create type array */
-store->typeCount = slCount(store->typeList);
-struct monomerType **types = AllocArray(store->typeArray, store->typeCount);
-struct monomerType *type;
-int i;
-for (i=0, type = store->typeList; i<store->typeCount; ++i, type = type->next)
-    types[i] = type;
-}
-
-void crossCheckMonomerOrderAndReads(struct alphaStore *store, char *orderedPrefix, char *readFile, char *typeFile)
-/* Make sure all monomer that begin with ordered prefix are present in monomer order file. */
-{
-/* Make hash of all monomers that have type info. */
-struct hash *orderHash = hashNew(0);
-struct monomerType *type;
-for (type = store->typeList; type != NULL; type = type->next)
-     {
-     struct monomerRef *ref;
-     for (ref = type->list; ref != NULL; ref = ref->next)
-         hashAdd(orderHash, ref->val->word, ref->val);
-     }
-
-/* Go through all monomers and make sure ones with correct prefix are in list. */
-struct monomer *mon;
-for (mon = store->monomerList; mon != NULL; mon = mon->next)
-    {
-    char *word = mon->word;
-    if (startsWith(orderedPrefix, word) && !hashLookup(orderHash, word))
-        {
-	errAbort("%s is in %s but not %s", word, readFile, typeFile);
-	}
-    }
-hashFree(&orderHash);
-}
-
-
-void monomerListNormalise(struct monomer *list, int totalCount, int outputSize)
-/* Set outTarget field in all of list to be normalized to outputSize */
-{
-struct monomer *monomer;
-double scale = (double)outputSize/totalCount;
-for (monomer = list; monomer != NULL; monomer = monomer->next)
-    {
-    monomer->outCount = 0;
-    monomer->outTarget = round(scale * monomer->useCount);
-    }
-}
-
-void alphaStoreNormalize(struct alphaStore *store, int outputSize)
-/* Set up output counts on each word to make sure it gets it's share of output size */
-{
-struct wordTree *wt = store->markovChains;
-wordTreeNormalize(wt, outputSize, 1.0);
-monomerListNormalise(store->monomerList, wt->useCount, outputSize);
-}
-
-struct monomerType *fillInTypeFromReads(struct monomer *monomer, struct alphaStore *store)
-/* Look through read list trying to find best supported type for this monomer. */
-{
-/* The algorithm here could be better.  It picks the closest nearby monomer that is typed
- * to fill in from.  When have information from both before and after the program arbitrarily 
- * uses before preferentially.  When have information that is equally close from multiple reads,
- * it just uses info from first read.  Fortunately the data, at least on Y, shows a lot of
- * consistency, so it's probably not actually worth a more sophisticated algorithm that would
- * instead take the most popular choice rather than the first one. */
-verbose(3, "fillInTypeFromReads on %s from %d reads\n", monomer->word, slCount(monomer->readList));
-struct monomerType *bestType = NULL;
-int bestPos = 0;
-boolean bestIsAfter = FALSE;
-struct slRef *readRef;
-for (readRef = monomer->readList; readRef != NULL; readRef = readRef->next)
-    {
-    struct alphaRead *read = readRef->val;
-    struct monomerRef *ref;
-	{
-	verbose(3, "  read %s (%d els):", read->name, slCount(read->list));
-	for (ref = read->list; ref != NULL; ref = ref->next)
-	    {
-	    struct monomer *m = ref->val;
-	    verbose(3, " %s", m->word);
-	    }
-	verbose(3, "\n");
-	}
-    struct monomerType *beforeType = NULL;
-    int beforePos = 0;
-
-    /* Look before. */
-    for (ref = read->list; ref != NULL; ref = ref->next)
-        {
-	struct monomer *m = ref->val;
-	if (m == monomer)
-	     break;
-	if (m->type != NULL)
-	    {
-	    beforeType = m->type;
-	    beforePos = 0;
-	    }
-	++beforePos;
-	}
-
-    /* Look after. */
-    struct monomerType *afterType = NULL;
-    int afterPos = 0;
-    assert(ref != NULL && ref->val == monomer);   
-    for (ref = ref->next; ref != NULL; ref = ref->next)
-        {
-	struct monomer *m = ref->val;
-	++afterPos;
-	if (m->type != NULL)
-	    {
-	    afterType = m->type;
-	    break;
-	    }
-	}
-
-    if (beforeType != NULL)
-        {
-	if (bestType == NULL || beforePos < bestPos)
-	    {
-	    bestType = beforeType;
-	    bestPos = beforePos;
-	    bestIsAfter = FALSE;
-	    }
-	}
-
-    if (afterType != NULL)
-        {
-	if (bestType == NULL || afterPos < bestPos)
-	    {
-	    bestType = afterType;
-	    bestPos = afterPos;
-	    bestIsAfter = TRUE;
-	    }
-	}
-    }
-
-/* Now have found a type that is at a known position relative to ourselves.  From this
- * infer our own type. */
-struct monomerType *chosenType = NULL;
-if (bestType != NULL)
-    {
-    if (bestIsAfter)
-	chosenType = typeBefore(store, bestType, bestPos);
-    else
-	chosenType = typeAfter(store, bestType, bestPos);
-    }
-verbose(3, "chosenType for %s is %s\n", monomer->word, 
-    (bestType == NULL ? "(null)" : chosenType->name));
-return chosenType;
-}
-
-void fillInTypes(struct alphaStore *store)
-/* We have types for most but not all monomers. Try and fill out types for most of
- * the rest by looking for position in reads they are in that do contain known types */
-{
-/* Do first pass on ones defined - just have to look them up in hash. */
-struct monomer *monomer;
-for (monomer = store->monomerList; monomer != NULL; monomer = monomer->next)
-    monomer->type = hashFindVal(store->typeHash, monomer->word);
-
-/* Do second pass on ones that are not yet defined */
-for (monomer = store->monomerList; monomer != NULL; monomer = monomer->next)
-    {
-    if (monomer->type == NULL)
-	{
-	monomer->type = fillInTypeFromReads(monomer, store);
-	}
-    }
-}
-
-struct dlList *alphaAsmOneSize(struct alphaStore *store, int outSize)
-/* alphaAsm - assemble alpha repeat regions such as centromeres from reads that have
- * been parsed into various repeat monomer variants.  Cycles of these variants tend to
- * form higher order repeats. Returns list of outSize monomers. */
-{
-struct wordTree *wt = store->markovChains;
-alphaStoreNormalize(store, outSize);
-verbose(2, "Normalized Markov Chains\n");
-
-if (optionExists("chain"))
-    {
-    char *fileName = optionVal("chain", NULL);
-    wordTreeWrite(wt, store->maxChainSize, fileName);
-    }
-
-struct dlList *ll = wordTreeMakeList(store, store->maxChainSize, 
-    pickWeightedRandomFromList(wt->children), outSize);
-
-if (optionExists("afterChain"))
-    {
-    char *fileName = optionVal("afterChain", NULL);
-    wordTreeWrite(wt, store->maxChainSize, fileName);
-    }
-
-return ll;
-}
-
-int countMissingMonomers(struct dlList *ll, struct monomer *monomerList)
-/* Set the outCounts in monomer list to match how often they occur in ll.
- * Return number of monomers that do not occur at all in ll. */
-{
-/* Zero out output counts. */
-struct monomer *monomer;
-for (monomer = monomerList; monomer != NULL; monomer = monomer->next)
-    monomer->outCount = 0;
-
-/* Increase output count each time a monomer is used. */
-struct dlNode *node;
-for (node = ll->head; !dlEnd(node); node = node->next)
-    {
-    monomer = node->val;
-    monomer->outCount += 1;
-    }
-
-/* Count up unused. */
-int missing = 0;
-for (monomer = monomerList; monomer != NULL; monomer = monomer->next)
-    if (monomer->outCount == 0 && !sameString(monomer->word, ""))
-        missing += 1;
-
-return missing;
-}
-
-void alphaAsm(char *readsFile, char *monomerOrderFile, char *outFile)
-/* alphaAsm - assemble alpha repeat regions such as centromeres from reads that have
- * been parsed into various repeat monomer variants.  Cycles of these variants tend to
- * form higher order repeats. */
-{
-/* This routine reads in the input,  and then calls a routine that produces the
- * output for a given size.  If not all monomers are included in the output, then it
- * will try to find the minimum output size needed to include all monomers. */
-
-/* Read in inputs, and put in "store" */
-struct alphaStore *store = alphaStoreNew(maxChainSize);
-alphaReadListFromFile(readsFile, store);
-alphaStoreLoadMonomerOrder(store, readsFile, monomerOrderFile);
-verbose(2, "Loaded input reads and monomer order\n");
-
-if (betweens)
-    {
-    store->markovChainsNoOrphans = makeMarkovChains(store);
-    verbose(2, "Made initial markov chains\n");
-    }
-
-/* Do some cross checking and filling out a bit for missing data. */
-crossCheckMonomerOrderAndReads(store, "m", readsFile, monomerOrderFile);
-fillInTypes(store);
-integrateOrphans(store);
-verbose(2, "Filled in missing types and integrated orphan ends\n");
-
-/* Make the main markov chains out of the reads. */
-store->markovChains = makeMarkovChains(store);
-verbose(2, "Made Markov Chains\n");
-
-/* Loop gradually increasing size of output we make until get to maxOutSize or until
- * we generate output that includes all input monomers. */
-struct dlList *ll;
-int outSize = initialOutSize;
-while (outSize <= maxOutSize)
-    {
-    ll = alphaAsmOneSize(store, outSize);
-    assert(outSize == dlCount(ll));
-    int missing = countMissingMonomers(ll, store->monomerList);
-    if (missing <= maxToMiss)
-        break;
-    if (outSize == maxOutSize)
-	{
-	errAbort("Could not include all monomers. Missing %d.\n"
-	         "consider increasing maxOutSize (now %d) or increasing maxToMiss (now %d)",
-	         missing, maxOutSize, maxToMiss);
-        break;
-	}
-    dlListFree(&ll);
-    outSize *= 1.1;	/* Grow by 10% */
-    if (outSize > maxOutSize)
-        outSize = maxOutSize;
-    verbose(1, "%d missing. Trying again with outSize=%d (initialOutSize %d, maxOutSize %d)\n",
-	missing, outSize, initialOutSize, maxOutSize);
-    }
-writeMonomerListAndBetweens(store, outFile, ll);
-verbose(2, "Wrote primary output\n");
-dlListFree(&ll);
-}
-
-
-int main(int argc, char *argv[])
-/* Process command line. */
-{
-optionInit(&argc, argv, options);
-if (argc != 4)
-    usage();
-maxChainSize = optionInt("size", maxChainSize);
-initialOutSize = optionInt("initialOutSize", initialOutSize);
-maxOutSize = optionInt("maxOutSize", initialOutSize);  // Defaults to same as initialOutSize
-if (maxOutSize < initialOutSize)
-    errAbort("maxOutSize (%d) needs to be bigger than initialOutSize (%d)\n", 
-	maxOutSize, initialOutSize);
-maxToMiss = optionInt("maxToMiss", maxToMiss);
-fullOnly = optionExists("fullOnly");
-pseudoCount = optionInt("pseudoCount", pseudoCount);
-betweens = optionExists("betweens");
-int seed = optionInt("seed", 0);
-srand(seed);
-alphaAsm(argv[1], argv[2], argv[3]);
-return 0;
-}