3ddccd174261d84b45708e2818103cfff0cf3a30 kent Tue Sep 11 12:06:58 2012 -0700 Renaming and otherwise refactoring alphaChain program into alphaAsm. Making it hold reads in a list for multiple passes. diff --git src/kehayden/alphaAsm/alphaAsm.c src/kehayden/alphaAsm/alphaAsm.c new file mode 100644 index 0000000..7a39680 --- /dev/null +++ src/kehayden/alphaAsm/alphaAsm.c @@ -0,0 +1,887 @@ +/* alphaAsm - Predicts faux centromere sequences using a probablistic model. */ + +#include "common.h" +#include "linefile.h" +#include "hash.h" +#include "options.h" +#include "dystring.h" +#include "dlist.h" + +/* Global vars - all of which can be set by command line options. */ +int maxChainSize = 3; +int outSize = 10000; +boolean fullOnly = FALSE; + +void usage() +/* Explain usage and exit. */ +{ +errAbort( + "alphaAsm - create a linear projection of alpha satellite arrays using the probablistic model\n" + "of HuRef satellite graphs\n" + "usage:\n" + " alphaAsm alphaMonFile.txt monomerOrder.txt output.txt\n" + "Where alphaMonFile is a list of reads with alpha chains monomers parsed out, one read per\n" + "line, with the first word in the line being the read id and subsequent words the monomers\n" + "within the read. The monomerOrder.txt has one line per major monomer type with a word for\n" + "each variant. The lines are in the same order the monomers appear, with the last line\n" + "being followed in order by the first since they repeat. The output.txt contains one line\n" + "for each monomer in the output, just containing the monomer symbol.\n" + "options:\n" + " -size=N - Set max chain size, default %d\n" + " -fullOnly - Only output chains of size above\n" + " -chain=fileName - Write out word chain to file\n" + " -afterChain=fileName - Write out word chain after faux generation to file for debugging\n" + " -outSize=N - Output this many words.\n" + " -seed=N - Initialize random number with this seed for consistent results, otherwise\n" + " it will generate different results each time it's run.\n" + , maxChainSize + ); +} + +/* Command line validation table. */ +static struct optionSpec options[] = { + {"size", OPTION_INT}, + {"chain", OPTION_STRING}, + {"afterChain", OPTION_STRING}, + {"fullOnly", OPTION_BOOLEAN}, + {"outSize", OPTION_INT}, + {"seed", OPTION_INT}, + {NULL, 0}, +}; + +/* Some structures to keep track of words (which correspond to alpha satellight monomers) + * seen in input. */ + +struct monomer +/* Basic information on a monomer including how many times it is seen in input and output + * streams. Unlike the wordTree, this is flat, and does not include predecessors. */ + { + struct monomer *next; /* Next in list of all words. */ + char *word; /* The word itself. Not allocated here. */ + int useCount; /* Number of times used. */ + int outTarget; /* Number of times want to output word. */ + int outCount; /* Number of times have output word so far. */ + }; + +struct monomerRef +/* A reference to a word. */ + { + struct monomerRef *next; /* Next in list */ + struct monomer *val; /* The word referred to. */ + }; + +struct monomerType +/* A collection of words of the same type - or monomers of same type. */ + { + struct monomerType *next; /* Next monomerType */ + struct monomerRef *list; /* List of all words of that type */ + }; + +struct alphaRead +/* A read that's been parsed into alpha variants. */ + { + struct alphaRead *next; /* Next in list */ + char *name; /* Id of read */ + struct monomerRef *list; /* List of alpha subunits */ + }; + +struct alphaStore +/* Stores info on all words */ + { + struct monomer *monomerList; /* List of words, fairly arbitrary order. */ + struct hash *monomerHash; /* Hash of monomer, keyed by word. */ + struct alphaRead *readList; /* List of all reads. */ + struct hash *readHash; /* Hash of all reads keyed by read ID. */ + struct wordTree *markovChains; /* Tree of words that follow other words. */ + int maxChainSize; /* Maximum depth of tree. */ + struct monomerType *typeList; /* List of all types. */ + struct hash *typeHash; /* Hash with monomerType values, keyed by all words. */ + }; + +/* The wordTree structure below is the central data structure for this program. It is + * used to build up a tree that contains all observed N-word-long sequences observed in + * the text, where N corresponds to the "size" command line option which defaults to 3, + * an option that in turn is stored in the maxChainSize variable. At this chain size the + * text + * this is the black dog and the black cat + * would have the chains + * this is the + * is the black + * the black dog + * black dog and + * dog and the + * and the black + * the black cat + * and turn into the tree + * this + * is + * the + * is + * the + * black + * the + * black + * dog + * cat + * black + * dog + * and + * dog + * and + * the + * and + * the + * black + * Note how the tree is able to compress the two chains "the black dog" and "the black cat." + * + * A node in the tree can have as many children as it needs to at each node. The depth of + * the tree is the same as the chain size, by default 3. At each node in the tree you get + * a word, and a list of all words that are observed in the text to follow that word. + * + * Once the program has build up the wordTree, it can output it in a couple of fashions. */ + +struct wordTree +/* A node in a tree of words. The head of the tree is a node with word value the empty string. */ + { + struct wordTree *next; /* Next sibling */ + struct wordTree *children; /* Children in list. */ + struct wordTree *parent; /* Parent of this node or NULL for root. */ + struct monomer *monomer; /* The info on the word itself. */ + int useCount; /* Number of times word used in input with given predecessors. */ + int outTarget; /* Number of times want to output word with given predecessors. */ + int outCount; /* Number of times actually output with given predecessors. */ + double normVal; /* value to place the normalization value */ + }; + +struct wordTree *wordTreeNew(struct monomer *monomer) +/* Create and return new wordTree element. */ +{ +struct wordTree *wt; +AllocVar(wt); +wt->monomer = monomer; +return wt; +} + +int wordTreeCmpWord(const void *va, const void *vb) +/* Compare two wordTree for slSort. */ +{ +const struct wordTree *a = *((struct wordTree **)va); +const struct wordTree *b = *((struct wordTree **)vb); +return cmpStringsWithEmbeddedNumbers(a->monomer->word, b->monomer->word); +} + +struct wordTree *wordTreeFindInList(struct wordTree *list, struct monomer *monomer) +/* Return wordTree element in list that has given word, or NULL if none. */ +{ +struct wordTree *wt; +for (wt = list; wt != NULL; wt = wt->next) + if (wt->monomer == monomer) + break; +return wt; +} + +struct wordTree *wordTreeAddFollowing(struct wordTree *wt, struct monomer *monomer) +/* Make word follow wt in tree. If word already exists among followers + * return it and bump use count. Otherwise create new one. */ +{ +struct wordTree *child = wordTreeFindInList(wt->children, monomer); +if (child == NULL) + { + child = wordTreeNew(monomer); + child->parent = wt; + slAddHead(&wt->children, child); + } +child->useCount += 1; +return child; +} + +int wordTreeSumUseCounts(struct wordTree *list) +/* Sum up useCounts in list */ +{ +int total = 0; +struct wordTree *wt; +for (wt = list; wt != NULL; wt = wt->next) + total += wt->useCount; +return total; +} + +int wordTreeSumOutTargets(struct wordTree *list) +/* Sum up useCounts in list */ +{ +int total = 0; +struct wordTree *wt; +for (wt = list; wt != NULL; wt = wt->next) + total += wt->outTarget; +return total; +} + +void addChainToTree(struct wordTree *wt, struct dlList *chain) +/* Add chain of words to tree. */ +{ +struct dlNode *node; +wt->useCount += 1; +for (node = chain->head; !dlEnd(node); node = node->next) + { + struct monomer *monomer = node->val; + verbose(2, " adding %s\n", monomer->word); + wt = wordTreeAddFollowing(wt, monomer); + } +} + +void wordTreeNormalize(struct wordTree *wt, double outTarget, double normVal) +/* Recursively set wt->normVal and wt->outTarget so each branch gets its share */ +{ +wt->normVal = normVal; +wt->outTarget = outTarget; +int childrenTotalUses = wordTreeSumUseCounts(wt->children); +struct wordTree *child; +for (child = wt->children; child != NULL; child = child->next) + { + double childRatio = (double)child->useCount / childrenTotalUses; + wordTreeNormalize(child, childRatio*outTarget, childRatio*normVal); + } +} + +char *wordTreeString(struct wordTree *wt) +/* Return something like '(a b c)' where c would be the value at wt itself, and + * a and b would be gotten by following parents. */ +{ +struct slName *list = NULL, *el; +for (;wt != NULL; wt = wt->parent) + { + char *word = wt->monomer->word; + if (!isEmpty(word)) // Avoid blank great grandparent + slNameAddHead(&list, word); + } + +struct dyString *dy = dyStringNew(0); +dyStringAppendC(dy, '('); +for (el = list; el != NULL; el = el->next) + { + dyStringPrintf(dy, "%s", el->name); + if (el->next != NULL) + dyStringAppendC(dy, ' '); + } +dyStringAppendC(dy, ')'); +slFreeList(&list); +return dyStringCannibalize(&dy); +} + +char *dlListFragWords(struct dlNode *head) +/* Return string containing all words in list pointed to by head. */ +{ +struct dyString *dy = dyStringNew(0); +dyStringAppendC(dy, '{'); +struct dlNode *node; +for (node = head; !dlEnd(node); node = node->next) + { + if (node != head) + dyStringAppendC(dy, ' '); + struct monomer *monomer = node->val; + dyStringAppend(dy, monomer->word); + } +dyStringAppendC(dy, '}'); +return dyStringCannibalize(&dy); +} + +void wordTreeDump(int level, struct wordTree *wt, int maxChainSize, FILE *f) +/* Write out wordTree to file. */ +{ +static char *words[64]; +int i; +assert(level < ArraySize(words)); + +words[level] = wt->monomer->word; +if (!fullOnly || level == maxChainSize) + { + fprintf(f, "%d\t%d\t%d\t%d\t%f\t", + level, wt->useCount, wt->outTarget, wt->outCount, wt->normVal); + + for (i=1; i<=level; ++i) + { + spaceOut(f, level*2); + fprintf(f, "%s ", words[i]); + } + fprintf(f, "\n"); + } +struct wordTree *child; +for (child = wt->children; child != NULL; child = child->next) + wordTreeDump(level+1, child, maxChainSize, f); +} + +struct wordTree *pickRandomOnOutTarget(struct wordTree *list) +/* Pick word from list randomly, but so that words with higher outTargets + * are picked more often. */ +{ +struct wordTree *picked = NULL; + +/* Debug output. */ + { + verbose(2, " pickRandomOnOutTarget("); + struct wordTree *wt; + for (wt = list; wt != NULL; wt = wt->next) + { + if (wt != list) + verbose(2, " "); + verbose(2, "%s:%d", wt->monomer->word, wt->outTarget); + } + verbose(2, ") total %d\n", wordTreeSumOutTargets(list)); + } + +/* Figure out total number of outputs left, and a random number between 0 and that total. */ +int total = wordTreeSumOutTargets(list); +if (total > 0) + { + int threshold = rand() % total; + verbose(2, " threshold %d\n", threshold); + + /* Loop through list returning selection corresponding to random threshold. */ + int binStart = 0; + struct wordTree *wt; + for (wt = list; wt != NULL; wt = wt->next) + { + int size = wt->outTarget; + int binEnd = binStart + size; + verbose(2, " %s size %d, binEnd %d\n", wt->monomer->word, size, binEnd); + if (threshold < binEnd) + { + picked = wt; + verbose(2, " picked %s\n", wt->monomer->word); + break; + } + binStart = binEnd; + } + } +return picked; +} + +struct wordTree *pickRandomOnUseCounts(struct wordTree *list) +/* Pick word from list randomly, but so that words with higher useCounts + * are picked more often. Much like above routine, but a little simple + * since we know useCounts are non-zero. */ +{ +struct wordTree *picked = NULL; + +/* Figure out total number and a random number between 0 and that total. */ +int total = wordTreeSumUseCounts(list); +assert(total > 0); +int threshold = rand() % total; + +/* Loop through list returning selection corresponding to random threshold. */ +int binStart = 0; +struct wordTree *wt; +for (wt = list; wt != NULL; wt = wt->next) + { + int size = wt->useCount; + int binEnd = binStart + size; + if (threshold < binEnd) + { + picked = wt; + break; + } + binStart = binEnd; + } +assert(picked != NULL); +return picked; +} + +int totUseZeroCount = 0; + +struct wordTree *pickRandom(struct wordTree *list) +/* Pick word from list randomly, but so that words more + * commonly seen are picked more often. */ +{ +struct wordTree *picked = pickRandomOnOutTarget(list); + +/* If did not find anything, that's ok. It can happen on legitimate input due to unevenness + * of read coverage. In this case we pick a random number based on original counts + * rather than normalized/counted down counts. */ +if (picked == NULL) + { + picked = pickRandomOnUseCounts(list); + ++totUseZeroCount; + } +return picked; +} + +struct monomer *pickRandomFromType(struct monomerType *type) +/* Pick a random word, weighted by outTarget, from all available of given type */ +{ +/* Figure out total on list */ +int total = 0; +struct monomerRef *ref; +for (ref = type->list; ref != NULL; ref = ref->next) + total += ref->val->outTarget; + +/* Loop through list returning selection corresponding to random threshold. */ +int threshold = rand() % total; +int binStart = 0; +for (ref = type->list; ref != NULL; ref = ref->next) + { + struct monomer *monomer = ref->val; + int size = monomer->outTarget; + int binEnd = binStart + size; + if (threshold < binEnd) + return monomer; + binStart = binEnd; + } + +verbose(2, "Fell off end in pickRandomFromType\n"); +return type->list->val; // Fall back position (necessary?) just first in list +} + +struct wordTree *predictNextFromAllPredecessors(struct wordTree *wt, struct dlNode *list) +/* Predict next word given tree and recently used word list. If tree doesn't + * have statistics for what comes next given the words in list, then it returns + * NULL. */ +{ +verbose(2, " predictNextFromAllPredecessors(%s, %s)\n", wordTreeString(wt), dlListFragWords(list)); +struct dlNode *node; +for (node = list; !dlEnd(node); node = node->next) + { + struct monomer *monomer = node->val; + wt = wordTreeFindInList(wt->children, monomer); + verbose(2, " wordTreeFindInList(%s) = %p %s\n", monomer->word, wt, wordTreeString(wt)); + if (wt == NULL || wt->children == NULL) + break; + } +struct wordTree *result = NULL; +if (wt != NULL && wt->children != NULL) + result = pickRandom(wt->children); +return result; +} + +struct wordTree *predictFromWordTree(struct wordTree *wt, struct dlNode *recent) +/* Predict next word given tree and recently used word list. Will use all words in + * recent list if can, but if there is not data in tree, will back off, and use + * progressively less previous words until ultimately it just picks a random + * word. */ +{ +struct dlNode *node; + +for (node = recent; !dlEnd(node); node = node->next) + { + struct wordTree *result = predictNextFromAllPredecessors(wt, node); + if (result != NULL) + return result; + } +return NULL; +} + +struct dlNode *nodesFromTail(struct dlList *list, int count) +/* Return count nodes from tail of list. If list is shorter than count, it returns the + * whole list. */ +{ +int i; +struct dlNode *node; +for (i=0, node = list->tail; ihead; + node = node->prev; + } +return node; +} + +struct monomerType *advanceTypeWithWrap(struct monomerType *typeList, struct monomerType *startType, + int amountToAdvance) +/* Skip forward amountToAdvance in typeList from startType. If get to end of list start + * again at the beginning */ +{ +int i; +struct monomerType *type = startType; +for (i=0; inext; + if (type == NULL) + type = typeList; + } +return type; +} + +struct wordTree *predictFromPreviousTypes(struct alphaStore *store, struct dlList *past) +/* Predict next based on general pattern of monomer order. */ +{ +int maxToLookBack = 3; + { /* Debugging block */ + struct dlNode *node = nodesFromTail(past, maxToLookBack); + verbose(3, "predictFromPreviousTypes("); + for (; !dlEnd(node); node = node->next) + { + struct monomer *monomer = node->val; + verbose(3, "%s, ", monomer->word); + } + verbose(3, ")\n"); + } +int pastDepth; +struct dlNode *node; +for (pastDepth = 1, node = past->tail; pastDepth <= maxToLookBack; ++pastDepth) + { + if (dlStart(node)) + { + verbose(2, "predictFromPreviousTypes with empty past\n"); + return NULL; + } + struct monomer *monomer = node->val; + struct monomerType *prevType = hashFindVal(store->typeHash, monomer->word); + if (prevType != NULL) + { + struct monomerType *curType = advanceTypeWithWrap(store->typeList, prevType, pastDepth); + struct monomer *curInfo = pickRandomFromType(curType); + struct wordTree *curTree = wordTreeFindInList(store->markovChains->children, curInfo); + verbose(3, "predictFromPreviousType pastDepth %d, curInfo %s, curTree %p %s\n", + pastDepth, curInfo->word, curTree, curTree->monomer->word); + return curTree; + } + node = node->prev; + } +verbose(2, "predictFromPreviousTypes past all unknown types\n"); +return NULL; +} + + +struct wordTree *predictNext(struct alphaStore *store, struct dlList *past) +/* Given input data store and what is known from the past, predict the next word. */ +{ +struct dlNode *recent = nodesFromTail(past, store->maxChainSize); +struct wordTree *pick = predictFromWordTree(store->markovChains, recent); +if (pick == NULL) + pick = predictFromPreviousTypes(store, past); +if (pick == NULL) + { + pick = pickRandom(store->markovChains->children); + verbose(2, "in predictNext() last resort pick of %s\n", pick->monomer->word); + } +return pick; +} + +void decrementOutputCountsInTree(struct wordTree *wt) +/* Decrement output count of self and parents. */ +{ +while (wt != NULL) + { + /* Decrement target count, but don't let it fall below sum of counts of all children. + * This can happen with incomplete data if we don't prevent it. This + * same code also prevents us from having negative outTarget. */ + int outTarget = wt->outTarget - 1; + int kidSum = wordTreeSumOutTargets(wt->children); + if (outTarget < kidSum) + outTarget = kidSum; + wt->outTarget = outTarget; + + /* Always bump outCount for debugging. */ + wt->outCount += 1; + wt = wt->parent; + } +} + +static struct dlList *wordTreeGenerateList(struct alphaStore *store, int maxSize, + struct wordTree *firstWord, int maxOutputWords) +/* Make a list of words based on probabilities in tree. */ +{ +struct dlList *ll = dlListNew(); +int chainSize = 0; +int outputWords = 0; +struct dlNode *chainStartNode = NULL; + +for (;;) + { + if (++outputWords > maxOutputWords) + break; + struct dlNode *node; + struct wordTree *picked; + + /* Get next predicted word. */ + AllocVar(node); + if (chainSize == 0) + { + chainStartNode = node; + ++chainSize; + picked = firstWord; + } + else if (chainSize >= maxSize) + { + chainStartNode = chainStartNode->next; + picked = predictNext(store, ll); + } + else + { + picked = predictNext(store, ll); + ++chainSize; + } + + if (picked == NULL) + break; + + + /* Add word from whatever level we fetched back to our output chain. */ + struct monomer *monomer = picked->monomer; + node->val = monomer; + dlAddTail(ll, node); + + decrementOutputCountsInTree(picked); + monomer->outTarget -= 1; + monomer->outCount += 1; + } +verbose(2, "totUseZeroCount = %d\n", totUseZeroCount); +return ll; +} + +static void wordTreeGenerateFile(struct alphaStore *store, int maxSize, struct wordTree *firstWord, + int maxOutputWords, char *fileName) +/* Create file containing words base on tree probabilities. The wordTreeGenerateList does + * most of work. */ +{ +struct dlList *ll = wordTreeGenerateList(store, maxSize, firstWord, maxOutputWords); +FILE *f = mustOpen(fileName, "w"); +struct dlNode *node; +for (node = ll->head; !dlEnd(node); node = node->next) + { + struct monomer *monomer = node->val; + fprintf(f, "%s\n", monomer->word); + } +carefulClose(&f); +dlListFree(&ll); +} + + +static void wordTreeSort(struct wordTree *wt) +/* Sort all children lists in tree. */ +{ +slSort(&wt->children, wordTreeCmpWord); +struct wordTree *child; +for (child = wt->children; child != NULL; child = child->next) + wordTreeSort(child); +} + +struct alphaStore *alphaStoreNew(int maxChainSize) +/* Allocate and initialize a new word store. */ +{ +struct alphaStore *store; +AllocVar(store); +store->maxChainSize = maxChainSize; +store->monomerHash = hashNew(0); +store->typeHash = hashNew(0); +store->readHash = hashNew(0); +return store; +} + +struct monomer *alphaStoreAddMonomer(struct alphaStore *store, char *word) +/* Add word to store, incrementing it's useCount if it's already there, otherwise + * making up a new record for it. */ +{ +struct monomer *monomer = hashFindVal(store->monomerHash, word); +if (monomer == NULL) + { + AllocVar(monomer); + hashAddSaveName(store->monomerHash, word, monomer, &monomer->word); + slAddHead(&store->monomerList, monomer); + } +monomer->useCount += 1; +return monomer; +} + +void alphaReadListFromFile(char *fileName, struct alphaStore *store) +/* Read in file and turn it into a list of alphaReads. */ +{ +/* Stuff for processing file a line at a time. */ +struct lineFile *lf = lineFileOpen(fileName, TRUE); +char *line, *word; + +/* Loop through each line of file making up a read for it. */ +struct alphaRead *readList = NULL; +while (lineFileNextReal(lf, &line)) + { + /* Parse out first word of line into name, and rest into list. */ + char *name = nextWord(&line); + struct monomerRef *list = NULL; + while ((word = nextWord(&line)) != NULL) + { + struct monomer *monomer = alphaStoreAddMonomer(store, word); + struct monomerRef *ref; + AllocVar(ref); + ref->val = monomer; + slAddHead(&list, ref); + } + slReverse(&list); + if (list == NULL) + errAbort("Line %d of %s has no alpha monomers.", lf->lineIx, lf->fileName); + + /* Create data structure and add read to list and hash */ + if (hashLookup(store->readHash, name)) + errAbort("Read ID %s is repeated line %d of %s", name, lf->lineIx, lf->fileName); + struct alphaRead *read; + AllocVar(read); + read->list = list; + hashAddSaveName(store->readHash, name, read, &read->name); + slAddHead(&readList, read); + } +slReverse(&readList); +store->readList = readList; +lineFileClose(&lf); +} + +void makeMarkovChains(struct alphaStore *store) +/* Return a alphaStore containing all words, and also all chains-of-words of length + * chainSize seen in file. */ +{ +/* We'll build up the tree starting with an empty root node. */ +struct wordTree *wt = store->markovChains = wordTreeNew(alphaStoreAddMonomer(store, "")); +int chainSize = store->maxChainSize; + +/* Loop through each read. There's special cases at the beginning and end of read, and for + * short reads. In the main case we'll be maintaining a chain (doubly linked list) of maxChainSize + * words, * popping off one word from the start, and adding one word to the end for each + * new word we encounter. This list is added to the tree each iteration. */ +struct alphaRead *read; +for (read = store->readList; read != NULL; read = read->next) + { + /* We'll keep a chain of three or so words in a doubly linked list. */ + struct dlNode *node; + struct dlList *chain = dlListNew(); + int curSize = 0; + int wordCount = 0; + + struct monomerRef *ref; + for (ref = read->list; ref != NULL; ref = ref->next) + { + struct monomer *monomer = ref->val; + /* For the first few words in the file after ID, we'll just build up the chain, + * only adding it to the tree when we finally do get to the desired + * chain size. Once past the initial section of the file we'll be + * getting rid of the first link in the chain as well as adding a new + * last link in the chain with each new word we see. */ + if (curSize < chainSize) + { + dlAddValTail(chain, monomer); + ++curSize; + if (curSize == chainSize) + addChainToTree(wt, chain); + } + else + { + /* Reuse doubly-linked-list node, but give it a new value, as we move + * it from head to tail of list. */ + node = dlPopHead(chain); + node->val = monomer; + dlAddTail(chain, node); + addChainToTree(wt, chain); + } + ++wordCount; + } + /* Handle last few words in line, where can't make a chain of full size. Also handles + * lines that have fewer than chain size words. */ + if (curSize < chainSize) + addChainToTree(wt, chain); + while ((node = dlPopHead(chain)) != NULL) + { + if (!dlEmpty(chain)) + addChainToTree(wt, chain); + freeMem(node); + } + dlListFree(&chain); + } +wordTreeSort(wt); // Make output of chain file prettier +} + +void wordTreeWrite(struct wordTree *wt, int maxChainSize, char *fileName) +/* Write out tree to file */ +{ +FILE *f = mustOpen(fileName, "w"); +fprintf(f, "#level\tuseCount\toutTarget\toutCount\tnormVal\tmonomers\n"); +wordTreeDump(0, wt, maxChainSize, f); +carefulClose(&f); +} + +void alphaStoreLoadMonomerOrder(struct alphaStore *store, char *readsFile, char *fileName) +/* Read in a file with one line for each monomer type, containing a word for each + * monomer variant. Requires all variants already be in store. The readsFile is passed + * just for nicer error reporting. */ +{ +/* Stuff for processing file a line at a time. */ +struct lineFile *lf = lineFileOpen(fileName, TRUE); +char *line, *word; + +/* Set up variables we'll put results in in store. */ +store->typeList = NULL; + +while (lineFileNextReal(lf, &line)) + { + struct monomerType *type; + AllocVar(type); + slAddHead(&store->typeList, type); + while ((word = nextWord(&line)) != NULL) + { + struct monomer *monomer = hashFindVal(store->monomerHash, word); + if (monomer == NULL) + errAbort("%s is in %s but not %s", word, lf->fileName, readsFile); + struct monomerRef *ref; + AllocVar(ref); + ref->val = monomer; + slAddHead(&type->list, ref); + hashAddUnique(store->typeHash, word, type); + } + } +slReverse(&store->typeList); +lineFileClose(&lf); +verbose(2, "Added %d types containing %d words from %s\n", + slCount(store->typeList), store->typeHash->elCount, fileName); +} + +void monomerListNormalise(struct monomer *list, int totalCount, int outputSize) +/* Set outTarget field in all of list to be normalized to outputSize */ +{ +struct monomer *monomer; +double scale = outputSize/totalCount; +for (monomer = list; monomer != NULL; monomer = monomer->next) + monomer->outTarget = round(scale * monomer->useCount); +} + +void alphaStoreNormalize(struct alphaStore *store, int outputSize) +/* Set up output counts on each word to make sure it gets it's share of output size */ +{ +struct wordTree *wt = store->markovChains; +wordTreeNormalize(wt, outputSize, 1.0); +monomerListNormalise(store->monomerList, wt->useCount, outputSize); +} + +void alphaAsm(char *readsFile, char *monomerOrderFile, char *outFile) +/* alphaAsm - Create Markov chain of words and optionally output chain in two formats. */ +{ +struct alphaStore *store = alphaStoreNew(maxChainSize); +alphaReadListFromFile(readsFile, store); +makeMarkovChains(store); +// struct alphaStore *store = alphaStoreForChainsInFile(readsFile, maxChainSize); +struct wordTree *wt = store->markovChains; +alphaStoreLoadMonomerOrder(store, readsFile, monomerOrderFile); +alphaStoreNormalize(store, outSize); + +if (optionExists("chain")) + { + char *fileName = optionVal("chain", NULL); + wordTreeWrite(wt, store->maxChainSize, fileName); + } + +wordTreeGenerateFile(store, store->maxChainSize, pickRandom(wt->children), outSize, outFile); + +if (optionExists("afterChain")) + { + char *fileName = optionVal("afterChain", NULL); + wordTreeWrite(wt, store->maxChainSize, fileName); + } +} + +int main(int argc, char *argv[]) +/* Process command line. */ +{ +optionInit(&argc, argv, options); +if (argc != 4) + usage(); +maxChainSize = optionInt("size", maxChainSize); +outSize = optionInt("outSize", outSize); +fullOnly = optionExists("fullOnly"); +int seed = optionInt("seed", (int)time(0)); +srand(seed); +alphaAsm(argv[1], argv[2], argv[3]); +return 0; +}