70495d3b08b1316f2eb2f0ca7032730b1333e6bb kent Wed Sep 5 11:16:51 2012 -0700 Some refactoring. Making it so saves entire generated chain in memory so that can go back further than just last 3. diff --git src/kehayden/alphaChain/alphaChain.c src/kehayden/alphaChain/alphaChain.c index 45bc5c0..f76263d 100644 --- src/kehayden/alphaChain/alphaChain.c +++ src/kehayden/alphaChain/alphaChain.c @@ -396,122 +396,137 @@ struct dlNode *node; for (node = list; !dlEnd(node); node = node->next) { struct wordInfo *info = node->val; wt = wordTreeFindInList(wt->children, info); verbose(2, " wordTreeFindInList(%s) = %p %s\n", info->word, wt, wordTreeString(wt)); if (wt == NULL || wt->children == NULL) break; } struct wordTree *result = NULL; if (wt != NULL && wt->children != NULL) result = pickRandom(wt->children); return result; } -struct wordTree *predictNext(struct wordTree *wt, struct dlList *recent) +struct wordTree *predictNext(struct wordTree *wt, struct dlNode *recent) /* Predict next word given tree and recently used word list. Will use all words in * recent list if can, but if there is not data in tree, will back off, and use * progressively less previous words until ultimately it just picks a random * word. */ { struct dlNode *node; -for (node = recent->head; !dlEnd(node); node = node->next) +for (node = recent; !dlEnd(node); node = node->next) { struct wordTree *result = predictNextFromAllPredecessors(wt, node); if (result != NULL) return result; } struct wordTree *topLevel = pickRandom(wt->children); -verbose(2, "in predictNext(%s, %s) ", wordTreeString(wt), dlListFragWords(recent->head)); +verbose(2, "in predictNext(%s, %s) ", wordTreeString(wt), dlListFragWords(recent)); verbose(2, "last resort pick of %s\n", topLevel->info->word); return topLevel; } void decrementOutputCounts(struct wordTree *wt) /* Decrement output count of self and parents. */ { while (wt != NULL) { /* Decrement target count, but don't let it fall below sum of counts of all children. * This can happen with incomplete data if we don't prevent it. This * same code also prevents us from having negative outTarget. */ int outTarget = wt->outTarget - 1; int kidSum = wordTreeSumOutTargets(wt->children); if (outTarget < kidSum) outTarget = kidSum; wt->outTarget = outTarget; /* Always bump outCount for debugging. */ wt->outCount += 1; wt = wt->parent; } } -static void wordTreeGenerateFaux(struct wordStore *store, int maxSize, struct wordTree *firstWord, - int maxOutputWords, char *fileName) -/* Go spew out a bunch of words according to probabilities in tree. */ +static struct dlList *wordTreeGenerateList(struct wordStore *store, int maxSize, + struct wordTree *firstWord, int maxOutputWords) +/* Make a list of words based on probabilities in tree. */ { struct wordTree *wt = store->markovChains; -FILE *f = mustOpen(fileName, "w"); struct dlList *ll = dlListNew(); -int listSize = 0; +int chainSize = 0; int outputWords = 0; +struct dlNode *chainStartNode = NULL; for (;;) { if (++outputWords > maxOutputWords) break; struct dlNode *node; struct wordTree *picked; /* Get next predicted word. */ - if (listSize == 0) - { AllocVar(node); - ++listSize; + if (chainSize == 0) + { + chainStartNode = node; + ++chainSize; picked = firstWord; } - else if (listSize >= maxSize) + else if (chainSize >= maxSize) { - node = dlPopHead(ll); - picked = predictNext(wt, ll); + chainStartNode = chainStartNode->next; + picked = predictNext(wt, chainStartNode); } else { - picked = predictNext(wt, ll); - AllocVar(node); - ++listSize; + picked = predictNext(wt, chainStartNode); + ++chainSize; } if (picked == NULL) break; - /* Add word from whatever level we fetched back to our chain of up to maxChainSize. */ + /* Add word from whatever level we fetched back to our output chain. */ node->val = picked->info; dlAddTail(ll, node); - fprintf(f, "%s\n", picked->info->word); - decrementOutputCounts(picked); } -dlListFree(&ll); -carefulClose(&f); verbose(2, "totUseZeroCount = %d\n", totUseZeroCount); +return ll; +} + +static void wordTreeGenerateFile(struct wordStore *store, int maxSize, struct wordTree *firstWord, + int maxOutputWords, char *fileName) +/* Create file containing words base on tree probabilities. The wordTreeGenerateList does + * most of work. */ +{ +struct dlList *ll = wordTreeGenerateList(store, maxSize, firstWord, maxOutputWords); +FILE *f = mustOpen(fileName, "w"); +struct dlNode *node; +for (node = ll->head; !dlEnd(node); node = node->next) + { + struct wordInfo *info = node->val; + fprintf(f, "%s\n", info->word); } +carefulClose(&f); +dlListFree(&ll); +} + static void wordTreeSort(struct wordTree *wt) /* Sort all children lists in tree. */ { slSort(&wt->children, wordTreeCmpWord); struct wordTree *child; for (child = wt->children; child != NULL; child = child->next) wordTreeSort(child); } struct wordStore *wordStoreNew() /* Allocate and initialize a new word store. */ { struct wordStore *store; AllocVar(store); @@ -652,31 +667,31 @@ void alphaChain(char *readsFile, char *monomerOrderFile, char *outFile) /* alphaChain - Create Markov chain of words and optionally output chain in two formats. */ { struct wordStore *store = wordStoreForChainsInFile(readsFile, maxChainSize); struct wordTree *wt = store->markovChains; wordStoreLoadMonomerOrder(store, readsFile, monomerOrderFile); wordTreeNormalize(wt, outSize, 1.0); if (optionExists("chain")) { char *fileName = optionVal("chain", NULL); wordTreeWrite(wt, fileName); } -wordTreeGenerateFaux(store, maxChainSize, pickRandom(wt->children), outSize, outFile); +wordTreeGenerateFile(store, maxChainSize, pickRandom(wt->children), outSize, outFile); if (optionExists("afterChain")) { char *fileName = optionVal("afterChain", NULL); wordTreeWrite(wt, fileName); } } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 4) usage(); maxChainSize = optionInt("size", maxChainSize);