1568a9ea025f3b3def190c2b3b93b6bbcee6e0ab kent Wed Sep 5 19:13:39 2012 -0700 Refactoring - working towards making predictNext more general. diff --git src/kehayden/alphaChain/alphaChain.c src/kehayden/alphaChain/alphaChain.c index f76263d..92b9706 100644 --- src/kehayden/alphaChain/alphaChain.c +++ src/kehayden/alphaChain/alphaChain.c @@ -65,30 +65,31 @@ }; struct wordType /* A collection of words of the same type - or monomers of same type. */ { struct wordType *next; /* Next wordType */ struct wordInfoRef *list; /* List of all words of that type */ }; struct wordStore /* Stores info on all words */ { struct wordInfo *infoList; /* List of words, fairly arbitrary order. */ struct hash *infoHash; /* Hash of wordInfo, keyed by word. */ struct wordTree *markovChains; /* Tree of words that follow other words. */ + int maxChainSize; /* Maximum depth of tree. */ struct wordType *typeList; /* List of all types. */ struct hash *typeHash; /* Hash with wordType values, keyed by all words. */ }; /* The wordTree structure below is the central data structure for this program. It is * used to build up a tree that contains all observed N-word-long sequences observed in * the text, where N corresponds to the "size" command line option which defaults to 3, * an option that in turn is stored in the maxChainSize variable. At this chain size the * text * this is the black dog and the black cat * would have the chains * this is the * is the black * the black dog * black dog and @@ -255,53 +256,53 @@ { struct dyString *dy = dyStringNew(0); dyStringAppendC(dy, '{'); struct dlNode *node; for (node = head; !dlEnd(node); node = node->next) { if (node != head) dyStringAppendC(dy, ' '); struct wordInfo *info = node->val; dyStringAppend(dy, info->word); } dyStringAppendC(dy, '}'); return dyStringCannibalize(&dy); } -void wordTreeDump(int level, struct wordTree *wt, FILE *f) +void wordTreeDump(int level, struct wordTree *wt, int maxChainSize, FILE *f) /* Write out wordTree to file. */ { static char *words[64]; int i; assert(level < ArraySize(words)); words[level] = wt->info->word; if (!fullOnly || level == maxChainSize) { fprintf(f, "%d\t%d\t%d\t%d\t%f\t", level, wt->useCount, wt->outTarget, wt->outCount, wt->normVal); for (i=1; i<=level; ++i) { spaceOut(f, level*2); fprintf(f, "%s ", words[i]); } fprintf(f, "\n"); } struct wordTree *child; for (child = wt->children; child != NULL; child = child->next) - wordTreeDump(level+1, child, f); + wordTreeDump(level+1, child, maxChainSize, f); } struct wordTree *pickRandomOnOutTarget(struct wordTree *list) /* Pick word from list randomly, but so that words with higher outTargets * are picked more often. */ { struct wordTree *picked = NULL; /* Debug output. */ { verbose(2, " pickRandomOnOutTarget("); struct wordTree *wt; for (wt = list; wt != NULL; wt = wt->next) { if (wt != list) @@ -396,103 +397,124 @@ struct dlNode *node; for (node = list; !dlEnd(node); node = node->next) { struct wordInfo *info = node->val; wt = wordTreeFindInList(wt->children, info); verbose(2, " wordTreeFindInList(%s) = %p %s\n", info->word, wt, wordTreeString(wt)); if (wt == NULL || wt->children == NULL) break; } struct wordTree *result = NULL; if (wt != NULL && wt->children != NULL) result = pickRandom(wt->children); return result; } -struct wordTree *predictNext(struct wordTree *wt, struct dlNode *recent) +struct wordTree *predictNextFromRecent(struct wordTree *wt, struct dlNode *recent) /* Predict next word given tree and recently used word list. Will use all words in * recent list if can, but if there is not data in tree, will back off, and use * progressively less previous words until ultimately it just picks a random * word. */ { struct dlNode *node; for (node = recent; !dlEnd(node); node = node->next) { struct wordTree *result = predictNextFromAllPredecessors(wt, node); if (result != NULL) return result; } struct wordTree *topLevel = pickRandom(wt->children); verbose(2, "in predictNext(%s, %s) ", wordTreeString(wt), dlListFragWords(recent)); verbose(2, "last resort pick of %s\n", topLevel->info->word); return topLevel; } +struct dlNode *nodesFromTail(struct dlList *list, int count) +/* Return count nodes from tail of list. If list is shorter than count, it returns the + * whole list. */ +{ +int i; +struct dlNode *node; +for (i=0, node = list->tail; i<count; ++i) + { + if (dlStart(node)) + return list->head; + node = node->prev; + } +return node; +} + +struct wordTree *predictNext(struct wordStore *store, struct dlList *past) +/* Given input data store and what is known from the past, predict the next word. */ +{ +struct dlNode *recent = nodesFromTail(past, store->maxChainSize); +return predictNextFromRecent(store->markovChains, recent); +} + void decrementOutputCounts(struct wordTree *wt) /* Decrement output count of self and parents. */ { while (wt != NULL) { /* Decrement target count, but don't let it fall below sum of counts of all children. * This can happen with incomplete data if we don't prevent it. This * same code also prevents us from having negative outTarget. */ int outTarget = wt->outTarget - 1; int kidSum = wordTreeSumOutTargets(wt->children); if (outTarget < kidSum) outTarget = kidSum; wt->outTarget = outTarget; /* Always bump outCount for debugging. */ wt->outCount += 1; wt = wt->parent; } } static struct dlList *wordTreeGenerateList(struct wordStore *store, int maxSize, struct wordTree *firstWord, int maxOutputWords) /* Make a list of words based on probabilities in tree. */ { -struct wordTree *wt = store->markovChains; struct dlList *ll = dlListNew(); int chainSize = 0; int outputWords = 0; struct dlNode *chainStartNode = NULL; for (;;) { if (++outputWords > maxOutputWords) break; struct dlNode *node; struct wordTree *picked; /* Get next predicted word. */ AllocVar(node); if (chainSize == 0) { chainStartNode = node; ++chainSize; picked = firstWord; } else if (chainSize >= maxSize) { chainStartNode = chainStartNode->next; - picked = predictNext(wt, chainStartNode); + picked = predictNext(store, ll); } else { - picked = predictNext(wt, chainStartNode); + picked = predictNext(store, ll); ++chainSize; } if (picked == NULL) break; /* Add word from whatever level we fetched back to our output chain. */ node->val = picked->info; dlAddTail(ll, node); decrementOutputCounts(picked); } verbose(2, "totUseZeroCount = %d\n", totUseZeroCount); return ll; @@ -513,64 +535,65 @@ } carefulClose(&f); dlListFree(&ll); } static void wordTreeSort(struct wordTree *wt) /* Sort all children lists in tree. */ { slSort(&wt->children, wordTreeCmpWord); struct wordTree *child; for (child = wt->children; child != NULL; child = child->next) wordTreeSort(child); } -struct wordStore *wordStoreNew() +struct wordStore *wordStoreNew(int maxChainSize) /* Allocate and initialize a new word store. */ { struct wordStore *store; AllocVar(store); store->infoHash = hashNew(0); +store->maxChainSize = maxChainSize; return store; } struct wordInfo *wordStoreAdd(struct wordStore *store, char *word) /* Add word to store, incrementing it's useCount if it's already there, otherwise * making up a new record for it. */ { struct wordInfo *info = hashFindVal(store->infoHash, word); if (info == NULL) { AllocVar(info); hashAddSaveName(store->infoHash, word, info, &info->word); slAddHead(&store->infoList, info); } info->useCount += 1; return info; } struct wordStore *wordStoreForChainsInFile(char *fileName, int chainSize) /* Return a wordStore containing all words, and also all chains-of-words of length * chainSize seen in file. */ { /* Stuff for processing file a line at a time. */ struct lineFile *lf = lineFileOpen(fileName, TRUE); char *line, *word; /* We'll build up the tree starting with an empty root node. */ -struct wordStore *store = wordStoreNew(); +struct wordStore *store = wordStoreNew(chainSize); struct wordTree *wt = store->markovChains = wordTreeNew(wordStoreAdd(store, "")); /* Loop through each line of file, treating it as a separate read. There's * special cases at the beginning and end of line, and for short lines. In the * main case we'll be maintaining a chain (doubly linked list) of maxChainSize words, * popping off one word from the start, and adding one word to the end for each * new word we encounter. This list is added to the tree each iteration. */ while (lineFileNext(lf, &line, NULL)) { /* We'll keep a chain of three or so words in a doubly linked list. */ struct dlNode *node; struct dlList *chain = dlListNew(); int curSize = 0; int wordCount = 0; @@ -609,36 +632,36 @@ addChainToTree(wt, chain); while ((node = dlPopHead(chain)) != NULL) { if (!dlEmpty(chain)) addChainToTree(wt, chain); freeMem(node); } dlListFree(&chain); } lineFileClose(&lf); wordTreeSort(wt); // Make output of chain file prettier return store; } -void wordTreeWrite(struct wordTree *wt, char *fileName) +void wordTreeWrite(struct wordTree *wt, int maxChainSize, char *fileName) /* Write out tree to file */ { FILE *f = mustOpen(fileName, "w"); fprintf(f, "#level\tuseCount\toutTarget\toutCount\tnormVal\tmonomers\n"); -wordTreeDump(0, wt, f); +wordTreeDump(0, wt, maxChainSize, f); carefulClose(&f); } void wordStoreLoadMonomerOrder(struct wordStore *store, char *readsFile, char *fileName) /* Read in a file with one line for each monomer type, containing a word for each * monomer variant. Requires all variants already be in store. The readsFile is passed * just for nicer error reporting. */ { /* Stuff for processing file a line at a time. */ struct lineFile *lf = lineFileOpen(fileName, TRUE); char *line, *word; /* Set up variables we'll put results in in store. */ store->typeHash = hashNew(0); store->typeList = NULL; @@ -664,39 +687,39 @@ verbose(2, "Added %d types containing %d words from %s\n", slCount(store->typeList), store->typeHash->elCount, fileName); } void alphaChain(char *readsFile, char *monomerOrderFile, char *outFile) /* alphaChain - Create Markov chain of words and optionally output chain in two formats. */ { struct wordStore *store = wordStoreForChainsInFile(readsFile, maxChainSize); struct wordTree *wt = store->markovChains; wordStoreLoadMonomerOrder(store, readsFile, monomerOrderFile); wordTreeNormalize(wt, outSize, 1.0); if (optionExists("chain")) { char *fileName = optionVal("chain", NULL); - wordTreeWrite(wt, fileName); + wordTreeWrite(wt, store->maxChainSize, fileName); } -wordTreeGenerateFile(store, maxChainSize, pickRandom(wt->children), outSize, outFile); +wordTreeGenerateFile(store, store->maxChainSize, pickRandom(wt->children), outSize, outFile); if (optionExists("afterChain")) { char *fileName = optionVal("afterChain", NULL); - wordTreeWrite(wt, fileName); + wordTreeWrite(wt, store->maxChainSize, fileName); } } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 4) usage(); maxChainSize = optionInt("size", maxChainSize); outSize = optionInt("outSize", outSize); fullOnly = optionExists("fullOnly"); int seed = optionInt("seed", (int)time(0)); srand(seed); alphaChain(argv[1], argv[2], argv[3]);