f4ab7c72c85bd8f5f90dc8586aa4ab4ef3cf4703 kehayden Wed Apr 25 15:37:53 2012 -0700 consolidated alphaChain between kent libraries, with parent diff --git src/kehayden/alphaChain/alphaChain.c src/kehayden/alphaChain/alphaChain.c index 1dec49e..3c39547 100644 --- src/kehayden/alphaChain/alphaChain.c +++ src/kehayden/alphaChain/alphaChain.c @@ -80,77 +80,116 @@ * Note how the tree is able to compress the two chains "the black dog" and "the black cat." * * A node in the tree can have as many children as it needs to at each node. The depth of * the tree is the same as the chain size, by default 3. At each node in the tree you get * a word, and a list of all words that are observed in the text to follow that word. * * There are special cases in the code so that the first and last words in the text get included * as much as possible in the tree. * * Once the program has build up the wordTree, it can output it in a couple of fashions. */ struct wordTree /* A node in a tree of words. The head of the tree is a node with word value the empty string. */ { struct rbTree *following; /* Contains words (as struct wordTree) that follow us. */ + struct wordTree *parent; /* Parent of this node or NULL for root. */ char *word; /* The word itself including comma, period etc. */ int useCount; /* Number of times word used. */ int outputCount; /* each level of tree and initialize that to a normalized version of it. */ double normVal; /* value to place the normalization value */ - double deadEnd; /* value to keep track of missing branches */ + int missingFromChildren; /* Uses not in children. */ }; struct wordTree *wordTreeNew(char *word) /* Create and return new wordTree element. */ { struct wordTree *wt; AllocVar(wt); wt->word = cloneString(word); return wt; } int wordTreeCmpWord(void *va, void *vb) /* Compare two wordTree. */ { struct wordTree *a = va, *b = vb; return strcmp(a->word, b->word); } - - +int wordTreeChildrenUseCount(struct wordTree *wt) +/* Return sum of useCounts of all children */ +{ +struct rbTree *following = wt->following; +if (following == NULL) + return 0; +struct slRef *childList = rbTreeItems(following); +struct slRef *childRef; +int total = 0; +for (childRef = childList; childRef != NULL; childRef = childRef->next) + { + struct wordTree *child = childRef->val; + total += child->useCount; + } +slFreeList(&childList); +return total; +} +int wordTreeCountNotInChildren(struct wordTree *wt) +/* Count up useCounts of all children and return difference between this and our own useCount. */ +{ +return wt->useCount - wordTreeChildrenUseCount(wt); +} +void wordTreeSetMissing(struct wordTree *wt) +/* Set missingFromChildren in self and all children. */ +{ +wt->missingFromChildren = wordTreeCountNotInChildren(wt); +struct rbTree *following = wt->following; +if (following != NULL) + { + struct slRef *childList = rbTreeItems(following); + struct slRef *childRef; + for (childRef = childList; childRef != NULL; childRef = childRef->next) + { + struct wordTree *child = childRef->val; + wordTreeSetMissing(child); + } + slFreeList(&childList); + } +} struct wordTree *wordTreeAddFollowing(struct wordTree *wt, char *word, struct lm *lm, struct rbTreeNode **stack) /* Make word follow wt in tree. If word already exists among followers * return it and bump use count. Otherwise create new one. */ { struct wordTree *w; /* Points to following element if any */ if (wt->following == NULL) { /* Allocate new if you've never seen it before. */ wt->following = rbTreeNewDetailed(wordTreeCmpWord, lm, stack); w = NULL; } else { /* Find word in existing tree */ struct wordTree key; key.word = word; w = rbTreeFind(wt->following, &key); } if (w == NULL) { w = wordTreeNew(word); + w->parent = wt; rbTreeAdd(wt->following, w); } w->useCount += 1; return w; } void addChainToTree(struct wordTree *wt, struct dlList *chain, struct lm *lm, struct rbTreeNode **stack) /* Add chain of words to tree. */ { struct dlNode *node; wt->useCount += 1; for (node = chain->head; !dlEnd(node); node = node->next) { char *word = node->val; @@ -221,31 +260,31 @@ */ } void wordTreeDump(int level, struct wordTree *wt, FILE *f) /* Write out wordTree to file. */ { static char *words[64]; struct slRef *list, *ref; int i; assert(level < ArraySize(words)); words[level] = wt->word; if (wt->useCount >= minUse) { if (!fullOnly || level == maxChainSize) { - fprintf(f, "%d\t%d\t%d\t%f\t%f\t", level, wt->useCount, wt->outputCount, wt->normVal, wt->deadEnd); + fprintf(f, "%d\t%d\t%d\t%f\t%d\t", level, wt->useCount, wt->outputCount, wt->normVal, wt->missingFromChildren); for (i=1; i<=level; ++i) { spaceOut(f, level*2); fprintf(f, "%s ", words[i]); } fprintf(f, "\n"); } } if (wt->following != NULL) { list = rbTreeItems(wt->following); for (ref = list; ref != NULL; ref = ref->next) wordTreeDump(level+1, ref->val, f); slFreeList(&list); @@ -442,31 +481,32 @@ freeMem(node); } dlListFree(&chain); } lineFileClose(&lf); return wt; } void alphaChain(char *inFile, char *outFile) /* alphaChain - Create Markov chain of words and optionally output chain in two formats. */ { struct lm *lm = lmInit(0); struct wordTree *wt = wordTreeForChainsInFile(inFile, maxChainSize, lm); wordTreeNormalize(wt, 1.0); -wordTreeDeadEnd(wt); +wordTreeSetMissing(wt); + if (optionExists("chain")) { char *fileName = optionVal("chain", NULL); FILE *f = mustOpen(fileName, "w"); wordTreeDump(0, wt, f); carefulClose(&f); } FILE *f = mustOpen(outFile, "w"); int maxSize = min(wt->useCount, maxNonsenseSize); /* KEH NOTES: controls how many words we emit */ wordTreeMakeNonsense(wt, maxChainSize, pickRandomWord(wt->following), maxSize, f);