7982829ea415cb7bfb760f0ec79793c6230d8d86 kent Thu May 3 09:25:08 2012 -0700 Further work on faux-generation side. Getting close but not quite perfect. diff --git src/kehayden/alphaChain/alphaChain.c src/kehayden/alphaChain/alphaChain.c index 071bccf..6d9b8b3 100644 --- src/kehayden/alphaChain/alphaChain.c +++ src/kehayden/alphaChain/alphaChain.c @@ -71,43 +71,41 @@ * black * dog * and * dog * and * the * and * the * black * Note how the tree is able to compress the two chains "the black dog" and "the black cat." * * A node in the tree can have as many children as it needs to at each node. The depth of * the tree is the same as the chain size, by default 3. At each node in the tree you get * a word, and a list of all words that are observed in the text to follow that word. * - * There are special cases in the code so that the first and last words in the text get included - * as much as possible in the tree. - * * Once the program has build up the wordTree, it can output it in a couple of fashions. */ struct wordTree /* A node in a tree of words. The head of the tree is a node with word value the empty string. */ { struct rbTree *following; /* Contains words (as struct wordTree) that follow us. */ struct wordTree *parent; /* Parent of this node or NULL for root. */ char *word; /* The word itself including comma, period etc. */ - int useCount; /* Number of times word used. */ - int outputCount; /* each level of tree and initialize that to a normalized version of it. */ + int useCount; /* Number of times word used in input. */ + int outTarget; /* Number of times want to output word. */ + int outCount; /* Number of times output. */ double normVal; /* value to place the normalization value */ }; struct wordTree *wordTreeNew(char *word) /* Create and return new wordTree element. */ { struct wordTree *wt; AllocVar(wt); wt->word = cloneString(word); return wt; } int wordTreeCmpWord(void *va, void *vb) /* Compare two wordTree. */ { @@ -164,177 +162,136 @@ void addChainToTree(struct wordTree *wt, struct dlList *chain, struct lm *lm, struct rbTreeNode **stack) /* Add chain of words to tree. */ { struct dlNode *node; wt->useCount += 1; for (node = chain->head; !dlEnd(node); node = node->next) { char *word = node->val; verbose(2, " %s\n", word); wt = wordTreeAddFollowing(wt, word, lm, stack); } } -void wordTreeNormalize(struct wordTree *wt, int outputCount, double normVal) -/* Recursively set wt->normVal and wt->outputCount */ +void wordTreeNormalize(struct wordTree *wt, double outTarget, double normVal) +/* Recursively set wt->normVal and wt->outTarget so each branch gets its share */ { wt->normVal = normVal; -wt->outputCount = outputCount; +wt->outTarget = outTarget; if (wt->following != NULL) { int totalChildUses = wordTreeChildrenUseCount(wt); struct slRef *list = rbTreeItems(wt->following); struct slRef *ref; for (ref = list; ref !=NULL; ref = ref->next) { struct wordTree *child = ref->val; double childRatio = (double)child->useCount / totalChildUses; - wordTreeNormalize(child, round(childRatio*outputCount), childRatio*normVal); + wordTreeNormalize(child, childRatio*outTarget, childRatio*normVal); } slFreeList(&list); } } -void wordTreeDeadEnd(struct wordTree *wt) -/* tally and include incomplete branches */ -{ -/* int levelNormVal = 0; - * int levelCount = 0; - * int sumNormVal = 0; - * int sumCount = 0; - * int diffNormVal = 0; - * int diffCount=0; - * Loop pseudocode - * work recursively through level 1-> 3, start at root of tree - * foreach word at level 1 - * { - * sumCount = 0 - * sumNormVal = 0 - * levelCount = wt -> outputCount - * levelNormVal = wt-> normVal - * if(wt->following == NULL) - * { - * create new child recursively (level 2 and level 3/default) - * wt->normVal = levelNormVal - * wt->word = 'NaN' - * wt->outputCount = levelCount - * } - * else - * { - * foreach wt->following at level + 1 - * { - * sumCount += wt->outputCount - * sumNormVal += wt->normVal - * ** RECURSIVE level 2 + 1 here ** - * } - * diffCount = levelCount - sumCount - * diffNormVal = levelNormVal - sumNormVal - * if(diffCount > 0) - * { - * create level 2: - * wt->normVal = diffNormVal - * wt->word = 'NaN' - * wt->outputVal = diffCount - * } - */ -} - void wordTreeDump(int level, struct wordTree *wt, FILE *f) /* Write out wordTree to file. */ { static char *words[64]; struct slRef *list, *ref; int i; assert(level < ArraySize(words)); words[level] = wt->word; if (wt->useCount >= minUse) { if (!fullOnly || level == maxChainSize) { - fprintf(f, "%d\t%d\t%d\t%f\t", level, wt->useCount, wt->outputCount, wt->normVal); + fprintf(f, "%d\t%d\t%d\t%d\t%f\t", level, wt->useCount, wt->outTarget, wt->outCount, wt->normVal); for (i=1; i<=level; ++i) { spaceOut(f, level*2); fprintf(f, "%s ", words[i]); } fprintf(f, "\n"); } } if (wt->following != NULL) { list = rbTreeItems(wt->following); for (ref = list; ref != NULL; ref = ref->next) wordTreeDump(level+1, ref->val, f); slFreeList(&list); } } int totalUses = 0; int curUses = 0; int useThreshold = 0; -struct wordTree *picked; +struct wordTree *pickedNode; int totUseZeroCount = 0; void addUse(void *v) /* Add up to total uses. */ { struct wordTree *wt = v; -totalUses += wt->outputCount; +totalUses += wt->outTarget; } void pickIfInThreshold(void *v) -/* See if inside threshold, and if so store it in picked. */ +/* See if inside threshold, and if so store it in pickedNode. */ +{ +if (pickedNode == NULL) { struct wordTree *wt = v; -int top = curUses + wt->outputCount; -if (curUses <= useThreshold && useThreshold < top) - picked = wt; + int top = curUses + wt->outTarget; + if (useThreshold < top) + pickedNode = wt; curUses = top; } +} void pickAny(void *v) -/* See if inside threshold, and if so store it in picked. */ +/* Force it to pick something - first thing as it turns out. */ { -struct wordTree *wt = v; -picked = wt; +if (pickedNode == NULL) + pickedNode = v; } struct wordTree *pickRandom(struct rbTree *rbTree) /* Pick word from list randomly, but so that words more * commonly seen are picked more often. */ { -picked = NULL; +pickedNode = NULL; curUses = 0; totalUses = 0; rbTreeTraverse(rbTree, addUse); -if (totalUses != 0) +if (totalUses > 0) { useThreshold = rand() % totalUses; rbTreeTraverse(rbTree, pickIfInThreshold); } -if (picked == NULL) +if (pickedNode == NULL) { ++totUseZeroCount; rbTreeTraverse(rbTree, pickAny); } -assert(picked != NULL); -return picked; +assert(pickedNode != NULL); +return pickedNode; } struct wordTree *predictNextFromAllPredecessors(struct wordTree *wt, struct dlNode *list) /* Predict next word given tree and recently used word list. If tree doesn't * have statistics for what comes next given the words in list, then it returns * NULL. */ { struct dlNode *node; for (node = list; !dlEnd(node); node = node->next) { char *word = node->val; struct wordTree key; key.word = word; wt = rbTreeFind(wt->following, &key); if (wt == NULL || wt->following == NULL) @@ -355,82 +312,86 @@ struct dlNode *node; for (node = recent->head; !dlEnd(node); node = node->next) { struct wordTree *result = predictNextFromAllPredecessors(wt, node); if (result != NULL) return result; } return pickRandom(wt->following); } void decrementOutputCounts(struct wordTree *wt) /* Decrement output count of self and parents. */ { while (wt != NULL) { - wt->outputCount -= 1; + wt->outTarget -= 1; + wt->outCount += 1; wt = wt->parent; } } static void wordTreeGenerateFaux(struct wordTree *wt, int maxSize, struct wordTree *firstWord, - int maxOutputWords, FILE *f) + int maxOutputWords, char *fileName) /* Go spew out a bunch of words according to probabilities in tree. */ { +FILE *f = mustOpen(fileName, "w"); struct dlList *ll = dlListNew(); int listSize = 0; int outputWords = 0; for (;;) { if (++outputWords > maxOutputWords) break; struct dlNode *node; struct wordTree *picked; /* Get next predicted word. */ if (listSize == 0) { AllocVar(node); ++listSize; picked = firstWord; } else if (listSize >= maxSize) { node = dlPopHead(ll); picked = predictNext(wt, ll); +// decrementOutputCounts(picked); // ugly placement? } else { picked = predictNext(wt, ll); AllocVar(node); ++listSize; } if (picked == NULL) break; /* Add word from whatever level we fetched back to our chain of up to maxChainSize. */ node->val = picked->word; dlAddTail(ll, node); fprintf(f, "%s\n", picked->word); decrementOutputCounts(picked); } dlListFree(&ll); +carefulClose(&f); } struct wordTree *wordTreeForChainsInFile(char *fileName, int chainSize, struct lm *lm) /* Return a wordTree of all chains-of-words of length chainSize seen in file. * Allocate the structure in local memory pool lm. */ { /* Stuff for processing file a line at a time. */ struct lineFile *lf = lineFileOpen(fileName, TRUE); char *line, *word; /* We'll build up the tree starting with an empty root node. */ struct wordTree *wt = wordTreeNew(""); /* Save time/space by sharing stack between all "following" rbTrees. */ struct rbTreeNode **stack; @@ -449,33 +410,30 @@ int wordCount = 0; /* skipping the first word which is the read id */ word = nextWord(&line); while ((word = nextWord(&line)) != NULL) { /* We come to this point in the code for each word in the file. * Here we want to maintain a chain of sequential words up to * chainSize long. We do this with a doubly-linked list structure. * For the first few words in the file we'll just build up the list, * only adding it to the tree when we finally do get to the desired * chain size. Once past the initial section of the file we'll be * getting rid of the first link in the chain as well as adding a new * last link in the chain with each new word we see. */ - - - if (curSize < chainSize) { dlAddValTail(chain, cloneString(word)); ++curSize; if (curSize == chainSize) addChainToTree(wt, chain, lm, stack); } else { /* Reuse doubly-linked-list node, but give it a new value, as we move * it from head to tail of list. */ node = dlPopHead(chain); freeMem(node->val); node->val = cloneString(word); dlAddTail(chain, node); @@ -489,57 +447,57 @@ addChainToTree(wt, chain, lm, stack); while ((node = dlPopHead(chain)) != NULL) { if (!dlEmpty(chain)) addChainToTree(wt, chain, lm, stack); freeMem(node->val); freeMem(node); } dlListFree(&chain); } lineFileClose(&lf); return wt; } +void wordTreeWrite(struct wordTree *wt, char *fileName) +/* Write out tree to file */ +{ +FILE *f = mustOpen(fileName, "w"); +fprintf(f, "#level\tuseCount\toutTarget\toutCount\tnormVal\tmonomers\n"); +wordTreeDump(0, wt, f); +carefulClose(&f); +} + void alphaChain(char *inFile, char *outFile) /* alphaChain - Create Markov chain of words and optionally output chain in two formats. */ { struct lm *lm = lmInit(0); struct wordTree *wt = wordTreeForChainsInFile(inFile, maxChainSize, lm); wordTreeNormalize(wt, outSize, 1.0); if (optionExists("chain")) { char *fileName = optionVal("chain", NULL); - FILE *f = mustOpen(fileName, "w"); - fprintf(f, "#level\tuseCount\toutputCount\tnormVal\tmonomers\n"); - wordTreeDump(0, wt, f); - carefulClose(&f); + wordTreeWrite(wt, fileName); } -FILE *f = mustOpen(outFile, "w"); -wordTreeGenerateFaux(wt, maxChainSize, pickRandom(wt->following), outSize, f); -carefulClose(&f); -uglyf("totUseZeroCount = %d\n", totUseZeroCount); +wordTreeGenerateFaux(wt, maxChainSize, pickRandom(wt->following), outSize, outFile); - { - FILE *f = mustOpen("foo.chain", "w"); - wordTreeDump(0, wt, f); - carefulClose(&f); - } +uglyf("totUseZeroCount = %d\n", totUseZeroCount); +wordTreeWrite(wt, "ugly.chain"); lmCleanup(&lm); // Not really needed since we're just going to exit. } int main(int argc, char *argv[]) /* Process command line. */ { #ifdef SOON srand( (unsigned)time(0) ); #endif /* SOON */ optionInit(&argc, argv, options); if (argc != 3) usage(); maxChainSize = optionInt("size", maxChainSize); minUse = optionInt("minUse", minUse);