b174d4ad085553d5a905b7147868a79070f5ea9d kent Wed Apr 25 18:19:27 2012 -0700 Fixed a bug in the useCounts from double counting once per line of input. Starting to implement the output, but it's still rough and buggy. diff --git src/kehayden/alphaChain/alphaChain.c src/kehayden/alphaChain/alphaChain.c index d59c43b..9ef18f9 100644 --- src/kehayden/alphaChain/alphaChain.c +++ src/kehayden/alphaChain/alphaChain.c @@ -22,30 +22,31 @@ "alphaChain - create a linear projection of alpha satellite arrays using the probablistic model\n" "of HuRef satellite graphs\n" "usage:\n" " alphaChain alphaMonFile.fa significant_output.txt\n" "options:\n" " -size=N - Set max chain size, default %d\n" " -chain=fileName - Write out word chain to file\n" " -maxNonsenseSize=N - Keep nonsense output to this many words.\n" " -fullOnly - Only output chains of size\n" " -minUse=N - Set minimum use in output chain, default %d\n" , maxChainSize, minUse ); } char *noData = "n/a"; // Used to indicate a dummy node representing missing data + /* Command line validation table. */ static struct optionSpec options[] = { {"size", OPTION_INT}, {"minUse", OPTION_INT}, {"chain", OPTION_STRING}, {"fullOnly", OPTION_BOOLEAN}, {"maxNonsenseSize", OPTION_INT}, {NULL, 0}, }; /* The wordTree structure below is the central data structure for this program. It is * used to build up a tree that contains all observed N-word-long sequences observed in * the text, where N corresponds to the "size" command line option which defaults to 3, * an option that in turn is stored in the maxChainSize variable. At this chain size the * text @@ -292,140 +293,192 @@ fprintf(f, "\n"); } } if (wt->following != NULL) { list = rbTreeItems(wt->following); for (ref = list; ref != NULL; ref = ref->next) wordTreeDump(level+1, ref->val, f); slFreeList(&list); } } int totalUses = 0; int curUses = 0; int useThreshold = 0; -char *pickedWord; +struct wordTree *picked; void addUse(void *v) /* Add up to total uses. */ { struct wordTree *wt = v; -totalUses += wt->useCount; +totalUses += wt->outputCount; } void pickIfInThreshold(void *v) -/* See if inside threshold, and if so store it in pickedWord. */ +/* See if inside threshold, and if so store it in picked. */ { struct wordTree *wt = v; -int top = curUses + wt->useCount; +int top = curUses + wt->outputCount; if (curUses <= useThreshold && useThreshold < top) - pickedWord = wt->word; + picked = wt; curUses = top; } -char *pickRandomWord(struct rbTree *rbTree) +struct wordTree *pickRandom(struct rbTree *rbTree) /* Pick word from list randomly, but so that words more * commonly seen are picked more often. */ { -pickedWord = NULL; +picked = NULL; curUses = 0; totalUses = 0; rbTreeTraverse(rbTree, addUse); useThreshold = rand() % totalUses; rbTreeTraverse(rbTree, pickIfInThreshold); -assert(pickedWord != NULL); -return pickedWord; +assert(picked != NULL); +return picked; +} + +void dumpWordList(struct dlNode *list) +{ +struct dlNode *node; +for (node = list; !dlEnd(node); node = node->next) + { + char *word = node->val; + uglyf("%s ", word); + } } -char *predictNextFromAllPredecessors(struct wordTree *wt, struct dlNode *list) +struct wordTree *predictNextFromAllPredecessors(struct wordTree *wt, struct dlNode *list) /* Predict next word given tree and recently used word list. If tree doesn't * have statistics for what comes next given the words in list, then it returns * NULL. */ { struct dlNode *node; for (node = list; !dlEnd(node); node = node->next) { char *word = node->val; struct wordTree key; key.word = word; wt = rbTreeFind(wt->following, &key); if (wt == NULL || wt->following == NULL) break; } -char *result = NULL; +struct wordTree *result = NULL; if (wt != NULL && wt->following != NULL) - result = pickRandomWord(wt->following); + result = pickRandom(wt->following); return result; } -char *predictNext(struct wordTree *wt, struct dlList *recent) +struct wordTree *predictNext(struct wordTree *wt, struct dlList *recent) /* Predict next word given tree and recently used word list. Will use all words in * recent list if can, but if there is not data in tree, will back off, and use * progressively less previous words until ultimately it just picks a random * word. */ { struct dlNode *node; for (node = recent->head; !dlEnd(node); node = node->next) { - char *result = predictNextFromAllPredecessors(wt, node); + struct wordTree *result = predictNextFromAllPredecessors(wt, node); if (result != NULL) return result; } -return pickRandomWord(wt->following); +return pickRandom(wt->following); +} + +void decrementOutputCounts(struct wordTree *wt) +/* Decrement output count of self and parents. */ +{ +while (wt != NULL) + { + wt->outputCount -= 1; + wt = wt->parent; + } } -static void wordTreeMakeNonsense(struct wordTree *wt, int maxSize, char *firstWord, +int anyForceCount = 0; +int fullForceCount = 0; + +struct wordTree *forceRealWord(struct wordTree *wt, struct dlNode *list) +/* Get a choice that is not one of the fake no-date ones by backing up to progressively + * higher levels of markov chain. */ +{ +++anyForceCount; +// uglyf("forceRealWord("); dumpWordList(list); uglyf(")\n"); +struct dlNode *sublist; +for (sublist = list->next; !dlEnd(sublist); sublist = sublist->next) /* Skip over first one, it failed already. */ + { + // uglyf(" "); dumpWordList(sublist); uglyf("\n"); + struct wordTree *picked = predictNextFromAllPredecessors(wt, sublist); + if (picked != NULL && !sameString(picked->word, noData)) + return picked; + } +++fullForceCount; +return pickRandom(wt->following); +} + +static void wordTreeGenerateFaux(struct wordTree *wt, int maxSize, struct wordTree *firstWord, int maxOutputWords, FILE *f) /* Go spew out a bunch of words according to probabilities in tree. */ { struct dlList *ll = dlListNew(); int listSize = 0; int outputWords = 0; for (;;) { if (++outputWords > maxOutputWords) break; struct dlNode *node; - char *word; + struct wordTree *maybeWord; // This might be what we want, or it might be a dummy node /* Get next predicted word. */ if (listSize == 0) { AllocVar(node); ++listSize; - word = firstWord; + maybeWord = firstWord; } else if (listSize >= maxSize) { node = dlPopHead(ll); - word = predictNext(wt, ll); + maybeWord = predictNext(wt, ll); } else { - word = predictNext(wt, ll); + maybeWord = predictNext(wt, ll); AllocVar(node); ++listSize; } - node->val = word; - dlAddTail(ll, node); - if (word == NULL) + if (maybeWord == NULL) break; - fprintf(f, "%s\n", word); + /* Here we deal with possibly having fetched a dummy node. */ + struct wordTree *realWord = maybeWord; + if (sameString(maybeWord->word, noData)) + { + realWord = forceRealWord(wt, ll->head); + } + + /* Add word from whatever level we fetched back to our chain of up to maxChainSize. */ + node->val = realWord->word; + dlAddTail(ll, node); + + fprintf(f, "%s\n", maybeWord->word); + + decrementOutputCounts(maybeWord); } dlListFree(&ll); } struct wordTree *wordTreeForChainsInFile(char *fileName, int chainSize, struct lm *lm) /* Return a wordTree of all chains-of-words of length chainSize seen in file. * Allocate the structure in local memory pool lm. */ { /* Stuff for processing file a line at a time. */ struct lineFile *lf = lineFileOpen(fileName, TRUE); char *line, *word; /* We'll build up the tree starting with an empty root node. */ struct wordTree *wt = wordTreeNew(""); @@ -510,36 +563,39 @@ if (optionExists("chain")) { char *fileName = optionVal("chain", NULL); FILE *f = mustOpen(fileName, "w"); fprintf(f, "#level\tuseCount\toutputCount\tnormVal\tmonomers\n"); wordTreeDump(0, wt, f); carefulClose(&f); } FILE *f = mustOpen(outFile, "w"); int maxSize = min(wt->useCount, maxNonsenseSize); /* KEH NOTES: controls how many words we emit */ - wordTreeMakeNonsense(wt, maxChainSize, pickRandomWord(wt->following), maxSize, f); +wordTreeGenerateFaux(wt, maxChainSize, pickRandom(wt->following), maxSize, f); carefulClose(&f); +uglyf("anyForce %d, fullForce %d (%4.2f%%)\n", anyForceCount, fullForceCount, 100.0*fullForceCount/anyForceCount); lmCleanup(&lm); // Not really needed since we're just going to exit. } int main(int argc, char *argv[]) /* Process command line. */ { +#ifdef SOON srand( (unsigned)time(0) ); +#endif /* SOON */ optionInit(&argc, argv, options); if (argc != 3) usage(); maxChainSize = optionInt("size", maxChainSize); minUse = optionInt("minUse", minUse); maxNonsenseSize = optionInt("maxNonsenseSize", maxNonsenseSize); fullOnly = optionExists("fullOnly"); alphaChain(argv[1], argv[2]); return 0; }