55a0fb30ff422a2a0688685111fb68b89db1f266 kent Wed May 2 11:10:54 2012 -0700 Getting rid of dummy nodes. Output looking more reasonable, but still not 100%. diff --git src/kehayden/alphaChain/alphaChain.c src/kehayden/alphaChain/alphaChain.c index 9ef18f9..071bccf 100644 --- src/kehayden/alphaChain/alphaChain.c +++ src/kehayden/alphaChain/alphaChain.c @@ -1,59 +1,57 @@ /* alphaChain - Predicts faux centromere sequences using a probablistic model. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "localmem.h" #include "options.h" #include "dlist.h" #include "rbTree.h" /* Global vars - all of which can be set by command line options. */ int maxChainSize = 3; -int maxNonsenseSize = 10000; +int outSize = 10000; int minUse = 1; boolean lower = FALSE; boolean unpunc = FALSE; boolean fullOnly = FALSE; void usage() /* Explain usage and exit. */ { errAbort( "alphaChain - create a linear projection of alpha satellite arrays using the probablistic model\n" "of HuRef satellite graphs\n" "usage:\n" " alphaChain alphaMonFile.fa significant_output.txt\n" "options:\n" " -size=N - Set max chain size, default %d\n" " -chain=fileName - Write out word chain to file\n" - " -maxNonsenseSize=N - Keep nonsense output to this many words.\n" + " -outSize=N - Output this many words.\n" " -fullOnly - Only output chains of size\n" " -minUse=N - Set minimum use in output chain, default %d\n" , maxChainSize, minUse ); } -char *noData = "n/a"; // Used to indicate a dummy node representing missing data - /* Command line validation table. */ static struct optionSpec options[] = { {"size", OPTION_INT}, {"minUse", OPTION_INT}, {"chain", OPTION_STRING}, {"fullOnly", OPTION_BOOLEAN}, - {"maxNonsenseSize", OPTION_INT}, + {"outSize", OPTION_INT}, {NULL, 0}, }; /* The wordTree structure below is the central data structure for this program. It is * used to build up a tree that contains all observed N-word-long sequences observed in * the text, where N corresponds to the "size" command line option which defaults to 3, * an option that in turn is stored in the maxChainSize variable. At this chain size the * text * this is the black dog and the black cat * would have the chains * this is the * is the black * the black dog * black dog and * dog and the @@ -152,87 +150,59 @@ struct rbTree *following = wt->following; if (following == NULL) return 0; struct slRef *childList = rbTreeItems(following); struct slRef *childRef; int total = 0; for (childRef = childList; childRef != NULL; childRef = childRef->next) { struct wordTree *child = childRef->val; total += child->useCount; } slFreeList(&childList); return total; } -int wordTreeCountNotInChildren(struct wordTree *wt) -/* Count up useCounts of all children and return difference between this and our own useCount. */ -{ -return wt->useCount - wordTreeChildrenUseCount(wt); -} - -void wordTreeSetMissing(struct wordTree *wt, int level, struct lm *lm, struct rbTreeNode **stack) -/* Set missingFromChildren in self and all children. */ -{ -int missingFromChildren = wordTreeCountNotInChildren(wt); -if (missingFromChildren > 0) - { - struct wordTree *missing = wordTreeAddFollowing(wt, noData, lm, stack); - missing->useCount = missingFromChildren; - } -struct rbTree *following = wt->following; -if (following != NULL && level < maxChainSize) - { - struct slRef *childList = rbTreeItems(following); - struct slRef *childRef; - for (childRef = childList; childRef != NULL; childRef = childRef->next) - { - struct wordTree *child = childRef->val; - wordTreeSetMissing(child, level+1, lm, stack); - } - slFreeList(&childList); - } -} - void addChainToTree(struct wordTree *wt, struct dlList *chain, struct lm *lm, struct rbTreeNode **stack) /* Add chain of words to tree. */ { struct dlNode *node; wt->useCount += 1; for (node = chain->head; !dlEnd(node); node = node->next) { char *word = node->val; verbose(2, " %s\n", word); wt = wordTreeAddFollowing(wt, word, lm, stack); } } -void wordTreeNormalize(struct wordTree *wt, double normVal) -/* Recursively set wt->normVal */ +void wordTreeNormalize(struct wordTree *wt, int outputCount, double normVal) +/* Recursively set wt->normVal and wt->outputCount */ { wt->normVal = normVal; -wt->outputCount = normVal * maxNonsenseSize; +wt->outputCount = outputCount; if (wt->following != NULL) { + int totalChildUses = wordTreeChildrenUseCount(wt); struct slRef *list = rbTreeItems(wt->following); struct slRef *ref; for (ref = list; ref !=NULL; ref = ref->next) { struct wordTree *child = ref->val; - double childRatio = (double)child->useCount / wt->useCount; - wordTreeNormalize(child, childRatio*normVal); + double childRatio = (double)child->useCount / totalChildUses; + wordTreeNormalize(child, round(childRatio*outputCount), childRatio*normVal); } slFreeList(&list); } } void wordTreeDeadEnd(struct wordTree *wt) /* tally and include incomplete branches */ { /* int levelNormVal = 0; * int levelCount = 0; * int sumNormVal = 0; * int sumCount = 0; * int diffNormVal = 0; * int diffCount=0; * Loop pseudocode @@ -295,69 +265,76 @@ } if (wt->following != NULL) { list = rbTreeItems(wt->following); for (ref = list; ref != NULL; ref = ref->next) wordTreeDump(level+1, ref->val, f); slFreeList(&list); } } int totalUses = 0; int curUses = 0; int useThreshold = 0; struct wordTree *picked; +int totUseZeroCount = 0; + void addUse(void *v) /* Add up to total uses. */ { struct wordTree *wt = v; totalUses += wt->outputCount; } void pickIfInThreshold(void *v) /* See if inside threshold, and if so store it in picked. */ { struct wordTree *wt = v; int top = curUses + wt->outputCount; if (curUses <= useThreshold && useThreshold < top) picked = wt; curUses = top; } +void pickAny(void *v) +/* See if inside threshold, and if so store it in picked. */ +{ +struct wordTree *wt = v; +picked = wt; +} + struct wordTree *pickRandom(struct rbTree *rbTree) /* Pick word from list randomly, but so that words more * commonly seen are picked more often. */ { picked = NULL; curUses = 0; totalUses = 0; rbTreeTraverse(rbTree, addUse); +if (totalUses != 0) + { useThreshold = rand() % totalUses; rbTreeTraverse(rbTree, pickIfInThreshold); -assert(picked != NULL); -return picked; } - -void dumpWordList(struct dlNode *list) +if (picked == NULL) { -struct dlNode *node; -for (node = list; !dlEnd(node); node = node->next) - { - char *word = node->val; - uglyf("%s ", word); + ++totUseZeroCount; + rbTreeTraverse(rbTree, pickAny); } +assert(picked != NULL); +return picked; } struct wordTree *predictNextFromAllPredecessors(struct wordTree *wt, struct dlNode *list) /* Predict next word given tree and recently used word list. If tree doesn't * have statistics for what comes next given the words in list, then it returns * NULL. */ { struct dlNode *node; for (node = list; !dlEnd(node); node = node->next) { char *word = node->val; struct wordTree key; key.word = word; wt = rbTreeFind(wt->following, &key); if (wt == NULL || wt->following == NULL) @@ -383,102 +360,75 @@ return result; } return pickRandom(wt->following); } void decrementOutputCounts(struct wordTree *wt) /* Decrement output count of self and parents. */ { while (wt != NULL) { wt->outputCount -= 1; wt = wt->parent; } } -int anyForceCount = 0; -int fullForceCount = 0; - -struct wordTree *forceRealWord(struct wordTree *wt, struct dlNode *list) -/* Get a choice that is not one of the fake no-date ones by backing up to progressively - * higher levels of markov chain. */ -{ -++anyForceCount; -// uglyf("forceRealWord("); dumpWordList(list); uglyf(")\n"); -struct dlNode *sublist; -for (sublist = list->next; !dlEnd(sublist); sublist = sublist->next) /* Skip over first one, it failed already. */ - { - // uglyf(" "); dumpWordList(sublist); uglyf("\n"); - struct wordTree *picked = predictNextFromAllPredecessors(wt, sublist); - if (picked != NULL && !sameString(picked->word, noData)) - return picked; - } -++fullForceCount; -return pickRandom(wt->following); -} - static void wordTreeGenerateFaux(struct wordTree *wt, int maxSize, struct wordTree *firstWord, int maxOutputWords, FILE *f) /* Go spew out a bunch of words according to probabilities in tree. */ { struct dlList *ll = dlListNew(); int listSize = 0; int outputWords = 0; for (;;) { if (++outputWords > maxOutputWords) break; struct dlNode *node; - struct wordTree *maybeWord; // This might be what we want, or it might be a dummy node + struct wordTree *picked; /* Get next predicted word. */ if (listSize == 0) { AllocVar(node); ++listSize; - maybeWord = firstWord; + picked = firstWord; } else if (listSize >= maxSize) { node = dlPopHead(ll); - maybeWord = predictNext(wt, ll); + picked = predictNext(wt, ll); } else { - maybeWord = predictNext(wt, ll); + picked = predictNext(wt, ll); AllocVar(node); ++listSize; } - if (maybeWord == NULL) + if (picked == NULL) break; - /* Here we deal with possibly having fetched a dummy node. */ - struct wordTree *realWord = maybeWord; - if (sameString(maybeWord->word, noData)) - { - realWord = forceRealWord(wt, ll->head); - } /* Add word from whatever level we fetched back to our chain of up to maxChainSize. */ - node->val = realWord->word; + node->val = picked->word; dlAddTail(ll, node); - fprintf(f, "%s\n", maybeWord->word); + fprintf(f, "%s\n", picked->word); - decrementOutputCounts(maybeWord); + decrementOutputCounts(picked); } dlListFree(&ll); } struct wordTree *wordTreeForChainsInFile(char *fileName, int chainSize, struct lm *lm) /* Return a wordTree of all chains-of-words of length chainSize seen in file. * Allocate the structure in local memory pool lm. */ { /* Stuff for processing file a line at a time. */ struct lineFile *lf = lineFileOpen(fileName, TRUE); char *line, *word; /* We'll build up the tree starting with an empty root node. */ struct wordTree *wt = wordTreeNew(""); @@ -536,66 +486,65 @@ /* Handle last few words in line, where can't make a chain of full size. Also handles * lines that have fewer than chain size words. */ if (curSize < chainSize) addChainToTree(wt, chain, lm, stack); while ((node = dlPopHead(chain)) != NULL) { if (!dlEmpty(chain)) addChainToTree(wt, chain, lm, stack); freeMem(node->val); freeMem(node); } dlListFree(&chain); } lineFileClose(&lf); -/* Add in additional information to help traverse tree . */ -wordTreeSetMissing(wt, 1, lm, stack); -wordTreeNormalize(wt, 1.0); return wt; } void alphaChain(char *inFile, char *outFile) /* alphaChain - Create Markov chain of words and optionally output chain in two formats. */ { struct lm *lm = lmInit(0); struct wordTree *wt = wordTreeForChainsInFile(inFile, maxChainSize, lm); +wordTreeNormalize(wt, outSize, 1.0); if (optionExists("chain")) { char *fileName = optionVal("chain", NULL); FILE *f = mustOpen(fileName, "w"); fprintf(f, "#level\tuseCount\toutputCount\tnormVal\tmonomers\n"); wordTreeDump(0, wt, f); carefulClose(&f); } FILE *f = mustOpen(outFile, "w"); -int maxSize = min(wt->useCount, maxNonsenseSize); - -/* KEH NOTES: controls how many words we emit */ - -wordTreeGenerateFaux(wt, maxChainSize, pickRandom(wt->following), maxSize, f); +wordTreeGenerateFaux(wt, maxChainSize, pickRandom(wt->following), outSize, f); carefulClose(&f); +uglyf("totUseZeroCount = %d\n", totUseZeroCount); -uglyf("anyForce %d, fullForce %d (%4.2f%%)\n", anyForceCount, fullForceCount, 100.0*fullForceCount/anyForceCount); + { + FILE *f = mustOpen("foo.chain", "w"); + wordTreeDump(0, wt, f); + carefulClose(&f); + } lmCleanup(&lm); // Not really needed since we're just going to exit. } int main(int argc, char *argv[]) /* Process command line. */ { #ifdef SOON srand( (unsigned)time(0) ); #endif /* SOON */ optionInit(&argc, argv, options); if (argc != 3) usage(); maxChainSize = optionInt("size", maxChainSize); minUse = optionInt("minUse", minUse); -maxNonsenseSize = optionInt("maxNonsenseSize", maxNonsenseSize); +outSize = optionInt("outSize", outSize); fullOnly = optionExists("fullOnly"); alphaChain(argv[1], argv[2]); return 0; }