de78c1c906a7f58028f633049ab08f43ea96d198 kent Wed Apr 25 17:12:13 2012 -0700 Adding in dummy nodes to help deal with missing data from reads not being infinite. diff --git src/kehayden/alphaChain/alphaChain.c src/kehayden/alphaChain/alphaChain.c index 1e68c98..d59c43b 100644 --- src/kehayden/alphaChain/alphaChain.c +++ src/kehayden/alphaChain/alphaChain.c @@ -21,30 +21,31 @@ errAbort( "alphaChain - create a linear projection of alpha satellite arrays using the probablistic model\n" "of HuRef satellite graphs\n" "usage:\n" " alphaChain alphaMonFile.fa significant_output.txt\n" "options:\n" " -size=N - Set max chain size, default %d\n" " -chain=fileName - Write out word chain to file\n" " -maxNonsenseSize=N - Keep nonsense output to this many words.\n" " -fullOnly - Only output chains of size\n" " -minUse=N - Set minimum use in output chain, default %d\n" , maxChainSize, minUse ); } +char *noData = "n/a"; // Used to indicate a dummy node representing missing data /* Command line validation table. */ static struct optionSpec options[] = { {"size", OPTION_INT}, {"minUse", OPTION_INT}, {"chain", OPTION_STRING}, {"fullOnly", OPTION_BOOLEAN}, {"maxNonsenseSize", OPTION_INT}, {NULL, 0}, }; /* The wordTree structure below is the central data structure for this program. It is * used to build up a tree that contains all observed N-word-long sequences observed in * the text, where N corresponds to the "size" command line option which defaults to 3, * an option that in turn is stored in the maxChainSize variable. At this chain size the * text @@ -85,117 +86,124 @@ * * There are special cases in the code so that the first and last words in the text get included * as much as possible in the tree. * * Once the program has build up the wordTree, it can output it in a couple of fashions. */ struct wordTree /* A node in a tree of words. The head of the tree is a node with word value the empty string. */ { struct rbTree *following; /* Contains words (as struct wordTree) that follow us. */ struct wordTree *parent; /* Parent of this node or NULL for root. */ char *word; /* The word itself including comma, period etc. */ int useCount; /* Number of times word used. */ int outputCount; /* each level of tree and initialize that to a normalized version of it. */ double normVal; /* value to place the normalization value */ - int missingFromChildren; /* Uses not in children. */ }; struct wordTree *wordTreeNew(char *word) /* Create and return new wordTree element. */ { struct wordTree *wt; AllocVar(wt); wt->word = cloneString(word); return wt; } int wordTreeCmpWord(void *va, void *vb) /* Compare two wordTree. */ { struct wordTree *a = va, *b = vb; return strcmp(a->word, b->word); } + +struct wordTree *wordTreeAddFollowing(struct wordTree *wt, char *word, + struct lm *lm, struct rbTreeNode **stack) +/* Make word follow wt in tree. If word already exists among followers + * return it and bump use count. Otherwise create new one. */ +{ +struct wordTree *w; /* Points to following element if any */ +if (wt->following == NULL) + { + /* Allocate new if you've never seen it before. */ + wt->following = rbTreeNewDetailed(wordTreeCmpWord, lm, stack); + w = NULL; + } +else + { + /* Find word in existing tree */ + struct wordTree key; + key.word = word; + w = rbTreeFind(wt->following, &key); + } +if (w == NULL) + { + w = wordTreeNew(word); + w->parent = wt; + rbTreeAdd(wt->following, w); + } +w->useCount += 1; +return w; +} + int wordTreeChildrenUseCount(struct wordTree *wt) /* Return sum of useCounts of all children */ { struct rbTree *following = wt->following; if (following == NULL) return 0; struct slRef *childList = rbTreeItems(following); struct slRef *childRef; int total = 0; for (childRef = childList; childRef != NULL; childRef = childRef->next) { struct wordTree *child = childRef->val; total += child->useCount; } slFreeList(&childList); return total; } + int wordTreeCountNotInChildren(struct wordTree *wt) /* Count up useCounts of all children and return difference between this and our own useCount. */ { return wt->useCount - wordTreeChildrenUseCount(wt); } -void wordTreeSetMissing(struct wordTree *wt) + +void wordTreeSetMissing(struct wordTree *wt, int level, struct lm *lm, struct rbTreeNode **stack) /* Set missingFromChildren in self and all children. */ { -wt->missingFromChildren = wordTreeCountNotInChildren(wt); +int missingFromChildren = wordTreeCountNotInChildren(wt); +if (missingFromChildren > 0) + { + struct wordTree *missing = wordTreeAddFollowing(wt, noData, lm, stack); + missing->useCount = missingFromChildren; + } struct rbTree *following = wt->following; -if (following != NULL) +if (following != NULL && level < maxChainSize) { struct slRef *childList = rbTreeItems(following); struct slRef *childRef; for (childRef = childList; childRef != NULL; childRef = childRef->next) { struct wordTree *child = childRef->val; - wordTreeSetMissing(child); + wordTreeSetMissing(child, level+1, lm, stack); } slFreeList(&childList); } } -struct wordTree *wordTreeAddFollowing(struct wordTree *wt, char *word, - struct lm *lm, struct rbTreeNode **stack) -/* Make word follow wt in tree. If word already exists among followers - * return it and bump use count. Otherwise create new one. */ -{ -struct wordTree *w; /* Points to following element if any */ -if (wt->following == NULL) - { - /* Allocate new if you've never seen it before. */ - wt->following = rbTreeNewDetailed(wordTreeCmpWord, lm, stack); - w = NULL; - } -else - { - /* Find word in existing tree */ - struct wordTree key; - key.word = word; - w = rbTreeFind(wt->following, &key); - } -if (w == NULL) - { - w = wordTreeNew(word); - w->parent = wt; - rbTreeAdd(wt->following, w); - } -w->useCount += 1; -return w; -} - void addChainToTree(struct wordTree *wt, struct dlList *chain, struct lm *lm, struct rbTreeNode **stack) /* Add chain of words to tree. */ { struct dlNode *node; wt->useCount += 1; for (node = chain->head; !dlEnd(node); node = node->next) { char *word = node->val; verbose(2, " %s\n", word); wt = wordTreeAddFollowing(wt, word, lm, stack); } } void wordTreeNormalize(struct wordTree *wt, double normVal) @@ -204,30 +212,31 @@ wt->normVal = normVal; wt->outputCount = normVal * maxNonsenseSize; if (wt->following != NULL) { struct slRef *list = rbTreeItems(wt->following); struct slRef *ref; for (ref = list; ref !=NULL; ref = ref->next) { struct wordTree *child = ref->val; double childRatio = (double)child->useCount / wt->useCount; wordTreeNormalize(child, childRatio*normVal); } slFreeList(&list); } } + void wordTreeDeadEnd(struct wordTree *wt) /* tally and include incomplete branches */ { /* int levelNormVal = 0; * int levelCount = 0; * int sumNormVal = 0; * int sumCount = 0; * int diffNormVal = 0; * int diffCount=0; * Loop pseudocode * work recursively through level 1-> 3, start at root of tree * foreach word at level 1 * { * sumCount = 0 * sumNormVal = 0 @@ -247,44 +256,45 @@ * sumCount += wt->outputCount * sumNormVal += wt->normVal * ** RECURSIVE level 2 + 1 here ** * } * diffCount = levelCount - sumCount * diffNormVal = levelNormVal - sumNormVal * if(diffCount > 0) * { * create level 2: * wt->normVal = diffNormVal * wt->word = 'NaN' * wt->outputVal = diffCount * } */ } + void wordTreeDump(int level, struct wordTree *wt, FILE *f) /* Write out wordTree to file. */ { static char *words[64]; struct slRef *list, *ref; int i; assert(level < ArraySize(words)); words[level] = wt->word; if (wt->useCount >= minUse) { if (!fullOnly || level == maxChainSize) { - fprintf(f, "%d\t%d\t%d\t%f\t%d\t", level, wt->useCount, wt->outputCount, wt->normVal, wt->missingFromChildren); + fprintf(f, "%d\t%d\t%d\t%f\t", level, wt->useCount, wt->outputCount, wt->normVal); for (i=1; i<=level; ++i) { spaceOut(f, level*2); fprintf(f, "%s ", words[i]); } fprintf(f, "\n"); } } if (wt->following != NULL) { list = rbTreeItems(wt->following); for (ref = list; ref != NULL; ref = ref->next) wordTreeDump(level+1, ref->val, f); slFreeList(&list); @@ -464,53 +474,56 @@ * it from head to tail of list. */ node = dlPopHead(chain); freeMem(node->val); node->val = cloneString(word); dlAddTail(chain, node); addChainToTree(wt, chain, lm, stack); } ++wordCount; } /* Handle last few words in line, where can't make a chain of full size. Also handles * lines that have fewer than chain size words. */ if (curSize < chainSize) addChainToTree(wt, chain, lm, stack); while ((node = dlPopHead(chain)) != NULL) { + if (!dlEmpty(chain)) addChainToTree(wt, chain, lm, stack); freeMem(node->val); freeMem(node); } dlListFree(&chain); } - lineFileClose(&lf); + +/* Add in additional information to help traverse tree . */ +wordTreeSetMissing(wt, 1, lm, stack); +wordTreeNormalize(wt, 1.0); return wt; } void alphaChain(char *inFile, char *outFile) /* alphaChain - Create Markov chain of words and optionally output chain in two formats. */ { struct lm *lm = lmInit(0); struct wordTree *wt = wordTreeForChainsInFile(inFile, maxChainSize, lm); -wordTreeNormalize(wt, 1.0); -wordTreeSetMissing(wt); if (optionExists("chain")) { char *fileName = optionVal("chain", NULL); FILE *f = mustOpen(fileName, "w"); + fprintf(f, "#level\tuseCount\toutputCount\tnormVal\tmonomers\n"); wordTreeDump(0, wt, f); carefulClose(&f); } FILE *f = mustOpen(outFile, "w"); int maxSize = min(wt->useCount, maxNonsenseSize); /* KEH NOTES: controls how many words we emit */ wordTreeMakeNonsense(wt, maxChainSize, pickRandomWord(wt->following), maxSize, f); carefulClose(&f); lmCleanup(&lm); // Not really needed since we're just going to exit.