618f7cd74f08e3b8314729f9dca945627cb4d4e3 kent Thu May 3 11:16:25 2012 -0700 When we use up our target out count, revert to original input counts when making random choices. diff --git src/kehayden/alphaChain/alphaChain.c src/kehayden/alphaChain/alphaChain.c index 8864b3d..c9e946e 100644 --- src/kehayden/alphaChain/alphaChain.c +++ src/kehayden/alphaChain/alphaChain.c @@ -1,53 +1,56 @@ /* alphaChain - Predicts faux centromere sequences using a probablistic model. */ + #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "dlist.h" /* Global vars - all of which can be set by command line options. */ int maxChainSize = 3; int outSize = 10000; -int minUse = 1; boolean fullOnly = FALSE; void usage() /* Explain usage and exit. */ { errAbort( "alphaChain - create a linear projection of alpha satellite arrays using the probablistic model\n" "of HuRef satellite graphs\n" "usage:\n" " alphaChain alphaMonFile.fa significant_output.txt\n" "options:\n" " -size=N - Set max chain size, default %d\n" + " -fullOnly - Only output chains of size above\n" " -chain=fileName - Write out word chain to file\n" + " -afterChain=fileName - Write out word chain after faux generation to file for debugging\n" " -outSize=N - Output this many words.\n" - " -fullOnly - Only output chains of size\n" - " -minUse=N - Set minimum use in output chain, default %d\n" - , maxChainSize, minUse + " -seed=N - Initialize random number with this seed for consistent results, otherwise\n" + " it will generate different results each time it's run.\n" + , maxChainSize ); } /* Command line validation table. */ static struct optionSpec options[] = { {"size", OPTION_INT}, - {"minUse", OPTION_INT}, {"chain", OPTION_STRING}, + {"afterChain", OPTION_STRING}, {"fullOnly", OPTION_BOOLEAN}, {"outSize", OPTION_INT}, + {"seed", OPTION_INT}, {NULL, 0}, }; /* The wordTree structure below is the central data structure for this program. It is * used to build up a tree that contains all observed N-word-long sequences observed in * the text, where N corresponds to the "size" command line option which defaults to 3, * an option that in turn is stored in the maxChainSize variable. At this chain size the * text * this is the black dog and the black cat * would have the chains * this is the * is the black * the black dog * black dog and * dog and the @@ -96,31 +99,31 @@ struct wordTree *wordTreeNew(char *word) /* Create and return new wordTree element. */ { struct wordTree *wt; AllocVar(wt); wt->word = cloneString(word); return wt; } int wordTreeCmpWord(const void *va, const void *vb) /* Compare two wordTree for slSort. */ { const struct wordTree *a = *((struct wordTree **)va); const struct wordTree *b = *((struct wordTree **)vb); -return strcmp(a->word, b->word); +return cmpStringsWithEmbeddedNumbers(a->word, b->word); } struct wordTree *wordTreeFindInList(struct wordTree *list, char *word) /* Return wordTree element in list that has given word, or NULL if none. */ { struct wordTree *wt; for (wt = list; wt != NULL; wt = wt->next) if (sameString(wt->word, word)) break; return wt; } struct wordTree *wordTreeAddFollowing(struct wordTree *wt, char *word) /* Make word follow wt in tree. If word already exists among followers * return it and bump use count. Otherwise create new one. */ @@ -134,181 +137,217 @@ } child->useCount += 1; return child; } int wordTreeSumUseCounts(struct wordTree *list) /* Sum up useCounts in list */ { int total = 0; struct wordTree *wt; for (wt = list; wt != NULL; wt = wt->next) total += wt->useCount; return total; } -int wordTreeChildrenUseCount(struct wordTree *wt) -/* Return sum of useCounts of all children */ -{ -return wordTreeSumUseCounts(wt->children); -} - int wordTreeSumOutTargets(struct wordTree *list) /* Sum up useCounts in list */ { int total = 0; struct wordTree *wt; for (wt = list; wt != NULL; wt = wt->next) total += wt->outTarget; return total; } void addChainToTree(struct wordTree *wt, struct dlList *chain) /* Add chain of words to tree. */ { struct dlNode *node; wt->useCount += 1; for (node = chain->head; !dlEnd(node); node = node->next) { char *word = node->val; verbose(2, " %s\n", word); wt = wordTreeAddFollowing(wt, word); } } void wordTreeNormalize(struct wordTree *wt, double outTarget, double normVal) /* Recursively set wt->normVal and wt->outTarget so each branch gets its share */ { wt->normVal = normVal; wt->outTarget = outTarget; -int totalChildUses = wordTreeChildrenUseCount(wt); +int childrenTotalUses = wordTreeSumUseCounts(wt->children); struct wordTree *child; for (child = wt->children; child != NULL; child = child->next) { - double childRatio = (double)child->useCount / totalChildUses; + double childRatio = (double)child->useCount / childrenTotalUses; wordTreeNormalize(child, childRatio*outTarget, childRatio*normVal); } } void wordTreeDump(int level, struct wordTree *wt, FILE *f) /* Write out wordTree to file. */ { static char *words[64]; int i; assert(level < ArraySize(words)); words[level] = wt->word; -if (wt->useCount >= minUse) - { if (!fullOnly || level == maxChainSize) { - fprintf(f, "%d\t%d\t%d\t%d\t%f\t", level, wt->useCount, wt->outTarget, wt->outCount, wt->normVal); + fprintf(f, "%d\t%d\t%d\t%d\t%f\t", + level, wt->useCount, wt->outTarget, wt->outCount, wt->normVal); for (i=1; i<=level; ++i) { spaceOut(f, level*2); fprintf(f, "%s ", words[i]); } fprintf(f, "\n"); } - } struct wordTree *child; for (child = wt->children; child != NULL; child = child->next) wordTreeDump(level+1, child, f); } -int totUseZeroCount; // debugging aid - struct wordTree *pickRandomOnOutTarget(struct wordTree *list) -/* Pick word from list randomly, but so that words more - * commonly seen are picked more often. */ +/* Pick word from list randomly, but so that words with higher outTargets + * are picked more often. */ { struct wordTree *picked = NULL; /* Figure out total number of outputs left, and a random number between 0 and that total. */ int total = wordTreeSumOutTargets(list); if (total > 0) { int threshold = rand() % total; /* Loop through list returning selection corresponding to random threshold. */ int binStart = 0; struct wordTree *wt; for (wt = list; wt != NULL; wt = wt->next) { int size = wt->outTarget; int binEnd = binStart + size; if (threshold < binEnd) { picked = wt; break; } binStart = binEnd; } } +return picked; +} + +struct wordTree *pickRandomOnUseCounts(struct wordTree *list) +/* Pick word from list randomly, but so that words with higher useCounts + * are picked more often. Much like above routine, but a little simple + * since we know useCounts are non-zero. */ +{ +struct wordTree *picked = NULL; + +/* Figure out total number and a random number between 0 and that total. */ +int total = wordTreeSumUseCounts(list); +assert(total > 0); +int threshold = rand() % total; + +/* Loop through list returning selection corresponding to random threshold. */ +int binStart = 0; +struct wordTree *wt; +for (wt = list; wt != NULL; wt = wt->next) + { + int size = wt->useCount; + int binEnd = binStart + size; + if (threshold < binEnd) + { + picked = wt; + break; + } + binStart = binEnd; + } +assert(picked != NULL); +return picked; +} + +int totUseZeroCount = 0; + +struct wordTree *pickRandom(struct wordTree *list) +/* Pick word from list randomly, but so that words more + * commonly seen are picked more often. */ +{ +struct wordTree *picked = pickRandomOnOutTarget(list); /* If did not find anything, that's ok. It can happen on legitimate input due to unevenness - * of read coverage. In this case we just return an arbitrary element. */ + * of read coverage. In this case we pick a random number based on original counts + * rather than normalized/counted down counts. */ if (picked == NULL) { - picked = list; - totUseZeroCount += 1; + picked = pickRandomOnUseCounts(list); + ++totUseZeroCount; } return picked; } struct wordTree *predictNextFromAllPredecessors(struct wordTree *wt, struct dlNode *list) /* Predict next word given tree and recently used word list. If tree doesn't * have statistics for what comes next given the words in list, then it returns * NULL. */ { struct dlNode *node; for (node = list; !dlEnd(node); node = node->next) { char *word = node->val; wt = wordTreeFindInList(wt->children, word); if (wt == NULL || wt->children == NULL) break; } struct wordTree *result = NULL; if (wt != NULL && wt->children != NULL) - result = pickRandomOnOutTarget(wt->children); + result = pickRandom(wt->children); return result; } struct wordTree *predictNext(struct wordTree *wt, struct dlList *recent) /* Predict next word given tree and recently used word list. Will use all words in * recent list if can, but if there is not data in tree, will back off, and use * progressively less previous words until ultimately it just picks a random * word. */ { struct dlNode *node; for (node = recent->head; !dlEnd(node); node = node->next) { struct wordTree *result = predictNextFromAllPredecessors(wt, node); if (result != NULL) return result; } -return pickRandomOnOutTarget(wt->children); +return pickRandom(wt->children); } void decrementOutputCounts(struct wordTree *wt) /* Decrement output count of self and parents. */ { while (wt != NULL) { - wt->outTarget -= 1; + /* Decrement target count, but don't let it go below zero. */ + int outTarget = wt->outTarget - 1; + if (outTarget >= 0) + wt->outTarget = outTarget; + + /* Always bump outCount for debugging. */ wt->outCount += 1; wt = wt->parent; } } static void wordTreeGenerateFaux(struct wordTree *wt, int maxSize, struct wordTree *firstWord, int maxOutputWords, char *fileName) /* Go spew out a bunch of words according to probabilities in tree. */ { FILE *f = mustOpen(fileName, "w"); struct dlList *ll = dlListNew(); int listSize = 0; int outputWords = 0; for (;;) @@ -419,65 +458,65 @@ /* Handle last few words in line, where can't make a chain of full size. Also handles * lines that have fewer than chain size words. */ if (curSize < chainSize) addChainToTree(wt, chain); while ((node = dlPopHead(chain)) != NULL) { if (!dlEmpty(chain)) addChainToTree(wt, chain); freeMem(node->val); freeMem(node); } dlListFree(&chain); } lineFileClose(&lf); -wordTreeSort(wt); // ugly debug +wordTreeSort(wt); // Make output of chain file prettier +verbose(2, "totUseZeroCount = %d\n", totUseZeroCount); return wt; } void wordTreeWrite(struct wordTree *wt, char *fileName) /* Write out tree to file */ { FILE *f = mustOpen(fileName, "w"); fprintf(f, "#level\tuseCount\toutTarget\toutCount\tnormVal\tmonomers\n"); wordTreeDump(0, wt, f); carefulClose(&f); } void alphaChain(char *inFile, char *outFile) /* alphaChain - Create Markov chain of words and optionally output chain in two formats. */ { struct wordTree *wt = wordTreeForChainsInFile(inFile, maxChainSize); wordTreeNormalize(wt, outSize, 1.0); if (optionExists("chain")) { char *fileName = optionVal("chain", NULL); wordTreeWrite(wt, fileName); } +wordTreeGenerateFaux(wt, maxChainSize, pickRandom(wt->children), outSize, outFile); -wordTreeGenerateFaux(wt, maxChainSize, pickRandomOnOutTarget(wt->children), outSize, outFile); - -uglyf("totUseZeroCount = %d\n", totUseZeroCount); -wordTreeWrite(wt, "ugly.chain"); +if (optionExists("afterChain")) + { + char *fileName = optionVal("afterChain", NULL); + wordTreeWrite(wt, fileName); + } } int main(int argc, char *argv[]) /* Process command line. */ { -#ifdef SOON -/* Seed random number generator with time, so it doesn't always generate same sequence of #s */ -srand( (unsigned)time(0) ); -#endif /* SOON */ optionInit(&argc, argv, options); if (argc != 3) usage(); maxChainSize = optionInt("size", maxChainSize); -minUse = optionInt("minUse", minUse); outSize = optionInt("outSize", outSize); fullOnly = optionExists("fullOnly"); +int seed = optionInt("seed", (int)time(0)); +srand(seed); alphaChain(argv[1], argv[2]); return 0; }