d1deca879029b44519864c9178ec35db15c816d4 kent Thu Sep 6 15:30:59 2012 -0700 Making program look at the monomer order file to help it's predictions in case where it has nothing it can use in the markov chains. diff --git src/kehayden/alphaChain/alphaChain.c src/kehayden/alphaChain/alphaChain.c index 92b9706..3414909 100644 --- src/kehayden/alphaChain/alphaChain.c +++ src/kehayden/alphaChain/alphaChain.c @@ -376,94 +376,182 @@ * commonly seen are picked more often. */ { struct wordTree *picked = pickRandomOnOutTarget(list); /* If did not find anything, that's ok. It can happen on legitimate input due to unevenness * of read coverage. In this case we pick a random number based on original counts * rather than normalized/counted down counts. */ if (picked == NULL) { picked = pickRandomOnUseCounts(list); ++totUseZeroCount; } return picked; } +struct wordInfo *pickRandomFromType(struct wordType *type) +/* Pick a random word, weighted by outTarget, from all available of given type */ +{ +/* Figure out total on list */ +int total = 0; +struct wordInfoRef *ref; +for (ref = type->list; ref != NULL; ref = ref->next) + total += ref->val->outTarget; + +/* Loop through list returning selection corresponding to random threshold. */ +int threshold = rand() % total; +int binStart = 0; +for (ref = type->list; ref != NULL; ref = ref->next) + { + struct wordInfo *info = ref->val; + int size = info->outTarget; + int binEnd = binStart + size; + if (threshold < binEnd) + return info; + binStart = binEnd; + } + +verbose(2, "Fell off end in pickRandomFromType\n"); +return type->list->val; // Fall back position (necessary?) just first in list +} + struct wordTree *predictNextFromAllPredecessors(struct wordTree *wt, struct dlNode *list) /* Predict next word given tree and recently used word list. If tree doesn't * have statistics for what comes next given the words in list, then it returns * NULL. */ { verbose(2, " predictNextFromAllPredecessors(%s, %s)\n", wordTreeString(wt), dlListFragWords(list)); struct dlNode *node; for (node = list; !dlEnd(node); node = node->next) { struct wordInfo *info = node->val; wt = wordTreeFindInList(wt->children, info); verbose(2, " wordTreeFindInList(%s) = %p %s\n", info->word, wt, wordTreeString(wt)); if (wt == NULL || wt->children == NULL) break; } struct wordTree *result = NULL; if (wt != NULL && wt->children != NULL) result = pickRandom(wt->children); return result; } -struct wordTree *predictNextFromRecent(struct wordTree *wt, struct dlNode *recent) +struct wordTree *predictFromWordTree(struct wordTree *wt, struct dlNode *recent) /* Predict next word given tree and recently used word list. Will use all words in * recent list if can, but if there is not data in tree, will back off, and use * progressively less previous words until ultimately it just picks a random * word. */ { struct dlNode *node; for (node = recent; !dlEnd(node); node = node->next) { struct wordTree *result = predictNextFromAllPredecessors(wt, node); if (result != NULL) return result; } -struct wordTree *topLevel = pickRandom(wt->children); -verbose(2, "in predictNext(%s, %s) ", wordTreeString(wt), dlListFragWords(recent)); -verbose(2, "last resort pick of %s\n", topLevel->info->word); -return topLevel; +return NULL; } struct dlNode *nodesFromTail(struct dlList *list, int count) /* Return count nodes from tail of list. If list is shorter than count, it returns the * whole list. */ { int i; struct dlNode *node; for (i=0, node = list->tail; i<count; ++i) { if (dlStart(node)) return list->head; node = node->prev; } return node; } +struct wordType *advanceTypeWithWrap(struct wordType *typeList, struct wordType *startType, + int amountToAdvance) +/* Skip forward amountToAdvance in typeList from startType. If get to end of list start + * again at the beginning */ +{ +int i; +struct wordType *type = startType; +for (i=0; i<amountToAdvance; ++i) + { + type = type->next; + if (type == NULL) + type = typeList; + } +return type; +} + +struct wordTree *predictFromPreviousTypes(struct wordStore *store, struct dlList *past) +/* Predict next based on general pattern of monomer order. */ +{ +int maxToLookBack = 3; + { /* Debugging block */ + struct dlNode *node = nodesFromTail(past, maxToLookBack); + verbose(3, "predictFromPreviousTypes("); + for (; !dlEnd(node); node = node->next) + { + struct wordInfo *info = node->val; + verbose(3, "%s, ", info->word); + } + verbose(3, ")\n"); + } +int pastDepth; +struct dlNode *node; +for (pastDepth = 1, node = past->tail; pastDepth <= maxToLookBack; ++pastDepth) + { + if (dlStart(node)) + { + verbose(2, "predictFromPreviousTypes with empty past\n"); + return NULL; + } + struct wordInfo *info = node->val; + struct wordType *prevType = hashFindVal(store->typeHash, info->word); + if (prevType != NULL) + { + struct wordType *curType = advanceTypeWithWrap(store->typeList, prevType, pastDepth); + struct wordInfo *curInfo = pickRandomFromType(curType); + struct wordTree *curTree = wordTreeFindInList(store->markovChains->children, curInfo); + verbose(3, "predictFromPreviousType pastDepth %d, curInfo %s, curTree %p %s\n", + pastDepth, curInfo->word, curTree, curTree->info->word); + return curTree; + } + node = node->prev; + } +verbose(2, "predictFromPreviousTypes past all unknown types\n"); +return NULL; +} + + struct wordTree *predictNext(struct wordStore *store, struct dlList *past) /* Given input data store and what is known from the past, predict the next word. */ { struct dlNode *recent = nodesFromTail(past, store->maxChainSize); -return predictNextFromRecent(store->markovChains, recent); +struct wordTree *pick = predictFromWordTree(store->markovChains, recent); +if (pick == NULL) + pick = predictFromPreviousTypes(store, past); +if (pick == NULL) + { + pick = pickRandom(store->markovChains->children); + verbose(2, "in predictNext() last resort pick of %s\n", pick->info->word); + } +return pick; } -void decrementOutputCounts(struct wordTree *wt) +void decrementOutputCountsInTree(struct wordTree *wt) /* Decrement output count of self and parents. */ { while (wt != NULL) { /* Decrement target count, but don't let it fall below sum of counts of all children. * This can happen with incomplete data if we don't prevent it. This * same code also prevents us from having negative outTarget. */ int outTarget = wt->outTarget - 1; int kidSum = wordTreeSumOutTargets(wt->children); if (outTarget < kidSum) outTarget = kidSum; wt->outTarget = outTarget; /* Always bump outCount for debugging. */ wt->outCount += 1; @@ -499,34 +587,37 @@ { chainStartNode = chainStartNode->next; picked = predictNext(store, ll); } else { picked = predictNext(store, ll); ++chainSize; } if (picked == NULL) break; /* Add word from whatever level we fetched back to our output chain. */ - node->val = picked->info; + struct wordInfo *info = picked->info; + node->val = info; dlAddTail(ll, node); - decrementOutputCounts(picked); + decrementOutputCountsInTree(picked); + info->outTarget -= 1; + info->outCount += 1; } verbose(2, "totUseZeroCount = %d\n", totUseZeroCount); return ll; } static void wordTreeGenerateFile(struct wordStore *store, int maxSize, struct wordTree *firstWord, int maxOutputWords, char *fileName) /* Create file containing words base on tree probabilities. The wordTreeGenerateList does * most of work. */ { struct dlList *ll = wordTreeGenerateList(store, maxSize, firstWord, maxOutputWords); FILE *f = mustOpen(fileName, "w"); struct dlNode *node; for (node = ll->head; !dlEnd(node); node = node->next) { @@ -671,42 +762,60 @@ struct wordType *type; AllocVar(type); slAddHead(&store->typeList, type); while ((word = nextWord(&line)) != NULL) { struct wordInfo *info = hashFindVal(store->infoHash, word); if (info == NULL) errAbort("%s is in %s but not %s", word, lf->fileName, readsFile); struct wordInfoRef *ref; AllocVar(ref); ref->val = info; slAddHead(&type->list, ref); hashAddUnique(store->typeHash, word, type); } } +slReverse(&store->typeList); lineFileClose(&lf); verbose(2, "Added %d types containing %d words from %s\n", slCount(store->typeList), store->typeHash->elCount, fileName); } +void wordInfoListNormalise(struct wordInfo *list, int totalCount, int outputSize) +/* Set outTarget field in all of list to be normalized to outputSize */ +{ +struct wordInfo *info; +double scale = outputSize/totalCount; +for (info = list; info != NULL; info = info->next) + info->outTarget = round(scale * info->useCount); +} + +void wordStoreNormalize(struct wordStore *store, int outputSize) +/* Set up output counts on each word to make sure it gets it's share of output size */ +{ +struct wordTree *wt = store->markovChains; +wordTreeNormalize(wt, outputSize, 1.0); +wordInfoListNormalise(store->infoList, wt->useCount, outputSize); +} + void alphaChain(char *readsFile, char *monomerOrderFile, char *outFile) /* alphaChain - Create Markov chain of words and optionally output chain in two formats. */ { struct wordStore *store = wordStoreForChainsInFile(readsFile, maxChainSize); struct wordTree *wt = store->markovChains; wordStoreLoadMonomerOrder(store, readsFile, monomerOrderFile); -wordTreeNormalize(wt, outSize, 1.0); +wordStoreNormalize(store, outSize); if (optionExists("chain")) { char *fileName = optionVal("chain", NULL); wordTreeWrite(wt, store->maxChainSize, fileName); } wordTreeGenerateFile(store, store->maxChainSize, pickRandom(wt->children), outSize, outFile); if (optionExists("afterChain")) { char *fileName = optionVal("afterChain", NULL); wordTreeWrite(wt, store->maxChainSize, fileName); } }