001f4847f07fad411ebc491290d337185cf0a1c3 kent Wed Sep 12 08:21:33 2012 -0700 Making it use lower order model once higher order model has reached outTarget. Improving top level comment. diff --git src/kehayden/alphaAsm/alphaAsm.c src/kehayden/alphaAsm/alphaAsm.c index 0b801f4..4f76dcd 100644 --- src/kehayden/alphaAsm/alphaAsm.c +++ src/kehayden/alphaAsm/alphaAsm.c @@ -1,16 +1,18 @@ -/* alphaAsm - Predicts faux centromere sequences using a probablistic model. */ +/* alphaAsm - assemble alpha repeat regions such as centromeres from reads that have + * been parsed into various repeat monomer variants. Cycles of these variants tend to + * form higher order repeats. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "dystring.h" #include "dlist.h" /* Global vars - all of which can be set by command line options. */ int maxChainSize = 3; int outSize = 10000; boolean fullOnly = FALSE; void usage() /* Explain usage and exit. */ @@ -302,39 +304,39 @@ fprintf(f, "%d\t%d\t%d\t%d\t%f\t", level, wt->useCount, wt->outTarget, wt->outCount, wt->normVal); for (i=1; i<=level; ++i) { spaceOut(f, level*2); fprintf(f, "%s ", words[i]); } fprintf(f, "\n"); } struct wordTree *child; for (child = wt->children; child != NULL; child = child->next) wordTreeDump(level+1, child, maxChainSize, f); } -struct wordTree *pickRandomOnOutTarget(struct wordTree *list) +struct wordTree *pickRandom(struct wordTree *list) /* Pick word from list randomly, but so that words with higher outTargets * are picked more often. */ { struct wordTree *picked = NULL; /* Debug output. */ { - verbose(2, " pickRandomOnOutTarget("); + verbose(2, " pickRandom("); struct wordTree *wt; for (wt = list; wt != NULL; wt = wt->next) { if (wt != list) verbose(2, " "); verbose(2, "%s:%d", wt->monomer->word, wt->outTarget); } verbose(2, ") total %d\n", wordTreeSumOutTargets(list)); } /* Figure out total number of outputs left, and a random number between 0 and that total. */ int total = wordTreeSumOutTargets(list); if (total > 0) { int threshold = rand() % total; @@ -348,102 +350,79 @@ int size = wt->outTarget; int binEnd = binStart + size; verbose(2, " %s size %d, binEnd %d\n", wt->monomer->word, size, binEnd); if (threshold < binEnd) { picked = wt; verbose(2, " picked %s\n", wt->monomer->word); break; } binStart = binEnd; } } return picked; } -struct wordTree *pickRandomOnUseCounts(struct wordTree *list) -/* Pick word from list randomly, but so that words with higher useCounts - * are picked more often. Much like above routine, but a little simple - * since we know useCounts are non-zero. */ -{ -struct wordTree *picked = NULL; - -/* Figure out total number and a random number between 0 and that total. */ -int total = wordTreeSumUseCounts(list); -assert(total > 0); -int threshold = rand() % total; - -/* Loop through list returning selection corresponding to random threshold. */ -int binStart = 0; -struct wordTree *wt; -for (wt = list; wt != NULL; wt = wt->next) - { - int size = wt->useCount; - int binEnd = binStart + size; - if (threshold < binEnd) - { - picked = wt; - break; - } - binStart = binEnd; - } -assert(picked != NULL); -return picked; -} - int totUseZeroCount = 0; -struct wordTree *pickRandom(struct wordTree *list) -/* Pick word from list randomly, but so that words more - * commonly seen are picked more often. */ -{ -struct wordTree *picked = pickRandomOnOutTarget(list); - -/* If did not find anything, that's ok. It can happen on legitimate input due to unevenness - * of read coverage. In this case we pick a random number based on original counts - * rather than normalized/counted down counts. */ -if (picked == NULL) - { - picked = pickRandomOnUseCounts(list); - ++totUseZeroCount; - } -return picked; -} - struct monomer *pickRandomFromType(struct monomerType *type) /* Pick a random word, weighted by outTarget, from all available of given type */ { /* Figure out total on list */ int total = 0; struct monomerRef *ref; for (ref = type->list; ref != NULL; ref = ref->next) total += ref->val->outTarget; /* Loop through list returning selection corresponding to random threshold. */ +if (total > 0) + { int threshold = rand() % total; int binStart = 0; for (ref = type->list; ref != NULL; ref = ref->next) { struct monomer *monomer = ref->val; int size = monomer->outTarget; int binEnd = binStart + size; if (threshold < binEnd) return monomer; binStart = binEnd; } + } + +verbose(2, "Backup plan for %s in pickRandomFromType\n", type->name); -verbose(2, "Fell off end in pickRandomFromType\n"); +/* Try again based on use counts. */ +total = 0; +for (ref = type->list; ref != NULL; ref = ref->next) + total += ref->val->useCount; +if (total > 0) + { + int threshold = rand() % total; + int binStart = 0; + for (ref = type->list; ref != NULL; ref = ref->next) + { + struct monomer *monomer = ref->val; + int size = monomer->useCount; + int binEnd = binStart + size; + if (threshold < binEnd) + return monomer; + binStart = binEnd; + } + } + +warn("Fell off end in pickRandomFromType on %s", type->name); return type->list->val; // Fall back position (necessary?) just first in list } struct wordTree *predictNextFromAllPredecessors(struct wordTree *wt, struct dlNode *list) /* Predict next word given tree and recently used word list. If tree doesn't * have statistics for what comes next given the words in list, then it returns * NULL. */ { verbose(2, " predictNextFromAllPredecessors(%s, %s)\n", wordTreeString(wt), dlListFragWords(list)); struct dlNode *node; for (node = list; !dlEnd(node); node = node->next) { struct monomer *monomer = node->val; wt = wordTreeFindInList(wt->children, monomer); verbose(2, " wordTreeFindInList(%s) = %p %s\n", monomer->word, wt, wordTreeString(wt)); @@ -451,30 +430,31 @@ break; } struct wordTree *result = NULL; if (wt != NULL && wt->children != NULL) result = pickRandom(wt->children); return result; } struct wordTree *predictFromWordTree(struct wordTree *wt, struct dlNode *recent) /* Predict next word given tree and recently used word list. Will use all words in * recent list if can, but if there is not data in tree, will back off, and use * progressively less previous words until ultimately it just picks a random * word. */ { struct dlNode *node; +verbose(2, " predictNextFromWordTree(%s)\n", wordTreeString(wt)); for (node = recent; !dlEnd(node); node = node->next) { struct wordTree *result = predictNextFromAllPredecessors(wt, node); if (result != NULL) return result; } return NULL; } struct dlNode *nodesFromTail(struct dlList *list, int count) /* Return count nodes from tail of list. If list is shorter than count, it returns the * whole list. */ { int i; @@ -1082,35 +1062,35 @@ monomer->outTarget = round(scale * monomer->useCount); } void alphaStoreNormalize(struct alphaStore *store, int outputSize) /* Set up output counts on each word to make sure it gets it's share of output size */ { struct wordTree *wt = store->markovChains; wordTreeNormalize(wt, outputSize, 1.0); monomerListNormalise(store->monomerList, wt->useCount, outputSize); } struct monomerType *fillInTypeFromReads(struct monomer *monomer, struct alphaStore *store) /* Look through read list trying to find best supported type for this monomer. */ { /* The algorithm here could be better. It picks the closest nearby monomer that is typed - * to fill in from. When have information from both before and after uses arbitrarily - * before preferentially. When have information that is equally close from multiple reads, - * just uses info from first read. Fortunately the data, at least on Y, shows a lot of - * consistency, so probably not actually worth a more sophisticated algorithm that could - * take the most popular choice if data were inconsistent. */ + * to fill in from. When have information from both before and after the program arbitrarily + * uses before preferentially. When have information that is equally close from multiple reads, + * it just uses info from first read. Fortunately the data, at least on Y, shows a lot of + * consistency, so it's probably not actually worth a more sophisticated algorithm that would + * instead take the most popular choice rather than the first one. */ verbose(2, "fillInTypeFromReads on %s from %d reads\n", monomer->word, slCount(monomer->readList)); struct monomerType *bestType = NULL; int bestPos = 0; boolean bestIsAfter = FALSE; struct slRef *readRef; for (readRef = monomer->readList; readRef != NULL; readRef = readRef->next) { struct alphaRead *read = readRef->val; struct monomerRef *ref; { verbose(2, " read %s (%d els):", read->name, slCount(read->list)); for (ref = read->list; ref != NULL; ref = ref->next) { struct monomer *m = ref->val; verbose(2, " %s", m->word); @@ -1158,64 +1138,67 @@ bestIsAfter = FALSE; } } if (afterType != NULL) { if (bestType == NULL || afterPos < bestPos) { bestType = afterType; bestPos = afterPos; bestIsAfter = TRUE; } } } +/* Now have found a type that is at a known position relative to ourselves. From this + * infer our own type. */ struct monomerType *chosenType = NULL; if (bestType != NULL) { if (bestIsAfter) chosenType = typeBefore(store, bestType, bestPos); else chosenType = typeAfter(store, bestType, bestPos); } - return chosenType; } void fillInTypes(struct alphaStore *store) /* We have types for most but not all monomers. Try and fill out types for most of * the rest by looking for position in reads they are in that do contain known types */ { /* Do first pass on ones defined - just have to look them up in hash. */ struct monomer *monomer; for (monomer = store->monomerList; monomer != NULL; monomer = monomer->next) monomer->type = hashFindVal(store->typeHash, monomer->word); /* Do second pass on ones that are not yet defined */ for (monomer = store->monomerList; monomer != NULL; monomer = monomer->next) { if (monomer->type == NULL) { struct monomerType *type = monomer->type = fillInTypeFromReads(monomer, store); verbose(2, "Got %p %s\n", type, (type != NULL ? type->name : "n/a")); } } } void alphaAsm(char *readsFile, char *monomerOrderFile, char *outFile) -/* alphaAsm - Create Markov chain of words and optionally output chain in two formats. */ +/* alphaAsm - assemble alpha repeat regions such as centromeres from reads that have + * been parsed into various repeat monomer variants. Cycles of these variants tend to + * form higher order repeats. */ { struct alphaStore *store = alphaStoreNew(maxChainSize); alphaReadListFromFile(readsFile, store); alphaStoreLoadMonomerOrder(store, readsFile, monomerOrderFile); fillInTypes(store); integrateOrphans(store); makeMarkovChains(store); struct wordTree *wt = store->markovChains; alphaStoreNormalize(store, outSize); if (optionExists("chain")) { char *fileName = optionVal("chain", NULL); wordTreeWrite(wt, store->maxChainSize, fileName); }