bd11b61b73ac03b8f3a1787bea84b36cd5332457 kent Wed Jun 19 13:18:03 2013 -0700 Moving alphaAsm to linearSat, a bettern name. diff --git src/kehayden/linearSat/linearSat.c src/kehayden/linearSat/linearSat.c new file mode 100644 index 0000000..9ce4ab6 --- /dev/null +++ src/kehayden/linearSat/linearSat.c @@ -0,0 +1,1832 @@ +/* alphaAsm - assemble alpha repeat regions such as centromeres from reads that have + * been parsed into various repeat monomer variants. Cycles of these variants tend to + * form higher order repeats. */ + +#include "common.h" +#include "linefile.h" +#include "hash.h" +#include "options.h" +#include "dystring.h" +#include "dlist.h" +#include "obscure.h" +#include "errabort.h" + +/* Global vars - all of which can be set by command line options. */ +int pseudoCount = 1; +int maxChainSize = 3; +int initialOutSize = 10000; +int maxOutSize; +int maxToMiss = 0; +boolean fullOnly = FALSE; +boolean betweens = FALSE; + +void usage() +/* Explain usage and exit. */ +{ +errAbort( + "alphaAsm - create a linear projection of alpha satellite arrays using the probablistic model\n" + "of HuRef satellite graphs based on Sanger style reads.\n" + "usage:\n" + " alphaAsm alphaMonFile.txt monomerOrder.txt output.txt\n" + "Where alphaMonFile is a list of reads with alpha chains monomers parsed out, one read per\n" + "line, with the first word in the line being the read id and subsequent words the monomers\n" + "within the read. The monomerOrder.txt has one line per major monomer type with a word for\n" + "each variant. The lines are in the same order the monomers appear, with the last line\n" + "being followed in order by the first since they repeat. The output.txt contains one line\n" + "for each monomer in the output, just containing the monomer symbol.\n" + "options:\n" + " -size=N - Set max chain size, default %d\n" + " -fullOnly - Only output chains of size above\n" + " -chain=fileName - Write out word chain to file\n" + " -afterChain=fileName - Write out word chain after faux generation to file for debugging\n" + " -initialOutSize=N - Output this many words. Default %d\n" + " -maxOutSize=N - Expand output up to this many words if not all monomers output initially\n" + " -maxToMiss=N - How many monomers may be missing from output. Default %d \n" + " -pseudoCount=N - Add this number to observed quantities - helps compensate for what's not\n" + " observed. Default %d\n" + " -seed=N - Initialize random number with this seed for consistent results, otherwise\n" + " it will generate different results each time it's run.\n" + " -betweens - Add <m> lines in output that indicate level of markov model used join\n" + " -unsubbed=fileName - Write output before substitution of missing monomers to file\n" + , maxChainSize, initialOutSize, maxToMiss, pseudoCount + ); +} + +/* Command line validation table. */ +static struct optionSpec options[] = { + {"size", OPTION_INT}, + {"chain", OPTION_STRING}, + {"afterChain", OPTION_STRING}, + {"fullOnly", OPTION_BOOLEAN}, + {"initialOutSize", OPTION_INT}, + {"maxOutSize", OPTION_INT}, + {"maxToMiss", OPTION_INT}, + {"pseudoCount", OPTION_INT}, + {"seed", OPTION_INT}, + {"unsubbed", OPTION_STRING}, + {"betweens", OPTION_BOOLEAN}, + {NULL, 0}, +}; + +/* Some structures to keep track of words (which correspond to alpha satellight monomers) + * seen in input. */ + +struct monomer +/* Basic information on a monomer including how many times it is seen in input and output + * streams. Unlike the wordTree, this is flat, and does not include predecessors. */ + { + struct monomer *next; /* Next in list of all words. */ + char *word; /* The word used to represent monomer. Not allocated here. */ + int useCount; /* Number of times used. */ + int outTarget; /* Number of times want to output word. */ + int outCount; /* Number of times have output word so far. */ + struct monomerType *type; /* The type of the monomer. */ + struct slRef *readList; /* List of references to reads this is in. */ + int subbedOutCount; /* Output count after substitution. */ + }; + +struct monomerRef +/* A reference to a monomer. */ + { + struct monomerRef *next; /* Next in list */ + struct monomer *val; /* The word referred to. */ + }; + +struct monomerType +/* A collection of words of the same type - or monomers of same type. */ + { + struct monomerType *next; /* Next monomerType */ + char *name; /* Short name of type */ + struct monomerRef *list; /* List of all words of that type */ + }; + +struct alphaRead +/* A read that's been parsed into alpha variants. */ + { + struct alphaRead *next; /* Next in list */ + char *name; /* Id of read, not allocated here. */ + struct monomerRef *list; /* List of alpha subunits */ + }; + +struct alphaStore +/* Stores info on all monomers */ + { + struct monomer *monomerList; /* List of words, fairly arbitrary order. */ + struct hash *monomerHash; /* Hash of monomer, keyed by word. */ + struct alphaRead *readList; /* List of all reads. */ + struct hash *readHash; /* Hash of all reads keyed by read ID. */ + struct wordTree *markovChains; /* Tree of words that follow other words. */ + struct wordTree *markovChainsNoOrphans; /* Markov tree before orphans are added. */ + int maxChainSize; /* Maximum depth of tree. */ + struct monomerType *typeList; /* List of all types. */ + struct hash *typeHash; /* Hash with monomerType values, keyed by all words. */ + int typeCount; /* Count of types. */ + struct monomerType **typeArray; /* Array as opposed to list of types */ + }; + +/* The wordTree structure below is the central data structure for this program. It is + * used to build up a tree that contains all observed N-word-long sequences observed in + * the text, where N corresponds to the "size" command line option which defaults to 3, + * an option that in turn is stored in the maxChainSize variable. At this chain size the + * text + * this is the black dog and the black cat + * would have the chains + * this is the + * is the black + * the black dog + * black dog and + * dog and the + * and the black + * the black cat + * and turn into the tree + * this + * is + * the + * is + * the + * black + * the + * black + * dog + * cat + * black + * dog + * and + * dog + * and + * the + * and + * the + * black + * Note how the tree is able to compress the two chains "the black dog" and "the black cat." + * + * A node in the tree can have as many children as it needs to at each node. The depth of + * the tree is the same as the chain size, by default 3. At each node in the tree you get + * a word, and a list of all words that are observed in the text to follow that word. + * + * Once the program has build up the wordTree, it can output it in a couple of fashions. */ + +struct wordTree +/* A node in a tree of words. The head of the tree is a node with word value the empty string. */ + { + struct wordTree *next; /* Next sibling */ + struct wordTree *children; /* Children in list. */ + struct wordTree *parent; /* Parent of this node or NULL for root. */ + struct monomer *monomer; /* The info on the word itself. */ + int useCount; /* Number of times word used in input with given predecessors. */ + int outTarget; /* Number of times want to output word with given predecessors. */ + int outCount; /* Number of times actually output with given predecessors. */ + double normVal; /* value to place the normalization value */ + }; + +struct wordTree *wordTreeNew(struct monomer *monomer) +/* Create and return new wordTree element. */ +{ +struct wordTree *wt; +AllocVar(wt); +wt->monomer = monomer; +return wt; +} + +int wordTreeCmpWord(const void *va, const void *vb) +/* Compare two wordTree for slSort. */ +{ +const struct wordTree *a = *((struct wordTree **)va); +const struct wordTree *b = *((struct wordTree **)vb); +return cmpStringsWithEmbeddedNumbers(a->monomer->word, b->monomer->word); +} + +static void wordTreeSort(struct wordTree *wt) +/* Sort all children lists in tree. */ +{ +slSort(&wt->children, wordTreeCmpWord); +struct wordTree *child; +for (child = wt->children; child != NULL; child = child->next) + wordTreeSort(child); +} + +struct wordTree *wordTreeFindInList(struct wordTree *list, struct monomer *monomer) +/* Return wordTree element in list that has given word, or NULL if none. */ +{ +struct wordTree *wt; +for (wt = list; wt != NULL; wt = wt->next) + if (wt->monomer == monomer) + break; +return wt; +} + +struct wordTree *wordTreeAddFollowing(struct wordTree *wt, struct monomer *monomer) +/* Make word follow wt in tree. If word already exists among followers + * return it and bump use count. Otherwise create new one. */ +{ +struct wordTree *child = wordTreeFindInList(wt->children, monomer); +if (child == NULL) + { + child = wordTreeNew(monomer); + child->parent = wt; + slAddHead(&wt->children, child); + } +child->useCount += 1; +return child; +} + +int wordTreeSumUseCounts(struct wordTree *list) +/* Sum up useCounts in list */ +{ +int total = 0; +struct wordTree *wt; +for (wt = list; wt != NULL; wt = wt->next) + total += wt->useCount; +return total; +} + +int wordTreeSumOutTargets(struct wordTree *list) +/* Sum up useCounts in list */ +{ +int total = 0; +struct wordTree *wt; +for (wt = list; wt != NULL; wt = wt->next) + total += wt->outTarget; +return total; +} + +void addChainToTree(struct wordTree *wt, struct dlList *chain) +/* Add chain of words to tree. */ +{ +struct dlNode *node; +wt->useCount += 1; +for (node = chain->head; !dlEnd(node); node = node->next) + { + struct monomer *monomer = node->val; + verbose(4, " adding %s\n", monomer->word); + wt = wordTreeAddFollowing(wt, monomer); + } +} + +int useCountInTree(struct wordTree *wt, struct dlNode *nodeList, int chainSize) +/* Return number of times chainSize successive nodes from nodeList are found + * in wt, 0 if not found. */ +{ +int i; +struct wordTree *subTree = wt; +struct dlNode *node = nodeList; +for (i=0; i<chainSize; ++i) + { + struct monomer *monomer = node->val; + subTree = wordTreeFindInList(subTree->children, monomer); + if (subTree == NULL) + return FALSE; + node = node->next; + } +return subTree->useCount; +} + + +void findLongestSupportingMarkovChain(struct wordTree *wt, struct dlNode *node, + int *retChainSize, int *retReadCount) +/* See if chain of words is in tree tree. */ +{ +struct dlNode *start = node; +int chainSize = 1; +int readCount = 0; +for (;;) + { + int useCount = useCountInTree(wt, start, chainSize); + if (useCount == 0) + break; + readCount = useCount; + chainSize += 1; + start = start->prev; + if (dlStart(start)) + break; + } +*retChainSize = chainSize; +*retReadCount = readCount; +} + +static void writeMonomerListAndBetweens(struct alphaStore *store, + char *fileName, struct dlList *ll) +/* Write out monomer list to file. */ +{ +FILE *f = mustOpen(fileName, "w"); +struct dlNode *node; +struct wordTree *origTree = store->markovChainsNoOrphans; +for (node = ll->head; !dlEnd(node); node = node->next) + { + struct monomer *monomer = node->val; + if (betweens) + { + int chainSize = 0, readCount = 0; + findLongestSupportingMarkovChain(origTree, node, + &chainSize, &readCount); + /* The -2 is for 1 extra for the empty tree root, and 1 extra to get + * from chain-size to markov model terminology. */ + char between[24]; + safef(between, sizeof(between), "<%d:%d:%d>", chainSize-2, + readCount, useCountInTree(origTree, node, 1)); + fprintf(f, "%-11s\t", between); + } + fprintf(f, "%s\n", monomer->word); + } +carefulClose(&f); +} + + +int wordTreeAddPseudoCount(struct wordTree *wt, int pseudo) +/* Add pseudo to all leaves of tree and propagate counts up to parents. */ +{ +if (wt->children == NULL) + { + wt->useCount += pseudo; + return wt->useCount; + } +else + { + struct wordTree *child; + int oldChildTotal = 0; + for (child = wt->children; child != NULL; child = child->next) + oldChildTotal += child->useCount; + int oldDiff = wt->useCount - oldChildTotal; + if (oldDiff < 0) oldDiff = 0; // Necessary in rare cases, not sure why. + + int total = 0; + for (child = wt->children; child != NULL; child = child->next) + total += wordTreeAddPseudoCount(child, pseudo); + wt->useCount = total + oldDiff + pseudo; + return total; + } +} + +void wordTreeNormalize(struct wordTree *wt, double outTarget, double normVal) +/* Recursively set wt->normVal and wt->outTarget so each branch gets its share */ +{ +if (pseudoCount > 0) + wordTreeAddPseudoCount(wt, pseudoCount); +wt->normVal = normVal; +wt->outTarget = outTarget; +wt->outCount = 0; +int childrenTotalUses = wordTreeSumUseCounts(wt->children); +struct wordTree *child; +for (child = wt->children; child != NULL; child = child->next) + { + double childRatio = (double)child->useCount / childrenTotalUses; + wordTreeNormalize(child, childRatio*outTarget, childRatio*normVal); + } +} + +char *wordTreeString(struct wordTree *wt) +/* Return something like '(a b c)' where c would be the value at wt itself, and + * a and b would be gotten by following parents. */ +{ +struct slName *list = NULL, *el; +for (;wt != NULL; wt = wt->parent) + { + char *word = wt->monomer->word; + if (!isEmpty(word)) // Avoid blank great grandparent at root of tree. + slNameAddHead(&list, word); + } + +struct dyString *dy = dyStringNew(0); +dyStringAppendC(dy, '('); +for (el = list; el != NULL; el = el->next) + { + dyStringPrintf(dy, "%s", el->name); + if (el->next != NULL) + dyStringAppendC(dy, ' '); + } +dyStringAppendC(dy, ')'); +slFreeList(&list); +return dyStringCannibalize(&dy); +} + +char *dlListFragWords(struct dlNode *head) +/* Return string containing all words in list pointed to by head. */ +{ +struct dyString *dy = dyStringNew(0); +dyStringAppendC(dy, '{'); +struct dlNode *node; +for (node = head; !dlEnd(node); node = node->next) + { + if (node != head) + dyStringAppendC(dy, ' '); + struct monomer *monomer = node->val; + dyStringAppend(dy, monomer->word); + } +dyStringAppendC(dy, '}'); +return dyStringCannibalize(&dy); +} + +void wordTreeDump(int level, struct wordTree *wt, int maxChainSize, FILE *f) +/* Write out wordTree to file. */ +{ +static char *words[64]; +int i; +assert(level < ArraySize(words)); + +words[level] = wt->monomer->word; +if (!fullOnly || level == maxChainSize) + { + fprintf(f, "%d\t%d\t%d\t%d\t%f\t", + level, wt->useCount, wt->outTarget, wt->outCount, wt->normVal); + + for (i=1; i<=level; ++i) + { + spaceOut(f, level*2); + fprintf(f, "%s ", words[i]); + } + fprintf(f, "\n"); + } +struct wordTree *child; +for (child = wt->children; child != NULL; child = child->next) + wordTreeDump(level+1, child, maxChainSize, f); +} + +struct wordTree *pickWeightedRandomFromList(struct wordTree *list) +/* Pick word from list randomly, but so that words with higher outTargets + * are picked more often. */ +{ +struct wordTree *picked = NULL; + +/* Debug output. */ + { + verbose(4, " pickWeightedRandomFromList("); + struct wordTree *wt; + for (wt = list; wt != NULL; wt = wt->next) + { + if (wt != list) + verbose(4, " "); + verbose(4, "%s:%d", wt->monomer->word, wt->outTarget); + } + verbose(4, ") total %d\n", wordTreeSumOutTargets(list)); + } + +/* Figure out total number of outputs left, and a random number between 0 and that total. */ +int total = wordTreeSumOutTargets(list); +if (total > 0) + { + int threshold = rand() % total; + verbose(4, " threshold %d\n", threshold); + + /* Loop through list returning selection corresponding to random threshold. */ + int binStart = 0; + struct wordTree *wt; + for (wt = list; wt != NULL; wt = wt->next) + { + int size = wt->outTarget; + int binEnd = binStart + size; + verbose(4, " %s size %d, binEnd %d\n", wt->monomer->word, size, binEnd); + if (threshold < binEnd) + { + picked = wt; + verbose(4, " picked %s\n", wt->monomer->word); + break; + } + binStart = binEnd; + } + } +return picked; +} + +int totUseZeroCount = 0; + +struct monomer *pickRandomFromType(struct monomerType *type) +/* Pick a random word, weighted by outTarget, from all available of given type */ +{ +/* Figure out total on list */ +int total = 0; +struct monomerRef *ref; +for (ref = type->list; ref != NULL; ref = ref->next) + total += ref->val->outTarget; +verbose(3, "pickRandomFromType %s total of outTarget=%d\n", type->name, total); + +/* Loop through list returning selection corresponding to random threshold. */ +if (total > 0) + { + int threshold = rand() % total; + int binStart = 0; + for (ref = type->list; ref != NULL; ref = ref->next) + { + struct monomer *monomer = ref->val; + int size = monomer->outTarget; + int binEnd = binStart + size; + if (threshold < binEnd) + return monomer; + binStart = binEnd; + } + } + +verbose(3, "Backup plan for %s in pickRandomFromType\n", type->name); + +/* Try again based on use counts. */ +total = 0; +for (ref = type->list; ref != NULL; ref = ref->next) + total += ref->val->useCount; +if (total > 0) + { + int threshold = rand() % total; + int binStart = 0; + for (ref = type->list; ref != NULL; ref = ref->next) + { + struct monomer *monomer = ref->val; + int size = monomer->useCount; + int binEnd = binStart + size; + if (threshold < binEnd) + return monomer; + binStart = binEnd; + } + } + +errAbort("Fell off end in pickRandomFromType on %s", type->name); +return NULL; +} + +struct wordTree *predictNextFromAllPredecessors(struct wordTree *wt, struct dlNode *list) +/* Predict next word given tree and recently used word list. If tree doesn't + * have statistics for what comes next given the words in list, then it returns + * NULL. */ +{ +if (verboseLevel() >= 4) + verbose(4, " predictNextFromAllPredecessors(%s, %s)\n", wordTreeString(wt), dlListFragWords(list)); +struct dlNode *node; +for (node = list; !dlEnd(node); node = node->next) + { + struct monomer *monomer = node->val; + wt = wordTreeFindInList(wt->children, monomer); + if (verboseLevel() >= 4) + verbose(4, " wordTreeFindInList(%s) = %p %s\n", monomer->word, wt, wordTreeString(wt)); + if (wt == NULL || wt->children == NULL) + break; + } +struct wordTree *result = NULL; +if (wt != NULL && wt->children != NULL) + result = pickWeightedRandomFromList(wt->children); +return result; +} + +struct wordTree *predictFromWordTree(struct wordTree *wt, struct dlNode *recent) +/* Predict next word given tree and recently used word list. Will use all words in + * recent list if can, but if there is not data in tree, will back off, and use + * progressively less previous words. */ +{ +struct dlNode *node; +if (verboseLevel() >= 4) + verbose(4, " predictNextFromWordTree(%s)\n", wordTreeString(wt)); + +for (node = recent; !dlEnd(node); node = node->next) + { + struct wordTree *result = predictNextFromAllPredecessors(wt, node); + if (result != NULL) + return result; + } +return NULL; +} + +struct dlNode *nodesFromTail(struct dlList *list, int count) +/* Return count nodes from tail of list. If list is shorter than count, it returns the + * whole list. */ +{ +int i; +struct dlNode *node; +for (i=0, node = list->tail; i<count; ++i) + { + if (dlStart(node)) + return list->head; + node = node->prev; + } +return node; +} + +struct monomerType *typeAfter(struct alphaStore *store, struct monomerType *oldType, + int amount) +/* Skip forward amount in typeList from oldType. If get to end of list start + * again at the beginning */ +{ +int i; +struct monomerType *type = oldType; +for (i=0; i<amount; ++i) + { + type = type->next; + if (type == NULL) + type = store->typeList; + } +return type; +} + +struct monomerType *typeBefore(struct alphaStore *store, struct monomerType *oldType, int amount) +/* Pick type that comes amount before given type. */ +{ +int typeIx = ptArrayIx(oldType, store->typeArray, store->typeCount); +typeIx -= amount; +while (typeIx < 0) + typeIx += store->typeCount; +return store->typeArray[typeIx]; +} + +struct wordTree *predictFromPreviousTypes(struct alphaStore *store, struct dlList *past) +/* Predict next based on general pattern of monomer order. */ +{ +/* This routine is now a bit more complex than necessary but it still works. It + * these days only needs to look back 1 because of the read-based type fill-in. */ +int maxToLookBack = 3; + { /* Debugging block */ + struct dlNode *node = nodesFromTail(past, maxToLookBack); + verbose(4, "predictFromPreviousTypes("); + for (; !dlEnd(node); node = node->next) + { + struct monomer *monomer = node->val; + verbose(4, "%s, ", monomer->word); + } + verbose(4, ")\n"); + } +int pastDepth; +struct dlNode *node; +for (pastDepth = 1, node = past->tail; pastDepth <= maxToLookBack; ++pastDepth) + { + if (dlStart(node)) + { + verbose(3, "predictFromPreviousTypes with empty past\n"); + return NULL; + } + struct monomer *monomer = node->val; + struct monomerType *prevType = hashFindVal(store->typeHash, monomer->word); + if (prevType != NULL) + { + struct monomerType *curType = typeAfter(store, prevType, pastDepth); + struct monomer *curInfo = pickRandomFromType(curType); + struct wordTree *curTree = wordTreeFindInList(store->markovChains->children, curInfo); + verbose(4, "predictFromPreviousType pastDepth %d, curInfo %s, curTree %p %s\n", + pastDepth, curInfo->word, curTree, curTree->monomer->word); + return curTree; + } + node = node->prev; + } +verbose(3, "predictFromPreviousTypes past all unknown types\n"); +return NULL; +} + + +struct wordTree *predictNext(struct alphaStore *store, struct dlList *past) +/* Given input data store and what is known from the past, predict the next word. */ +{ +struct dlNode *recent = nodesFromTail(past, store->maxChainSize); +struct wordTree *pick = predictFromWordTree(store->markovChains, recent); +if (pick == NULL) + pick = predictFromPreviousTypes(store, past); +if (pick == NULL) + { + pick = pickWeightedRandomFromList(store->markovChains->children); + verbose(3, "in predictNext() last resort pick of %s\n", pick->monomer->word); + } +return pick; +} + +void decrementOutputCountsInTree(struct wordTree *wt) +/* Decrement output count of self and parents. */ +{ +while (wt != NULL) + { + /* Decrement target count, but don't let it fall below sum of counts of all children. + * This can happen with incomplete data if we don't prevent it. This + * same code also prevents us from having negative outTarget. */ + int outTarget = wt->outTarget - 1; + int kidSum = wordTreeSumOutTargets(wt->children); + if (outTarget < kidSum) + outTarget = kidSum; + wt->outTarget = outTarget; + + /* Always bump outCount to help us debug (only real use of outCount). */ + wt->outCount += 1; + wt = wt->parent; + } +} + +static struct dlList *wordTreeGenerateList(struct alphaStore *store, int maxSize, + struct wordTree *firstWord, int maxOutputWords) +/* Make a list of words based on probabilities in tree. */ +{ +struct dlList *ll = dlListNew(); +int chainSize = 0; +int outputWords = 0; +struct dlNode *chainStartNode = NULL; + +for (;;) + { + if (++outputWords > maxOutputWords) + break; + struct dlNode *node; + struct wordTree *picked; + + /* Get next predicted word. */ + AllocVar(node); + if (chainSize == 0) + { + chainStartNode = node; + ++chainSize; + picked = firstWord; + } + else if (chainSize >= maxSize) + { + chainStartNode = chainStartNode->next; + picked = predictNext(store, ll); + } + else + { + picked = predictNext(store, ll); + ++chainSize; + } + + if (picked == NULL) + break; + + /* Add word from whatever level we fetched back to our output chain. */ + struct monomer *monomer = picked->monomer; + node->val = monomer; + dlAddTail(ll, node); + + decrementOutputCountsInTree(picked); + monomer->outTarget -= 1; + monomer->outCount += 1; + } +verbose(3, "totUseZeroCount = %d\n", totUseZeroCount); +return ll; +} + + +struct slRef *listUnusedMonomers(struct alphaStore *store, struct dlList *ll) +/* Return list of references to monomers that are in store but not on ll. */ +{ +struct slRef *refList = NULL, *ref; +struct hash *usedHash = hashNew(0); +struct dlNode *node; +/* Make hash of used monomers. */ +for (node = ll->head; !dlEnd(node); node = node->next) + { + struct monomer *monomer = node->val; + hashAdd(usedHash, monomer->word, monomer); + } + +/* Stream through all monomers looking for ones not used and adding to list. */ +struct monomer *monomer; +for (monomer = store->monomerList; monomer != NULL; monomer = monomer->next) + { + if (isEmpty(monomer->word)) + continue; + if (!hashLookup(usedHash, monomer->word)) + { + AllocVar(ref); + ref->val = monomer; + slAddHead(&refList, ref); + } + } +hashFree(&usedHash); +slReverse(&refList); +return refList; +} + + +int monomerRefIx(struct monomerRef *list, struct monomer *monomer) +/* Return index of monomer in list. */ +{ +int i; +struct monomerRef *ref; +for (i=0, ref = list; ref != NULL; ref = ref->next, ++i) + { + if (ref->val == monomer) + return i; + } +errAbort("Monomer %s not on list\n", monomer->word); +return -1; +} + +struct monomerRef *findNeighborhoodFromReads(struct monomer *center) +/* Find if possible one monomer to either side of center */ +{ +struct slRef *readRef; +struct monomerRef *before = NULL, *after = NULL; + +/* Loop through reads hoping to find a case where center is flanked by two monomers in + * same read. As a fallback, keep track of a monomer before and a monomer after in + * any read. */ +for (readRef = center->readList; readRef != NULL; readRef = readRef->next) + { + struct alphaRead *read = readRef->val; + int readSize = slCount(read->list); + int centerIx = monomerRefIx(read->list, center); + if (readSize >= 3 && centerIx > 0 && centerIx < readSize-1) + { + before = slElementFromIx(read->list, centerIx-1); + after = slElementFromIx(read->list, centerIx+1); + break; + } + else if (readSize >= 2) + { + if (centerIx == 0) + after = slElementFromIx(read->list, centerIx+1); + else + before = slElementFromIx(read->list, centerIx-1); + } + } + +/* Make up list from end to start. */ +struct monomerRef *retList = NULL, *monoRef; +if (after) + { + AllocVar(monoRef); + monoRef->val = after->val; + slAddHead(&retList, monoRef); + } +AllocVar(monoRef); +monoRef->val = center; +slAddHead(&retList, monoRef); +if (before) + { + AllocVar(monoRef); + monoRef->val = before->val; + slAddHead(&retList, monoRef); + } +return retList; +} + +struct dlNode *matchExceptCenter(struct dlNode *node, struct monomerRef *neighborhood, struct monomer *center) +/* Return center node if node matches neighborhood except for center. */ +{ +struct dlNode *centerNode = NULL; +struct monomerRef *neighbor; +struct dlNode *n; +for (n = node, neighbor=neighborhood; ; n = n->next, neighbor = neighbor->next) + { + if (neighbor == NULL) + break; + if (dlEnd(n)) + return NULL; + if (neighbor->val == center) + centerNode = n; + else if (n->val != neighbor->val) + return NULL; + } +return centerNode; +} + +void printMonomerRefList(struct monomerRef *refList, FILE *f) +/* Print out a line to file with the list of monomers. */ +{ +struct monomerRef *ref; +for (ref = refList; ref != NULL; ref = ref->next) + fprintf(f, "%s ", ref->val->word); +fprintf(f, "\n"); +} + +struct slRef *refsToPossibleCenters(struct monomer *center, struct monomerRef *neighborhood, struct dlList *ll) +/* Return a list of dlNodes where neighborhood, but not center matches. */ +{ +struct slRef *list = NULL; +struct dlNode *node; +for (node = ll->head; !dlEnd(node); node = node->next) + { + struct dlNode *centerNode = matchExceptCenter(node, neighborhood, center); + if (centerNode != NULL) + refAdd(&list, centerNode); + } +return list; +} + +void mostCommonMonomerWord(struct slRef *refList, char **retWord, int *retCount) +/* Given refs to dlNodes containing monomers, find word associated with most common monomer. */ +{ +/* Make up a hash that contains counts of all monomers. */ +struct hash *countHash = hashNew(0); +struct slRef *ref; +for (ref = refList; ref != NULL; ref = ref->next) + { + struct dlNode *node = ref->val; + struct monomer *monomer = node->val; + hashIncInt(countHash, monomer->word); + } + +/* Iterate through hash finding max value. */ +char *maxWord = NULL; +int maxCount = 0; +struct hashEl *hel; +struct hashCookie cookie = hashFirst(countHash); +while ((hel = hashNext(&cookie)) != NULL) + { + int count = ptToInt(hel->val); + if (count > maxCount) + { + maxCount = count; + maxWord = hel->name; + } + } +*retWord = maxWord; +*retCount = maxCount; +hashFree(&countHash); +} + +boolean subCommonCenter(struct alphaStore *store, + struct monomer *center, struct monomerRef *neighborhood, struct dlList *ll) +/* Scan list for places where have all items in neighborhood (except for center) matching. + * Substitute in center at one of these places chosen at random and return TRUE if possible. */ +{ +struct slRef *centerRefList = refsToPossibleCenters(center, neighborhood, ll); +verbose(4, "sub %s in neighborhood: ", center->word); +if (verboseLevel() >= 4) + printMonomerRefList(neighborhood, stderr); +verbose(4, "Got %d possible centers\n", slCount(centerRefList)); + +if (centerRefList == NULL) + return FALSE; +int commonCount = 0; +char *commonWord = NULL; +mostCommonMonomerWord(centerRefList, &commonWord, &commonCount); +struct monomer *commonMonomer = hashFindVal(store->monomerHash, commonWord); +verbose(4, "Commonest word to displace with %s is %s which occurs %d times in context and %d overall\n", center->word, commonWord, commonCount, commonMonomer->subbedOutCount); +if (commonMonomer->subbedOutCount < 2) + { + verbose(3, "Want to substitute %s for %s, but %s only occurs %d time.\n", + center->word, commonWord, commonWord, commonMonomer->subbedOutCount); + return FALSE; + } + +/* Select a random one of the most commonly occuring possible centers. */ +int targetIx = rand() % commonCount; +struct slRef *ref; +int currentIx = 0; +for (ref = centerRefList; ref != NULL; ref = ref->next) + { + struct dlNode *node = ref->val; + struct monomer *monomer = node->val; + if (sameString(monomer->word, commonWord)) + { + if (currentIx == targetIx) + { + verbose(3, "Substituting %s for %s in context of %d\n", center->word, commonWord, slCount(neighborhood)); + struct monomer *oldCenter = node->val; + if (oldCenter->type != center->type) + verbose(3, "Type mismatch subbig %s vs %s\n", oldCenter->word, center->word); + node->val = center; + oldCenter->subbedOutCount -= 1; + center->subbedOutCount += 1; + return TRUE; + } + ++currentIx; + } + } +internalErr(); // Should not get here. +return FALSE; +} + +boolean subCenterInNeighborhood(struct alphaStore *store, + struct monomer *center, struct monomerRef *neighborhood, struct dlList *ll) +/* Scan ll for cases where neighborhood around center matches. Replace one of these + * cases with center. */ +{ +int size = slCount(neighborhood); +assert(size == 3 || size == 2); +if (subCommonCenter(store, center, neighborhood, ll)) + return TRUE; +if (size == 3) + { + if (subCommonCenter(store, center, neighborhood->next, ll)) + return TRUE; + struct monomerRef *third = neighborhood->next->next; + neighborhood->next->next = NULL; + boolean ok = subCommonCenter(store, center, neighborhood, ll); + neighborhood->next->next = third; + return ok; + } +else + return FALSE; +} + +struct monomer *mostCommonInType(struct monomerType *type) +/* Return most common monomer of given type */ +{ +struct monomerRef *ref; +int commonCount = 0; +struct monomer *common = NULL; +for (ref = type->list; ref != NULL; ref = ref->next) + { + struct monomer *monomer = ref->val; + if (monomer->subbedOutCount > commonCount) + { + commonCount = monomer->subbedOutCount; + common = monomer; + } + } +return common; +} + +boolean subIntoFirstMostCommonOfType(struct alphaStore *store, struct monomer *unused, + struct dlList *ll) +/* Substitute unused for first occurence of most common monomer of same type. */ +{ +struct monomer *common = mostCommonInType(unused->type); +if (common == NULL) + { + verbose(3, "No monomers of type %s used at all!\n", unused->type->name); + return FALSE; + } +if (common->subbedOutCount < 2) + { + verbose(3, "Trying to sub in %s, but there's no monomers of type %s that are used more than once.\n", + unused->word, unused->type->name); + return FALSE; + } +struct dlNode *node; +for (node = ll->head; !dlEnd(node); node = node->next) + { + struct monomer *monomer = node->val; + if (monomer == common) + { + verbose(3, "Subbing %s for %s of type %s\n", unused->word, monomer->word, unused->type->name); + node->val = unused; + unused->subbedOutCount += 1; + monomer->subbedOutCount -= 1; + break; + } + } +return TRUE; +} + +void setInitialSubbedOutCount(struct alphaStore *store, struct dlList *ll) +/* Set subbedOutCount based on how many times monomers occur in list. */ +{ +struct dlNode *node; +for (node = ll->head; !dlEnd(node); node = node->next) + { + struct monomer *monomer = node->val; + monomer->subbedOutCount += 1; + } +#ifdef PARANOID +/* As a check see if subbedOutCount agrees with outCount. */ +int mismatchCount = 0; +struct monomer *monomer; +for (monomer = store->monomerList; monomer != NULL; monomer = monomer->next) + { + uglyf("%s %d %d\n", monomer->word, monomer->outCount, monomer->subbedOutCount); + if (monomer->outCount != monomer->subbedOutCount) + ++mismatchCount; + } +uglyf("mismatch count = %d\n", mismatchCount); +#endif /* PARANOID */ +} + +void subInMissing(struct alphaStore *store, struct dlList *ll) +/* Go figure out missing monomers in ll, and attempt to substitute them in somewhere they would fit. */ +{ +setInitialSubbedOutCount(store, ll); +struct slRef *unusedList = listUnusedMonomers(store, ll); +verbose(3, "%d monomers, %d unused\n", slCount(store->monomerList), slCount(unusedList)); +struct slRef *unusedRef; +for (unusedRef = unusedList; unusedRef != NULL; unusedRef = unusedRef->next) + { + struct monomer *unused = unusedRef->val; + struct monomerRef *neighborhood = findNeighborhoodFromReads(unused); + if (!subCenterInNeighborhood(store, unused, neighborhood, ll)) + { + verbose(3, "Couldn't substitute in %s with context, falling back to type logic\n", + unused->word); + subIntoFirstMostCommonOfType(store, unused, ll); + } + slFreeList(&neighborhood); + } +} + +static void writeMonomerList(char *fileName, struct dlList *ll) +/* Write out monomer list to file. */ +{ +FILE *f = mustOpen(fileName, "w"); +struct dlNode *node; +for (node = ll->head; !dlEnd(node); node = node->next) + { + struct monomer *monomer = node->val; + fprintf(f, "%s\n", monomer->word); + } +carefulClose(&f); +} + +static struct dlList *wordTreeMakeList(struct alphaStore *store, int maxSize, + struct wordTree *firstWord, int maxOutputWords) +/* Create word list base on tree probabilities, and then substituting in unused + * monomers if possible. */ +{ +struct dlList *ll = wordTreeGenerateList(store, maxSize, firstWord, maxOutputWords); +verbose(2, "Generated initial output list\n"); +char *unsubbedFile = optionVal("unsubbed", NULL); +if (unsubbedFile) + writeMonomerList(unsubbedFile, ll); +subInMissing(store, ll); +verbose(2, "Substituted in missing rare monomers\n"); +return ll; +} + + +struct alphaStore *alphaStoreNew(int maxChainSize) +/* Allocate and initialize a new word store. */ +{ +struct alphaStore *store; +AllocVar(store); +store->maxChainSize = maxChainSize; +store->monomerHash = hashNew(0); +store->typeHash = hashNew(0); +store->readHash = hashNew(0); +return store; +} + +struct monomer *alphaStoreAddMonomer(struct alphaStore *store, char *word) +/* Add word to store, incrementing it's useCount if it's already there, otherwise + * making up a new record for it. */ +{ +struct monomer *monomer = hashFindVal(store->monomerHash, word); +if (monomer == NULL) + { + AllocVar(monomer); + hashAddSaveName(store->monomerHash, word, monomer, &monomer->word); + slAddHead(&store->monomerList, monomer); + } +monomer->useCount += 1; +monomer->outTarget = monomer->useCount; /* Set default value, may be renormalized. */ +return monomer; +} + +void connectReadsToMonomers(struct alphaStore *store) +/* Add each read monomer appears in to monomer's readList. */ +{ +struct alphaRead *read; +for (read = store->readList; read != NULL; read = read->next) + { + struct monomerRef *ref; + for (ref = read->list; ref != NULL; ref = ref->next) + { + struct monomer *monomer = ref->val; + refAdd(&monomer->readList, read); + } + } +} + +void alphaReadListFromFile(char *fileName, struct alphaStore *store) +/* Read in file and turn it into a list of alphaReads. */ +{ +/* Stuff for processing file a line at a time. */ +struct lineFile *lf = lineFileOpen(fileName, TRUE); +char *line, *word; + +/* Loop through each line of file making up a read for it. */ +struct alphaRead *readList = NULL; +while (lineFileNextReal(lf, &line)) + { + /* Parse out first word of line into name, and rest into list. */ + char *name = nextWord(&line); + struct monomerRef *list = NULL; + while ((word = nextWord(&line)) != NULL) + { + struct monomer *monomer = alphaStoreAddMonomer(store, word); + struct monomerRef *ref; + AllocVar(ref); + ref->val = monomer; + slAddHead(&list, ref); + } + slReverse(&list); + if (list == NULL) + errAbort("Line %d of %s has no alpha monomers.", lf->lineIx, lf->fileName); + + /* Create data structure and add read to list and hash */ + if (hashLookup(store->readHash, name)) + errAbort("Read ID %s is repeated line %d of %s", name, lf->lineIx, lf->fileName); + struct alphaRead *read; + AllocVar(read); + read->list = list; + hashAddSaveName(store->readHash, name, read, &read->name); + slAddHead(&readList, read); + } +slReverse(&readList); +store->readList = readList; +if (store->readList == NULL) + errAbort("%s contains no reads", lf->fileName); +lineFileClose(&lf); +connectReadsToMonomers(store); +} + +struct alphaOrphan +/* Information about an orphan - something without great joining information. */ + { + struct alphaOrphan *next; /* Next in list */ + struct monomer *monomer; /* Pointer to orphan */ + boolean paired; /* True if has been paired with somebody. */ + }; + +struct alphaOrphan *findOrphanStarts(struct alphaRead *readList, struct alphaStore *store) +/* Return list of monomers that are found only at start of reads. */ +{ +struct alphaOrphan *orphanList = NULL; + +/* Build up hash of things seen later in reads. */ +struct alphaRead *read; +struct hash *later = hashNew(0); /* Hash of monomers found later. */ +for (read = readList; read != NULL; read = read->next) + { + struct monomerRef *ref; + for (ref = read->list->next; ref != NULL; ref = ref->next) + { + hashAdd(later, ref->val->word, NULL); + } + } + +/* Pass through again collecting orphans. */ +struct hash *orphanHash = hashNew(0); +for (read = readList; read != NULL; read = read->next) + { + struct monomer *monomer = read->list->val; + char *word = monomer->word; + if (!hashLookup(later, word)) + { + struct alphaOrphan *orphan = hashFindVal(orphanHash, word); + if (orphan == NULL) + { + AllocVar(orphan); + orphan->monomer = monomer; + slAddHead(&orphanList, orphan); + hashAdd(orphanHash, word, orphan); + verbose(3, "Adding orphan start %s\n", word); + } + } + } +hashFree(&later); +hashFree(&orphanHash); +slReverse(&orphanList); +return orphanList; +} + +struct alphaOrphan *findOrphanEnds(struct alphaRead *readList, struct alphaStore *store) +/* Return list of monomers that are found only at end of reads. */ +{ +struct alphaOrphan *orphanList = NULL; + +/* Build up hash of things seen sooner in reads. */ +struct alphaRead *read; +struct hash *sooner = hashNew(0); /* Hash of monomers found sooner. */ +for (read = readList; read != NULL; read = read->next) + { + struct monomerRef *ref; + /* Loop over all but last. */ + for (ref = read->list; ref->next != NULL; ref = ref->next) + { + hashAdd(sooner, ref->val->word, NULL); + } + } + +/* Pass through again collecting orphans. */ +struct hash *orphanHash = hashNew(0); +for (read = readList; read != NULL; read = read->next) + { + struct monomerRef *lastRef = slLastEl(read->list); + struct monomer *monomer = lastRef->val; + if (!hashLookup(sooner, monomer->word)) + { + struct alphaOrphan *orphan = hashFindVal(orphanHash, monomer->word); + if (orphan == NULL) + { + AllocVar(orphan); + orphan->monomer = monomer; + slAddHead(&orphanList, orphan); + hashAdd(orphanHash, monomer->word, orphan); + verbose(3, "Adding orphan end %s\n", monomer->word); + } + } + } +hashFree(&sooner); +slReverse(&orphanList); +hashFree(&orphanHash); +return orphanList; +} + +struct alphaRead *addReadOfTwo(struct alphaStore *store, struct monomer *a, struct monomer *b) +/* Make up a fake read that goes from a to b and add it to store. */ +{ +/* Create fake name and start fake read with it. */ +static int fakeId = 0; +char name[32]; +safef(name, sizeof(name), "fake%d", ++fakeId); + +/* Allocate fake read. */ +struct alphaRead *read; +AllocVar(read); + +/* Add a and b to the read. */ +struct monomerRef *aRef, *bRef; +AllocVar(aRef); +aRef->val = a; +AllocVar(bRef); +bRef->val = b; +aRef->next = bRef; +read->list = aRef; + +/* Add read to the store. */ +slAddHead(&store->readList, read); +hashAddSaveName(store->readHash, name, read, &read->name); + +/* Add read to the monomers. */ +refAdd(&a->readList, read); +refAdd(&b->readList, read); + +a->useCount += 1; +b->useCount += 1; +return read; +} + +void integrateOrphans(struct alphaStore *store) +/* Make up fake reads that integrate orphans (monomers only found at beginning or end + * of a read) better. */ +{ +struct alphaOrphan *orphanStarts = findOrphanStarts(store->readList, store); +struct alphaOrphan *orphanEnds = findOrphanEnds(store->readList, store); +verbose(3, "orphanStarts has %d items, orphanEnds %d\n", + slCount(orphanStarts), slCount(orphanEnds)); + +/* First look for the excellent situation where you can pair up orphans with each + * other. For Y at least this happens half the time. */ +struct alphaOrphan *start, *end; +for (start = orphanStarts; start != NULL; start = start->next) + { + struct monomerType *startType = start->monomer->type; + if (startType == NULL) + continue; + for (end = orphanEnds; end != NULL && !start->paired; end = end->next) + { + struct monomerType *endType = end->monomer->type; + if (endType == NULL) + continue; + if (!end->paired) + { + struct monomerType *nextType = typeAfter(store, endType, 1); + if (nextType == startType) + { + end->paired = TRUE; + start->paired = TRUE; + addReadOfTwo(store, end->monomer, start->monomer); + verbose(3, "Pairing %s with %s\n", end->monomer->word, start->monomer->word); + } + } + } + } + +/* Ok, now have to just manufacture other side of orphan starts out of thin air. */ +for (start = orphanStarts; start != NULL; start = start->next) + { + if (start->paired) + continue; + struct monomer *startMono = start->monomer; + struct monomerType *startType = startMono->type; + if (startType == NULL) + continue; + + struct monomerType *newType = typeBefore(store, startType, 1); + verbose(3, "Trying to find end of type %s\n", newType->name); + struct monomer *newMono = pickRandomFromType(newType); + addReadOfTwo(store, newMono, startMono); + verbose(3, "Pairing new %s with start %s\n", newMono->word, startMono->word); + } + +/* Ok, now have to just manufacture other side of orphan ends out of thin air. */ +for (end = orphanEnds; end != NULL; end = end->next) + { + if (end->paired) + continue; + struct monomer *endMono = end->monomer; + struct monomerType *endType = endMono->type; + if (endType == NULL) + continue; + + struct monomerType *newType = typeAfter(store, endType, 1); + verbose(3, "Trying to find start of type %s\n", newType->name); + struct monomer *newMono = pickRandomFromType(newType); + addReadOfTwo(store, endMono, newMono); + verbose(3, "Pairing end %s with new %s\n", endMono->word, newMono->word); + } +} + +struct wordTree *makeMarkovChains(struct alphaStore *store) +/* Return wordTree containing all words, and also all chains-of-words of length + * chainSize seen in file. */ +{ +/* We'll build up the tree starting with an empty root node. */ +struct wordTree *wt = wordTreeNew(alphaStoreAddMonomer(store, "")); +int chainSize = store->maxChainSize; + +/* Loop through each read. There's special cases at the beginning and end of read, and for + * short reads. In the main case we'll be maintaining a chain (doubly linked list) of maxChainSize + * words, * popping off one word from the start, and adding one word to the end for each + * new word we encounter. This list is added to the tree each iteration. */ +struct alphaRead *read; +for (read = store->readList; read != NULL; read = read->next) + { + /* We'll keep a chain of three or so words in a doubly linked list. */ + struct dlNode *node; + struct dlList *chain = dlListNew(); + int curSize = 0; + int wordCount = 0; + + struct monomerRef *ref; + for (ref = read->list; ref != NULL; ref = ref->next) + { + struct monomer *monomer = ref->val; + /* For the first few words in the file after ID, we'll just build up the chain, + * only adding it to the tree when we finally do get to the desired + * chain size. Once past the initial section of the file we'll be + * getting rid of the first link in the chain as well as adding a new + * last link in the chain with each new word we see. */ + if (curSize < chainSize) + { + dlAddValTail(chain, monomer); + ++curSize; + if (curSize == chainSize) + addChainToTree(wt, chain); + } + else + { + /* Reuse doubly-linked-list node, but give it a new value, as we move + * it from head to tail of list. */ + node = dlPopHead(chain); + node->val = monomer; + dlAddTail(chain, node); + addChainToTree(wt, chain); + } + ++wordCount; + } + /* Handle last few words in line, where can't make a chain of full size. Also handles + * lines that have fewer than chain size words. */ + if (curSize < chainSize) + addChainToTree(wt, chain); + while ((node = dlPopHead(chain)) != NULL) + { + if (!dlEmpty(chain)) + addChainToTree(wt, chain); + freeMem(node); + } + dlListFree(&chain); + } +wordTreeSort(wt); // Make output of chain file prettier +return wt; +} + +void wordTreeWrite(struct wordTree *wt, int maxChainSize, char *fileName) +/* Write out tree to file */ +{ +FILE *f = mustOpen(fileName, "w"); +fprintf(f, "#level\tuseCount\toutTarget\toutCount\tnormVal\tmonomers\n"); +wordTreeDump(0, wt, maxChainSize, f); +carefulClose(&f); +} + +void alphaStoreLoadMonomerOrder(struct alphaStore *store, char *readsFile, char *fileName) +/* Read in a file with one line for each monomer type, where first word is type name, + * and rest of words are all the actually variants of that type. + * Requires all variants already be in store. The readsFile is passed + * just for nicer error reporting. */ +{ +/* Stuff for processing file a line at a time. */ +struct lineFile *lf = lineFileOpen(fileName, TRUE); +char *line, *word; +struct hash *uniq = hashNew(0); + +/* Set up variables we'll put results in in store. */ +store->typeList = NULL; + +while (lineFileNextReal(lf, &line)) + { + char *name = nextWord(&line); + if (hashLookup(uniq, name) != NULL) + errAbort("Type name '%s' repeated line %d of %s\n", name, lf->lineIx, lf->fileName); + struct monomerType *type; + AllocVar(type); + type->name = cloneString(name); + slAddHead(&store->typeList, type); + while ((word = nextWord(&line)) != NULL) + { + struct monomer *monomer = hashFindVal(store->monomerHash, word); + if (monomer == NULL) + errAbort("%s is in %s but not %s", word, lf->fileName, readsFile); + struct monomerRef *ref; + AllocVar(ref); + ref->val = monomer; + slAddHead(&type->list, ref); + hashAddUnique(store->typeHash, word, type); + } + if (type->list == NULL) + errAbort("Short line %d of %s. Format should be:\ntype list-of-monomers-of-type\n", + lf->lineIx, lf->fileName); + } +slReverse(&store->typeList); +if (store->typeList == NULL) + errAbort("%s is empty", lf->fileName); +lineFileClose(&lf); +hashFree(&uniq); +verbose(3, "Added %d types containing %d words from %s\n", + slCount(store->typeList), store->typeHash->elCount, fileName); + +/* Create type array */ +store->typeCount = slCount(store->typeList); +struct monomerType **types = AllocArray(store->typeArray, store->typeCount); +struct monomerType *type; +int i; +for (i=0, type = store->typeList; i<store->typeCount; ++i, type = type->next) + types[i] = type; +} + +void crossCheckMonomerOrderAndReads(struct alphaStore *store, char *orderedPrefix, char *readFile, char *typeFile) +/* Make sure all monomer that begin with ordered prefix are present in monomer order file. */ +{ +/* Make hash of all monomers that have type info. */ +struct hash *orderHash = hashNew(0); +struct monomerType *type; +for (type = store->typeList; type != NULL; type = type->next) + { + struct monomerRef *ref; + for (ref = type->list; ref != NULL; ref = ref->next) + hashAdd(orderHash, ref->val->word, ref->val); + } + +/* Go through all monomers and make sure ones with correct prefix are in list. */ +struct monomer *mon; +for (mon = store->monomerList; mon != NULL; mon = mon->next) + { + char *word = mon->word; + if (startsWith(orderedPrefix, word) && !hashLookup(orderHash, word)) + { + errAbort("%s is in %s but not %s", word, readFile, typeFile); + } + } +hashFree(&orderHash); +} + + +void monomerListNormalise(struct monomer *list, int totalCount, int outputSize) +/* Set outTarget field in all of list to be normalized to outputSize */ +{ +struct monomer *monomer; +double scale = (double)outputSize/totalCount; +for (monomer = list; monomer != NULL; monomer = monomer->next) + { + monomer->outCount = 0; + monomer->outTarget = round(scale * monomer->useCount); + } +} + +void alphaStoreNormalize(struct alphaStore *store, int outputSize) +/* Set up output counts on each word to make sure it gets it's share of output size */ +{ +struct wordTree *wt = store->markovChains; +wordTreeNormalize(wt, outputSize, 1.0); +monomerListNormalise(store->monomerList, wt->useCount, outputSize); +} + +struct monomerType *fillInTypeFromReads(struct monomer *monomer, struct alphaStore *store) +/* Look through read list trying to find best supported type for this monomer. */ +{ +/* The algorithm here could be better. It picks the closest nearby monomer that is typed + * to fill in from. When have information from both before and after the program arbitrarily + * uses before preferentially. When have information that is equally close from multiple reads, + * it just uses info from first read. Fortunately the data, at least on Y, shows a lot of + * consistency, so it's probably not actually worth a more sophisticated algorithm that would + * instead take the most popular choice rather than the first one. */ +verbose(3, "fillInTypeFromReads on %s from %d reads\n", monomer->word, slCount(monomer->readList)); +struct monomerType *bestType = NULL; +int bestPos = 0; +boolean bestIsAfter = FALSE; +struct slRef *readRef; +for (readRef = monomer->readList; readRef != NULL; readRef = readRef->next) + { + struct alphaRead *read = readRef->val; + struct monomerRef *ref; + { + verbose(3, " read %s (%d els):", read->name, slCount(read->list)); + for (ref = read->list; ref != NULL; ref = ref->next) + { + struct monomer *m = ref->val; + verbose(3, " %s", m->word); + } + verbose(3, "\n"); + } + struct monomerType *beforeType = NULL; + int beforePos = 0; + + /* Look before. */ + for (ref = read->list; ref != NULL; ref = ref->next) + { + struct monomer *m = ref->val; + if (m == monomer) + break; + if (m->type != NULL) + { + beforeType = m->type; + beforePos = 0; + } + ++beforePos; + } + + /* Look after. */ + struct monomerType *afterType = NULL; + int afterPos = 0; + assert(ref != NULL && ref->val == monomer); + for (ref = ref->next; ref != NULL; ref = ref->next) + { + struct monomer *m = ref->val; + ++afterPos; + if (m->type != NULL) + { + afterType = m->type; + break; + } + } + + if (beforeType != NULL) + { + if (bestType == NULL || beforePos < bestPos) + { + bestType = beforeType; + bestPos = beforePos; + bestIsAfter = FALSE; + } + } + + if (afterType != NULL) + { + if (bestType == NULL || afterPos < bestPos) + { + bestType = afterType; + bestPos = afterPos; + bestIsAfter = TRUE; + } + } + } + +/* Now have found a type that is at a known position relative to ourselves. From this + * infer our own type. */ +struct monomerType *chosenType = NULL; +if (bestType != NULL) + { + if (bestIsAfter) + chosenType = typeBefore(store, bestType, bestPos); + else + chosenType = typeAfter(store, bestType, bestPos); + } +verbose(3, "chosenType for %s is %s\n", monomer->word, + (bestType == NULL ? "(null)" : chosenType->name)); +return chosenType; +} + +void fillInTypes(struct alphaStore *store) +/* We have types for most but not all monomers. Try and fill out types for most of + * the rest by looking for position in reads they are in that do contain known types */ +{ +/* Do first pass on ones defined - just have to look them up in hash. */ +struct monomer *monomer; +for (monomer = store->monomerList; monomer != NULL; monomer = monomer->next) + monomer->type = hashFindVal(store->typeHash, monomer->word); + +/* Do second pass on ones that are not yet defined */ +for (monomer = store->monomerList; monomer != NULL; monomer = monomer->next) + { + if (monomer->type == NULL) + { + monomer->type = fillInTypeFromReads(monomer, store); + } + } +} + +struct dlList *alphaAsmOneSize(struct alphaStore *store, int outSize) +/* alphaAsm - assemble alpha repeat regions such as centromeres from reads that have + * been parsed into various repeat monomer variants. Cycles of these variants tend to + * form higher order repeats. Returns list of outSize monomers. */ +{ +struct wordTree *wt = store->markovChains; +alphaStoreNormalize(store, outSize); +verbose(2, "Normalized Markov Chains\n"); + +if (optionExists("chain")) + { + char *fileName = optionVal("chain", NULL); + wordTreeWrite(wt, store->maxChainSize, fileName); + } + +struct dlList *ll = wordTreeMakeList(store, store->maxChainSize, + pickWeightedRandomFromList(wt->children), outSize); + +if (optionExists("afterChain")) + { + char *fileName = optionVal("afterChain", NULL); + wordTreeWrite(wt, store->maxChainSize, fileName); + } + +return ll; +} + +int countMissingMonomers(struct dlList *ll, struct monomer *monomerList) +/* Set the outCounts in monomer list to match how often they occur in ll. + * Return number of monomers that do not occur at all in ll. */ +{ +/* Zero out output counts. */ +struct monomer *monomer; +for (monomer = monomerList; monomer != NULL; monomer = monomer->next) + monomer->outCount = 0; + +/* Increase output count each time a monomer is used. */ +struct dlNode *node; +for (node = ll->head; !dlEnd(node); node = node->next) + { + monomer = node->val; + monomer->outCount += 1; + } + +/* Count up unused. */ +int missing = 0; +for (monomer = monomerList; monomer != NULL; monomer = monomer->next) + if (monomer->outCount == 0 && !sameString(monomer->word, "")) + missing += 1; + +return missing; +} + +void alphaAsm(char *readsFile, char *monomerOrderFile, char *outFile) +/* alphaAsm - assemble alpha repeat regions such as centromeres from reads that have + * been parsed into various repeat monomer variants. Cycles of these variants tend to + * form higher order repeats. */ +{ +/* This routine reads in the input, and then calls a routine that produces the + * output for a given size. If not all monomers are included in the output, then it + * will try to find the minimum output size needed to include all monomers. */ + +/* Read in inputs, and put in "store" */ +struct alphaStore *store = alphaStoreNew(maxChainSize); +alphaReadListFromFile(readsFile, store); +alphaStoreLoadMonomerOrder(store, readsFile, monomerOrderFile); +verbose(2, "Loaded input reads and monomer order\n"); + +if (betweens) + { + store->markovChainsNoOrphans = makeMarkovChains(store); + verbose(2, "Made initial markov chains\n"); + } + +/* Do some cross checking and filling out a bit for missing data. */ +crossCheckMonomerOrderAndReads(store, "m", readsFile, monomerOrderFile); +fillInTypes(store); +integrateOrphans(store); +verbose(2, "Filled in missing types and integrated orphan ends\n"); + +/* Make the main markov chains out of the reads. */ +store->markovChains = makeMarkovChains(store); +verbose(2, "Made Markov Chains\n"); + +/* Loop gradually increasing size of output we make until get to maxOutSize or until + * we generate output that includes all input monomers. */ +struct dlList *ll; +int outSize = initialOutSize; +while (outSize <= maxOutSize) + { + ll = alphaAsmOneSize(store, outSize); + assert(outSize == dlCount(ll)); + int missing = countMissingMonomers(ll, store->monomerList); + if (missing <= maxToMiss) + break; + if (outSize == maxOutSize) + { + errAbort("Could not include all monomers. Missing %d.\n" + "consider increasing maxOutSize (now %d) or increasing maxToMiss (now %d)", + missing, maxOutSize, maxToMiss); + break; + } + dlListFree(&ll); + outSize *= 1.1; /* Grow by 10% */ + if (outSize > maxOutSize) + outSize = maxOutSize; + verbose(1, "%d missing. Trying again with outSize=%d (initialOutSize %d, maxOutSize %d)\n", + missing, outSize, initialOutSize, maxOutSize); + } +writeMonomerListAndBetweens(store, outFile, ll); +verbose(2, "Wrote primary output\n"); +dlListFree(&ll); +} + + +int main(int argc, char *argv[]) +/* Process command line. */ +{ +optionInit(&argc, argv, options); +if (argc != 4) + usage(); +maxChainSize = optionInt("size", maxChainSize); +initialOutSize = optionInt("initialOutSize", initialOutSize); +maxOutSize = optionInt("maxOutSize", initialOutSize); // Defaults to same as initialOutSize +if (maxOutSize < initialOutSize) + errAbort("maxOutSize (%d) needs to be bigger than initialOutSize (%d)\n", + maxOutSize, initialOutSize); +maxToMiss = optionInt("maxToMiss", maxToMiss); +fullOnly = optionExists("fullOnly"); +pseudoCount = optionInt("pseudoCount", pseudoCount); +betweens = optionExists("betweens"); +int seed = optionInt("seed", 0); +srand(seed); +alphaAsm(argv[1], argv[2], argv[3]); +return 0; +}