9b3a97eac6e90e14eb908d1f790d25d57c8d4479 kent Tue Aug 28 17:18:35 2012 -0700 Loading in monomer order file. diff --git src/kehayden/alphaChain/alphaChain.c src/kehayden/alphaChain/alphaChain.c index af954a7..45bc5c0 100644 --- src/kehayden/alphaChain/alphaChain.c +++ src/kehayden/alphaChain/alphaChain.c @@ -45,36 +45,52 @@ /* Some structures to keep track of words (which correspond to alpha satellight monomers) * seen in input. */ struct wordInfo /* Basic information on a word including how many times it is seen in input and output * streams. Unlike the wordTree, this is flat, and does not include predecessors. */ { struct wordInfo *next; /* Next in list of all words. */ char *word; /* The word itself. Not allocated here. */ int useCount; /* Number of times used. */ int outTarget; /* Number of times want to output word. */ int outCount; /* Number of times have output word so far. */ }; +struct wordInfoRef +/* A reference to a word. */ + { + struct wordInfoRef *next; /* Next in list */ + struct wordInfo *val; /* The word referred to. */ + }; + +struct wordType +/* A collection of words of the same type - or monomers of same type. */ + { + struct wordType *next; /* Next wordType */ + struct wordInfoRef *list; /* List of all words of that type */ + }; + struct wordStore /* Stores info on all words */ { - struct wordInfo *list; /* List of words, fairly arbitrary order. */ - struct hash *hash; /* Hash of wordInfo, keyed by word. */ + struct wordInfo *infoList; /* List of words, fairly arbitrary order. */ + struct hash *infoHash; /* Hash of wordInfo, keyed by word. */ struct wordTree *markovChains; /* Tree of words that follow other words. */ + struct wordType *typeList; /* List of all types. */ + struct hash *typeHash; /* Hash with wordType values, keyed by all words. */ }; /* The wordTree structure below is the central data structure for this program. It is * used to build up a tree that contains all observed N-word-long sequences observed in * the text, where N corresponds to the "size" command line option which defaults to 3, * an option that in turn is stored in the maxChainSize variable. At this chain size the * text * this is the black dog and the black cat * would have the chains * this is the * is the black * the black dog * black dog and * dog and the * and the black @@ -487,43 +503,44 @@ static void wordTreeSort(struct wordTree *wt) /* Sort all children lists in tree. */ { slSort(&wt->children, wordTreeCmpWord); struct wordTree *child; for (child = wt->children; child != NULL; child = child->next) wordTreeSort(child); } struct wordStore *wordStoreNew() /* Allocate and initialize a new word store. */ { struct wordStore *store; AllocVar(store); -store->hash = hashNew(0); +store->infoHash = hashNew(0); return store; } struct wordInfo *wordStoreAdd(struct wordStore *store, char *word) /* Add word to store, incrementing it's useCount if it's already there, otherwise * making up a new record for it. */ { -struct wordInfo *info = hashFindVal(store->hash, word); +struct wordInfo *info = hashFindVal(store->infoHash, word); if (info == NULL) { AllocVar(info); - hashAddSaveName(store->hash, word, info, &info->word); + hashAddSaveName(store->infoHash, word, info, &info->word); + slAddHead(&store->infoList, info); } info->useCount += 1; return info; } struct wordStore *wordStoreForChainsInFile(char *fileName, int chainSize) /* Return a wordStore containing all words, and also all chains-of-words of length * chainSize seen in file. */ { /* Stuff for processing file a line at a time. */ struct lineFile *lf = lineFileOpen(fileName, TRUE); char *line, *word; /* We'll build up the tree starting with an empty root node. */ struct wordStore *store = wordStoreNew(); @@ -586,35 +603,71 @@ lineFileClose(&lf); wordTreeSort(wt); // Make output of chain file prettier return store; } void wordTreeWrite(struct wordTree *wt, char *fileName) /* Write out tree to file */ { FILE *f = mustOpen(fileName, "w"); fprintf(f, "#level\tuseCount\toutTarget\toutCount\tnormVal\tmonomers\n"); wordTreeDump(0, wt, f); carefulClose(&f); } +void wordStoreLoadMonomerOrder(struct wordStore *store, char *readsFile, char *fileName) +/* Read in a file with one line for each monomer type, containing a word for each + * monomer variant. Requires all variants already be in store. The readsFile is passed + * just for nicer error reporting. */ +{ +/* Stuff for processing file a line at a time. */ +struct lineFile *lf = lineFileOpen(fileName, TRUE); +char *line, *word; + +/* Set up variables we'll put results in in store. */ +store->typeHash = hashNew(0); +store->typeList = NULL; + +while (lineFileNextReal(lf, &line)) + { + struct wordType *type; + AllocVar(type); + slAddHead(&store->typeList, type); + while ((word = nextWord(&line)) != NULL) + { + struct wordInfo *info = hashFindVal(store->infoHash, word); + if (info == NULL) + errAbort("%s is in %s but not %s", word, lf->fileName, readsFile); + struct wordInfoRef *ref; + AllocVar(ref); + ref->val = info; + slAddHead(&type->list, ref); + hashAddUnique(store->typeHash, word, type); + } + } +lineFileClose(&lf); +verbose(2, "Added %d types containing %d words from %s\n", + slCount(store->typeList), store->typeHash->elCount, fileName); +} + void alphaChain(char *readsFile, char *monomerOrderFile, char *outFile) /* alphaChain - Create Markov chain of words and optionally output chain in two formats. */ { struct wordStore *store = wordStoreForChainsInFile(readsFile, maxChainSize); struct wordTree *wt = store->markovChains; +wordStoreLoadMonomerOrder(store, readsFile, monomerOrderFile); wordTreeNormalize(wt, outSize, 1.0); if (optionExists("chain")) { char *fileName = optionVal("chain", NULL); wordTreeWrite(wt, fileName); } wordTreeGenerateFaux(store, maxChainSize, pickRandom(wt->children), outSize, outFile); if (optionExists("afterChain")) { char *fileName = optionVal("afterChain", NULL); wordTreeWrite(wt, fileName); }