29abab691ecf705da7c642ccfedd3fd2e26b74b1 kent Tue Sep 11 13:35:18 2012 -0700 Starting to do orphan beginning/end processing. diff --git src/kehayden/alphaAsm/alphaAsm.c src/kehayden/alphaAsm/alphaAsm.c index 7a39680..ca818d4 100644 --- src/kehayden/alphaAsm/alphaAsm.c +++ src/kehayden/alphaAsm/alphaAsm.c @@ -45,55 +45,55 @@ {"afterChain", OPTION_STRING}, {"fullOnly", OPTION_BOOLEAN}, {"outSize", OPTION_INT}, {"seed", OPTION_INT}, {NULL, 0}, }; /* Some structures to keep track of words (which correspond to alpha satellight monomers) * seen in input. */ struct monomer /* Basic information on a monomer including how many times it is seen in input and output * streams. Unlike the wordTree, this is flat, and does not include predecessors. */ { struct monomer *next; /* Next in list of all words. */ - char *word; /* The word itself. Not allocated here. */ + char *word; /* The word used to represent monomer. Not allocated here. */ int useCount; /* Number of times used. */ int outTarget; /* Number of times want to output word. */ int outCount; /* Number of times have output word so far. */ }; struct monomerRef /* A reference to a word. */ { struct monomerRef *next; /* Next in list */ struct monomer *val; /* The word referred to. */ }; struct monomerType /* A collection of words of the same type - or monomers of same type. */ { struct monomerType *next; /* Next monomerType */ struct monomerRef *list; /* List of all words of that type */ }; struct alphaRead /* A read that's been parsed into alpha variants. */ { struct alphaRead *next; /* Next in list */ - char *name; /* Id of read */ + char *name; /* Id of read, not allocated here. */ struct monomerRef *list; /* List of alpha subunits */ }; struct alphaStore /* Stores info on all words */ { struct monomer *monomerList; /* List of words, fairly arbitrary order. */ struct hash *monomerHash; /* Hash of monomer, keyed by word. */ struct alphaRead *readList; /* List of all reads. */ struct hash *readHash; /* Hash of all reads keyed by read ID. */ struct wordTree *markovChains; /* Tree of words that follow other words. */ int maxChainSize; /* Maximum depth of tree. */ struct monomerType *typeList; /* List of all types. */ struct hash *typeHash; /* Hash with monomerType values, keyed by all words. */ }; @@ -709,30 +709,133 @@ /* Create data structure and add read to list and hash */ if (hashLookup(store->readHash, name)) errAbort("Read ID %s is repeated line %d of %s", name, lf->lineIx, lf->fileName); struct alphaRead *read; AllocVar(read); read->list = list; hashAddSaveName(store->readHash, name, read, &read->name); slAddHead(&readList, read); } slReverse(&readList); store->readList = readList; lineFileClose(&lf); } +struct alphaOrphan +/* Information about an orphan - something without great joining information. */ + { + struct alphaOrphan *next; + struct monomer *monomer; + struct monomerType *type; + }; + +struct alphaOrphan *findOrphanStarts(struct alphaRead *readList) +/* Return list of monomers that are found only at start of reads. */ +{ +struct alphaOrphan *orphanList = NULL; + +/* Build up hash of things seen later in reads. */ +struct alphaRead *read; +struct hash *later = hashNew(0); /* Hash of monomers found later. */ +for (read = readList; read != NULL; read = read->next) + { + struct monomerRef *ref; + for (ref = read->list->next; ref != NULL; ref = ref->next) + { + hashAdd(later, ref->val->word, NULL); + } + } + +/* Pass through again collecting orphans. */ +struct hash *orphanHash = hashNew(0); +for (read = readList; read != NULL; read = read->next) + { + struct monomer *monomer = read->list->val; + if (!hashLookup(later, monomer->word)) + { + struct alphaOrphan *orphan = hashFindVal(orphanHash, monomer->word); + if (orphan == NULL) + { + AllocVar(orphan); + orphan->monomer = monomer; + slAddHead(&orphanList, orphan); + hashAdd(orphanHash, monomer->word, orphan); + uglyf("Adding orphan start %s\n", monomer->word); + } + else uglyf("Repeating orphan start %s\n", monomer->word); + } + } +hashFree(&later); +hashFree(&orphanHash); +slReverse(&orphanList); +return orphanList; +} + +struct alphaOrphan *findOrphanEnds(struct alphaRead *readList) +/* Return list of monomers that are found only at end of reads. */ +{ +struct alphaOrphan *orphanList = NULL; + +/* Build up hash of things seen sooner in reads. */ +struct alphaRead *read; +struct hash *sooner = hashNew(0); /* Hash of monomers found sooner. */ +for (read = readList; read != NULL; read = read->next) + { + struct monomerRef *ref; + /* Loop over all but last. */ + for (ref = read->list; ref->next != NULL; ref = ref->next) + { + hashAdd(sooner, ref->val->word, NULL); + } + } + +/* Pass through again collecting orphans. */ +struct hash *orphanHash = hashNew(0); +for (read = readList; read != NULL; read = read->next) + { + struct monomerRef *lastRef = slLastEl(read->list); + struct monomer *monomer = lastRef->val; + if (!hashLookup(sooner, monomer->word)) + { + struct alphaOrphan *orphan = hashFindVal(orphanHash, monomer->word); + if (orphan == NULL) + { + AllocVar(orphan); + orphan->monomer = monomer; + slAddHead(&orphanList, orphan); + hashAdd(orphanHash, monomer->word, orphan); + uglyf("Adding orphan end %s\n", monomer->word); + } + else uglyf("Repeating orphan end %s\n", monomer->word); + } + } +hashFree(&sooner); +slReverse(&orphanList); +hashFree(&orphanHash); +return orphanList; +} + +void matchUpOrphans(struct alphaStore *store) +/* Make up fake reads that integrate orphans (monomers only found at beginning or end + * of a read) better. */ +{ +struct alphaOrphan *orphanStarts = findOrphanStarts(store->readList); +struct alphaOrphan *orphanEnds = findOrphanEnds(store->readList); +uglyf("orphanStarts has %d items, orphanEnds %d\n", slCount(orphanStarts), slCount(orphanEnds)); +} + void makeMarkovChains(struct alphaStore *store) /* Return a alphaStore containing all words, and also all chains-of-words of length * chainSize seen in file. */ { /* We'll build up the tree starting with an empty root node. */ struct wordTree *wt = store->markovChains = wordTreeNew(alphaStoreAddMonomer(store, "")); int chainSize = store->maxChainSize; /* Loop through each read. There's special cases at the beginning and end of read, and for * short reads. In the main case we'll be maintaining a chain (doubly linked list) of maxChainSize * words, * popping off one word from the start, and adding one word to the end for each * new word we encounter. This list is added to the tree each iteration. */ struct alphaRead *read; for (read = store->readList; read != NULL; read = read->next) { @@ -839,33 +942,33 @@ void alphaStoreNormalize(struct alphaStore *store, int outputSize) /* Set up output counts on each word to make sure it gets it's share of output size */ { struct wordTree *wt = store->markovChains; wordTreeNormalize(wt, outputSize, 1.0); monomerListNormalise(store->monomerList, wt->useCount, outputSize); } void alphaAsm(char *readsFile, char *monomerOrderFile, char *outFile) /* alphaAsm - Create Markov chain of words and optionally output chain in two formats. */ { struct alphaStore *store = alphaStoreNew(maxChainSize); alphaReadListFromFile(readsFile, store); makeMarkovChains(store); -// struct alphaStore *store = alphaStoreForChainsInFile(readsFile, maxChainSize); struct wordTree *wt = store->markovChains; alphaStoreLoadMonomerOrder(store, readsFile, monomerOrderFile); +matchUpOrphans(store); alphaStoreNormalize(store, outSize); if (optionExists("chain")) { char *fileName = optionVal("chain", NULL); wordTreeWrite(wt, store->maxChainSize, fileName); } wordTreeGenerateFile(store, store->maxChainSize, pickRandom(wt->children), outSize, outFile); if (optionExists("afterChain")) { char *fileName = optionVal("afterChain", NULL); wordTreeWrite(wt, store->maxChainSize, fileName); }