a1d692a299235963e9864f15c8225d18fadb7a42 kent Fri Dec 14 15:34:08 2012 -0800 Starting work on a second pass of processing to put in monomers that did not get omitted by main pass. diff --git src/kehayden/alphaAsm/alphaAsm.c src/kehayden/alphaAsm/alphaAsm.c index f67c6a7..1a4ec29 100644 --- src/kehayden/alphaAsm/alphaAsm.c +++ src/kehayden/alphaAsm/alphaAsm.c @@ -651,36 +651,165 @@ /* Add word from whatever level we fetched back to our output chain. */ struct monomer *monomer = picked->monomer; node->val = monomer; dlAddTail(ll, node); decrementOutputCountsInTree(picked); monomer->outTarget -= 1; monomer->outCount += 1; } verbose(2, "totUseZeroCount = %d\n", totUseZeroCount); return ll; } + +struct slRef *listUnusedMonomers(struct alphaStore *store, struct dlList *ll) +/* Return list of references to monomers that are in store but not on ll. */ +{ +struct slRef *refList = NULL, *ref; +struct hash *usedHash = hashNew(0); +struct dlNode *node; +/* Make hash of used monomers. */ +for (node = ll->head; !dlEnd(node); node = node->next) + { + struct monomer *monomer = node->val; + hashAdd(usedHash, monomer->word, monomer); + } + +/* Stream through all monomers looking for ones not used and adding to list. */ +struct monomer *monomer; +for (monomer = store->monomerList; monomer != NULL; monomer = monomer->next) + { + if (isEmpty(monomer->word)) + continue; + if (!hashLookup(usedHash, monomer->word)) + { + AllocVar(ref); + ref->val = monomer; + slAddHead(&refList, ref); + } + } +hashFree(&usedHash); +slReverse(&refList); +return refList; +} + + +int monomerRefIx(struct monomerRef *list, struct monomer *monomer) +/* Return index of monomer in list. */ +{ +int i; +struct monomerRef *ref; +for (i=0, ref = list; ref != NULL; ref = ref->next, ++i) + { + if (ref->val == monomer) + return i; + } +errAbort("Monomer %s not on list\n", monomer->word); +return -1; // ugly +} + +struct monomerRef *findNeighborhoodFromReads(struct monomer *center) +/* Find if possible one monomer to either side of center */ +{ +struct slRef *readRef; +struct monomerRef *before = NULL, *after = NULL; + +/* Loop through reads hoping to find a case where center is flanked by two monomers in + * same read. As a fallback, keep track of a monomer before and a monomer after in + * any read. */ +for (readRef = center->readList; readRef != NULL; readRef = readRef->next) + { + struct alphaRead *read = readRef->val; + int readSize = slCount(read->list); + int centerIx = monomerRefIx(read->list, center); + if (readSize >= 3 && centerIx > 0 && centerIx < readSize-1) + { + uglyf("Whoopie found central for %s\n", center->word); + before = slElementFromIx(read->list, centerIx-1); + after = slElementFromIx(read->list, centerIx+1); + break; + } + else if (readSize >= 2) + { + if (centerIx == 0) + { + after = slElementFromIx(read->list, centerIx+1); + uglyf("read %s, centerIx %d, after %p\n", center->word, centerIx, after); + } + else + { + before = slElementFromIx(read->list, centerIx-1); + uglyf("read %s, centerIx %d, before %p\n", center->word, centerIx, before); + } + } + } +uglyf("before %p, center %p, after %p\n", before, center, after); +/* Make up list. */ +struct monomerRef *retList = NULL, *monoRef; +if (after) + { + AllocVar(monoRef); + monoRef->val = after->val; + slAddHead(&retList, monoRef); + } +AllocVar(monoRef); +monoRef->val = center; +slAddHead(&retList, monoRef); +if (before) + { + AllocVar(monoRef); + monoRef->val = before->val; + slAddHead(&retList, monoRef); + } +uglyf("%d in retList\n", slCount(retList)); +return retList; +} + +void subInMissing(struct alphaStore *store, struct dlList *ll) +/* Go figure out missing monomers in ll, and attempt to substitute them in somewhere they would fit. */ +{ +struct slRef *unusedList = listUnusedMonomers(store, ll); +uglyf("%d monomers, %d unused\n", slCount(store->monomerList), slCount(unusedList)); +struct slRef *unusedRef; +for (unusedRef = unusedList; unusedRef != NULL; unusedRef = unusedRef->next) + { + struct monomer *unused = unusedRef->val; + struct monomerRef *neighborhood = findNeighborhoodFromReads(unused); + uglyf("%s(%d):", unused->word, slCount(neighborhood)); + { + struct monomerRef *monomerRef; + for (monomerRef = neighborhood; monomerRef != NULL; monomerRef = monomerRef->next) + { + struct monomer *monomer = monomerRef->val; + uglyf(" %s", monomer->word); + } + uglyf("\n"); + } + slFreeList(&neighborhood); + } +} + static void wordTreeGenerateFile(struct alphaStore *store, int maxSize, struct wordTree *firstWord, int maxOutputWords, char *fileName) /* Create file containing words base on tree probabilities. The wordTreeGenerateList does * most of work. */ { struct dlList *ll = wordTreeGenerateList(store, maxSize, firstWord, maxOutputWords); +subInMissing(store, ll); FILE *f = mustOpen(fileName, "w"); struct dlNode *node; for (node = ll->head; !dlEnd(node); node = node->next) { struct monomer *monomer = node->val; fprintf(f, "%s\n", monomer->word); } carefulClose(&f); dlListFree(&ll); } struct alphaStore *alphaStoreNew(int maxChainSize) /* Allocate and initialize a new word store. */ { @@ -875,31 +1004,31 @@ /* Add a and b to the read. */ struct monomerRef *aRef, *bRef; AllocVar(aRef); aRef->val = a; AllocVar(bRef); bRef->val = b; aRef->next = bRef; read->list = aRef; /* Add read to the store. */ slAddHead(&store->readList, read); hashAddSaveName(store->readHash, name, read, &read->name); /* Add read to the monomers. */ refAdd(&a->readList, read); -refAdd(&a->readList, read); +refAdd(&b->readList, read); a->useCount += 1; b->useCount += 1; return read; } void integrateOrphans(struct alphaStore *store) /* Make up fake reads that integrate orphans (monomers only found at beginning or end * of a read) better. */ { struct alphaOrphan *orphanStarts = findOrphanStarts(store->readList, store); struct alphaOrphan *orphanEnds = findOrphanEnds(store->readList, store); verbose(2, "orphanStarts has %d items, orphanEnds %d\n", slCount(orphanStarts), slCount(orphanEnds));