542ffb6988062175e53503be8d4fd817be89e388 kent Wed Jan 30 17:14:50 2013 -0800 Improving way unused monomers get inserted back in output a little. diff --git src/kehayden/alphaAsm/alphaAsm.c src/kehayden/alphaAsm/alphaAsm.c index 449b51b..f761e1d 100644 --- src/kehayden/alphaAsm/alphaAsm.c +++ src/kehayden/alphaAsm/alphaAsm.c @@ -61,30 +61,31 @@ /* Some structures to keep track of words (which correspond to alpha satellight monomers) * seen in input. */ struct monomer /* Basic information on a monomer including how many times it is seen in input and output * streams. Unlike the wordTree, this is flat, and does not include predecessors. */ { struct monomer *next; /* Next in list of all words. */ char *word; /* The word used to represent monomer. Not allocated here. */ int useCount; /* Number of times used. */ int outTarget; /* Number of times want to output word. */ int outCount; /* Number of times have output word so far. */ struct monomerType *type; /* The type of the monomer. */ struct slRef *readList; /* List of references to reads this is in. */ + int subbedOutCount; /* Output count after substitution. */ }; struct monomerRef /* A reference to a monomer. */ { struct monomerRef *next; /* Next in list */ struct monomer *val; /* The word referred to. */ }; struct monomerType /* A collection of words of the same type - or monomers of same type. */ { struct monomerType *next; /* Next monomerType */ char *name; /* Short name of type */ struct monomerRef *list; /* List of all words of that type */ @@ -843,55 +844,60 @@ /* Scan list for places where have all items in neighborhood (except for center) matching. * Substitute in center at one of these places chosen at random and return TRUE if possible. */ { struct slRef *centerRefList = refsToPossibleCenters(center, neighborhood, ll); verbose(3, "sub %s in neighborhood: ", center->word); if (verboseLevel() >= 3) printMonomerRefList(neighborhood, stderr); verbose(3, "Got %d possible centers\n", slCount(centerRefList)); if (centerRefList == NULL) return FALSE; int commonCount = 0; char *commonWord = NULL; mostCommonMonomerWord(centerRefList, &commonWord, &commonCount); struct monomer *commonMonomer = hashFindVal(store->monomerHash, commonWord); -verbose(3, "Commonest word to displace with %s is %s which occurs %d times in context and %d overall\n", center->word, commonWord, commonCount, commonMonomer->outCount); -if (commonMonomer->outCount < 2) +verbose(3, "Commonest word to displace with %s is %s which occurs %d times in context and %d overall\n", center->word, commonWord, commonCount, commonMonomer->subbedOutCount); +uglyf("Commonest word to displace with %s is %s which occurs %d times in context and %d overall\n", center->word, commonWord, commonCount, commonMonomer->subbedOutCount); +if (commonMonomer->subbedOutCount < 2) { verbose(2, "Want to substitute %s for %s, but %s only occurs %d time.\n", - center->word, commonWord, commonWord, commonMonomer->outCount); + center->word, commonWord, commonWord, commonMonomer->subbedOutCount); + uglyf("Want to substitute %s for %s, but %s only occurs %d time.\n", + center->word, commonWord, commonWord, commonMonomer->subbedOutCount); return FALSE; } /* Select a random one of the most commonly occuring possible centers. */ int targetIx = rand() % commonCount; struct slRef *ref; int currentIx = 0; for (ref = centerRefList; ref != NULL; ref = ref->next) { struct dlNode *node = ref->val; struct monomer *monomer = node->val; if (sameString(monomer->word, commonWord)) { if (currentIx == targetIx) { verbose(2, "Substituting %s for %s in context of %d\n", center->word, commonWord, slCount(neighborhood)); struct monomer *oldCenter = node->val; if (oldCenter->type != center->type) verbose(2, "Type mismatch subbig %s vs %s\n", oldCenter->word, center->word); node->val = center; + oldCenter->subbedOutCount -= 1; + center->subbedOutCount += 1; return TRUE; } ++currentIx; } } internalErr(); // Should not get here. return FALSE; } boolean subCenterInNeighborhood(struct alphaStore *store, struct monomer *center, struct monomerRef *neighborhood, struct dlList *ll) /* Scan ll for cases where neighborhood around center matches. Replace one of these * cases with center. */ { int size = slCount(neighborhood); @@ -909,70 +915,105 @@ return ok; } else return FALSE; } struct monomer *mostCommonInType(struct monomerType *type) /* Return most common monomer of given type */ { struct monomerRef *ref; int commonCount = 0; struct monomer *common = NULL; for (ref = type->list; ref != NULL; ref = ref->next) { struct monomer *monomer = ref->val; - if (monomer->outCount > commonCount) + if (monomer->subbedOutCount > commonCount) { - commonCount = monomer->outCount; + commonCount = monomer->subbedOutCount; common = monomer; } } return common; } -void subIntoFirstMostCommonOfType(struct alphaStore *store, struct monomer *unused, +boolean subIntoFirstMostCommonOfType(struct alphaStore *store, struct monomer *unused, struct dlList *ll) /* Substitute unused for first occurence of most common monomer of same type. */ { struct monomer *common = mostCommonInType(unused->type); +if (common->subbedOutCount < 2) + { + uglyf("Trying to sub in %s, but there's no monomers of type %s that are used more than once.\n", unused->word, unused->type->name); + return FALSE; + } struct dlNode *node; for (node = ll->head; !dlEnd(node); node = node->next) { struct monomer *monomer = node->val; if (monomer == common) { verbose(2, "Subbing %s for %s of type %s\n", unused->word, monomer->word, unused->type->name); + uglyf("Subbing %s for %s (used %d times) of type %s\n", unused->word, monomer->word, monomer->subbedOutCount, unused->type->name); node->val = unused; + unused->subbedOutCount += 1; + monomer->subbedOutCount -= 1; break; } } +return TRUE; +} + +void setInitialSubbedOutCount(struct alphaStore *store, struct dlList *ll) +/* Set subbedOutCount based on how many times monomers occur in list. */ +{ +struct dlNode *node; +for (node = ll->head; !dlEnd(node); node = node->next) + { + struct monomer *monomer = node->val; + monomer->subbedOutCount += 1; + } +#ifdef PARANOID +/* As a check see if subbedOutCount agrees with outCount. */ +int mismatchCount = 0; +struct monomer *monomer; +for (monomer = store->monomerList; monomer != NULL; monomer = monomer->next) + { + uglyf("%s %d %d\n", monomer->word, monomer->outCount, monomer->subbedOutCount); + if (monomer->outCount != monomer->subbedOutCount) + ++mismatchCount; + } +uglyf("mismatch count = %d\n", mismatchCount); +#endif /* PARANOID */ } void subInMissing(struct alphaStore *store, struct dlList *ll) /* Go figure out missing monomers in ll, and attempt to substitute them in somewhere they would fit. */ { +setInitialSubbedOutCount(store, ll); struct slRef *unusedList = listUnusedMonomers(store, ll); verbose(2, "%d monomers, %d unused\n", slCount(store->monomerList), slCount(unusedList)); +uglyf("%d monomers, %d unused\n", slCount(store->monomerList), slCount(unusedList)); struct slRef *unusedRef; for (unusedRef = unusedList; unusedRef != NULL; unusedRef = unusedRef->next) { struct monomer *unused = unusedRef->val; struct monomerRef *neighborhood = findNeighborhoodFromReads(unused); if (!subCenterInNeighborhood(store, unused, neighborhood, ll)) { + uglyf("Couldn't substitute in %s with context, falling back to type logic\n", unused->word); verbose(2, "Couldn't substitute in %s with context, falling back to type logic\n", unused->word); subIntoFirstMostCommonOfType(store, unused, ll); } slFreeList(&neighborhood); } } static void writeMonomerList(char *fileName, struct dlList *ll) /* Write out monomer list to file. */ { FILE *f = mustOpen(fileName, "w"); struct dlNode *node; for (node = ll->head; !dlEnd(node); node = node->next) {