542ffb6988062175e53503be8d4fd817be89e388
kent
  Wed Jan 30 17:14:50 2013 -0800
Improving way unused monomers get inserted back in output a little.
diff --git src/kehayden/alphaAsm/alphaAsm.c src/kehayden/alphaAsm/alphaAsm.c
index 449b51b..f761e1d 100644
--- src/kehayden/alphaAsm/alphaAsm.c
+++ src/kehayden/alphaAsm/alphaAsm.c
@@ -61,30 +61,31 @@
 
 /* Some structures to keep track of words (which correspond to alpha satellight monomers)
  * seen in input. */
 
 struct monomer
 /* Basic information on a monomer including how many times it is seen in input and output
  * streams.  Unlike the wordTree, this is flat, and does not include predecessors. */
     {
     struct monomer *next;	/* Next in list of all words. */
     char *word;			/* The word used to represent monomer.  Not allocated here. */
     int useCount;		/* Number of times used. */
     int outTarget;		/* Number of times want to output word. */
     int outCount;		/* Number of times have output word so far. */
     struct monomerType *type;	/* The type of the monomer. */
     struct slRef *readList;	/* List of references to reads this is in. */
+    int subbedOutCount;		/* Output count after substitution. */
     };
 
 struct monomerRef
 /* A reference to a monomer. */
     {
     struct monomerRef *next;	/* Next in list */
     struct monomer *val;	/* The word referred to. */
     };
 
 struct monomerType
 /* A collection of words of the same type - or monomers of same type. */
     {
     struct monomerType *next;   /* Next monomerType */
     char *name;			/* Short name of type */
     struct monomerRef *list;	    /* List of all words of that type */
@@ -843,55 +844,60 @@
 /* Scan list for places where have all items in neighborhood (except for center) matching. 
  * Substitute in center at one of these places chosen at random and return TRUE if possible. */
 {
 struct slRef *centerRefList = refsToPossibleCenters(center, neighborhood, ll);
 verbose(3, "sub %s in neighborhood: ", center->word);
 if (verboseLevel() >= 3)
     printMonomerRefList(neighborhood, stderr);
 verbose(3, "Got %d possible centers\n", slCount(centerRefList));
 
 if (centerRefList == NULL)
     return FALSE;
 int commonCount = 0;
 char *commonWord = NULL;
 mostCommonMonomerWord(centerRefList, &commonWord, &commonCount);
 struct monomer *commonMonomer = hashFindVal(store->monomerHash, commonWord);
-verbose(3, "Commonest word to displace with %s is %s which occurs %d times in context and %d overall\n", center->word, commonWord, commonCount, commonMonomer->outCount);
-if (commonMonomer->outCount < 2)
+verbose(3, "Commonest word to displace with %s is %s which occurs %d times in context and %d overall\n", center->word, commonWord, commonCount, commonMonomer->subbedOutCount);
+uglyf("Commonest word to displace with %s is %s which occurs %d times in context and %d overall\n", center->word, commonWord, commonCount, commonMonomer->subbedOutCount);
+if (commonMonomer->subbedOutCount < 2)
     {
     verbose(2, "Want to substitute %s for %s, but %s only occurs %d time.\n", 
-	center->word, commonWord, commonWord, commonMonomer->outCount);
+	center->word, commonWord, commonWord, commonMonomer->subbedOutCount);
+    uglyf("Want to substitute %s for %s, but %s only occurs %d time.\n", 
+	center->word, commonWord, commonWord, commonMonomer->subbedOutCount);
     return FALSE;
     }
 
 /* Select a random one of the most commonly occuring possible centers. */
 int targetIx = rand() % commonCount;
 struct slRef *ref;
 int currentIx = 0;
 for (ref = centerRefList; ref != NULL; ref = ref->next)
     {
     struct dlNode *node = ref->val;
     struct monomer *monomer = node->val;
     if (sameString(monomer->word, commonWord))
          {
 	 if (currentIx == targetIx)
 	     {
 	     verbose(2, "Substituting %s for %s in context of %d\n", center->word, commonWord, slCount(neighborhood));
 	     struct monomer *oldCenter = node->val;
 	     if (oldCenter->type != center->type)
 		 verbose(2, "Type mismatch subbig %s vs %s\n", oldCenter->word, center->word);
 	     node->val = center;
+	     oldCenter->subbedOutCount -= 1;
+	     center->subbedOutCount += 1;
 	     return TRUE;
 	     }
 	 ++currentIx;
 	 }
     }
 internalErr();	// Should not get here.
 return FALSE;	
 }
 
 boolean subCenterInNeighborhood(struct alphaStore *store, 
     struct monomer *center, struct monomerRef *neighborhood, struct dlList *ll)
 /* Scan ll for cases where neighborhood around center matches.  Replace one of these 
  * cases with center. */
 {
 int size = slCount(neighborhood);
@@ -909,70 +915,105 @@
     return ok;
     }
 else
     return FALSE;
 }
 
 struct monomer *mostCommonInType(struct monomerType *type)
 /* Return most common monomer of given type */
 {
 struct monomerRef *ref;
 int commonCount = 0;
 struct monomer *common = NULL;
 for (ref = type->list; ref != NULL; ref = ref->next)
     {
     struct monomer *monomer = ref->val;
-    if (monomer->outCount > commonCount)
+    if (monomer->subbedOutCount > commonCount)
         {
-	commonCount = monomer->outCount;
+	commonCount = monomer->subbedOutCount;
 	common = monomer;
 	}
     }
 return common;
 }
 
-void subIntoFirstMostCommonOfType(struct alphaStore *store, struct monomer *unused, 
+boolean subIntoFirstMostCommonOfType(struct alphaStore *store, struct monomer *unused, 
     struct dlList *ll)
 /* Substitute unused for first occurence of most common monomer of same type. */
 {
 struct monomer *common = mostCommonInType(unused->type);
+if (common->subbedOutCount < 2)
+    {
+    uglyf("Trying to sub in %s, but there's no monomers of type %s that are used more than once.\n", unused->word, unused->type->name);
+    return FALSE;
+    }
 struct dlNode *node;
 for (node = ll->head; !dlEnd(node); node = node->next)
     {
     struct monomer *monomer = node->val;
     if (monomer == common)
         {
 	verbose(2, "Subbing %s for %s of type %s\n", unused->word, monomer->word, 
 	    unused->type->name);
+	uglyf("Subbing %s for %s (used %d times) of type %s\n", unused->word, monomer->word, monomer->subbedOutCount, unused->type->name);
 	node->val = unused;
+	unused->subbedOutCount += 1;
+	monomer->subbedOutCount -= 1;
 	break;
 	}
     }
+return TRUE;
+}
+
+void setInitialSubbedOutCount(struct alphaStore *store, struct dlList *ll)
+/* Set subbedOutCount based on how many times monomers occur in list. */
+{
+struct dlNode *node;
+for (node = ll->head; !dlEnd(node); node = node->next)
+    {
+    struct monomer *monomer = node->val;
+    monomer->subbedOutCount += 1;
+    }
+#ifdef PARANOID
+/* As a check see if subbedOutCount agrees with outCount. */
+int mismatchCount = 0;
+struct monomer *monomer;
+for (monomer = store->monomerList; monomer != NULL; monomer = monomer->next)
+    {
+    uglyf("%s %d %d\n", monomer->word, monomer->outCount, monomer->subbedOutCount);
+    if (monomer->outCount != monomer->subbedOutCount)
+        ++mismatchCount;
+    }
+uglyf("mismatch count = %d\n", mismatchCount);
+#endif /* PARANOID */
 }
 
 void subInMissing(struct alphaStore *store, struct dlList *ll)
 /* Go figure out missing monomers in ll, and attempt to substitute them in somewhere they would fit. */
 {
+setInitialSubbedOutCount(store, ll);
 struct slRef *unusedList = listUnusedMonomers(store, ll);
 verbose(2, "%d monomers, %d unused\n", slCount(store->monomerList), slCount(unusedList));
+uglyf("%d monomers, %d unused\n", slCount(store->monomerList), slCount(unusedList));
 struct slRef *unusedRef;
 for (unusedRef = unusedList; unusedRef != NULL; unusedRef = unusedRef->next)
     {
     struct monomer *unused = unusedRef->val;
     struct monomerRef *neighborhood = findNeighborhoodFromReads(unused);
     if (!subCenterInNeighborhood(store, unused, neighborhood, ll))
 	{
+	uglyf("Couldn't substitute in %s with context, falling back to type logic\n", unused->word);
 	verbose(2, "Couldn't substitute in %s with context, falling back to type logic\n", 
 	    unused->word);
         subIntoFirstMostCommonOfType(store, unused, ll);
 	}
     slFreeList(&neighborhood);
     }
 }
 
 static void writeMonomerList(char *fileName, struct dlList *ll)
 /* Write out monomer list to file. */
 {
 FILE *f = mustOpen(fileName, "w");
 struct dlNode *node;
 for (node = ll->head; !dlEnd(node); node = node->next)
     {