src/kehayden/alphaAsm/alphaAsm.c 11d2f86002bf9faac8d8e6acd839050196859366

11d2f86002bf9faac8d8e6acd839050196859366
kent
  Mon Dec 17 17:59:58 2012 -0800
Adding fall back logic to place unused monomers by type if they can't be placed by context.
diff --git src/kehayden/alphaAsm/alphaAsm.c src/kehayden/alphaAsm/alphaAsm.c
index 03b7673..d02b7a3 100644
--- src/kehayden/alphaAsm/alphaAsm.c
+++ src/kehayden/alphaAsm/alphaAsm.c
@@ -780,32 +780,30 @@
 return centerNode;
 }
 
 void printMonomerRefList(struct monomerRef *refList, FILE *f)
 /* Print out a line to file with the list of monomers. */
 {
 struct monomerRef *ref;
 for (ref = refList; ref != NULL; ref = ref->next)
     fprintf(f, "%s ", ref->val->word);
 fprintf(f, "\n");
 }
 
 struct slRef *refsToPossibleCenters(struct monomer *center, struct monomerRef *neighborhood, struct dlList *ll)
 /* Return a list of dlNodes where neighborhood, but not center matches. */
 {
-uglyf("refsToPossibleCenters ll(%d) neighborhood: ", dlCount(ll));
-printMonomerRefList(neighborhood, uglyOut);
 struct slRef *list = NULL;
 struct dlNode *node;
 for (node = ll->head; !dlEnd(node); node = node->next)
     {
     struct dlNode *centerNode = matchExceptCenter(node, neighborhood, center);
     if (centerNode != NULL)
 	refAdd(&list, centerNode);
     }
 return list;
 }
 
 void mostCommonMonomerWord(struct slRef *refList, char **retWord, int *retCount)
 /* Given refs to dlNodes containing monomers, find word associated with most common monomer. */
 {
 /* Make up a hash that contains counts of all monomers. */
@@ -831,105 +829,145 @@
 	maxCount = count;
 	maxWord = hel->name;
 	}
     }
 *retWord = maxWord;
 *retCount = maxCount;
 hashFree(&countHash);
 }
 
 boolean subCommonCenter(struct alphaStore *store,
     struct monomer *center, struct monomerRef *neighborhood, struct dlList *ll)
 /* Scan list for places where have all items in neighborhood (except for center) matching. 
  * Substitute in center at one of these places chosen at random and return TRUE if possible. */
 {
 struct slRef *centerRefList = refsToPossibleCenters(center, neighborhood, ll);
-uglyf("sub %s in neighborhood: ", center->word);
-printMonomerRefList(neighborhood, uglyOut);
-uglyf("Got %d possible centers\n", slCount(centerRefList));
+verbose(3, "sub %s in neighborhood: ", center->word);
+if (verboseLevel() >= 3)
+    printMonomerRefList(neighborhood, stderr);
+verbose(3, "Got %d possible centers\n", slCount(centerRefList));
 
 if (centerRefList == NULL)
     return FALSE;
 int commonCount = 0;
 char *commonWord = NULL;
 mostCommonMonomerWord(centerRefList, &commonWord, &commonCount);
 struct monomer *commonMonomer = hashFindVal(store->monomerHash, commonWord);
-uglyf("Commonest word to displace with %s is %s which occurs %d times in context and %d overall\n", center->word, commonWord, commonCount, commonMonomer->outCount);
+verbose(3, "Commonest word to displace with %s is %s which occurs %d times in context and %d overall\n", center->word, commonWord, commonCount, commonMonomer->outCount);
 if (commonMonomer->outCount < 2)
     {
-    uglyf("Want to substitute %s for %s, but %s only occurs %d time.\n", center->word, commonWord, commonWord, commonMonomer->outCount);
+    verbose(2, "Want to substitute %s for %s, but %s only occurs %d time.\n", 
+	center->word, commonWord, commonWord, commonMonomer->outCount);
     return FALSE;
     }
 
 /* Select a random one of the most commonly occuring possible centers. */
 int targetIx = rand() % commonCount;
 struct slRef *ref;
 int currentIx = 0;
 for (ref = centerRefList; ref != NULL; ref = ref->next)
     {
     struct dlNode *node = ref->val;
     struct monomer *monomer = node->val;
     if (sameString(monomer->word, commonWord))
          {
 	 if (currentIx == targetIx)
 	     {
-	     uglyf("Substituting %s for %s in context of %d\n", center->word, commonWord, slCount(neighborhood));
+	     verbose(2, "Substituting %s for %s in context of %d\n", center->word, commonWord, slCount(neighborhood));
 	     struct monomer *oldCenter = node->val;
 	     if (oldCenter->type != center->type)
-	         {
-		 uglyAbort("Type mismatch subbig %s vs %s\n", oldCenter->word, center->word);
-		 }
+		 verbose(2, "Type mismatch subbig %s vs %s\n", oldCenter->word, center->word);
 	     node->val = center;
 	     return TRUE;
 	     }
 	 ++currentIx;
 	 }
     }
-assert(FALSE);
+internalErr();	// Should not get here.
 return FALSE;	
 }
 
-void subCenterInNeighborhood(struct alphaStore *store, 
+boolean subCenterInNeighborhood(struct alphaStore *store, 
     struct monomer *center, struct monomerRef *neighborhood, struct dlList *ll)
-/* Scan ll for cases where neighborhood around center matches.  Replace one of these cases with center. */
+/* Scan ll for cases where neighborhood around center matches.  Replace one of these 
+ * cases with center. */
 {
 assert(slCount(neighborhood) == 3);	// Simplifies things and is true for now.
 if (subCommonCenter(store, center, neighborhood, ll))
-   return;
+   return TRUE;
 if (subCommonCenter(store, center, neighborhood->next, ll))
-   return;
+   return TRUE;
 struct monomerRef *third = neighborhood->next->next;
 neighborhood->next->next = NULL;
 boolean ok = subCommonCenter(store, center, neighborhood, ll);
 neighborhood->next->next = third;
-if (!ok)
+return ok;
+}
+
+struct monomer *mostCommonInType(struct monomerType *type)
+/* Return most common monomer of given type */
+{
+struct monomerRef *ref;
+int commonCount = 0;
+struct monomer *common = NULL;
+for (ref = type->list; ref != NULL; ref = ref->next)
+    {
+    struct monomer *monomer = ref->val;
+    if (monomer->outCount > commonCount)
+        {
+	commonCount = monomer->outCount;
+	common = monomer;
+	}
+    }
+return common;
+}
+
+void subIntoFirstMostCommonOfType(struct alphaStore *store, struct monomer *unused, 
+    struct dlList *ll)
+/* Substitute unused for first occurence of most common monomer of same type. */
+{
+struct monomer *common = mostCommonInType(unused->type);
+struct dlNode *node;
+for (node = ll->head; !dlEnd(node); node = node->next)
     {
-    warn("Couldn't substitute in %s", center->word);
+    struct monomer *monomer = node->val;
+    if (monomer == common)
+        {
+	verbose(2, "Subbing %s for %s of type %s\n", unused->word, monomer->word, 
+	    unused->type->name);
+	node->val = unused;
+	break;
+	}
     }
 }
 
 void subInMissing(struct alphaStore *store, struct dlList *ll)
 /* Go figure out missing monomers in ll, and attempt to substitute them in somewhere they would fit. */
 {
 struct slRef *unusedList = listUnusedMonomers(store, ll);
 verbose(2, "%d monomers, %d unused\n", slCount(store->monomerList), slCount(unusedList));
 struct slRef *unusedRef;
 for (unusedRef = unusedList; unusedRef != NULL; unusedRef = unusedRef->next)
     {
     struct monomer *unused = unusedRef->val;
     struct monomerRef *neighborhood = findNeighborhoodFromReads(unused);
-    subCenterInNeighborhood(store, unused, neighborhood, ll);
+    if (!subCenterInNeighborhood(store, unused, neighborhood, ll))
+	{
+	verbose(2, "Couldn't substitute in %s with context, falling back to type logic", 
+	    unused->word);
+        subIntoFirstMostCommonOfType(store, unused, ll);
+	}
     slFreeList(&neighborhood);
     }
 }
 
 static void wordTreeGenerateFile(struct alphaStore *store, int maxSize, struct wordTree *firstWord, 
 	int maxOutputWords, char *fileName)
 /* Create file containing words base on tree probabilities.  The wordTreeGenerateList does
  * most of work. */
 {
 struct dlList *ll = wordTreeGenerateList(store, maxSize, firstWord, maxOutputWords);
 subInMissing(store, ll);
 FILE *f = mustOpen(fileName, "w");
 struct dlNode *node;
 for (node = ll->head; !dlEnd(node); node = node->next)
     {