src/kehayden/alphaAsm/alphaAsm.c fe4a41bc1db9c0b4152303169ab700ef410a3929

fe4a41bc1db9c0b4152303169ab700ef410a3929
kent
  Fri Dec 14 17:59:03 2012 -0800
Fixing end condition in matchExceptCenter.  Rearranging debugging code.
diff --git src/kehayden/alphaAsm/alphaAsm.c src/kehayden/alphaAsm/alphaAsm.c
index 3d44798..03b7673 100644
--- src/kehayden/alphaAsm/alphaAsm.c
+++ src/kehayden/alphaAsm/alphaAsm.c
@@ -1,25 +1,26 @@
 /* alphaAsm - assemble alpha repeat regions such as centromeres from reads that have
  * been parsed into various repeat monomer variants.  Cycles of these variants tend to
  * form higher order repeats. */
 
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "dystring.h"
 #include "dlist.h"
+#include "obscure.h"
 
 /* Global vars - all of which can be set by command line options. */
 int pseudoCount = 1;
 int maxChainSize = 3;
 int outSize = 10000;
 boolean fullOnly = FALSE;
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "alphaAsm - create a linear projection of alpha satellite arrays using the probablistic model\n"
   "of HuRef satellite graphs based on Sanger style reads.\n"
   "usage:\n"
   "   alphaAsm alphaMonFile.txt monomerOrder.txt output.txt\n"
@@ -747,50 +748,188 @@
     monoRef->val = after->val;
     slAddHead(&retList, monoRef);
     }
 AllocVar(monoRef);
 monoRef->val = center;
 slAddHead(&retList, monoRef);
 if (before)
     {
     AllocVar(monoRef);
     monoRef->val = before->val;
     slAddHead(&retList, monoRef);
     }
 return retList;
 }
 
+struct dlNode *matchExceptCenter(struct dlNode *node, struct monomerRef *neighborhood, struct monomer *center)
+/* Return center node if node matches neighborhood except for center. */
+{
+struct dlNode *centerNode = NULL;
+struct monomerRef *neighbor;
+struct dlNode *n;
+for (n = node, neighbor=neighborhood; ; n = n->next, neighbor = neighbor->next)
+    {
+    if (neighbor == NULL)
+        break;
+    if (dlEnd(n))
+        return NULL;
+    if (neighbor->val == center)
+        centerNode = n;
+    else if (n->val != neighbor->val)
+        return NULL;
+    }
+return centerNode;
+}
+
+void printMonomerRefList(struct monomerRef *refList, FILE *f)
+/* Print out a line to file with the list of monomers. */
+{
+struct monomerRef *ref;
+for (ref = refList; ref != NULL; ref = ref->next)
+    fprintf(f, "%s ", ref->val->word);
+fprintf(f, "\n");
+}
+
+struct slRef *refsToPossibleCenters(struct monomer *center, struct monomerRef *neighborhood, struct dlList *ll)
+/* Return a list of dlNodes where neighborhood, but not center matches. */
+{
+uglyf("refsToPossibleCenters ll(%d) neighborhood: ", dlCount(ll));
+printMonomerRefList(neighborhood, uglyOut);
+struct slRef *list = NULL;
+struct dlNode *node;
+for (node = ll->head; !dlEnd(node); node = node->next)
+    {
+    struct dlNode *centerNode = matchExceptCenter(node, neighborhood, center);
+    if (centerNode != NULL)
+	refAdd(&list, centerNode);
+    }
+return list;
+}
+
+void mostCommonMonomerWord(struct slRef *refList, char **retWord, int *retCount)
+/* Given refs to dlNodes containing monomers, find word associated with most common monomer. */
+{
+/* Make up a hash that contains counts of all monomers. */
+struct hash *countHash = hashNew(0);
+struct slRef *ref;
+for (ref = refList; ref != NULL; ref = ref->next)
+    {
+    struct dlNode *node = ref->val;
+    struct monomer *monomer = node->val;
+    hashIncInt(countHash, monomer->word);
+    }
+
+/* Iterate through hash finding max value. */
+char *maxWord = NULL;
+int maxCount = 0;
+struct hashEl *hel;
+struct hashCookie cookie = hashFirst(countHash);
+while ((hel = hashNext(&cookie)) != NULL)
+    {
+    int count = ptToInt(hel->val);
+    if (count > maxCount)
+        {
+	maxCount = count;
+	maxWord = hel->name;
+	}
+    }
+*retWord = maxWord;
+*retCount = maxCount;
+hashFree(&countHash);
+}
+
+boolean subCommonCenter(struct alphaStore *store,
+    struct monomer *center, struct monomerRef *neighborhood, struct dlList *ll)
+/* Scan list for places where have all items in neighborhood (except for center) matching. 
+ * Substitute in center at one of these places chosen at random and return TRUE if possible. */
+{
+struct slRef *centerRefList = refsToPossibleCenters(center, neighborhood, ll);
+uglyf("sub %s in neighborhood: ", center->word);
+printMonomerRefList(neighborhood, uglyOut);
+uglyf("Got %d possible centers\n", slCount(centerRefList));
+
+if (centerRefList == NULL)
+    return FALSE;
+int commonCount = 0;
+char *commonWord = NULL;
+mostCommonMonomerWord(centerRefList, &commonWord, &commonCount);
+struct monomer *commonMonomer = hashFindVal(store->monomerHash, commonWord);
+uglyf("Commonest word to displace with %s is %s which occurs %d times in context and %d overall\n", center->word, commonWord, commonCount, commonMonomer->outCount);
+if (commonMonomer->outCount < 2)
+    {
+    uglyf("Want to substitute %s for %s, but %s only occurs %d time.\n", center->word, commonWord, commonWord, commonMonomer->outCount);
+    return FALSE;
+    }
+
+/* Select a random one of the most commonly occuring possible centers. */
+int targetIx = rand() % commonCount;
+struct slRef *ref;
+int currentIx = 0;
+for (ref = centerRefList; ref != NULL; ref = ref->next)
+    {
+    struct dlNode *node = ref->val;
+    struct monomer *monomer = node->val;
+    if (sameString(monomer->word, commonWord))
+         {
+	 if (currentIx == targetIx)
+	     {
+	     uglyf("Substituting %s for %s in context of %d\n", center->word, commonWord, slCount(neighborhood));
+	     struct monomer *oldCenter = node->val;
+	     if (oldCenter->type != center->type)
+	         {
+		 uglyAbort("Type mismatch subbig %s vs %s\n", oldCenter->word, center->word);
+		 }
+	     node->val = center;
+	     return TRUE;
+	     }
+	 ++currentIx;
+	 }
+    }
+assert(FALSE);
+return FALSE;	
+}
+
+void subCenterInNeighborhood(struct alphaStore *store, 
+    struct monomer *center, struct monomerRef *neighborhood, struct dlList *ll)
+/* Scan ll for cases where neighborhood around center matches.  Replace one of these cases with center. */
+{
+assert(slCount(neighborhood) == 3);	// Simplifies things and is true for now.
+if (subCommonCenter(store, center, neighborhood, ll))
+   return;
+if (subCommonCenter(store, center, neighborhood->next, ll))
+   return;
+struct monomerRef *third = neighborhood->next->next;
+neighborhood->next->next = NULL;
+boolean ok = subCommonCenter(store, center, neighborhood, ll);
+neighborhood->next->next = third;
+if (!ok)
+    {
+    warn("Couldn't substitute in %s", center->word);
+    }
+}
+
 void subInMissing(struct alphaStore *store, struct dlList *ll)
 /* Go figure out missing monomers in ll, and attempt to substitute them in somewhere they would fit. */
 {
 struct slRef *unusedList = listUnusedMonomers(store, ll);
 verbose(2, "%d monomers, %d unused\n", slCount(store->monomerList), slCount(unusedList));
 struct slRef *unusedRef;
 for (unusedRef = unusedList; unusedRef != NULL; unusedRef = unusedRef->next)
     {
     struct monomer *unused = unusedRef->val;
     struct monomerRef *neighborhood = findNeighborhoodFromReads(unused);
-    uglyf("%s(%d):", unused->word, slCount(neighborhood));
-        {
-	struct monomerRef *monomerRef;
-	for (monomerRef = neighborhood; monomerRef != NULL; monomerRef = monomerRef->next)
-	    {
-	    struct monomer *monomer = monomerRef->val;
-	    uglyf(" %s", monomer->word);
-	    }
-	uglyf("\n");
-	}
+    subCenterInNeighborhood(store, unused, neighborhood, ll);
     slFreeList(&neighborhood);
     }
 }
 
 static void wordTreeGenerateFile(struct alphaStore *store, int maxSize, struct wordTree *firstWord, 
 	int maxOutputWords, char *fileName)
 /* Create file containing words base on tree probabilities.  The wordTreeGenerateList does
  * most of work. */
 {
 struct dlList *ll = wordTreeGenerateList(store, maxSize, firstWord, maxOutputWords);
 subInMissing(store, ll);
 FILE *f = mustOpen(fileName, "w");
 struct dlNode *node;
 for (node = ll->head; !dlEnd(node); node = node->next)
     {