src/kehayden/alphaAsm/alphaAsm.c a1d692a299235963e9864f15c8225d18fadb7a42

a1d692a299235963e9864f15c8225d18fadb7a42
kent
  Fri Dec 14 15:34:08 2012 -0800
Starting work on a second pass of processing to put in monomers that did not get omitted by main pass.
diff --git src/kehayden/alphaAsm/alphaAsm.c src/kehayden/alphaAsm/alphaAsm.c
index f67c6a7..1a4ec29 100644
--- src/kehayden/alphaAsm/alphaAsm.c
+++ src/kehayden/alphaAsm/alphaAsm.c
@@ -651,36 +651,165 @@
 
 
     /* Add word from whatever level we fetched back to our output chain. */
     struct monomer *monomer = picked->monomer;
     node->val = monomer;
     dlAddTail(ll, node);
 
     decrementOutputCountsInTree(picked);
     monomer->outTarget -= 1;
     monomer->outCount += 1;
     }
 verbose(2, "totUseZeroCount = %d\n", totUseZeroCount);
 return ll;
 }
 
+
+struct slRef *listUnusedMonomers(struct alphaStore *store, struct dlList *ll)
+/* Return list of references to monomers that are in store but not on ll. */
+{
+struct slRef *refList = NULL, *ref;
+struct hash *usedHash = hashNew(0);
+struct dlNode *node;
+/* Make hash of used monomers. */
+for (node = ll->head; !dlEnd(node); node = node->next)
+    {
+    struct monomer *monomer = node->val;
+    hashAdd(usedHash, monomer->word, monomer);
+    }
+
+/* Stream through all monomers looking for ones not used and adding to list. */
+struct monomer *monomer;
+for (monomer = store->monomerList; monomer != NULL; monomer = monomer->next)
+    {
+    if (isEmpty(monomer->word))
+        continue;
+    if (!hashLookup(usedHash, monomer->word))
+	{
+	AllocVar(ref);
+	ref->val = monomer;
+	slAddHead(&refList, ref);
+	}
+    }
+hashFree(&usedHash);
+slReverse(&refList);
+return refList;
+}
+
+
+int monomerRefIx(struct monomerRef *list, struct monomer *monomer)
+/* Return index of monomer in list. */
+{
+int i;
+struct monomerRef *ref;
+for (i=0, ref = list; ref != NULL; ref = ref->next, ++i)
+    {
+    if (ref->val == monomer)
+        return i;
+    }
+errAbort("Monomer %s not on list\n", monomer->word);
+return -1;   // ugly
+}
+
+struct monomerRef *findNeighborhoodFromReads(struct monomer *center)
+/* Find if possible one monomer to either side of center */
+{
+struct slRef *readRef;
+struct monomerRef *before = NULL, *after = NULL;
+
+/* Loop through reads hoping to find a case where center is flanked by two monomers in
+ * same read.   As a fallback, keep track of a monomer before and a monomer after in
+ * any read. */
+for (readRef = center->readList; readRef != NULL; readRef = readRef->next)
+    {
+    struct alphaRead *read = readRef->val;
+    int readSize = slCount(read->list);
+    int centerIx = monomerRefIx(read->list, center);
+    if (readSize >= 3 && centerIx > 0 && centerIx < readSize-1)
+	 {
+         uglyf("Whoopie found central for %s\n", center->word);
+	 before = slElementFromIx(read->list, centerIx-1);
+	 after = slElementFromIx(read->list, centerIx+1);
+	 break;
+	 }
+    else if (readSize >= 2)
+         {
+	 if (centerIx == 0)
+	     {
+	     after = slElementFromIx(read->list, centerIx+1);
+	     uglyf("read %s, centerIx %d, after %p\n", center->word, centerIx, after);
+	     }
+	 else
+	     {
+	     before = slElementFromIx(read->list, centerIx-1);
+	     uglyf("read %s, centerIx %d, before %p\n", center->word, centerIx, before);
+	     }
+	 }
+    }
+uglyf("before %p, center %p, after %p\n", before, center, after);
+/* Make up list. */
+struct monomerRef *retList = NULL, *monoRef;
+if (after)
+    {
+    AllocVar(monoRef);
+    monoRef->val = after->val;
+    slAddHead(&retList, monoRef);
+    }
+AllocVar(monoRef);
+monoRef->val = center;
+slAddHead(&retList, monoRef);
+if (before)
+    {
+    AllocVar(monoRef);
+    monoRef->val = before->val;
+    slAddHead(&retList, monoRef);
+    }
+uglyf("%d in retList\n", slCount(retList));
+return retList;
+}
+
+void subInMissing(struct alphaStore *store, struct dlList *ll)
+/* Go figure out missing monomers in ll, and attempt to substitute them in somewhere they would fit. */
+{
+struct slRef *unusedList = listUnusedMonomers(store, ll);
+uglyf("%d monomers, %d unused\n", slCount(store->monomerList), slCount(unusedList));
+struct slRef *unusedRef;
+for (unusedRef = unusedList; unusedRef != NULL; unusedRef = unusedRef->next)
+    {
+    struct monomer *unused = unusedRef->val;
+    struct monomerRef *neighborhood = findNeighborhoodFromReads(unused);
+    uglyf("%s(%d):", unused->word, slCount(neighborhood));
+        {
+	struct monomerRef *monomerRef;
+	for (monomerRef = neighborhood; monomerRef != NULL; monomerRef = monomerRef->next)
+	    {
+	    struct monomer *monomer = monomerRef->val;
+	    uglyf(" %s", monomer->word);
+	    }
+	uglyf("\n");
+	}
+    slFreeList(&neighborhood);
+    }
+}
+
 static void wordTreeGenerateFile(struct alphaStore *store, int maxSize, struct wordTree *firstWord, 
 	int maxOutputWords, char *fileName)
 /* Create file containing words base on tree probabilities.  The wordTreeGenerateList does
  * most of work. */
 {
 struct dlList *ll = wordTreeGenerateList(store, maxSize, firstWord, maxOutputWords);
+subInMissing(store, ll);
 FILE *f = mustOpen(fileName, "w");
 struct dlNode *node;
 for (node = ll->head; !dlEnd(node); node = node->next)
     {
     struct monomer *monomer = node->val;
     fprintf(f, "%s\n", monomer->word);
     }
 carefulClose(&f);
 dlListFree(&ll);
 }
 
 
 struct alphaStore *alphaStoreNew(int maxChainSize)
 /* Allocate and initialize a new word store. */
 {
@@ -875,31 +1004,31 @@
 /* Add a and b to the read. */
 struct monomerRef *aRef, *bRef;
 AllocVar(aRef);
 aRef->val = a;
 AllocVar(bRef);
 bRef->val = b;
 aRef->next = bRef;
 read->list = aRef;
 
 /* Add read to the store. */
 slAddHead(&store->readList, read);
 hashAddSaveName(store->readHash, name, read, &read->name);
 
 /* Add read to the monomers. */
 refAdd(&a->readList, read);
-refAdd(&a->readList, read);
+refAdd(&b->readList, read);
 
 a->useCount += 1;
 b->useCount += 1;
 return read;
 }
 
 void integrateOrphans(struct alphaStore *store)
 /* Make up fake reads that integrate orphans (monomers only found at beginning or end
  * of a read) better. */
 {
 struct alphaOrphan *orphanStarts = findOrphanStarts(store->readList, store);
 struct alphaOrphan *orphanEnds = findOrphanEnds(store->readList, store);
 verbose(2, "orphanStarts has %d items, orphanEnds %d\n", 
     slCount(orphanStarts), slCount(orphanEnds));