src/kehayden/alphaAsm/alphaAsm.c a3628f6cda9c870182b206496b6edf8a22094910

a3628f6cda9c870182b206496b6edf8a22094910
kent
  Tue Sep 11 18:26:47 2012 -0700
Integrating orphans (things that are only at beginning or end of any read) by adding fake reads.
diff --git src/kehayden/alphaAsm/alphaAsm.c src/kehayden/alphaAsm/alphaAsm.c
index 104f156..7280e49 100644
--- src/kehayden/alphaAsm/alphaAsm.c
+++ src/kehayden/alphaAsm/alphaAsm.c
@@ -476,74 +476,86 @@
 struct dlNode *nodesFromTail(struct dlList *list, int count)
 /* Return count nodes from tail of list.  If list is shorter than count, it returns the
  * whole list. */
 {
 int i;
 struct dlNode *node;
 for (i=0, node = list->tail; i<count; ++i)
     {
     if (dlStart(node))
         return list->head;
     node = node->prev;
     }
 return node;
 }
 
-struct monomerType *advanceTypeWithWrap(struct monomerType *typeList, struct monomerType *startType,
-    int amountToAdvance)
-/* Skip forward amountToAdvance in typeList from startType.  If get to end of list start
+struct monomerType *typeAfter(struct alphaStore *store, struct monomerType *oldType,
+    int amount)
+/* Skip forward amount in typeList from oldType.  If get to end of list start
  * again at the beginning */
 {
 int i;
-struct monomerType *type = startType;
-for (i=0; i<amountToAdvance; ++i)
+struct monomerType *type = oldType;
+for (i=0; i<amount; ++i)
     {
     type = type->next;
     if (type == NULL)
-	type = typeList;
+	type = store->typeList;
     }
 return type;
 }
 
+struct monomerType *typeBefore(struct alphaStore *store, struct monomerType *oldType, int amount)
+/* Pick type that comes amount before given type. */
+{
+int typeIx = ptArrayIx(oldType, store->typeArray, store->typeCount);
+typeIx -= amount;
+while (typeIx < 0)
+    typeIx += store->typeCount;
+return store->typeArray[typeIx];
+}
+
 struct wordTree *predictFromPreviousTypes(struct alphaStore *store, struct dlList *past)
 /* Predict next based on general pattern of monomer order. */
 {
+/* This routine is now a bit more complex than necessary but it still works.  It
+ * these days only needs to look back 1 because of the read-based type fill-in. */
 int maxToLookBack = 3;
     { /* Debugging block */
     struct dlNode *node = nodesFromTail(past, maxToLookBack);
     verbose(3, "predictFromPreviousTypes(");
     for (; !dlEnd(node); node = node->next)
         {
 	struct monomer *monomer = node->val;
 	verbose(3, "%s, ", monomer->word);
 	}
     verbose(3, ")\n");
     }
 int pastDepth;
 struct dlNode *node;
 for (pastDepth = 1, node = past->tail; pastDepth <= maxToLookBack; ++pastDepth)
     {
     if (dlStart(node))
 	{
 	verbose(2, "predictFromPreviousTypes with empty past\n");
 	return NULL;
 	}
     struct monomer *monomer = node->val;
     struct monomerType *prevType = hashFindVal(store->typeHash, monomer->word);
     if (prevType != NULL)
         {
-	struct monomerType *curType = advanceTypeWithWrap(store->typeList, prevType, pastDepth);
+	struct monomerType *curType = typeAfter(store, prevType, pastDepth);
 	struct monomer *curInfo = pickRandomFromType(curType);
 	struct wordTree *curTree = wordTreeFindInList(store->markovChains->children, curInfo);
 	verbose(3, "predictFromPreviousType pastDepth %d, curInfo %s, curTree %p %s\n", 
 	    pastDepth, curInfo->word, curTree, curTree->monomer->word);
 	return curTree;
 	}
     node = node->prev;
     }
 verbose(2, "predictFromPreviousTypes past all unknown types\n");
 return NULL;
 }
 
 
 struct wordTree *predictNext(struct alphaStore *store, struct dlList *past)
 /* Given input data store and what is known from the past, predict the next word. */
@@ -671,30 +683,31 @@
 return store;
 }
 
 struct monomer *alphaStoreAddMonomer(struct alphaStore *store, char *word)
 /* Add word to store,  incrementing it's useCount if it's already there, otherwise
  * making up a new record for it. */
 {
 struct monomer *monomer = hashFindVal(store->monomerHash, word);
 if (monomer == NULL)
     {
     AllocVar(monomer);
     hashAddSaveName(store->monomerHash, word, monomer, &monomer->word);
     slAddHead(&store->monomerList, monomer);
     }
 monomer->useCount += 1;
+monomer->outTarget = monomer->useCount;	    /* Set default value, may be renormalized. */
 return monomer;
 }
 
 void connectReadsToMonomers(struct alphaStore *store)
 /* Add each read monomer appears in to monomer's readList. */
 {
 struct alphaRead *read;
 for (read = store->readList; read != NULL; read = read->next)
     {
     struct monomerRef *ref;
     for (ref = read->list; ref != NULL; ref = ref->next)
         {
 	struct monomer *monomer = ref->val;
 	refAdd(&monomer->readList, read);
 	}
@@ -733,33 +746,33 @@
     struct alphaRead *read;
     AllocVar(read);
     read->list = list;
     hashAddSaveName(store->readHash, name, read, &read->name);
     slAddHead(&readList, read);
     }
 slReverse(&readList);
 store->readList = readList;
 lineFileClose(&lf);
 connectReadsToMonomers(store);
 }
 
 struct alphaOrphan
 /* Information about an orphan - something without great joining information. */
     {
-    struct alphaOrphan *next;
-    struct monomer *monomer;
-    struct monomerType *type;
+    struct alphaOrphan *next;	/* Next in list */
+    struct monomer *monomer;	/* Pointer to orphan */
+    boolean paired;		/* True if has been paired with somebody. */
     };
 
 struct alphaOrphan *findOrphanStarts(struct alphaRead *readList, struct alphaStore *store)
 /* Return list of monomers that are found only at start of reads. */
 {
 struct alphaOrphan *orphanList = NULL;
 
 /* Build up hash of things seen later in reads. */
 struct alphaRead *read;
 struct hash *later = hashNew(0);    /* Hash of monomers found later. */
 for (read = readList; read != NULL; read = read->next)
     {
     struct monomerRef *ref;
     for (ref = read->list->next; ref != NULL; ref = ref->next)
         {
@@ -770,43 +783,31 @@
 /* Pass through again collecting orphans. */
 struct hash *orphanHash = hashNew(0);
 for (read = readList; read != NULL; read = read->next)
     {
     struct monomer *monomer = read->list->val;
     char *word = monomer->word;
     if (!hashLookup(later, word))
         {
 	struct alphaOrphan *orphan = hashFindVal(orphanHash, word);
 	if (orphan == NULL)
 	    {
 	    AllocVar(orphan);
 	    orphan->monomer = monomer;
 	    slAddHead(&orphanList, orphan);
 	    hashAdd(orphanHash, word, orphan);
-	    uglyf("Adding orphan start %s\n", word);
-	    }
-
-	/* Try and find orphan type by consulting either itself or previous items in read. */
-	if (orphan->type == NULL)
-	    {
-	    struct monomerType *type = monomer->type;
-	    if (type == NULL) 
-	        {
-		uglyf("Can't find type for %s\n", word);
-		}
-	    else uglyf("%s is type %s\n", word, type->name);
-	    orphan->type = type;
+	    verbose(2, "Adding orphan start %s\n", word);
 	    }
 	}
     }
 hashFree(&later);
 hashFree(&orphanHash);
 slReverse(&orphanList);
 return orphanList;
 }
 
 struct alphaOrphan *findOrphanEnds(struct alphaRead *readList, struct alphaStore *store)
 /* Return list of monomers that are found only at end of reads. */
 {
 struct alphaOrphan *orphanList = NULL;
 
 /* Build up hash of things seen sooner in reads. */
@@ -825,48 +826,141 @@
 /* Pass through again collecting orphans. */
 struct hash *orphanHash = hashNew(0);
 for (read = readList; read != NULL; read = read->next)
     {
     struct monomerRef *lastRef = slLastEl(read->list);
     struct monomer *monomer = lastRef->val;
     if (!hashLookup(sooner, monomer->word))
         {
 	struct alphaOrphan *orphan = hashFindVal(orphanHash, monomer->word);
 	if (orphan == NULL)
 	    {
 	    AllocVar(orphan);
 	    orphan->monomer = monomer;
 	    slAddHead(&orphanList, orphan);
 	    hashAdd(orphanHash, monomer->word, orphan);
-	    uglyf("Adding orphan end %s\n", monomer->word);
+	    verbose(2, "Adding orphan end %s\n", monomer->word);
 	    }
-	else uglyf("Repeating orphan end %s\n", monomer->word);
 	}
     }
 hashFree(&sooner);
 slReverse(&orphanList);
 hashFree(&orphanHash);
 return orphanList;
 }
 
-void matchUpOrphans(struct alphaStore *store)
+struct alphaRead *addReadOfTwo(struct alphaStore *store, struct monomer *a, struct monomer *b)
+/* Make up a fake read that goes from a to b and add it to store. */
+{
+/* Create fake name and start fake read with it. */
+static int fakeId = 0;
+char name[32];
+safef(name, sizeof(name), "fake%d", ++fakeId);
+
+/* Allocate fake read. */
+struct alphaRead *read;
+AllocVar(read);
+
+/* Add a and b to the read. */
+struct monomerRef *aRef, *bRef;
+AllocVar(aRef);
+aRef->val = a;
+AllocVar(bRef);
+bRef->val = b;
+aRef->next = bRef;
+read->list = aRef;
+
+/* Add read to the store. */
+slAddHead(&store->readList, read);
+hashAddSaveName(store->readHash, name, read, &read->name);
+
+/* Add read to the monomers. */
+refAdd(&a->readList, read);
+refAdd(&a->readList, read);
+
+a->useCount += 1;
+b->useCount += 1;
+return read;
+}
+
+void integrateOrphans(struct alphaStore *store)
 /* Make up fake reads that integrate orphans (monomers only found at beginning or end
  * of a read) better. */
 {
 struct alphaOrphan *orphanStarts = findOrphanStarts(store->readList, store);
 struct alphaOrphan *orphanEnds = findOrphanEnds(store->readList, store);
-uglyf("orphanStarts has %d items, orphanEnds %d\n", slCount(orphanStarts), slCount(orphanEnds));
+verbose(2, "orphanStarts has %d items, orphanEnds %d\n", 
+    slCount(orphanStarts), slCount(orphanEnds));
+
+/* First look for the excellent situation where you can pair up orphans with each 
+ * other.  For Y at least this happens half the time. */
+struct alphaOrphan *start, *end;
+for (start = orphanStarts; start != NULL; start = start->next)
+    {
+    struct monomerType *startType = start->monomer->type;
+    if (startType == NULL)
+        continue;
+    for (end = orphanEnds; end != NULL && !start->paired; end = end->next)
+        {
+	struct monomerType *endType = end->monomer->type;
+	if (endType == NULL)
+	    continue;
+	if (!end->paired)
+	    {
+	    struct monomerType *nextType = typeAfter(store, endType, 1);
+	    if (nextType == startType)
+	        {
+		end->paired = TRUE;
+		start->paired = TRUE;
+		addReadOfTwo(store, end->monomer, start->monomer);
+		verbose(2, "Pairing %s with %s\n", end->monomer->word, start->monomer->word);
+		}
+	    }
+	}
+    }
+
+/* Ok, now have to just manufacture other side of orphan starts out of thin air. */
+for (start = orphanStarts; start != NULL; start = start->next)
+    {
+    if (start->paired)
+        continue;
+    struct monomer *startMono = start->monomer;
+    struct monomerType *startType = startMono->type;
+    if (startType == NULL)
+        continue;
+
+    struct monomerType *newType = typeBefore(store, startType, 1);
+    struct monomer *newMono = pickRandomFromType(newType);
+    addReadOfTwo(store, newMono, startMono);
+    verbose(2, "Pairing new %s with start %s\n", newMono->word, startMono->word);
+    }
+
+/* Ok, now have to just manufacture other side of orphan ends out of thin air. */
+for (end = orphanEnds; end != NULL; end = end->next)
+    {
+    if (end->paired)
+        continue;
+    struct monomer *endMono = end->monomer;
+    struct monomerType *endType = endMono->type;
+    if (endType == NULL)
+        continue;
+
+    struct monomerType *newType = typeAfter(store, endType, 1);
+    struct monomer *newMono = pickRandomFromType(newType);
+    addReadOfTwo(store, endMono, newMono);
+    verbose(2, "Pairing end %s with new %s\n", endMono->word, newMono->word);
+    }
 }
 
 void makeMarkovChains(struct alphaStore *store)
 /* Return a alphaStore containing all words, and also all chains-of-words of length 
  * chainSize seen in file.  */
 {
 /* We'll build up the tree starting with an empty root node. */
 struct wordTree *wt = store->markovChains = wordTreeNew(alphaStoreAddMonomer(store, ""));	
 int chainSize = store->maxChainSize;
 
 /* Loop through each read. There's special cases at the beginning and end of read, and for 
  * short reads.  In the main case we'll be maintaining a chain (doubly linked list) of maxChainSize 
  * words, * popping off one word from the start, and adding one word to the end for each
  * new word we encounter. This list is added to the tree each iteration. */
 struct alphaRead *read;
@@ -1067,44 +1161,34 @@
 
     if (afterType != NULL)
         {
 	if (bestType == NULL || afterPos < bestPos)
 	    {
 	    bestType = afterType;
 	    bestPos = afterPos;
 	    bestIsAfter = TRUE;
 	    }
 	}
     }
 
 struct monomerType *chosenType = NULL;
 if (bestType != NULL)
     {
-    int bestIx = ptArrayIx(bestType, store->typeArray, store->typeCount);
     if (bestIsAfter)
-        {
-	bestIx -= bestPos;
-	if (bestIx < 0)
-	    bestIx += store->typeCount;
-	}
+	chosenType = typeBefore(store, bestType, bestPos);
     else
-        {
-	bestIx += bestPos;
-	if (bestIx > store->typeCount)
-	     bestIx -= store->typeCount;
-	}
-    chosenType = store->typeArray[bestIx];
+	chosenType = typeAfter(store, bestType, bestPos);
     }
 
 return chosenType;
 }
 
 void fillInTypes(struct alphaStore *store)
 /* We have types for most but not all monomers. Try and fill out types for most of
  * the rest by looking for position in reads they are in that do contain known types */
 {
 /* Do first pass on ones defined - just have to look them up in hash. */
 struct monomer *monomer;
 for (monomer = store->monomerList; monomer != NULL; monomer = monomer->next)
     monomer->type = hashFindVal(store->typeHash, monomer->word);
 
 /* Do second pass on ones that are not yet defined */
@@ -1115,31 +1199,31 @@
 	struct monomerType *type = monomer->type = fillInTypeFromReads(monomer, store);
 	verbose(2, "Got %p %s\n", type, (type != NULL ? type->name : "n/a"));
 	}
     }
 }
 
 void alphaAsm(char *readsFile, char *monomerOrderFile, char *outFile)
 /* alphaAsm - Create Markov chain of words and optionally output chain in two formats. */
 {
 struct alphaStore *store = alphaStoreNew(maxChainSize);
 alphaReadListFromFile(readsFile, store);
 makeMarkovChains(store);
 struct wordTree *wt = store->markovChains;
 alphaStoreLoadMonomerOrder(store, readsFile, monomerOrderFile);
 fillInTypes(store);
-matchUpOrphans(store);
+integrateOrphans(store);
 alphaStoreNormalize(store, outSize);
 
 if (optionExists("chain"))
     {
     char *fileName = optionVal("chain", NULL);
     wordTreeWrite(wt, store->maxChainSize, fileName);
     }
 
 wordTreeGenerateFile(store, store->maxChainSize, pickRandom(wt->children), outSize, outFile);
 
 if (optionExists("afterChain"))
     {
     char *fileName = optionVal("afterChain", NULL);
     wordTreeWrite(wt, store->maxChainSize, fileName);
     }