src/kehayden/alphaAsm/alphaAsm.c 001f4847f07fad411ebc491290d337185cf0a1c3

001f4847f07fad411ebc491290d337185cf0a1c3
kent
  Wed Sep 12 08:21:33 2012 -0700
Making it use lower order model once higher order model has reached outTarget.  Improving top level comment.
diff --git src/kehayden/alphaAsm/alphaAsm.c src/kehayden/alphaAsm/alphaAsm.c
index 0b801f4..4f76dcd 100644
--- src/kehayden/alphaAsm/alphaAsm.c
+++ src/kehayden/alphaAsm/alphaAsm.c
@@ -1,16 +1,18 @@
-/* alphaAsm - Predicts faux centromere sequences using a probablistic model. */
+/* alphaAsm - assemble alpha repeat regions such as centromeres from reads that have
+ * been parsed into various repeat monomer variants.  Cycles of these variants tend to
+ * form higher order repeats. */
 
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "dystring.h"
 #include "dlist.h"
 
 /* Global vars - all of which can be set by command line options. */
 int maxChainSize = 3;
 int outSize = 10000;
 boolean fullOnly = FALSE;
 
 void usage()
 /* Explain usage and exit. */
@@ -302,39 +304,39 @@
     fprintf(f, "%d\t%d\t%d\t%d\t%f\t", 
 	    level, wt->useCount, wt->outTarget, wt->outCount, wt->normVal);
     
     for (i=1; i<=level; ++i)
 	{
 	spaceOut(f, level*2);
 	fprintf(f, "%s ", words[i]);
 	}
     fprintf(f, "\n");
     }
 struct wordTree *child;
 for (child = wt->children; child != NULL; child = child->next)
     wordTreeDump(level+1, child, maxChainSize, f);
 }
 
-struct wordTree *pickRandomOnOutTarget(struct wordTree *list)
+struct wordTree *pickRandom(struct wordTree *list)
 /* Pick word from list randomly, but so that words with higher outTargets
  * are picked more often. */
 {
 struct wordTree *picked = NULL;
 
 /* Debug output. */
     {
-    verbose(2, "   pickRandomOnOutTarget(");
+    verbose(2, "   pickRandom(");
     struct wordTree *wt;
     for (wt = list; wt != NULL; wt = wt->next)
         {
 	if (wt != list)
 	    verbose(2, " ");
 	verbose(2, "%s:%d", wt->monomer->word, wt->outTarget);
 	}
     verbose(2, ") total %d\n", wordTreeSumOutTargets(list));
     }
 
 /* Figure out total number of outputs left, and a random number between 0 and that total. */
 int total = wordTreeSumOutTargets(list);
 if (total > 0)
     {
     int threshold = rand() % total; 
@@ -348,102 +350,79 @@
 	int size = wt->outTarget;
 	int binEnd = binStart + size;
 	verbose(2, "      %s size %d, binEnd %d\n", wt->monomer->word, size, binEnd);
 	if (threshold < binEnd)
 	    {
 	    picked = wt;
 	    verbose(2, "      picked %s\n", wt->monomer->word);
 	    break;
 	    }
 	binStart = binEnd;
 	}
     }
 return picked;
 }
 
-struct wordTree *pickRandomOnUseCounts(struct wordTree *list)
-/* Pick word from list randomly, but so that words with higher useCounts
- * are picked more often.  Much like above routine, but a little simple
- * since we know useCounts are non-zero. */
-{
-struct wordTree *picked = NULL;
-
-/* Figure out total number and a random number between 0 and that total. */
-int total = wordTreeSumUseCounts(list);
-assert(total > 0);
-int threshold = rand() % total; 
-
-/* Loop through list returning selection corresponding to random threshold. */
-int binStart = 0;
-struct wordTree *wt;
-for (wt = list; wt != NULL; wt = wt->next)
-    {
-    int size = wt->useCount;
-    int binEnd = binStart + size;
-    if (threshold < binEnd)
-	{
-	picked = wt;
-	break;
-	}
-    binStart = binEnd;
-    }
-assert(picked != NULL);
-return picked;
-}
-
 int totUseZeroCount = 0;
 
-struct wordTree *pickRandom(struct wordTree *list)
-/* Pick word from list randomly, but so that words more
- * commonly seen are picked more often. */
-{
-struct wordTree *picked = pickRandomOnOutTarget(list);
-
-/* If did not find anything, that's ok. It can happen on legitimate input due to unevenness
- * of read coverage.  In this case we pick a random number based on original counts
- * rather than normalized/counted down counts. */
-if (picked == NULL)
-    {
-    picked = pickRandomOnUseCounts(list);
-    ++totUseZeroCount;
-    }
-return picked;
-}
-
 struct monomer *pickRandomFromType(struct monomerType *type)
 /* Pick a random word, weighted by outTarget, from all available of given type */
 {
 /* Figure out total on list */
 int total = 0;
 struct monomerRef *ref;
 for (ref = type->list; ref != NULL; ref = ref->next)
     total += ref->val->outTarget;
 
 /* Loop through list returning selection corresponding to random threshold. */
+if (total > 0)
+    {
 int threshold = rand() % total; 
 int binStart = 0;
 for (ref = type->list; ref != NULL; ref = ref->next)
     {
     struct monomer *monomer = ref->val;
     int size = monomer->outTarget;
     int binEnd = binStart + size;
     if (threshold < binEnd)
 	return monomer;
     binStart = binEnd;
     }
+    }
+
+verbose(2, "Backup plan for %s in pickRandomFromType\n", type->name);
 
-verbose(2, "Fell off end in pickRandomFromType\n");
+/* Try again based on use counts. */
+total = 0;
+for (ref = type->list; ref != NULL; ref = ref->next)
+    total += ref->val->useCount;
+if (total > 0)
+    {
+    int threshold = rand() % total; 
+    int binStart = 0;
+    for (ref = type->list; ref != NULL; ref = ref->next)
+	{
+	struct monomer *monomer = ref->val;
+	int size = monomer->useCount;
+	int binEnd = binStart + size;
+	if (threshold < binEnd)
+	    return monomer;
+	binStart = binEnd;
+	}
+    }
+
+warn("Fell off end in pickRandomFromType on %s", type->name);
 return type->list->val;	    // Fall back position (necessary?) just first in list
 }
 
 struct wordTree *predictNextFromAllPredecessors(struct wordTree *wt, struct dlNode *list)
 /* Predict next word given tree and recently used word list.  If tree doesn't
  * have statistics for what comes next given the words in list, then it returns
  * NULL. */
 {
 verbose(2, " predictNextFromAllPredecessors(%s, %s)\n", wordTreeString(wt), dlListFragWords(list));
 struct dlNode *node;
 for (node = list; !dlEnd(node); node = node->next)
     {
     struct monomer *monomer = node->val;
     wt = wordTreeFindInList(wt->children, monomer);
     verbose(2, "   wordTreeFindInList(%s) = %p %s\n", monomer->word, wt, wordTreeString(wt));
@@ -451,30 +430,31 @@
         break;
     }
 struct wordTree *result = NULL;
 if (wt != NULL && wt->children != NULL)
     result = pickRandom(wt->children);
 return result;
 }
 
 struct wordTree *predictFromWordTree(struct wordTree *wt, struct dlNode *recent)
 /* Predict next word given tree and recently used word list.  Will use all words in
  * recent list if can,  but if there is not data in tree, will back off, and use
  * progressively less previous words until ultimately it just picks a random
  * word. */
 {
 struct dlNode *node;
+verbose(2, " predictNextFromWordTree(%s)\n", wordTreeString(wt));
 
 for (node = recent; !dlEnd(node); node = node->next)
     {
     struct wordTree *result = predictNextFromAllPredecessors(wt, node);
     if (result != NULL)
         return result;
     }
 return NULL;
 }
 
 struct dlNode *nodesFromTail(struct dlList *list, int count)
 /* Return count nodes from tail of list.  If list is shorter than count, it returns the
  * whole list. */
 {
 int i;
@@ -1082,35 +1062,35 @@
     monomer->outTarget = round(scale * monomer->useCount);
 }
 
 void alphaStoreNormalize(struct alphaStore *store, int outputSize)
 /* Set up output counts on each word to make sure it gets it's share of output size */
 {
 struct wordTree *wt = store->markovChains;
 wordTreeNormalize(wt, outputSize, 1.0);
 monomerListNormalise(store->monomerList, wt->useCount, outputSize);
 }
 
 struct monomerType *fillInTypeFromReads(struct monomer *monomer, struct alphaStore *store)
 /* Look through read list trying to find best supported type for this monomer. */
 {
 /* The algorithm here could be better.  It picks the closest nearby monomer that is typed
- * to fill in from.  When have information from both before and after uses arbitrarily 
- * before preferentially.  When have information that is equally close from multiple reads,
- * just uses info from first read.  Fortunately the data, at least on Y, shows a lot of
- * consistency, so probably not actually worth a more sophisticated algorithm that could
- * take the most popular choice if data were inconsistent. */
+ * to fill in from.  When have information from both before and after the program arbitrarily 
+ * uses before preferentially.  When have information that is equally close from multiple reads,
+ * it just uses info from first read.  Fortunately the data, at least on Y, shows a lot of
+ * consistency, so it's probably not actually worth a more sophisticated algorithm that would
+ * instead take the most popular choice rather than the first one. */
 verbose(2, "fillInTypeFromReads on %s from %d reads\n", monomer->word, slCount(monomer->readList));
 struct monomerType *bestType = NULL;
 int bestPos = 0;
 boolean bestIsAfter = FALSE;
 struct slRef *readRef;
 for (readRef = monomer->readList; readRef != NULL; readRef = readRef->next)
     {
     struct alphaRead *read = readRef->val;
     struct monomerRef *ref;
 	{
 	verbose(2, "  read %s (%d els):", read->name, slCount(read->list));
 	for (ref = read->list; ref != NULL; ref = ref->next)
 	    {
 	    struct monomer *m = ref->val;
 	    verbose(2, " %s", m->word);
@@ -1158,64 +1138,67 @@
 	    bestIsAfter = FALSE;
 	    }
 	}
 
     if (afterType != NULL)
         {
 	if (bestType == NULL || afterPos < bestPos)
 	    {
 	    bestType = afterType;
 	    bestPos = afterPos;
 	    bestIsAfter = TRUE;
 	    }
 	}
     }
 
+/* Now have found a type that is at a known position relative to ourselves.  From this
+ * infer our own type. */
 struct monomerType *chosenType = NULL;
 if (bestType != NULL)
     {
     if (bestIsAfter)
 	chosenType = typeBefore(store, bestType, bestPos);
     else
 	chosenType = typeAfter(store, bestType, bestPos);
     }
-
 return chosenType;
 }
 
 void fillInTypes(struct alphaStore *store)
 /* We have types for most but not all monomers. Try and fill out types for most of
  * the rest by looking for position in reads they are in that do contain known types */
 {
 /* Do first pass on ones defined - just have to look them up in hash. */
 struct monomer *monomer;
 for (monomer = store->monomerList; monomer != NULL; monomer = monomer->next)
     monomer->type = hashFindVal(store->typeHash, monomer->word);
 
 /* Do second pass on ones that are not yet defined */
 for (monomer = store->monomerList; monomer != NULL; monomer = monomer->next)
     {
     if (monomer->type == NULL)
 	{
 	struct monomerType *type = monomer->type = fillInTypeFromReads(monomer, store);
 	verbose(2, "Got %p %s\n", type, (type != NULL ? type->name : "n/a"));
 	}
     }
 }
 
 void alphaAsm(char *readsFile, char *monomerOrderFile, char *outFile)
-/* alphaAsm - Create Markov chain of words and optionally output chain in two formats. */
+/* alphaAsm - assemble alpha repeat regions such as centromeres from reads that have
+ * been parsed into various repeat monomer variants.  Cycles of these variants tend to
+ * form higher order repeats. */
 {
 struct alphaStore *store = alphaStoreNew(maxChainSize);
 alphaReadListFromFile(readsFile, store);
 alphaStoreLoadMonomerOrder(store, readsFile, monomerOrderFile);
 fillInTypes(store);
 integrateOrphans(store);
 makeMarkovChains(store);
 struct wordTree *wt = store->markovChains;
 alphaStoreNormalize(store, outSize);
 
 if (optionExists("chain"))
     {
     char *fileName = optionVal("chain", NULL);
     wordTreeWrite(wt, store->maxChainSize, fileName);
     }