src/kehayden/alphaAsm/alphaAsm.c cb59d1c01ec38183b8fecd0aebf7a2135fb04985

cb59d1c01ec38183b8fecd0aebf7a2135fb04985
kent
  Wed Sep 12 13:43:13 2012 -0700
Polishing code and comments without changing meaning at all.
diff --git src/kehayden/alphaAsm/alphaAsm.c src/kehayden/alphaAsm/alphaAsm.c
index 46fdb2b..93ac2ba 100644
--- src/kehayden/alphaAsm/alphaAsm.c
+++ src/kehayden/alphaAsm/alphaAsm.c
@@ -8,48 +8,50 @@
 #include "options.h"
 #include "dystring.h"
 #include "dlist.h"
 
 /* Global vars - all of which can be set by command line options. */
 int pseudoCount = 1;
 int maxChainSize = 3;
 int outSize = 10000;
 boolean fullOnly = FALSE;
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "alphaAsm - create a linear projection of alpha satellite arrays using the probablistic model\n"
-  "of HuRef satellite graphs\n"
+  "of HuRef satellite graphs based on Sanger style reads.\n"
   "usage:\n"
   "   alphaAsm alphaMonFile.txt monomerOrder.txt output.txt\n"
   "Where alphaMonFile is a list of reads with alpha chains monomers parsed out, one read per\n"
   "line, with the first word in the line being the read id and subsequent words the monomers\n"
   "within the read. The monomerOrder.txt has one line per major monomer type with a word for\n"
   "each variant. The lines are in the same order the monomers appear, with the last line\n"
   "being followed in order by the first since they repeat.   The output.txt contains one line\n"
   "for each monomer in the output, just containing the monomer symbol.\n"
   "options:\n"
   "   -size=N - Set max chain size, default %d\n"
   "   -fullOnly - Only output chains of size above\n"
   "   -chain=fileName - Write out word chain to file\n"
   "   -afterChain=fileName - Write out word chain after faux generation to file for debugging\n"
-  "   -outSize=N - Output this many words.\n"
+  "   -outSize=N - Output this many words. Default %d\n"
+  "   -pseudoCount=N - Add this number to observed quantities - helps compensate for what's not\n"
+  "                observed.  Default %d\n"
   "   -seed=N - Initialize random number with this seed for consistent results, otherwise\n"
   "             it will generate different results each time it's run.\n"
-  , maxChainSize
+  , maxChainSize, outSize, pseudoCount
   );
 }
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
    {"size", OPTION_INT},
    {"chain", OPTION_STRING},
    {"afterChain", OPTION_STRING},
    {"fullOnly", OPTION_BOOLEAN},
    {"outSize", OPTION_INT},
    {"seed", OPTION_INT},
    {"pseudoCount", OPTION_INT},
    {NULL, 0},
 };
 
@@ -58,54 +60,54 @@
 
 struct monomer
 /* Basic information on a monomer including how many times it is seen in input and output
  * streams.  Unlike the wordTree, this is flat, and does not include predecessors. */
     {
     struct monomer *next;	/* Next in list of all words. */
     char *word;			/* The word used to represent monomer.  Not allocated here. */
     int useCount;		/* Number of times used. */
     int outTarget;		/* Number of times want to output word. */
     int outCount;		/* Number of times have output word so far. */
     struct monomerType *type;	/* The type of the monomer. */
     struct slRef *readList;	/* List of references to reads this is in. */
     };
 
 struct monomerRef
-/* A reference to a word. */
+/* A reference to a monomer. */
     {
     struct monomerRef *next;	/* Next in list */
     struct monomer *val;	/* The word referred to. */
     };
 
 struct monomerType
 /* A collection of words of the same type - or monomers of same type. */
     {
     struct monomerType *next;   /* Next monomerType */
     char *name;			/* Short name of type */
     struct monomerRef *list;	    /* List of all words of that type */
     };
 
 struct alphaRead 
 /* A read that's been parsed into alpha variants. */
     {
     struct alphaRead *next;	/* Next in list */
     char *name;			/* Id of read, not allocated here. */
     struct monomerRef *list;	/* List of alpha subunits */
     };
 
 struct alphaStore
-/* Stores info on all words */
+/* Stores info on all monomers */
     {
     struct monomer *monomerList;   /* List of words, fairly arbitrary order. */
     struct hash *monomerHash;       /* Hash of monomer, keyed by word. */
     struct alphaRead *readList;	 /* List of all reads. */
     struct hash *readHash;	 /* Hash of all reads keyed by read ID. */
     struct wordTree *markovChains;   /* Tree of words that follow other words. */
     int maxChainSize;		/* Maximum depth of tree. */
     struct monomerType *typeList;	/* List of all types. */
     struct hash *typeHash;	/* Hash with monomerType values, keyed by all words. */
     int typeCount;		/* Count of types. */
     struct monomerType **typeArray;  /* Array as opposed to list of types */
     };
 
 /* The wordTree structure below is the central data structure for this program.  It is
  * used to build up a tree that contains all observed N-word-long sequences observed in
@@ -167,30 +169,39 @@
 {
 struct wordTree *wt;
 AllocVar(wt);
 wt->monomer = monomer;
 return wt;
 }
 
 int wordTreeCmpWord(const void *va, const void *vb)
 /* Compare two wordTree for slSort. */
 {
 const struct wordTree *a = *((struct wordTree **)va);
 const struct wordTree *b = *((struct wordTree **)vb);
 return cmpStringsWithEmbeddedNumbers(a->monomer->word, b->monomer->word);
 }
 
+static void wordTreeSort(struct wordTree *wt)
+/* Sort all children lists in tree. */
+{
+slSort(&wt->children, wordTreeCmpWord);
+struct wordTree *child;
+for (child = wt->children; child != NULL; child = child->next)
+    wordTreeSort(child);
+}
+
 struct wordTree *wordTreeFindInList(struct wordTree *list, struct monomer *monomer)
 /* Return wordTree element in list that has given word, or NULL if none. */
 {
 struct wordTree *wt;
 for (wt = list; wt != NULL; wt = wt->next)
     if (wt->monomer == monomer)
         break;
 return wt;
 }
 
 struct wordTree *wordTreeAddFollowing(struct wordTree *wt, struct monomer *monomer)
 /* Make word follow wt in tree.  If word already exists among followers
  * return it and bump use count.  Otherwise create new one. */
 {
 struct wordTree *child = wordTreeFindInList(wt->children, monomer);
@@ -255,32 +266,30 @@
     if (oldDiff < 0) oldDiff = 0;  // Necessary in rare cases, not sure why.
 
     int total = 0;
     for (child = wt->children; child != NULL; child = child->next)
 	total += wordTreeAddPseudoCount(child, pseudo);
     wt->useCount = total + oldDiff + pseudo;
     return total;
     }
 }
 
 void wordTreeNormalize(struct wordTree *wt, double outTarget, double normVal)
 /* Recursively set wt->normVal  and wt->outTarget so each branch gets its share */
 {
 if (pseudoCount > 0)
     wordTreeAddPseudoCount(wt, pseudoCount);
-#ifdef SOON
-#endif /* SOON */
 wt->normVal = normVal;
 wt->outTarget = outTarget;
 int childrenTotalUses = wordTreeSumUseCounts(wt->children);
 struct wordTree *child;
 for (child = wt->children; child != NULL; child = child->next)
     {
     double childRatio = (double)child->useCount / childrenTotalUses;
     wordTreeNormalize(child, childRatio*outTarget, childRatio*normVal);
     }
 }
 
 char *wordTreeString(struct wordTree *wt)
 /* Return something like '(a b c)' where c would be the value at wt itself, and
  * a and b would be gotten by following parents. */
 {
@@ -429,60 +438,59 @@
 if (total > 0)
     {
     int threshold = rand() % total; 
     int binStart = 0;
     for (ref = type->list; ref != NULL; ref = ref->next)
 	{
 	struct monomer *monomer = ref->val;
 	int size = monomer->useCount;
 	int binEnd = binStart + size;
 	if (threshold < binEnd)
 	    return monomer;
 	binStart = binEnd;
 	}
     }
 
-warn("Fell off end in pickRandomFromType on %s", type->name);
-return type->list->val;	    // Fall back position (necessary?) just first in list
+errAbort("Fell off end in pickRandomFromType on %s", type->name);
+return NULL;
 }
 
 struct wordTree *predictNextFromAllPredecessors(struct wordTree *wt, struct dlNode *list)
 /* Predict next word given tree and recently used word list.  If tree doesn't
  * have statistics for what comes next given the words in list, then it returns
  * NULL. */
 {
 verbose(2, "  predictNextFromAllPredecessors(%s, %s)\n", wordTreeString(wt), dlListFragWords(list));
 struct dlNode *node;
 for (node = list; !dlEnd(node); node = node->next)
     {
     struct monomer *monomer = node->val;
     wt = wordTreeFindInList(wt->children, monomer);
     verbose(2, "   wordTreeFindInList(%s) = %p %s\n", monomer->word, wt, wordTreeString(wt));
     if (wt == NULL || wt->children == NULL)
         break;
     }
 struct wordTree *result = NULL;
 if (wt != NULL && wt->children != NULL)
     result = pickRandom(wt->children);
 return result;
 }
 
 struct wordTree *predictFromWordTree(struct wordTree *wt, struct dlNode *recent)
 /* Predict next word given tree and recently used word list.  Will use all words in
  * recent list if can,  but if there is not data in tree, will back off, and use
- * progressively less previous words until ultimately it just picks a random
- * word. */
+ * progressively less previous words. */
 {
 struct dlNode *node;
 verbose(2, " predictNextFromWordTree(%s)\n", wordTreeString(wt));
 
 for (node = recent; !dlEnd(node); node = node->next)
     {
     struct wordTree *result = predictNextFromAllPredecessors(wt, node);
     if (result != NULL)
         return result;
     }
 return NULL;
 }
 
 struct dlNode *nodesFromTail(struct dlList *list, int count)
 /* Return count nodes from tail of list.  If list is shorter than count, it returns the
@@ -585,31 +593,31 @@
 
 void decrementOutputCountsInTree(struct wordTree *wt)
 /* Decrement output count of self and parents. */
 {
 while (wt != NULL)
     {
     /* Decrement target count, but don't let it fall below sum of counts of all children. 
      * This can happen with incomplete data if we don't prevent it.  This
      * same code also prevents us from having negative outTarget. */
     int outTarget = wt->outTarget - 1;
     int kidSum = wordTreeSumOutTargets(wt->children);
     if (outTarget < kidSum)
         outTarget = kidSum;
     wt->outTarget = outTarget;
 
-    /* Always bump outCount for debugging. */
+    /* Always bump outCount to help us debug (only real use of outCount). */
     wt->outCount += 1;
     wt = wt->parent;
     }
 }
 
 static struct dlList *wordTreeGenerateList(struct alphaStore *store, int maxSize, 
     struct wordTree *firstWord, int maxOutputWords)
 /* Make a list of words based on probabilities in tree. */
 {
 struct dlList *ll = dlListNew();
 int chainSize = 0;
 int outputWords = 0;
 struct dlNode *chainStartNode = NULL;
 
 for (;;)
@@ -661,39 +669,30 @@
  * most of work. */
 {
 struct dlList *ll = wordTreeGenerateList(store, maxSize, firstWord, maxOutputWords);
 FILE *f = mustOpen(fileName, "w");
 struct dlNode *node;
 for (node = ll->head; !dlEnd(node); node = node->next)
     {
     struct monomer *monomer = node->val;
     fprintf(f, "%s\n", monomer->word);
     }
 carefulClose(&f);
 dlListFree(&ll);
 }
 
 
-static void wordTreeSort(struct wordTree *wt)
-/* Sort all children lists in tree. */
-{
-slSort(&wt->children, wordTreeCmpWord);
-struct wordTree *child;
-for (child = wt->children; child != NULL; child = child->next)
-    wordTreeSort(child);
-}
-
 struct alphaStore *alphaStoreNew(int maxChainSize)
 /* Allocate and initialize a new word store. */
 {
 struct alphaStore *store;
 AllocVar(store);
 store->maxChainSize = maxChainSize;
 store->monomerHash = hashNew(0);
 store->typeHash = hashNew(0);
 store->readHash = hashNew(0);
 return store;
 }
 
 struct monomer *alphaStoreAddMonomer(struct alphaStore *store, char *word)
 /* Add word to store,  incrementing it's useCount if it's already there, otherwise
  * making up a new record for it. */