src/kehayden/alphaAsm/alphaAsm.c 9dd4efa12926f4ade41f47304d086cd86b35a11d

9dd4efa12926f4ade41f47304d086cd86b35a11d
kent
  Wed Mar 6 14:05:38 2013 -0800
Some refactoring in anticipation of expanding output if need be to accommodate full monomer diversity.
diff --git src/kehayden/alphaAsm/alphaAsm.c src/kehayden/alphaAsm/alphaAsm.c
index 7c12be2..5a0e540 100644
--- src/kehayden/alphaAsm/alphaAsm.c
+++ src/kehayden/alphaAsm/alphaAsm.c
@@ -2,69 +2,72 @@
  * been parsed into various repeat monomer variants.  Cycles of these variants tend to
  * form higher order repeats. */
 
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "dystring.h"
 #include "dlist.h"
 #include "obscure.h"
 #include "errabort.h"
 
 /* Global vars - all of which can be set by command line options. */
 int pseudoCount = 1;
 int maxChainSize = 3;
-int outSize = 10000;
+int initialOutSize = 10000;
+int maxOutSize;
 boolean fullOnly = FALSE;
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "alphaAsm - create a linear projection of alpha satellite arrays using the probablistic model\n"
   "of HuRef satellite graphs based on Sanger style reads.\n"
   "usage:\n"
   "   alphaAsm alphaMonFile.txt monomerOrder.txt output.txt\n"
   "Where alphaMonFile is a list of reads with alpha chains monomers parsed out, one read per\n"
   "line, with the first word in the line being the read id and subsequent words the monomers\n"
   "within the read. The monomerOrder.txt has one line per major monomer type with a word for\n"
   "each variant. The lines are in the same order the monomers appear, with the last line\n"
   "being followed in order by the first since they repeat.   The output.txt contains one line\n"
   "for each monomer in the output, just containing the monomer symbol.\n"
   "options:\n"
   "   -size=N - Set max chain size, default %d\n"
   "   -fullOnly - Only output chains of size above\n"
   "   -chain=fileName - Write out word chain to file\n"
   "   -afterChain=fileName - Write out word chain after faux generation to file for debugging\n"
-  "   -outSize=N - Output this many words. Default %d\n"
+  "   -initialOutSize=N - Output this many words. Default %d\n"
+  "   -maxOutSize=N - Expand output up to this many words if not all monomers output initially\n"
   "   -pseudoCount=N - Add this number to observed quantities - helps compensate for what's not\n"
   "                observed.  Default %d\n"
   "   -seed=N - Initialize random number with this seed for consistent results, otherwise\n"
   "             it will generate different results each time it's run.\n"
   "   -unsubbed=fileName - Write output before substitution of missing monomers to file\n"
-  , maxChainSize, outSize, pseudoCount
+  , maxChainSize, initialOutSize, pseudoCount
   );
 }
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
    {"size", OPTION_INT},
    {"chain", OPTION_STRING},
    {"afterChain", OPTION_STRING},
    {"fullOnly", OPTION_BOOLEAN},
-   {"outSize", OPTION_INT},
+   {"initialOutSize", OPTION_INT},
+   {"maxOUtSize", OPTION_INT},
    {"pseudoCount", OPTION_INT},
    {"seed", OPTION_INT},
    {"unsubbed", OPTION_STRING},
    {NULL, 0},
 };
 
 /* Some structures to keep track of words (which correspond to alpha satellight monomers)
  * seen in input. */
 
 struct monomer
 /* Basic information on a monomer including how many times it is seen in input and output
  * streams.  Unlike the wordTree, this is flat, and does not include predecessors. */
     {
     struct monomer *next;	/* Next in list of all words. */
     char *word;			/* The word used to represent monomer.  Not allocated here. */
@@ -273,30 +276,31 @@
     int total = 0;
     for (child = wt->children; child != NULL; child = child->next)
 	total += wordTreeAddPseudoCount(child, pseudo);
     wt->useCount = total + oldDiff + pseudo;
     return total;
     }
 }
 
 void wordTreeNormalize(struct wordTree *wt, double outTarget, double normVal)
 /* Recursively set wt->normVal  and wt->outTarget so each branch gets its share */
 {
 if (pseudoCount > 0)
     wordTreeAddPseudoCount(wt, pseudoCount);
 wt->normVal = normVal;
 wt->outTarget = outTarget;
+wt->outCount = 0;
 int childrenTotalUses = wordTreeSumUseCounts(wt->children);
 struct wordTree *child;
 for (child = wt->children; child != NULL; child = child->next)
     {
     double childRatio = (double)child->useCount / childrenTotalUses;
     wordTreeNormalize(child, childRatio*outTarget, childRatio*normVal);
     }
 }
 
 char *wordTreeString(struct wordTree *wt)
 /* Return something like '(a b c)' where c would be the value at wt itself, and
  * a and b would be gotten by following parents. */
 {
 struct slName *list = NULL, *el;
 for (;wt != NULL; wt = wt->parent)
@@ -1012,45 +1016,43 @@
 }
 
 static void writeMonomerList(char *fileName, struct dlList *ll)
 /* Write out monomer list to file. */
 {
 FILE *f = mustOpen(fileName, "w");
 struct dlNode *node;
 for (node = ll->head; !dlEnd(node); node = node->next)
     {
     struct monomer *monomer = node->val;
     fprintf(f, "%s\n", monomer->word);
     }
 carefulClose(&f);
 }
 
-static void wordTreeGenerateFile(struct alphaStore *store, int maxSize, struct wordTree *firstWord, 
-	int maxOutputWords, char *fileName)
-/* Create file containing words base on tree probabilities.  The wordTreeGenerateList does
- * most of work. */
+static struct dlList *wordTreeMakeList(struct alphaStore *store, int maxSize, 
+    struct wordTree *firstWord, int maxOutputWords)
+/* Create word list base on tree probabilities, and then substituting in unused
+ * monomers if possible. */
 {
 struct dlList *ll = wordTreeGenerateList(store, maxSize, firstWord, maxOutputWords);
 verbose(2, "Generated initial output list\n");
 char *unsubbedFile = optionVal("unsubbed", NULL);
 if (unsubbedFile)
     writeMonomerList(unsubbedFile, ll);
 subInMissing(store, ll);
 verbose(2, "Substituted in missing rare monomers\n");
-writeMonomerList(fileName, ll);
-verbose(2, "Wrote primary output\n");
-dlListFree(&ll);
+return ll;
 }
 
 
 struct alphaStore *alphaStoreNew(int maxChainSize)
 /* Allocate and initialize a new word store. */
 {
 struct alphaStore *store;
 AllocVar(store);
 store->maxChainSize = maxChainSize;
 store->monomerHash = hashNew(0);
 store->typeHash = hashNew(0);
 store->readHash = hashNew(0);
 return store;
 }
 
@@ -1474,32 +1476,35 @@
     if (startsWith(orderedPrefix, word) && !hashLookup(orderHash, word))
         {
 	errAbort("%s is in %s but not %s", word, readFile, typeFile);
 	}
     }
 hashFree(&orderHash);
 }
 
 
 void monomerListNormalise(struct monomer *list, int totalCount, int outputSize)
 /* Set outTarget field in all of list to be normalized to outputSize */
 {
 struct monomer *monomer;
 double scale = outputSize/totalCount;
 for (monomer = list; monomer != NULL; monomer = monomer->next)
+    {
+    monomer->outCount = 0;
     monomer->outTarget = round(scale * monomer->useCount);
 }
+}
 
 void alphaStoreNormalize(struct alphaStore *store, int outputSize)
 /* Set up output counts on each word to make sure it gets it's share of output size */
 {
 struct wordTree *wt = store->markovChains;
 wordTreeNormalize(wt, outputSize, 1.0);
 monomerListNormalise(store->monomerList, wt->useCount, outputSize);
 }
 
 struct monomerType *fillInTypeFromReads(struct monomer *monomer, struct alphaStore *store)
 /* Look through read list trying to find best supported type for this monomer. */
 {
 /* The algorithm here could be better.  It picks the closest nearby monomer that is typed
  * to fill in from.  When have information from both before and after the program arbitrarily 
  * uses before preferentially.  When have information that is equally close from multiple reads,
@@ -1599,65 +1604,91 @@
 /* Do first pass on ones defined - just have to look them up in hash. */
 struct monomer *monomer;
 for (monomer = store->monomerList; monomer != NULL; monomer = monomer->next)
     monomer->type = hashFindVal(store->typeHash, monomer->word);
 
 /* Do second pass on ones that are not yet defined */
 for (monomer = store->monomerList; monomer != NULL; monomer = monomer->next)
     {
     if (monomer->type == NULL)
 	{
 	monomer->type = fillInTypeFromReads(monomer, store);
 	}
     }
 }
 
-void alphaAsm(char *readsFile, char *monomerOrderFile, char *outFile)
+void alphaAsmOneSize(struct alphaStore *store, int outSize, char *outFile)
 /* alphaAsm - assemble alpha repeat regions such as centromeres from reads that have
  * been parsed into various repeat monomer variants.  Cycles of these variants tend to
  * form higher order repeats. */
 {
-struct alphaStore *store = alphaStoreNew(maxChainSize);
-alphaReadListFromFile(readsFile, store);
-alphaStoreLoadMonomerOrder(store, readsFile, monomerOrderFile);
-verbose(2, "Loaded input reads and monomer order\n");
-crossCheckMonomerOrderAndReads(store, "m", readsFile, monomerOrderFile);
-fillInTypes(store);
-integrateOrphans(store);
-verbose(2, "Filled in missing types and integrated orphan ends\n");
-makeMarkovChains(store);
-verbose(2, "Made Markov Chains\n");
 struct wordTree *wt = store->markovChains;
 alphaStoreNormalize(store, outSize);
 verbose(2, "Normalized Markov Chains\n");
 
 if (optionExists("chain"))
     {
     char *fileName = optionVal("chain", NULL);
     wordTreeWrite(wt, store->maxChainSize, fileName);
     }
 
-wordTreeGenerateFile(store, store->maxChainSize, 
-    pickWeightedRandomFromList(wt->children), outSize, outFile);
+struct dlList *ll = wordTreeMakeList(store, store->maxChainSize, 
+    pickWeightedRandomFromList(wt->children), outSize);
+writeMonomerList(outFile, ll);
+verbose(2, "Wrote primary output\n");
+dlListFree(&ll);
 
 if (optionExists("afterChain"))
     {
     char *fileName = optionVal("afterChain", NULL);
     wordTreeWrite(wt, store->maxChainSize, fileName);
     }
 }
 
+void alphaAsm(char *readsFile, char *monomerOrderFile, char *outFile)
+/* alphaAsm - assemble alpha repeat regions such as centromeres from reads that have
+ * been parsed into various repeat monomer variants.  Cycles of these variants tend to
+ * form higher order repeats. */
+{
+/* This routine reads in the input,  and then calls a routine that produces the
+ * output for a given size.  If not all monomers are included in the output, then it
+ * will try to find the minimum output size needed to include all monomers. */
+
+/* Read in inputs, and put in "store" */
+struct alphaStore *store = alphaStoreNew(maxChainSize);
+alphaReadListFromFile(readsFile, store);
+alphaStoreLoadMonomerOrder(store, readsFile, monomerOrderFile);
+verbose(2, "Loaded input reads and monomer order\n");
+
+/* Do some cross checking and filling out a bit for missing data. */
+crossCheckMonomerOrderAndReads(store, "m", readsFile, monomerOrderFile);
+fillInTypes(store);
+integrateOrphans(store);
+verbose(2, "Filled in missing types and integrated orphan ends\n");
+
+/* Make the main markov chains out of the reads. */
+makeMarkovChains(store);
+verbose(2, "Made Markov Chains\n");
+
+alphaAsmOneSize(store, initialOutSize, outFile);
+}
+
+
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 4)
     usage();
 maxChainSize = optionInt("size", maxChainSize);
-outSize = optionInt("outSize", outSize);
+initialOutSize = optionInt("initialOutSize", initialOutSize);
+maxOutSize = optionInt("maxOutSize", initialOutSize);  // Defaults to same as initialOutSize
+if (maxOutSize < initialOutSize)
+    errAbort("maxOutSize (%d) needs to be bigger than initialOutSize (%d)\n", 
+	maxOutSize, initialOutSize);
 fullOnly = optionExists("fullOnly");
 pseudoCount = optionInt("pseudoCount", pseudoCount);
 int seed = optionInt("seed", 0);
 srand(seed);
 alphaAsm(argv[1], argv[2], argv[3]);
 return 0;
 }