src/kehayden/alphaAsm/alphaAsm.c ddc2f8d86b21acf0665c9264e9232cdba313ae12

ddc2f8d86b21acf0665c9264e9232cdba313ae12
kent
  Wed Mar 6 14:30:23 2013 -0800
Adding maxToMiss and maxOutSize parameters to let program expand output size if need be to get in all monomers, or to miss a few monomers.
diff --git src/kehayden/alphaAsm/alphaAsm.c src/kehayden/alphaAsm/alphaAsm.c
index 5a0e540..a697bae 100644
--- src/kehayden/alphaAsm/alphaAsm.c
+++ src/kehayden/alphaAsm/alphaAsm.c
@@ -4,70 +4,73 @@
 
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "dystring.h"
 #include "dlist.h"
 #include "obscure.h"
 #include "errabort.h"
 
 /* Global vars - all of which can be set by command line options. */
 int pseudoCount = 1;
 int maxChainSize = 3;
 int initialOutSize = 10000;
 int maxOutSize;
+int maxToMiss = 0;
 boolean fullOnly = FALSE;
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "alphaAsm - create a linear projection of alpha satellite arrays using the probablistic model\n"
   "of HuRef satellite graphs based on Sanger style reads.\n"
   "usage:\n"
   "   alphaAsm alphaMonFile.txt monomerOrder.txt output.txt\n"
   "Where alphaMonFile is a list of reads with alpha chains monomers parsed out, one read per\n"
   "line, with the first word in the line being the read id and subsequent words the monomers\n"
   "within the read. The monomerOrder.txt has one line per major monomer type with a word for\n"
   "each variant. The lines are in the same order the monomers appear, with the last line\n"
   "being followed in order by the first since they repeat.   The output.txt contains one line\n"
   "for each monomer in the output, just containing the monomer symbol.\n"
   "options:\n"
   "   -size=N - Set max chain size, default %d\n"
   "   -fullOnly - Only output chains of size above\n"
   "   -chain=fileName - Write out word chain to file\n"
   "   -afterChain=fileName - Write out word chain after faux generation to file for debugging\n"
   "   -initialOutSize=N - Output this many words. Default %d\n"
   "   -maxOutSize=N - Expand output up to this many words if not all monomers output initially\n"
+  "   -maxToMiss=N - How many monomers may be missing from output. Default %d \n"
   "   -pseudoCount=N - Add this number to observed quantities - helps compensate for what's not\n"
   "                observed.  Default %d\n"
   "   -seed=N - Initialize random number with this seed for consistent results, otherwise\n"
   "             it will generate different results each time it's run.\n"
   "   -unsubbed=fileName - Write output before substitution of missing monomers to file\n"
-  , maxChainSize, initialOutSize, pseudoCount
+  , maxChainSize, initialOutSize, maxToMiss, pseudoCount
   );
 }
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
    {"size", OPTION_INT},
    {"chain", OPTION_STRING},
    {"afterChain", OPTION_STRING},
    {"fullOnly", OPTION_BOOLEAN},
    {"initialOutSize", OPTION_INT},
-   {"maxOUtSize", OPTION_INT},
+   {"maxOutSize", OPTION_INT},
+   {"maxToMiss", OPTION_INT},
    {"pseudoCount", OPTION_INT},
    {"seed", OPTION_INT},
    {"unsubbed", OPTION_STRING},
    {NULL, 0},
 };
 
 /* Some structures to keep track of words (which correspond to alpha satellight monomers)
  * seen in input. */
 
 struct monomer
 /* Basic information on a monomer including how many times it is seen in input and output
  * streams.  Unlike the wordTree, this is flat, and does not include predecessors. */
     {
     struct monomer *next;	/* Next in list of all words. */
     char *word;			/* The word used to represent monomer.  Not allocated here. */
@@ -1604,91 +1607,145 @@
 /* Do first pass on ones defined - just have to look them up in hash. */
 struct monomer *monomer;
 for (monomer = store->monomerList; monomer != NULL; monomer = monomer->next)
     monomer->type = hashFindVal(store->typeHash, monomer->word);
 
 /* Do second pass on ones that are not yet defined */
 for (monomer = store->monomerList; monomer != NULL; monomer = monomer->next)
     {
     if (monomer->type == NULL)
 	{
 	monomer->type = fillInTypeFromReads(monomer, store);
 	}
     }
 }
 
-void alphaAsmOneSize(struct alphaStore *store, int outSize, char *outFile)
+struct dlList *alphaAsmOneSize(struct alphaStore *store, int outSize)
 /* alphaAsm - assemble alpha repeat regions such as centromeres from reads that have
  * been parsed into various repeat monomer variants.  Cycles of these variants tend to
- * form higher order repeats. */
+ * form higher order repeats. Returns list of outSize monomers. */
 {
 struct wordTree *wt = store->markovChains;
 alphaStoreNormalize(store, outSize);
 verbose(2, "Normalized Markov Chains\n");
 
 if (optionExists("chain"))
     {
     char *fileName = optionVal("chain", NULL);
     wordTreeWrite(wt, store->maxChainSize, fileName);
     }
 
 struct dlList *ll = wordTreeMakeList(store, store->maxChainSize, 
     pickWeightedRandomFromList(wt->children), outSize);
-writeMonomerList(outFile, ll);
-verbose(2, "Wrote primary output\n");
-dlListFree(&ll);
 
 if (optionExists("afterChain"))
     {
     char *fileName = optionVal("afterChain", NULL);
     wordTreeWrite(wt, store->maxChainSize, fileName);
     }
+
+return ll;
+}
+
+int countMissingMonomers(struct dlList *ll, struct monomer *monomerList)
+/* Set the outCounts in monomer list to match how often they occur in ll.
+ * Return number of monomers that do not occur at all in ll. */
+{
+/* Zero out output counts. */
+struct monomer *monomer;
+for (monomer = monomerList; monomer != NULL; monomer = monomer->next)
+    monomer->outCount = 0;
+
+/* Increase output count each time a monomer is used. */
+struct dlNode *node;
+for (node = ll->head; !dlEnd(node); node = node->next)
+    {
+    monomer = node->val;
+    monomer->outCount += 1;
+    }
+
+/* Count up unused. */
+int missing = 0;
+for (monomer = monomerList; monomer != NULL; monomer = monomer->next)
+    if (monomer->outCount == 0)
+        missing += 1;
+
+return missing;
 }
 
 void alphaAsm(char *readsFile, char *monomerOrderFile, char *outFile)
 /* alphaAsm - assemble alpha repeat regions such as centromeres from reads that have
  * been parsed into various repeat monomer variants.  Cycles of these variants tend to
  * form higher order repeats. */
 {
 /* This routine reads in the input,  and then calls a routine that produces the
  * output for a given size.  If not all monomers are included in the output, then it
  * will try to find the minimum output size needed to include all monomers. */
 
 /* Read in inputs, and put in "store" */
 struct alphaStore *store = alphaStoreNew(maxChainSize);
 alphaReadListFromFile(readsFile, store);
 alphaStoreLoadMonomerOrder(store, readsFile, monomerOrderFile);
 verbose(2, "Loaded input reads and monomer order\n");
 
 /* Do some cross checking and filling out a bit for missing data. */
 crossCheckMonomerOrderAndReads(store, "m", readsFile, monomerOrderFile);
 fillInTypes(store);
 integrateOrphans(store);
 verbose(2, "Filled in missing types and integrated orphan ends\n");
 
 /* Make the main markov chains out of the reads. */
 makeMarkovChains(store);
 verbose(2, "Made Markov Chains\n");
 
-alphaAsmOneSize(store, initialOutSize, outFile);
+/* Loop gradually increasing size of output we make until get to maxOutSize or until
+ * we generate output that includes all input monomers. */
+struct dlList *ll;
+int outSize = initialOutSize;
+while (outSize <= maxOutSize)
+    {
+    ll = alphaAsmOneSize(store, outSize);
+    assert(outSize == dlCount(ll));
+    int missing = countMissingMonomers(ll, store->monomerList);
+    if (missing <= maxToMiss)
+        break;
+    if (outSize == maxOutSize)
+	{
+	errAbort("Could not include all monomers. Missing %d.\n"
+	         "consider increasing maxOutSize (now %d) or increasing maxToMiss (now %d)",
+	         missing, maxOutSize, maxToMiss);
+        break;
+	}
+    dlListFree(&ll);
+    outSize *= 1.1;	/* Grow by 10% */
+    if (outSize > maxOutSize)
+        outSize = maxOutSize;
+    verbose(1, "%d missing. Trying again with outSize=%d (initialOutSize %d, maxOutSize %d)\n",
+	missing, outSize, initialOutSize, maxOutSize);
+    }
+    
+writeMonomerList(outFile, ll);
+verbose(2, "Wrote primary output\n");
+dlListFree(&ll);
 }
 
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 4)
     usage();
 maxChainSize = optionInt("size", maxChainSize);
 initialOutSize = optionInt("initialOutSize", initialOutSize);
 maxOutSize = optionInt("maxOutSize", initialOutSize);  // Defaults to same as initialOutSize
 if (maxOutSize < initialOutSize)
     errAbort("maxOutSize (%d) needs to be bigger than initialOutSize (%d)\n", 
 	maxOutSize, initialOutSize);
+maxToMiss = optionInt("maxToMiss", maxToMiss);
 fullOnly = optionExists("fullOnly");
 pseudoCount = optionInt("pseudoCount", pseudoCount);
 int seed = optionInt("seed", 0);
 srand(seed);
 alphaAsm(argv[1], argv[2], argv[3]);
 return 0;
 }