f34c13227dfebde7e3d30032e235f88c25a8ae99
kent
  Wed Jun 19 13:20:52 2013 -0700
Moving alphaAsm to linearSat, a better name.
diff --git src/kehayden/linearSat/linearSat.c src/kehayden/linearSat/linearSat.c
index 9ce4ab6..edaf2c2 100644
--- src/kehayden/linearSat/linearSat.c
+++ src/kehayden/linearSat/linearSat.c
@@ -1,46 +1,46 @@
-/* alphaAsm - assemble alpha repeat regions such as centromeres from reads that have
+/* linearSat - assemble repeat regions such as centromeres from reads that have
  * been parsed into various repeat monomer variants.  Cycles of these variants tend to
  * form higher order repeats. */
 
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "dystring.h"
 #include "dlist.h"
 #include "obscure.h"
 #include "errabort.h"
 
 /* Global vars - all of which can be set by command line options. */
 int pseudoCount = 1;
 int maxChainSize = 3;
 int initialOutSize = 10000;
 int maxOutSize;
 int maxToMiss = 0;
 boolean fullOnly = FALSE;
 boolean betweens = FALSE;
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
-  "alphaAsm - create a linear projection of alpha satellite arrays using the probablistic model\n"
+  "linearSat - create a linear projection of satellite arrays using the probablistic model\n"
   "of HuRef satellite graphs based on Sanger style reads.\n"
   "usage:\n"
-  "   alphaAsm alphaMonFile.txt monomerOrder.txt output.txt\n"
-  "Where alphaMonFile is a list of reads with alpha chains monomers parsed out, one read per\n"
+  "   linearSat monomerFile.txt monomerOrder.txt output.txt\n"
+  "Where monomerFile is a list of reads with monomers parsed out, one read per\n"
   "line, with the first word in the line being the read id and subsequent words the monomers\n"
   "within the read. The monomerOrder.txt has one line per major monomer type with a word for\n"
   "each variant. The lines are in the same order the monomers appear, with the last line\n"
   "being followed in order by the first since they repeat.   The output.txt contains one line\n"
   "for each monomer in the output, just containing the monomer symbol.\n"
   "options:\n"
   "   -size=N - Set max chain size, default %d\n"
   "   -fullOnly - Only output chains of size above\n"
   "   -chain=fileName - Write out word chain to file\n"
   "   -afterChain=fileName - Write out word chain after faux generation to file for debugging\n"
   "   -initialOutSize=N - Output this many words. Default %d\n"
   "   -maxOutSize=N - Expand output up to this many words if not all monomers output initially\n"
   "   -maxToMiss=N - How many monomers may be missing from output. Default %d \n"
   "   -pseudoCount=N - Add this number to observed quantities - helps compensate for what's not\n"
   "                observed.  Default %d\n"
@@ -56,31 +56,31 @@
 static struct optionSpec options[] = {
    {"size", OPTION_INT},
    {"chain", OPTION_STRING},
    {"afterChain", OPTION_STRING},
    {"fullOnly", OPTION_BOOLEAN},
    {"initialOutSize", OPTION_INT},
    {"maxOutSize", OPTION_INT},
    {"maxToMiss", OPTION_INT},
    {"pseudoCount", OPTION_INT},
    {"seed", OPTION_INT},
    {"unsubbed", OPTION_STRING},
    {"betweens", OPTION_BOOLEAN},
    {NULL, 0},
 };
 
-/* Some structures to keep track of words (which correspond to alpha satellight monomers)
+/* Some structures to keep track of words (which correspond to satellight monomers)
  * seen in input. */
 
 struct monomer
 /* Basic information on a monomer including how many times it is seen in input and output
  * streams.  Unlike the wordTree, this is flat, and does not include predecessors. */
     {
     struct monomer *next;	/* Next in list of all words. */
     char *word;			/* The word used to represent monomer.  Not allocated here. */
     int useCount;		/* Number of times used. */
     int outTarget;		/* Number of times want to output word. */
     int outCount;		/* Number of times have output word so far. */
     struct monomerType *type;	/* The type of the monomer. */
     struct slRef *readList;	/* List of references to reads this is in. */
     int subbedOutCount;		/* Output count after substitution. */
     };
@@ -1682,32 +1682,32 @@
 /* Do first pass on ones defined - just have to look them up in hash. */
 struct monomer *monomer;
 for (monomer = store->monomerList; monomer != NULL; monomer = monomer->next)
     monomer->type = hashFindVal(store->typeHash, monomer->word);
 
 /* Do second pass on ones that are not yet defined */
 for (monomer = store->monomerList; monomer != NULL; monomer = monomer->next)
     {
     if (monomer->type == NULL)
 	{
 	monomer->type = fillInTypeFromReads(monomer, store);
 	}
     }
 }
 
-struct dlList *alphaAsmOneSize(struct alphaStore *store, int outSize)
-/* alphaAsm - assemble alpha repeat regions such as centromeres from reads that have
+struct dlList *linearSatOneSize(struct alphaStore *store, int outSize)
+/* linearSat - assemble alpha repeat regions such as centromeres from reads that have
  * been parsed into various repeat monomer variants.  Cycles of these variants tend to
  * form higher order repeats. Returns list of outSize monomers. */
 {
 struct wordTree *wt = store->markovChains;
 alphaStoreNormalize(store, outSize);
 verbose(2, "Normalized Markov Chains\n");
 
 if (optionExists("chain"))
     {
     char *fileName = optionVal("chain", NULL);
     wordTreeWrite(wt, store->maxChainSize, fileName);
     }
 
 struct dlList *ll = wordTreeMakeList(store, store->maxChainSize, 
     pickWeightedRandomFromList(wt->children), outSize);
@@ -1735,32 +1735,32 @@
 for (node = ll->head; !dlEnd(node); node = node->next)
     {
     monomer = node->val;
     monomer->outCount += 1;
     }
 
 /* Count up unused. */
 int missing = 0;
 for (monomer = monomerList; monomer != NULL; monomer = monomer->next)
     if (monomer->outCount == 0 && !sameString(monomer->word, ""))
         missing += 1;
 
 return missing;
 }
 
-void alphaAsm(char *readsFile, char *monomerOrderFile, char *outFile)
-/* alphaAsm - assemble alpha repeat regions such as centromeres from reads that have
+void linearSat(char *readsFile, char *monomerOrderFile, char *outFile)
+/* linearSat - assemble alpha repeat regions such as centromeres from reads that have
  * been parsed into various repeat monomer variants.  Cycles of these variants tend to
  * form higher order repeats. */
 {
 /* This routine reads in the input,  and then calls a routine that produces the
  * output for a given size.  If not all monomers are included in the output, then it
  * will try to find the minimum output size needed to include all monomers. */
 
 /* Read in inputs, and put in "store" */
 struct alphaStore *store = alphaStoreNew(maxChainSize);
 alphaReadListFromFile(readsFile, store);
 alphaStoreLoadMonomerOrder(store, readsFile, monomerOrderFile);
 verbose(2, "Loaded input reads and monomer order\n");
 
 if (betweens)
     {
@@ -1772,31 +1772,31 @@
 crossCheckMonomerOrderAndReads(store, "m", readsFile, monomerOrderFile);
 fillInTypes(store);
 integrateOrphans(store);
 verbose(2, "Filled in missing types and integrated orphan ends\n");
 
 /* Make the main markov chains out of the reads. */
 store->markovChains = makeMarkovChains(store);
 verbose(2, "Made Markov Chains\n");
 
 /* Loop gradually increasing size of output we make until get to maxOutSize or until
  * we generate output that includes all input monomers. */
 struct dlList *ll;
 int outSize = initialOutSize;
 while (outSize <= maxOutSize)
     {
-    ll = alphaAsmOneSize(store, outSize);
+    ll = linearSatOneSize(store, outSize);
     assert(outSize == dlCount(ll));
     int missing = countMissingMonomers(ll, store->monomerList);
     if (missing <= maxToMiss)
         break;
     if (outSize == maxOutSize)
 	{
 	errAbort("Could not include all monomers. Missing %d.\n"
 	         "consider increasing maxOutSize (now %d) or increasing maxToMiss (now %d)",
 	         missing, maxOutSize, maxToMiss);
         break;
 	}
     dlListFree(&ll);
     outSize *= 1.1;	/* Grow by 10% */
     if (outSize > maxOutSize)
         outSize = maxOutSize;
@@ -1815,18 +1815,18 @@
 optionInit(&argc, argv, options);
 if (argc != 4)
     usage();
 maxChainSize = optionInt("size", maxChainSize);
 initialOutSize = optionInt("initialOutSize", initialOutSize);
 maxOutSize = optionInt("maxOutSize", initialOutSize);  // Defaults to same as initialOutSize
 if (maxOutSize < initialOutSize)
     errAbort("maxOutSize (%d) needs to be bigger than initialOutSize (%d)\n", 
 	maxOutSize, initialOutSize);
 maxToMiss = optionInt("maxToMiss", maxToMiss);
 fullOnly = optionExists("fullOnly");
 pseudoCount = optionInt("pseudoCount", pseudoCount);
 betweens = optionExists("betweens");
 int seed = optionInt("seed", 0);
 srand(seed);
-alphaAsm(argv[1], argv[2], argv[3]);
+linearSat(argv[1], argv[2], argv[3]);
 return 0;
 }