src/kehayden/alphaAsm/alphaAsm.c 718e8066b72f9745865bd19cdd986b3debee1a71

718e8066b72f9745865bd19cdd986b3debee1a71
kent
  Wed Dec 19 15:56:03 2012 -0800
Making subCenterInNeighborhood routine work on a wider variety of input and putting in some additional logging/debugging options.
diff --git src/kehayden/alphaAsm/alphaAsm.c src/kehayden/alphaAsm/alphaAsm.c
index d02b7a3..449b51b 100644
--- src/kehayden/alphaAsm/alphaAsm.c
+++ src/kehayden/alphaAsm/alphaAsm.c
@@ -1,26 +1,27 @@
 /* alphaAsm - assemble alpha repeat regions such as centromeres from reads that have
  * been parsed into various repeat monomer variants.  Cycles of these variants tend to
  * form higher order repeats. */
 
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "dystring.h"
 #include "dlist.h"
 #include "obscure.h"
+#include "errabort.h"
 
 /* Global vars - all of which can be set by command line options. */
 int pseudoCount = 1;
 int maxChainSize = 3;
 int outSize = 10000;
 boolean fullOnly = FALSE;
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "alphaAsm - create a linear projection of alpha satellite arrays using the probablistic model\n"
   "of HuRef satellite graphs based on Sanger style reads.\n"
   "usage:\n"
   "   alphaAsm alphaMonFile.txt monomerOrder.txt output.txt\n"
@@ -28,43 +29,45 @@
   "line, with the first word in the line being the read id and subsequent words the monomers\n"
   "within the read. The monomerOrder.txt has one line per major monomer type with a word for\n"
   "each variant. The lines are in the same order the monomers appear, with the last line\n"
   "being followed in order by the first since they repeat.   The output.txt contains one line\n"
   "for each monomer in the output, just containing the monomer symbol.\n"
   "options:\n"
   "   -size=N - Set max chain size, default %d\n"
   "   -fullOnly - Only output chains of size above\n"
   "   -chain=fileName - Write out word chain to file\n"
   "   -afterChain=fileName - Write out word chain after faux generation to file for debugging\n"
   "   -outSize=N - Output this many words. Default %d\n"
   "   -pseudoCount=N - Add this number to observed quantities - helps compensate for what's not\n"
   "                observed.  Default %d\n"
   "   -seed=N - Initialize random number with this seed for consistent results, otherwise\n"
   "             it will generate different results each time it's run.\n"
+  "   -unsubbed=fileName - Write output before substitution of missing monomers to file\n"
   , maxChainSize, outSize, pseudoCount
   );
 }
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
    {"size", OPTION_INT},
    {"chain", OPTION_STRING},
    {"afterChain", OPTION_STRING},
    {"fullOnly", OPTION_BOOLEAN},
    {"outSize", OPTION_INT},
-   {"seed", OPTION_INT},
    {"pseudoCount", OPTION_INT},
+   {"seed", OPTION_INT},
+   {"unsubbed", OPTION_STRING},
    {NULL, 0},
 };
 
 /* Some structures to keep track of words (which correspond to alpha satellight monomers)
  * seen in input. */
 
 struct monomer
 /* Basic information on a monomer including how many times it is seen in input and output
  * streams.  Unlike the wordTree, this is flat, and does not include predecessors. */
     {
     struct monomer *next;	/* Next in list of all words. */
     char *word;			/* The word used to represent monomer.  Not allocated here. */
     int useCount;		/* Number of times used. */
     int outTarget;		/* Number of times want to output word. */
     int outCount;		/* Number of times have output word so far. */
@@ -879,41 +882,47 @@
 	     node->val = center;
 	     return TRUE;
 	     }
 	 ++currentIx;
 	 }
     }
 internalErr();	// Should not get here.
 return FALSE;	
 }
 
 boolean subCenterInNeighborhood(struct alphaStore *store, 
     struct monomer *center, struct monomerRef *neighborhood, struct dlList *ll)
 /* Scan ll for cases where neighborhood around center matches.  Replace one of these 
  * cases with center. */
 {
-assert(slCount(neighborhood) == 3);	// Simplifies things and is true for now.
+int size = slCount(neighborhood);
+assert(size == 3 || size == 2);
 if (subCommonCenter(store, center, neighborhood, ll))
    return TRUE;
+if (size == 3)
+    {
 if (subCommonCenter(store, center, neighborhood->next, ll))
    return TRUE;
 struct monomerRef *third = neighborhood->next->next;
 neighborhood->next->next = NULL;
 boolean ok = subCommonCenter(store, center, neighborhood, ll);
 neighborhood->next->next = third;
 return ok;
 }
+else
+    return FALSE;
+}
 
 struct monomer *mostCommonInType(struct monomerType *type)
 /* Return most common monomer of given type */
 {
 struct monomerRef *ref;
 int commonCount = 0;
 struct monomer *common = NULL;
 for (ref = type->list; ref != NULL; ref = ref->next)
     {
     struct monomer *monomer = ref->val;
     if (monomer->outCount > commonCount)
         {
 	commonCount = monomer->outCount;
 	common = monomer;
 	}
@@ -940,53 +949,62 @@
     }
 }
 
 void subInMissing(struct alphaStore *store, struct dlList *ll)
 /* Go figure out missing monomers in ll, and attempt to substitute them in somewhere they would fit. */
 {
 struct slRef *unusedList = listUnusedMonomers(store, ll);
 verbose(2, "%d monomers, %d unused\n", slCount(store->monomerList), slCount(unusedList));
 struct slRef *unusedRef;
 for (unusedRef = unusedList; unusedRef != NULL; unusedRef = unusedRef->next)
     {
     struct monomer *unused = unusedRef->val;
     struct monomerRef *neighborhood = findNeighborhoodFromReads(unused);
     if (!subCenterInNeighborhood(store, unused, neighborhood, ll))
 	{
-	verbose(2, "Couldn't substitute in %s with context, falling back to type logic", 
+	verbose(2, "Couldn't substitute in %s with context, falling back to type logic\n", 
 	    unused->word);
         subIntoFirstMostCommonOfType(store, unused, ll);
 	}
     slFreeList(&neighborhood);
     }
 }
 
-static void wordTreeGenerateFile(struct alphaStore *store, int maxSize, struct wordTree *firstWord, 
-	int maxOutputWords, char *fileName)
-/* Create file containing words base on tree probabilities.  The wordTreeGenerateList does
- * most of work. */
+static void writeMonomerList(char *fileName, struct dlList *ll)
+/* Write out monomer list to file. */
 {
-struct dlList *ll = wordTreeGenerateList(store, maxSize, firstWord, maxOutputWords);
-subInMissing(store, ll);
 FILE *f = mustOpen(fileName, "w");
 struct dlNode *node;
 for (node = ll->head; !dlEnd(node); node = node->next)
     {
     struct monomer *monomer = node->val;
     fprintf(f, "%s\n", monomer->word);
     }
 carefulClose(&f);
+}
+
+static void wordTreeGenerateFile(struct alphaStore *store, int maxSize, struct wordTree *firstWord, 
+	int maxOutputWords, char *fileName)
+/* Create file containing words base on tree probabilities.  The wordTreeGenerateList does
+ * most of work. */
+{
+struct dlList *ll = wordTreeGenerateList(store, maxSize, firstWord, maxOutputWords);
+char *unsubbedFile = optionVal("unsubbed", NULL);
+if (unsubbedFile)
+    writeMonomerList(unsubbedFile, ll);
+subInMissing(store, ll);
+writeMonomerList(fileName, ll);
 dlListFree(&ll);
 }
 
 
 struct alphaStore *alphaStoreNew(int maxChainSize)
 /* Allocate and initialize a new word store. */
 {
 struct alphaStore *store;
 AllocVar(store);
 store->maxChainSize = maxChainSize;
 store->monomerHash = hashNew(0);
 store->typeHash = hashNew(0);
 store->readHash = hashNew(0);
 return store;
 }