src/kehayden/alphaAsm/alphaAsm.c c61f4e79c1c7cfd78b37d4bfa13e00fd8789b043

c61f4e79c1c7cfd78b37d4bfa13e00fd8789b043
kent
  Wed Apr 3 16:32:52 2013 -0700
Addint betweens option.
diff --git src/kehayden/alphaAsm/alphaAsm.c src/kehayden/alphaAsm/alphaAsm.c
index 77875ed..d338108 100644
--- src/kehayden/alphaAsm/alphaAsm.c
+++ src/kehayden/alphaAsm/alphaAsm.c
@@ -6,74 +6,77 @@
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "dystring.h"
 #include "dlist.h"
 #include "obscure.h"
 #include "errabort.h"
 
 /* Global vars - all of which can be set by command line options. */
 int pseudoCount = 1;
 int maxChainSize = 3;
 int initialOutSize = 10000;
 int maxOutSize;
 int maxToMiss = 0;
 boolean fullOnly = FALSE;
+boolean betweens = FALSE;
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "alphaAsm - create a linear projection of alpha satellite arrays using the probablistic model\n"
   "of HuRef satellite graphs based on Sanger style reads.\n"
   "usage:\n"
   "   alphaAsm alphaMonFile.txt monomerOrder.txt output.txt\n"
   "Where alphaMonFile is a list of reads with alpha chains monomers parsed out, one read per\n"
   "line, with the first word in the line being the read id and subsequent words the monomers\n"
   "within the read. The monomerOrder.txt has one line per major monomer type with a word for\n"
   "each variant. The lines are in the same order the monomers appear, with the last line\n"
   "being followed in order by the first since they repeat.   The output.txt contains one line\n"
   "for each monomer in the output, just containing the monomer symbol.\n"
   "options:\n"
   "   -size=N - Set max chain size, default %d\n"
   "   -fullOnly - Only output chains of size above\n"
   "   -chain=fileName - Write out word chain to file\n"
   "   -afterChain=fileName - Write out word chain after faux generation to file for debugging\n"
   "   -initialOutSize=N - Output this many words. Default %d\n"
   "   -maxOutSize=N - Expand output up to this many words if not all monomers output initially\n"
   "   -maxToMiss=N - How many monomers may be missing from output. Default %d \n"
   "   -pseudoCount=N - Add this number to observed quantities - helps compensate for what's not\n"
   "                observed.  Default %d\n"
   "   -seed=N - Initialize random number with this seed for consistent results, otherwise\n"
   "             it will generate different results each time it's run.\n"
+  "   -betweens - Add <m> lines in output that indicate level of markov model used join\n"
   "   -unsubbed=fileName - Write output before substitution of missing monomers to file\n"
   , maxChainSize, initialOutSize, maxToMiss, pseudoCount
   );
 }
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
    {"size", OPTION_INT},
    {"chain", OPTION_STRING},
    {"afterChain", OPTION_STRING},
    {"fullOnly", OPTION_BOOLEAN},
    {"initialOutSize", OPTION_INT},
    {"maxOutSize", OPTION_INT},
    {"maxToMiss", OPTION_INT},
    {"pseudoCount", OPTION_INT},
    {"seed", OPTION_INT},
    {"unsubbed", OPTION_STRING},
+   {"betweens", OPTION_BOOLEAN},
    {NULL, 0},
 };
 
 /* Some structures to keep track of words (which correspond to alpha satellight monomers)
  * seen in input. */
 
 struct monomer
 /* Basic information on a monomer including how many times it is seen in input and output
  * streams.  Unlike the wordTree, this is flat, and does not include predecessors. */
     {
     struct monomer *next;	/* Next in list of all words. */
     char *word;			/* The word used to represent monomer.  Not allocated here. */
     int useCount;		/* Number of times used. */
     int outTarget;		/* Number of times want to output word. */
     int outCount;		/* Number of times have output word so far. */
@@ -101,30 +104,31 @@
 /* A read that's been parsed into alpha variants. */
     {
     struct alphaRead *next;	/* Next in list */
     char *name;			/* Id of read, not allocated here. */
     struct monomerRef *list;	/* List of alpha subunits */
     };
 
 struct alphaStore
 /* Stores info on all monomers */
     {
     struct monomer *monomerList;   /* List of words, fairly arbitrary order. */
     struct hash *monomerHash;       /* Hash of monomer, keyed by word. */
     struct alphaRead *readList;	 /* List of all reads. */
     struct hash *readHash;	 /* Hash of all reads keyed by read ID. */
     struct wordTree *markovChains;   /* Tree of words that follow other words. */
+    struct wordTree *markovChainsNoOrphans; /* Markov tree before orphans are added. */
     int maxChainSize;		/* Maximum depth of tree. */
     struct monomerType *typeList;	/* List of all types. */
     struct hash *typeHash;	/* Hash with monomerType values, keyed by all words. */
     int typeCount;		/* Count of types. */
     struct monomerType **typeArray;  /* Array as opposed to list of types */
     };
 
 /* The wordTree structure below is the central data structure for this program.  It is
  * used to build up a tree that contains all observed N-word-long sequences observed in
  * the text, where N corresponds to the "size" command line option which defaults to 3,
  * an option that in turn is stored in the maxChainSize variable.  At this chain size the
  * text 
  *     this is the black dog and the black cat
  * would have the chains 
  *     this is the 
@@ -247,30 +251,91 @@
 }
 
 void addChainToTree(struct wordTree *wt, struct dlList *chain)
 /* Add chain of words to tree. */
 {
 struct dlNode *node;
 wt->useCount += 1;
 for (node = chain->head; !dlEnd(node); node = node->next)
     {
     struct monomer *monomer = node->val;
     verbose(4, "  adding %s\n", monomer->word);
     wt = wordTreeAddFollowing(wt, monomer);
     }
 }
 
+boolean gotMatchInTree(struct wordTree *wt, struct dlNode *nodeList, int chainSize)
+/* Return TRUE if find node list in tree*/
+{
+int i;
+struct wordTree *subTree = wt;
+struct dlNode *node = nodeList;
+for (i=0; i<chainSize; ++i)
+    {
+    struct monomer *monomer = node->val;
+    subTree = wordTreeFindInList(subTree->children, monomer);
+    if (subTree == NULL)
+        return FALSE;
+    node = node->next;
+    }
+return TRUE;
+}
+
+
+void findLongestSupportingMarkovChain(struct wordTree *wt, struct dlNode *node,
+    int *retChainSize, int *retReadCount)
+/* See if chain of words is in tree tree. */
+{
+struct dlNode *start = node;
+int chainSize = 1;
+for (;;)
+    {
+    if (!gotMatchInTree(wt, start, chainSize))
+        break;
+    chainSize += 1;
+    start = start->prev;
+    if (dlStart(start))
+        break;
+    }
+*retChainSize = chainSize;
+*retReadCount = 0;	// Not implemented.
+}
+
+static void writeMonomerListAndBetweens(struct alphaStore *store, 
+    char *fileName, struct dlList *ll)
+/* Write out monomer list to file. */
+{
+FILE *f = mustOpen(fileName, "w");
+struct dlNode *node;
+for (node = ll->head; !dlEnd(node); node = node->next)
+    {
+    struct monomer *monomer = node->val;
+    if (betweens)
+	{
+	int chainSize = 0, readCount = 0;
+	findLongestSupportingMarkovChain(store->markovChainsNoOrphans, node, 
+	    &chainSize, &readCount);
+	/* The -2 is for 1 extra for the empty tree root, and 1 extra to get
+	 * from chain-size to markov model terminology. */
+	fprintf(f, "<%d> ", chainSize-2); 
+	}
+    fprintf(f, "%s\n", monomer->word);
+    }
+carefulClose(&f);
+}
+
+
 int wordTreeAddPseudoCount(struct wordTree *wt, int pseudo)
 /* Add pseudo to all leaves of tree and propagate counts up to parents. */
 {
 if (wt->children == NULL)
     {
     wt->useCount += pseudo;
     return wt->useCount;
     }
 else
     {
     struct wordTree *child;
     int oldChildTotal = 0;
     for (child = wt->children; child != NULL; child = child->next)
 	oldChildTotal += child->useCount;
     int oldDiff = wt->useCount - oldChildTotal;
@@ -1319,36 +1384,36 @@
     if (end->paired)
         continue;
     struct monomer *endMono = end->monomer;
     struct monomerType *endType = endMono->type;
     if (endType == NULL)
         continue;
 
     struct monomerType *newType = typeAfter(store, endType, 1);
     verbose(3, "Trying to find start of type %s\n", newType->name);
     struct monomer *newMono = pickRandomFromType(newType);
     addReadOfTwo(store, endMono, newMono);
     verbose(3, "Pairing end %s with new %s\n", endMono->word, newMono->word);
     }
 }
 
-void makeMarkovChains(struct alphaStore *store)
-/* Return a alphaStore containing all words, and also all chains-of-words of length 
+struct wordTree *makeMarkovChains(struct alphaStore *store)
+/* Return wordTree containing all words, and also all chains-of-words of length 
  * chainSize seen in file.  */
 {
 /* We'll build up the tree starting with an empty root node. */
-struct wordTree *wt = store->markovChains = wordTreeNew(alphaStoreAddMonomer(store, ""));	
+struct wordTree *wt = wordTreeNew(alphaStoreAddMonomer(store, ""));	
 int chainSize = store->maxChainSize;
 
 /* Loop through each read. There's special cases at the beginning and end of read, and for 
  * short reads.  In the main case we'll be maintaining a chain (doubly linked list) of maxChainSize 
  * words, * popping off one word from the start, and adding one word to the end for each
  * new word we encounter. This list is added to the tree each iteration. */
 struct alphaRead *read;
 for (read = store->readList; read != NULL; read = read->next)
     {
     /* We'll keep a chain of three or so words in a doubly linked list. */
     struct dlNode *node;
     struct dlList *chain = dlListNew();
     int curSize = 0;
     int wordCount = 0;
 
@@ -1380,30 +1445,31 @@
 	++wordCount;
 	}
     /* Handle last few words in line, where can't make a chain of full size.  Also handles       
     * lines that have fewer than chain size words. */
     if (curSize < chainSize)
  	addChainToTree(wt, chain);
     while ((node = dlPopHead(chain)) != NULL)
 	{
 	if (!dlEmpty(chain))
 	    addChainToTree(wt, chain);
 	freeMem(node);
 	}
     dlListFree(&chain);
     }
 wordTreeSort(wt);  // Make output of chain file prettier
+return wt;
 }
 
 void wordTreeWrite(struct wordTree *wt, int maxChainSize, char *fileName)
 /* Write out tree to file */
 {
 FILE *f = mustOpen(fileName, "w");
 fprintf(f, "#level\tuseCount\toutTarget\toutCount\tnormVal\tmonomers\n");
 wordTreeDump(0, wt, maxChainSize, f);
 carefulClose(&f);
 }
 
 void alphaStoreLoadMonomerOrder(struct alphaStore *store, char *readsFile, char *fileName)
 /* Read in a file with one line for each monomer type, where first word is type name,
  * and rest of words are all the actually variants of that type.
  * Requires all variants already be in store.  The readsFile is passed
@@ -1675,77 +1741,83 @@
 void alphaAsm(char *readsFile, char *monomerOrderFile, char *outFile)
 /* alphaAsm - assemble alpha repeat regions such as centromeres from reads that have
  * been parsed into various repeat monomer variants.  Cycles of these variants tend to
  * form higher order repeats. */
 {
 /* This routine reads in the input,  and then calls a routine that produces the
  * output for a given size.  If not all monomers are included in the output, then it
  * will try to find the minimum output size needed to include all monomers. */
 
 /* Read in inputs, and put in "store" */
 struct alphaStore *store = alphaStoreNew(maxChainSize);
 alphaReadListFromFile(readsFile, store);
 alphaStoreLoadMonomerOrder(store, readsFile, monomerOrderFile);
 verbose(2, "Loaded input reads and monomer order\n");
 
+if (betweens)
+    {
+    store->markovChainsNoOrphans = makeMarkovChains(store);
+    verbose(2, "Made initial markov chains\n");
+    }
+
 /* Do some cross checking and filling out a bit for missing data. */
 crossCheckMonomerOrderAndReads(store, "m", readsFile, monomerOrderFile);
 fillInTypes(store);
 integrateOrphans(store);
 verbose(2, "Filled in missing types and integrated orphan ends\n");
 
 /* Make the main markov chains out of the reads. */
-makeMarkovChains(store);
+store->markovChains = makeMarkovChains(store);
 verbose(2, "Made Markov Chains\n");
 
 /* Loop gradually increasing size of output we make until get to maxOutSize or until
  * we generate output that includes all input monomers. */
 struct dlList *ll;
 int outSize = initialOutSize;
 while (outSize <= maxOutSize)
     {
     ll = alphaAsmOneSize(store, outSize);
     assert(outSize == dlCount(ll));
     int missing = countMissingMonomers(ll, store->monomerList);
     if (missing <= maxToMiss)
         break;
     if (outSize == maxOutSize)
 	{
 	errAbort("Could not include all monomers. Missing %d.\n"
 	         "consider increasing maxOutSize (now %d) or increasing maxToMiss (now %d)",
 	         missing, maxOutSize, maxToMiss);
         break;
 	}
     dlListFree(&ll);
     outSize *= 1.1;	/* Grow by 10% */
     if (outSize > maxOutSize)
         outSize = maxOutSize;
     verbose(1, "%d missing. Trying again with outSize=%d (initialOutSize %d, maxOutSize %d)\n",
 	missing, outSize, initialOutSize, maxOutSize);
     }
-    
-writeMonomerList(outFile, ll);
+writeMonomerListAndBetweens(store, outFile, ll);
 verbose(2, "Wrote primary output\n");
 dlListFree(&ll);
 }
 
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 4)
     usage();
 maxChainSize = optionInt("size", maxChainSize);
 initialOutSize = optionInt("initialOutSize", initialOutSize);
 maxOutSize = optionInt("maxOutSize", initialOutSize);  // Defaults to same as initialOutSize
 if (maxOutSize < initialOutSize)
     errAbort("maxOutSize (%d) needs to be bigger than initialOutSize (%d)\n", 
 	maxOutSize, initialOutSize);
 maxToMiss = optionInt("maxToMiss", maxToMiss);
 fullOnly = optionExists("fullOnly");
 pseudoCount = optionInt("pseudoCount", pseudoCount);
+betweens = optionExists("betweens");
 int seed = optionInt("seed", 0);
 srand(seed);
 alphaAsm(argv[1], argv[2], argv[3]);
 return 0;
 }