9b3a97eac6e90e14eb908d1f790d25d57c8d4479
kent
  Tue Aug 28 17:18:35 2012 -0700
Loading in monomer order file.
diff --git src/kehayden/alphaChain/alphaChain.c src/kehayden/alphaChain/alphaChain.c
index af954a7..45bc5c0 100644
--- src/kehayden/alphaChain/alphaChain.c
+++ src/kehayden/alphaChain/alphaChain.c
@@ -45,36 +45,52 @@
 
 /* Some structures to keep track of words (which correspond to alpha satellight monomers)
  * seen in input. */
 
 struct wordInfo
 /* Basic information on a word including how many times it is seen in input and output
  * streams.  Unlike the wordTree, this is flat, and does not include predecessors. */
     {
     struct wordInfo *next;	/* Next in list of all words. */
     char *word;			/* The word itself.  Not allocated here. */
     int useCount;		/* Number of times used. */
     int outTarget;		/* Number of times want to output word. */
     int outCount;		/* Number of times have output word so far. */
     };
 
+struct wordInfoRef
+/* A reference to a word. */
+    {
+    struct wordInfoRef *next;	/* Next in list */
+    struct wordInfo *val;	/* The word referred to. */
+    };
+
+struct wordType
+/* A collection of words of the same type - or monomers of same type. */
+    {
+    struct wordType *next;   /* Next wordType */
+    struct wordInfoRef *list;	    /* List of all words of that type */
+    };
+
 struct wordStore
 /* Stores info on all words */
     {
-    struct wordInfo *list;   /* List of words, fairly arbitrary order. */
-    struct hash *hash;	     /* Hash of wordInfo, keyed by word. */
+    struct wordInfo *infoList;   /* List of words, fairly arbitrary order. */
+    struct hash *infoHash;	     /* Hash of wordInfo, keyed by word. */
     struct wordTree *markovChains;   /* Tree of words that follow other words. */
+    struct wordType *typeList;	/* List of all types. */
+    struct hash *typeHash;	/* Hash with wordType values, keyed by all words. */
     };
 
 /* The wordTree structure below is the central data structure for this program.  It is
  * used to build up a tree that contains all observed N-word-long sequences observed in
  * the text, where N corresponds to the "size" command line option which defaults to 3,
  * an option that in turn is stored in the maxChainSize variable.  At this chain size the
  * text 
  *     this is the black dog and the black cat
  * would have the chains 
  *     this is the 
  *     is the black
  *     the black dog
  *     black dog and
  *     dog and the
  *     and the black
@@ -487,43 +503,44 @@
 
 static void wordTreeSort(struct wordTree *wt)
 /* Sort all children lists in tree. */
 {
 slSort(&wt->children, wordTreeCmpWord);
 struct wordTree *child;
 for (child = wt->children; child != NULL; child = child->next)
     wordTreeSort(child);
 }
 
 struct wordStore *wordStoreNew()
 /* Allocate and initialize a new word store. */
 {
 struct wordStore *store;
 AllocVar(store);
-store->hash = hashNew(0);
+store->infoHash = hashNew(0);
 return store;
 }
 
 struct wordInfo *wordStoreAdd(struct wordStore *store, char *word)
 /* Add word to store,  incrementing it's useCount if it's already there, otherwise
  * making up a new record for it. */
 {
-struct wordInfo *info = hashFindVal(store->hash, word);
+struct wordInfo *info = hashFindVal(store->infoHash, word);
 if (info == NULL)
     {
     AllocVar(info);
-    hashAddSaveName(store->hash, word, info, &info->word);
+    hashAddSaveName(store->infoHash, word, info, &info->word);
+    slAddHead(&store->infoList, info);
     }
 info->useCount += 1;
 return info;
 }
 
 struct wordStore *wordStoreForChainsInFile(char *fileName, int chainSize)
 /* Return a wordStore containing all words, and also all chains-of-words of length 
  * chainSize seen in file.  */
 {
 /* Stuff for processing file a line at a time. */
 struct lineFile *lf = lineFileOpen(fileName, TRUE);
 char *line, *word;
 
 /* We'll build up the tree starting with an empty root node. */
 struct wordStore *store = wordStoreNew();
@@ -586,35 +603,71 @@
 lineFileClose(&lf);
 
 wordTreeSort(wt);  // Make output of chain file prettier
 return store;
 }
 
 void wordTreeWrite(struct wordTree *wt, char *fileName)
 /* Write out tree to file */
 {
 FILE *f = mustOpen(fileName, "w");
 fprintf(f, "#level\tuseCount\toutTarget\toutCount\tnormVal\tmonomers\n");
 wordTreeDump(0, wt, f);
 carefulClose(&f);
 }
 
+void wordStoreLoadMonomerOrder(struct wordStore *store, char *readsFile, char *fileName)
+/* Read in a file with one line for each monomer type, containing a word for each
+ * monomer variant.  Requires all variants already be in store.  The readsFile is passed
+ * just for nicer error reporting. */
+{
+/* Stuff for processing file a line at a time. */
+struct lineFile *lf = lineFileOpen(fileName, TRUE);
+char *line, *word;
+
+/* Set up variables we'll put results in in store. */
+store->typeHash = hashNew(0);
+store->typeList = NULL;
+
+while (lineFileNextReal(lf, &line))
+    {
+    struct wordType *type;
+    AllocVar(type);
+    slAddHead(&store->typeList, type);
+    while ((word = nextWord(&line)) != NULL)
+        {
+	struct wordInfo *info = hashFindVal(store->infoHash, word);
+	if (info == NULL)
+	    errAbort("%s is in %s but not %s", word, lf->fileName, readsFile);
+	struct wordInfoRef *ref;
+	AllocVar(ref);
+	ref->val = info;
+	slAddHead(&type->list, ref);
+	hashAddUnique(store->typeHash, word, type);
+	}
+    }
+lineFileClose(&lf);
+verbose(2, "Added %d types containing %d words from %s\n", 
+    slCount(store->typeList), store->typeHash->elCount, fileName);
+}
+
 void alphaChain(char *readsFile, char *monomerOrderFile, char *outFile)
 /* alphaChain - Create Markov chain of words and optionally output chain in two formats. */
 {
 struct wordStore *store = wordStoreForChainsInFile(readsFile, maxChainSize);
 struct wordTree *wt = store->markovChains;
+wordStoreLoadMonomerOrder(store, readsFile, monomerOrderFile);
 wordTreeNormalize(wt, outSize, 1.0);
 
 if (optionExists("chain"))
     {
     char *fileName = optionVal("chain", NULL);
     wordTreeWrite(wt, fileName);
     }
 
 wordTreeGenerateFaux(store, maxChainSize, pickRandom(wt->children), outSize, outFile);
 
 if (optionExists("afterChain"))
     {
     char *fileName = optionVal("afterChain", NULL);
     wordTreeWrite(wt, fileName);
     }