src/kehayden/alphaAsm/alphaAsm.c c0c04675b1d4865752984e71a0a6d9034a45d622

c0c04675b1d4865752984e71a0a6d9034a45d622
kent
  Tue Sep 11 16:11:30 2012 -0700
Adding missing types based on reads.
diff --git src/kehayden/alphaAsm/alphaAsm.c src/kehayden/alphaAsm/alphaAsm.c
index ca818d4..104f156 100644
--- src/kehayden/alphaAsm/alphaAsm.c
+++ src/kehayden/alphaAsm/alphaAsm.c
@@ -49,65 +49,70 @@
    {NULL, 0},
 };
 
 /* Some structures to keep track of words (which correspond to alpha satellight monomers)
  * seen in input. */
 
 struct monomer
 /* Basic information on a monomer including how many times it is seen in input and output
  * streams.  Unlike the wordTree, this is flat, and does not include predecessors. */
     {
     struct monomer *next;	/* Next in list of all words. */
     char *word;			/* The word used to represent monomer.  Not allocated here. */
     int useCount;		/* Number of times used. */
     int outTarget;		/* Number of times want to output word. */
     int outCount;		/* Number of times have output word so far. */
+    struct monomerType *type;	/* The type of the monomer. */
+    struct slRef *readList;	/* List of references to reads this is in. */
     };
 
 struct monomerRef
 /* A reference to a word. */
     {
     struct monomerRef *next;	/* Next in list */
     struct monomer *val;	/* The word referred to. */
     };
 
 struct monomerType
 /* A collection of words of the same type - or monomers of same type. */
     {
     struct monomerType *next;   /* Next monomerType */
+    char *name;			/* Short name of type */
     struct monomerRef *list;	    /* List of all words of that type */
     };
 
 struct alphaRead 
 /* A read that's been parsed into alpha variants. */
     {
     struct alphaRead *next;	/* Next in list */
     char *name;			/* Id of read, not allocated here. */
     struct monomerRef *list;	/* List of alpha subunits */
     };
 
 struct alphaStore
 /* Stores info on all words */
     {
     struct monomer *monomerList;   /* List of words, fairly arbitrary order. */
     struct hash *monomerHash;       /* Hash of monomer, keyed by word. */
     struct alphaRead *readList;	 /* List of all reads. */
     struct hash *readHash;	 /* Hash of all reads keyed by read ID. */
     struct wordTree *markovChains;   /* Tree of words that follow other words. */
     int maxChainSize;		/* Maximum depth of tree. */
     struct monomerType *typeList;	/* List of all types. */
     struct hash *typeHash;	/* Hash with monomerType values, keyed by all words. */
+    int typeCount;		/* Count of types. */
+    struct monomerType **typeArray;  /* Array as opposed to list of types */
     };
 
 /* The wordTree structure below is the central data structure for this program.  It is
  * used to build up a tree that contains all observed N-word-long sequences observed in
  * the text, where N corresponds to the "size" command line option which defaults to 3,
  * an option that in turn is stored in the maxChainSize variable.  At this chain size the
  * text 
  *     this is the black dog and the black cat
  * would have the chains 
  *     this is the 
  *     is the black
  *     the black dog
  *     black dog and
  *     dog and the
  *     and the black
@@ -669,30 +674,45 @@
 struct monomer *alphaStoreAddMonomer(struct alphaStore *store, char *word)
 /* Add word to store,  incrementing it's useCount if it's already there, otherwise
  * making up a new record for it. */
 {
 struct monomer *monomer = hashFindVal(store->monomerHash, word);
 if (monomer == NULL)
     {
     AllocVar(monomer);
     hashAddSaveName(store->monomerHash, word, monomer, &monomer->word);
     slAddHead(&store->monomerList, monomer);
     }
 monomer->useCount += 1;
 return monomer;
 }
 
+void connectReadsToMonomers(struct alphaStore *store)
+/* Add each read monomer appears in to monomer's readList. */
+{
+struct alphaRead *read;
+for (read = store->readList; read != NULL; read = read->next)
+    {
+    struct monomerRef *ref;
+    for (ref = read->list; ref != NULL; ref = ref->next)
+        {
+	struct monomer *monomer = ref->val;
+	refAdd(&monomer->readList, read);
+	}
+    }
+}
+
 void alphaReadListFromFile(char *fileName, struct alphaStore *store)
 /* Read in file and turn it into a list of alphaReads. */
 {
 /* Stuff for processing file a line at a time. */
 struct lineFile *lf = lineFileOpen(fileName, TRUE);
 char *line, *word;
 
 /* Loop through each line of file making up a read for it. */
 struct alphaRead *readList = NULL;
 while (lineFileNextReal(lf, &line))
     {
     /* Parse out first word of line into name, and rest into list. */
     char *name = nextWord(&line);
     struct monomerRef *list = NULL;
     while ((word = nextWord(&line)) != NULL)
@@ -707,83 +727,96 @@
     if (list == NULL)
         errAbort("Line %d of %s has no alpha monomers.", lf->lineIx, lf->fileName);
 
     /* Create data structure and add read to list and hash */
     if (hashLookup(store->readHash, name))
         errAbort("Read ID %s is repeated line %d of %s", name, lf->lineIx, lf->fileName);
     struct alphaRead *read;
     AllocVar(read);
     read->list = list;
     hashAddSaveName(store->readHash, name, read, &read->name);
     slAddHead(&readList, read);
     }
 slReverse(&readList);
 store->readList = readList;
 lineFileClose(&lf);
+connectReadsToMonomers(store);
 }
 
 struct alphaOrphan
 /* Information about an orphan - something without great joining information. */
     {
     struct alphaOrphan *next;
     struct monomer *monomer;
     struct monomerType *type;
     };
 
-struct alphaOrphan *findOrphanStarts(struct alphaRead *readList)
+struct alphaOrphan *findOrphanStarts(struct alphaRead *readList, struct alphaStore *store)
 /* Return list of monomers that are found only at start of reads. */
 {
 struct alphaOrphan *orphanList = NULL;
 
 /* Build up hash of things seen later in reads. */
 struct alphaRead *read;
 struct hash *later = hashNew(0);    /* Hash of monomers found later. */
 for (read = readList; read != NULL; read = read->next)
     {
     struct monomerRef *ref;
     for (ref = read->list->next; ref != NULL; ref = ref->next)
         {
 	hashAdd(later, ref->val->word, NULL);
 	}
     }
 
 /* Pass through again collecting orphans. */
 struct hash *orphanHash = hashNew(0);
 for (read = readList; read != NULL; read = read->next)
     {
     struct monomer *monomer = read->list->val;
-    if (!hashLookup(later, monomer->word))
+    char *word = monomer->word;
+    if (!hashLookup(later, word))
         {
-	struct alphaOrphan *orphan = hashFindVal(orphanHash, monomer->word);
+	struct alphaOrphan *orphan = hashFindVal(orphanHash, word);
 	if (orphan == NULL)
 	    {
 	    AllocVar(orphan);
 	    orphan->monomer = monomer;
 	    slAddHead(&orphanList, orphan);
-	    hashAdd(orphanHash, monomer->word, orphan);
-	    uglyf("Adding orphan start %s\n", monomer->word);
+	    hashAdd(orphanHash, word, orphan);
+	    uglyf("Adding orphan start %s\n", word);
+	    }
+
+	/* Try and find orphan type by consulting either itself or previous items in read. */
+	if (orphan->type == NULL)
+	    {
+	    struct monomerType *type = monomer->type;
+	    if (type == NULL) 
+	        {
+		uglyf("Can't find type for %s\n", word);
+		}
+	    else uglyf("%s is type %s\n", word, type->name);
+	    orphan->type = type;
 	    }
-	else uglyf("Repeating orphan start %s\n", monomer->word);
 	}
     }
 hashFree(&later);
 hashFree(&orphanHash);
 slReverse(&orphanList);
 return orphanList;
 }
 
-struct alphaOrphan *findOrphanEnds(struct alphaRead *readList)
+struct alphaOrphan *findOrphanEnds(struct alphaRead *readList, struct alphaStore *store)
 /* Return list of monomers that are found only at end of reads. */
 {
 struct alphaOrphan *orphanList = NULL;
 
 /* Build up hash of things seen sooner in reads. */
 struct alphaRead *read;
 struct hash *sooner = hashNew(0);    /* Hash of monomers found sooner. */
 for (read = readList; read != NULL; read = read->next)
     {
     struct monomerRef *ref;
     /* Loop over all but last. */
     for (ref = read->list; ref->next != NULL; ref = ref->next)
         {
 	hashAdd(sooner, ref->val->word, NULL);
 	}
@@ -807,32 +840,32 @@
 	    uglyf("Adding orphan end %s\n", monomer->word);
 	    }
 	else uglyf("Repeating orphan end %s\n", monomer->word);
 	}
     }
 hashFree(&sooner);
 slReverse(&orphanList);
 hashFree(&orphanHash);
 return orphanList;
 }
 
 void matchUpOrphans(struct alphaStore *store)
 /* Make up fake reads that integrate orphans (monomers only found at beginning or end
  * of a read) better. */
 {
-struct alphaOrphan *orphanStarts = findOrphanStarts(store->readList);
-struct alphaOrphan *orphanEnds = findOrphanEnds(store->readList);
+struct alphaOrphan *orphanStarts = findOrphanStarts(store->readList, store);
+struct alphaOrphan *orphanEnds = findOrphanEnds(store->readList, store);
 uglyf("orphanStarts has %d items, orphanEnds %d\n", slCount(orphanStarts), slCount(orphanEnds));
 }
 
 void makeMarkovChains(struct alphaStore *store)
 /* Return a alphaStore containing all words, and also all chains-of-words of length 
  * chainSize seen in file.  */
 {
 /* We'll build up the tree starting with an empty root node. */
 struct wordTree *wt = store->markovChains = wordTreeNew(alphaStoreAddMonomer(store, ""));	
 int chainSize = store->maxChainSize;
 
 /* Loop through each read. There's special cases at the beginning and end of read, and for 
  * short reads.  In the main case we'll be maintaining a chain (doubly linked list) of maxChainSize 
  * words, * popping off one word from the start, and adding one word to the end for each
  * new word we encounter. This list is added to the tree each iteration. */
@@ -885,89 +918,227 @@
     dlListFree(&chain);
     }
 wordTreeSort(wt);  // Make output of chain file prettier
 }
 
 void wordTreeWrite(struct wordTree *wt, int maxChainSize, char *fileName)
 /* Write out tree to file */
 {
 FILE *f = mustOpen(fileName, "w");
 fprintf(f, "#level\tuseCount\toutTarget\toutCount\tnormVal\tmonomers\n");
 wordTreeDump(0, wt, maxChainSize, f);
 carefulClose(&f);
 }
 
 void alphaStoreLoadMonomerOrder(struct alphaStore *store, char *readsFile, char *fileName)
-/* Read in a file with one line for each monomer type, containing a word for each
- * monomer variant.  Requires all variants already be in store.  The readsFile is passed
+/* Read in a file with one line for each monomer type, where first word is type name,
+ * and rest of words are all the actually variants of that type.
+ * Requires all variants already be in store.  The readsFile is passed
  * just for nicer error reporting. */
 {
 /* Stuff for processing file a line at a time. */
 struct lineFile *lf = lineFileOpen(fileName, TRUE);
 char *line, *word;
+struct hash *uniq = hashNew(0);
 
 /* Set up variables we'll put results in in store. */
 store->typeList = NULL;
 
 while (lineFileNextReal(lf, &line))
     {
+    char *name = nextWord(&line);
+    if (hashLookup(uniq, name) != NULL)
+        errAbort("Type name '%s' repeated line %d of %s\n", name, lf->lineIx, lf->fileName);
     struct monomerType *type;
     AllocVar(type);
+    type->name = cloneString(name);
     slAddHead(&store->typeList, type);
     while ((word = nextWord(&line)) != NULL)
         {
 	struct monomer *monomer = hashFindVal(store->monomerHash, word);
 	if (monomer == NULL)
 	    errAbort("%s is in %s but not %s", word, lf->fileName, readsFile);
 	struct monomerRef *ref;
 	AllocVar(ref);
 	ref->val = monomer;
 	slAddHead(&type->list, ref);
 	hashAddUnique(store->typeHash, word, type);
 	}
     }
 slReverse(&store->typeList);
 lineFileClose(&lf);
+hashFree(&uniq);
 verbose(2, "Added %d types containing %d words from %s\n", 
     slCount(store->typeList), store->typeHash->elCount, fileName);
+
+/* Create type array */
+store->typeCount = slCount(store->typeList);
+struct monomerType **types = AllocArray(store->typeArray, store->typeCount);
+struct monomerType *type;
+int i;
+for (i=0, type = store->typeList; i<store->typeCount; ++i, type = type->next)
+    types[i] = type;
 }
 
 void monomerListNormalise(struct monomer *list, int totalCount, int outputSize)
 /* Set outTarget field in all of list to be normalized to outputSize */
 {
 struct monomer *monomer;
 double scale = outputSize/totalCount;
 for (monomer = list; monomer != NULL; monomer = monomer->next)
     monomer->outTarget = round(scale * monomer->useCount);
 }
 
 void alphaStoreNormalize(struct alphaStore *store, int outputSize)
 /* Set up output counts on each word to make sure it gets it's share of output size */
 {
 struct wordTree *wt = store->markovChains;
 wordTreeNormalize(wt, outputSize, 1.0);
 monomerListNormalise(store->monomerList, wt->useCount, outputSize);
 }
 
+struct monomerType *fillInTypeFromReads(struct monomer *monomer, struct alphaStore *store)
+/* Look through read list trying to find best supported type for this monomer. */
+{
+/* The algorithm here could be better.  It picks the closest nearby monomer that is typed
+ * to fill in from.  When have information from both before and after uses arbitrarily 
+ * before preferentially.  When have information that is equally close from multiple reads,
+ * just uses info from first read.  Fortunately the data, at least on Y, shows a lot of
+ * consistency, so probably not actually worth a more sophisticated algorithm that could
+ * take the most popular choice if data were inconsistent. */
+verbose(2, "fillInTypeFromReads on %s from %d reads\n", monomer->word, slCount(monomer->readList));
+struct monomerType *bestType = NULL;
+int bestPos = 0;
+boolean bestIsAfter = FALSE;
+struct slRef *readRef;
+for (readRef = monomer->readList; readRef != NULL; readRef = readRef->next)
+    {
+    struct alphaRead *read = readRef->val;
+    struct monomerRef *ref;
+	{
+	verbose(2, "  read %s (%d els):", read->name, slCount(read->list));
+	for (ref = read->list; ref != NULL; ref = ref->next)
+	    {
+	    struct monomer *m = ref->val;
+	    verbose(2, " %s", m->word);
+	    }
+	verbose(2, "\n");
+	}
+    struct monomerType *beforeType = NULL;
+    int beforePos = 0;
+
+    /* Look before. */
+    for (ref = read->list; ref != NULL; ref = ref->next)
+        {
+	struct monomer *m = ref->val;
+	if (m == monomer)
+	     break;
+	if (m->type != NULL)
+	    {
+	    beforeType = m->type;
+	    beforePos = 0;
+	    }
+	++beforePos;
+	}
+
+    /* Look after. */
+    struct monomerType *afterType = NULL;
+    int afterPos = 0;
+    assert(ref != NULL && ref->val == monomer);   
+    for (ref = ref->next; ref != NULL; ref = ref->next)
+        {
+	struct monomer *m = ref->val;
+	++afterPos;
+	if (m->type != NULL)
+	    {
+	    afterType = m->type;
+	    break;
+	    }
+	}
+
+    if (beforeType != NULL)
+        {
+	if (bestType == NULL || beforePos < bestPos)
+	    {
+	    bestType = beforeType;
+	    bestPos = beforePos;
+	    bestIsAfter = FALSE;
+	    }
+	}
+
+    if (afterType != NULL)
+        {
+	if (bestType == NULL || afterPos < bestPos)
+	    {
+	    bestType = afterType;
+	    bestPos = afterPos;
+	    bestIsAfter = TRUE;
+	    }
+	}
+    }
+
+struct monomerType *chosenType = NULL;
+if (bestType != NULL)
+    {
+    int bestIx = ptArrayIx(bestType, store->typeArray, store->typeCount);
+    if (bestIsAfter)
+        {
+	bestIx -= bestPos;
+	if (bestIx < 0)
+	    bestIx += store->typeCount;
+	}
+    else
+        {
+	bestIx += bestPos;
+	if (bestIx > store->typeCount)
+	     bestIx -= store->typeCount;
+	}
+    chosenType = store->typeArray[bestIx];
+    }
+
+return chosenType;
+}
+
+void fillInTypes(struct alphaStore *store)
+/* We have types for most but not all monomers. Try and fill out types for most of
+ * the rest by looking for position in reads they are in that do contain known types */
+{
+/* Do first pass on ones defined - just have to look them up in hash. */
+struct monomer *monomer;
+for (monomer = store->monomerList; monomer != NULL; monomer = monomer->next)
+    monomer->type = hashFindVal(store->typeHash, monomer->word);
+
+/* Do second pass on ones that are not yet defined */
+for (monomer = store->monomerList; monomer != NULL; monomer = monomer->next)
+    {
+    if (monomer->type == NULL)
+	{
+	struct monomerType *type = monomer->type = fillInTypeFromReads(monomer, store);
+	verbose(2, "Got %p %s\n", type, (type != NULL ? type->name : "n/a"));
+	}
+    }
+}
+
 void alphaAsm(char *readsFile, char *monomerOrderFile, char *outFile)
 /* alphaAsm - Create Markov chain of words and optionally output chain in two formats. */
 {
 struct alphaStore *store = alphaStoreNew(maxChainSize);
 alphaReadListFromFile(readsFile, store);
 makeMarkovChains(store);
 struct wordTree *wt = store->markovChains;
 alphaStoreLoadMonomerOrder(store, readsFile, monomerOrderFile);
+fillInTypes(store);
 matchUpOrphans(store);
 alphaStoreNormalize(store, outSize);
 
 if (optionExists("chain"))
     {
     char *fileName = optionVal("chain", NULL);
     wordTreeWrite(wt, store->maxChainSize, fileName);
     }
 
 wordTreeGenerateFile(store, store->maxChainSize, pickRandom(wt->children), outSize, outFile);
 
 if (optionExists("afterChain"))
     {
     char *fileName = optionVal("afterChain", NULL);
     wordTreeWrite(wt, store->maxChainSize, fileName);