29abab691ecf705da7c642ccfedd3fd2e26b74b1
kent
  Tue Sep 11 13:35:18 2012 -0700
Starting to do orphan beginning/end processing.
diff --git src/kehayden/alphaAsm/alphaAsm.c src/kehayden/alphaAsm/alphaAsm.c
index 7a39680..ca818d4 100644
--- src/kehayden/alphaAsm/alphaAsm.c
+++ src/kehayden/alphaAsm/alphaAsm.c
@@ -45,55 +45,55 @@
    {"afterChain", OPTION_STRING},
    {"fullOnly", OPTION_BOOLEAN},
    {"outSize", OPTION_INT},
    {"seed", OPTION_INT},
    {NULL, 0},
 };
 
 /* Some structures to keep track of words (which correspond to alpha satellight monomers)
  * seen in input. */
 
 struct monomer
 /* Basic information on a monomer including how many times it is seen in input and output
  * streams.  Unlike the wordTree, this is flat, and does not include predecessors. */
     {
     struct monomer *next;	/* Next in list of all words. */
-    char *word;			/* The word itself.  Not allocated here. */
+    char *word;			/* The word used to represent monomer.  Not allocated here. */
     int useCount;		/* Number of times used. */
     int outTarget;		/* Number of times want to output word. */
     int outCount;		/* Number of times have output word so far. */
     };
 
 struct monomerRef
 /* A reference to a word. */
     {
     struct monomerRef *next;	/* Next in list */
     struct monomer *val;	/* The word referred to. */
     };
 
 struct monomerType
 /* A collection of words of the same type - or monomers of same type. */
     {
     struct monomerType *next;   /* Next monomerType */
     struct monomerRef *list;	    /* List of all words of that type */
     };
 
 struct alphaRead 
 /* A read that's been parsed into alpha variants. */
     {
     struct alphaRead *next;	/* Next in list */
-    char *name;			/* Id of read */
+    char *name;			/* Id of read, not allocated here. */
     struct monomerRef *list;	/* List of alpha subunits */
     };
 
 struct alphaStore
 /* Stores info on all words */
     {
     struct monomer *monomerList;   /* List of words, fairly arbitrary order. */
     struct hash *monomerHash;       /* Hash of monomer, keyed by word. */
     struct alphaRead *readList;	 /* List of all reads. */
     struct hash *readHash;	 /* Hash of all reads keyed by read ID. */
     struct wordTree *markovChains;   /* Tree of words that follow other words. */
     int maxChainSize;		/* Maximum depth of tree. */
     struct monomerType *typeList;	/* List of all types. */
     struct hash *typeHash;	/* Hash with monomerType values, keyed by all words. */
     };
@@ -709,30 +709,133 @@
 
     /* Create data structure and add read to list and hash */
     if (hashLookup(store->readHash, name))
         errAbort("Read ID %s is repeated line %d of %s", name, lf->lineIx, lf->fileName);
     struct alphaRead *read;
     AllocVar(read);
     read->list = list;
     hashAddSaveName(store->readHash, name, read, &read->name);
     slAddHead(&readList, read);
     }
 slReverse(&readList);
 store->readList = readList;
 lineFileClose(&lf);
 }
 
+struct alphaOrphan
+/* Information about an orphan - something without great joining information. */
+    {
+    struct alphaOrphan *next;
+    struct monomer *monomer;
+    struct monomerType *type;
+    };
+
+struct alphaOrphan *findOrphanStarts(struct alphaRead *readList)
+/* Return list of monomers that are found only at start of reads. */
+{
+struct alphaOrphan *orphanList = NULL;
+
+/* Build up hash of things seen later in reads. */
+struct alphaRead *read;
+struct hash *later = hashNew(0);    /* Hash of monomers found later. */
+for (read = readList; read != NULL; read = read->next)
+    {
+    struct monomerRef *ref;
+    for (ref = read->list->next; ref != NULL; ref = ref->next)
+        {
+	hashAdd(later, ref->val->word, NULL);
+	}
+    }
+
+/* Pass through again collecting orphans. */
+struct hash *orphanHash = hashNew(0);
+for (read = readList; read != NULL; read = read->next)
+    {
+    struct monomer *monomer = read->list->val;
+    if (!hashLookup(later, monomer->word))
+        {
+	struct alphaOrphan *orphan = hashFindVal(orphanHash, monomer->word);
+	if (orphan == NULL)
+	    {
+	    AllocVar(orphan);
+	    orphan->monomer = monomer;
+	    slAddHead(&orphanList, orphan);
+	    hashAdd(orphanHash, monomer->word, orphan);
+	    uglyf("Adding orphan start %s\n", monomer->word);
+	    }
+	else uglyf("Repeating orphan start %s\n", monomer->word);
+	}
+    }
+hashFree(&later);
+hashFree(&orphanHash);
+slReverse(&orphanList);
+return orphanList;
+}
+
+struct alphaOrphan *findOrphanEnds(struct alphaRead *readList)
+/* Return list of monomers that are found only at end of reads. */
+{
+struct alphaOrphan *orphanList = NULL;
+
+/* Build up hash of things seen sooner in reads. */
+struct alphaRead *read;
+struct hash *sooner = hashNew(0);    /* Hash of monomers found sooner. */
+for (read = readList; read != NULL; read = read->next)
+    {
+    struct monomerRef *ref;
+    /* Loop over all but last. */
+    for (ref = read->list; ref->next != NULL; ref = ref->next)
+        {
+	hashAdd(sooner, ref->val->word, NULL);
+	}
+    }
+
+/* Pass through again collecting orphans. */
+struct hash *orphanHash = hashNew(0);
+for (read = readList; read != NULL; read = read->next)
+    {
+    struct monomerRef *lastRef = slLastEl(read->list);
+    struct monomer *monomer = lastRef->val;
+    if (!hashLookup(sooner, monomer->word))
+        {
+	struct alphaOrphan *orphan = hashFindVal(orphanHash, monomer->word);
+	if (orphan == NULL)
+	    {
+	    AllocVar(orphan);
+	    orphan->monomer = monomer;
+	    slAddHead(&orphanList, orphan);
+	    hashAdd(orphanHash, monomer->word, orphan);
+	    uglyf("Adding orphan end %s\n", monomer->word);
+	    }
+	else uglyf("Repeating orphan end %s\n", monomer->word);
+	}
+    }
+hashFree(&sooner);
+slReverse(&orphanList);
+hashFree(&orphanHash);
+return orphanList;
+}
+
+void matchUpOrphans(struct alphaStore *store)
+/* Make up fake reads that integrate orphans (monomers only found at beginning or end
+ * of a read) better. */
+{
+struct alphaOrphan *orphanStarts = findOrphanStarts(store->readList);
+struct alphaOrphan *orphanEnds = findOrphanEnds(store->readList);
+uglyf("orphanStarts has %d items, orphanEnds %d\n", slCount(orphanStarts), slCount(orphanEnds));
+}
+
 void makeMarkovChains(struct alphaStore *store)
 /* Return a alphaStore containing all words, and also all chains-of-words of length 
  * chainSize seen in file.  */
 {
 /* We'll build up the tree starting with an empty root node. */
 struct wordTree *wt = store->markovChains = wordTreeNew(alphaStoreAddMonomer(store, ""));	
 int chainSize = store->maxChainSize;
 
 /* Loop through each read. There's special cases at the beginning and end of read, and for 
  * short reads.  In the main case we'll be maintaining a chain (doubly linked list) of maxChainSize 
  * words, * popping off one word from the start, and adding one word to the end for each
  * new word we encounter. This list is added to the tree each iteration. */
 struct alphaRead *read;
 for (read = store->readList; read != NULL; read = read->next)
     {
@@ -839,33 +942,33 @@
 
 void alphaStoreNormalize(struct alphaStore *store, int outputSize)
 /* Set up output counts on each word to make sure it gets it's share of output size */
 {
 struct wordTree *wt = store->markovChains;
 wordTreeNormalize(wt, outputSize, 1.0);
 monomerListNormalise(store->monomerList, wt->useCount, outputSize);
 }
 
 void alphaAsm(char *readsFile, char *monomerOrderFile, char *outFile)
 /* alphaAsm - Create Markov chain of words and optionally output chain in two formats. */
 {
 struct alphaStore *store = alphaStoreNew(maxChainSize);
 alphaReadListFromFile(readsFile, store);
 makeMarkovChains(store);
-// struct alphaStore *store = alphaStoreForChainsInFile(readsFile, maxChainSize);
 struct wordTree *wt = store->markovChains;
 alphaStoreLoadMonomerOrder(store, readsFile, monomerOrderFile);
+matchUpOrphans(store);
 alphaStoreNormalize(store, outSize);
 
 if (optionExists("chain"))
     {
     char *fileName = optionVal("chain", NULL);
     wordTreeWrite(wt, store->maxChainSize, fileName);
     }
 
 wordTreeGenerateFile(store, store->maxChainSize, pickRandom(wt->children), outSize, outFile);
 
 if (optionExists("afterChain"))
     {
     char *fileName = optionVal("afterChain", NULL);
     wordTreeWrite(wt, store->maxChainSize, fileName);
     }