src/kehayden/alphaAsm/alphaAsm.c d4c63af3897de8657a064989f55337a78646220a

d4c63af3897de8657a064989f55337a78646220a
kent
  Fri Dec 14 15:54:30 2012 -0800
Adding a new check on input.
diff --git src/kehayden/alphaAsm/alphaAsm.c src/kehayden/alphaAsm/alphaAsm.c
index 1a4ec29..18084dc 100644
--- src/kehayden/alphaAsm/alphaAsm.c
+++ src/kehayden/alphaAsm/alphaAsm.c
@@ -707,30 +707,31 @@
         return i;
     }
 errAbort("Monomer %s not on list\n", monomer->word);
 return -1;   // ugly
 }
 
 struct monomerRef *findNeighborhoodFromReads(struct monomer *center)
 /* Find if possible one monomer to either side of center */
 {
 struct slRef *readRef;
 struct monomerRef *before = NULL, *after = NULL;
 
 /* Loop through reads hoping to find a case where center is flanked by two monomers in
  * same read.   As a fallback, keep track of a monomer before and a monomer after in
  * any read. */
+uglyf("findNeighborhood of %s from %d reads\n", center->word, slCount(center->readList));
 for (readRef = center->readList; readRef != NULL; readRef = readRef->next)
     {
     struct alphaRead *read = readRef->val;
     int readSize = slCount(read->list);
     int centerIx = monomerRefIx(read->list, center);
     if (readSize >= 3 && centerIx > 0 && centerIx < readSize-1)
 	 {
          uglyf("Whoopie found central for %s\n", center->word);
 	 before = slElementFromIx(read->list, centerIx-1);
 	 after = slElementFromIx(read->list, centerIx+1);
 	 break;
 	 }
     else if (readSize >= 2)
          {
 	 if (centerIx == 0)
@@ -1209,30 +1210,57 @@
     errAbort("%s is empty", lf->fileName);
 lineFileClose(&lf);
 hashFree(&uniq);
 verbose(2, "Added %d types containing %d words from %s\n", 
     slCount(store->typeList), store->typeHash->elCount, fileName);
 
 /* Create type array */
 store->typeCount = slCount(store->typeList);
 struct monomerType **types = AllocArray(store->typeArray, store->typeCount);
 struct monomerType *type;
 int i;
 for (i=0, type = store->typeList; i<store->typeCount; ++i, type = type->next)
     types[i] = type;
 }
 
+void crossCheckMonomerOrderAndReads(struct alphaStore *store, char *orderedPrefix, char *readFile, char *typeFile)
+/* Make sure all monomer that begin with ordered prefix are present in monomer order file. */
+{
+/* Make hash of all monomers that have type info. */
+struct hash *orderHash = hashNew(0);
+struct monomerType *type;
+for (type = store->typeList; type != NULL; type = type->next)
+     {
+     struct monomerRef *ref;
+     for (ref = type->list; ref != NULL; ref = ref->next)
+         hashAdd(orderHash, ref->val->word, ref->val);
+     }
+
+/* Go through all monomers and make sure ones with correct prefix are in list. */
+struct monomer *mon;
+for (mon = store->monomerList; mon != NULL; mon = mon->next)
+    {
+    char *word = mon->word;
+    if (startsWith(orderedPrefix, word) && !hashLookup(orderHash, word))
+        {
+	errAbort("%s is in %s but not %s", word, readFile, typeFile);
+	}
+    }
+hashFree(&orderHash);
+}
+
+
 void monomerListNormalise(struct monomer *list, int totalCount, int outputSize)
 /* Set outTarget field in all of list to be normalized to outputSize */
 {
 struct monomer *monomer;
 double scale = outputSize/totalCount;
 for (monomer = list; monomer != NULL; monomer = monomer->next)
     monomer->outTarget = round(scale * monomer->useCount);
 }
 
 void alphaStoreNormalize(struct alphaStore *store, int outputSize)
 /* Set up output counts on each word to make sure it gets it's share of output size */
 {
 struct wordTree *wt = store->markovChains;
 wordTreeNormalize(wt, outputSize, 1.0);
 monomerListNormalise(store->monomerList, wt->useCount, outputSize);
@@ -1347,30 +1375,31 @@
 	{
 	struct monomerType *type = monomer->type = fillInTypeFromReads(monomer, store);
 	verbose(2, "Got %p %s\n", type, (type != NULL ? type->name : "n/a"));
 	}
     }
 }
 
 void alphaAsm(char *readsFile, char *monomerOrderFile, char *outFile)
 /* alphaAsm - assemble alpha repeat regions such as centromeres from reads that have
  * been parsed into various repeat monomer variants.  Cycles of these variants tend to
  * form higher order repeats. */
 {
 struct alphaStore *store = alphaStoreNew(maxChainSize);
 alphaReadListFromFile(readsFile, store);
 alphaStoreLoadMonomerOrder(store, readsFile, monomerOrderFile);
+crossCheckMonomerOrderAndReads(store, "m", readsFile, monomerOrderFile);
 fillInTypes(store);
 integrateOrphans(store);
 makeMarkovChains(store);
 struct wordTree *wt = store->markovChains;
 alphaStoreNormalize(store, outSize);
 
 if (optionExists("chain"))
     {
     char *fileName = optionVal("chain", NULL);
     wordTreeWrite(wt, store->maxChainSize, fileName);
     }
 
 wordTreeGenerateFile(store, store->maxChainSize, 
     pickWeightedRandomFromList(wt->children), outSize, outFile);