d4c63af3897de8657a064989f55337a78646220a kent Fri Dec 14 15:54:30 2012 -0800 Adding a new check on input. diff --git src/kehayden/alphaAsm/alphaAsm.c src/kehayden/alphaAsm/alphaAsm.c index 1a4ec29..18084dc 100644 --- src/kehayden/alphaAsm/alphaAsm.c +++ src/kehayden/alphaAsm/alphaAsm.c @@ -707,30 +707,31 @@ return i; } errAbort("Monomer %s not on list\n", monomer->word); return -1; // ugly } struct monomerRef *findNeighborhoodFromReads(struct monomer *center) /* Find if possible one monomer to either side of center */ { struct slRef *readRef; struct monomerRef *before = NULL, *after = NULL; /* Loop through reads hoping to find a case where center is flanked by two monomers in * same read. As a fallback, keep track of a monomer before and a monomer after in * any read. */ +uglyf("findNeighborhood of %s from %d reads\n", center->word, slCount(center->readList)); for (readRef = center->readList; readRef != NULL; readRef = readRef->next) { struct alphaRead *read = readRef->val; int readSize = slCount(read->list); int centerIx = monomerRefIx(read->list, center); if (readSize >= 3 && centerIx > 0 && centerIx < readSize-1) { uglyf("Whoopie found central for %s\n", center->word); before = slElementFromIx(read->list, centerIx-1); after = slElementFromIx(read->list, centerIx+1); break; } else if (readSize >= 2) { if (centerIx == 0) @@ -1209,30 +1210,57 @@ errAbort("%s is empty", lf->fileName); lineFileClose(&lf); hashFree(&uniq); verbose(2, "Added %d types containing %d words from %s\n", slCount(store->typeList), store->typeHash->elCount, fileName); /* Create type array */ store->typeCount = slCount(store->typeList); struct monomerType **types = AllocArray(store->typeArray, store->typeCount); struct monomerType *type; int i; for (i=0, type = store->typeList; i<store->typeCount; ++i, type = type->next) types[i] = type; } +void crossCheckMonomerOrderAndReads(struct alphaStore *store, char *orderedPrefix, char *readFile, char *typeFile) +/* Make sure all monomer that begin with ordered prefix are present in monomer order file. */ +{ +/* Make hash of all monomers that have type info. */ +struct hash *orderHash = hashNew(0); +struct monomerType *type; +for (type = store->typeList; type != NULL; type = type->next) + { + struct monomerRef *ref; + for (ref = type->list; ref != NULL; ref = ref->next) + hashAdd(orderHash, ref->val->word, ref->val); + } + +/* Go through all monomers and make sure ones with correct prefix are in list. */ +struct monomer *mon; +for (mon = store->monomerList; mon != NULL; mon = mon->next) + { + char *word = mon->word; + if (startsWith(orderedPrefix, word) && !hashLookup(orderHash, word)) + { + errAbort("%s is in %s but not %s", word, readFile, typeFile); + } + } +hashFree(&orderHash); +} + + void monomerListNormalise(struct monomer *list, int totalCount, int outputSize) /* Set outTarget field in all of list to be normalized to outputSize */ { struct monomer *monomer; double scale = outputSize/totalCount; for (monomer = list; monomer != NULL; monomer = monomer->next) monomer->outTarget = round(scale * monomer->useCount); } void alphaStoreNormalize(struct alphaStore *store, int outputSize) /* Set up output counts on each word to make sure it gets it's share of output size */ { struct wordTree *wt = store->markovChains; wordTreeNormalize(wt, outputSize, 1.0); monomerListNormalise(store->monomerList, wt->useCount, outputSize); @@ -1347,30 +1375,31 @@ { struct monomerType *type = monomer->type = fillInTypeFromReads(monomer, store); verbose(2, "Got %p %s\n", type, (type != NULL ? type->name : "n/a")); } } } void alphaAsm(char *readsFile, char *monomerOrderFile, char *outFile) /* alphaAsm - assemble alpha repeat regions such as centromeres from reads that have * been parsed into various repeat monomer variants. Cycles of these variants tend to * form higher order repeats. */ { struct alphaStore *store = alphaStoreNew(maxChainSize); alphaReadListFromFile(readsFile, store); alphaStoreLoadMonomerOrder(store, readsFile, monomerOrderFile); +crossCheckMonomerOrderAndReads(store, "m", readsFile, monomerOrderFile); fillInTypes(store); integrateOrphans(store); makeMarkovChains(store); struct wordTree *wt = store->markovChains; alphaStoreNormalize(store, outSize); if (optionExists("chain")) { char *fileName = optionVal("chain", NULL); wordTreeWrite(wt, store->maxChainSize, fileName); } wordTreeGenerateFile(store, store->maxChainSize, pickWeightedRandomFromList(wt->children), outSize, outFile);