2c49b0bd798bdf2198d76cecd817928cdda45658 kent Thu Feb 14 13:15:57 2013 -0800 Rearranging logging levels so verbose(2) prints a handful of status messages one for each major phase. Some of the logging leaks memory, so put in a test to avoid this unless logging level is high. diff --git src/kehayden/alphaAsm/alphaAsm.c src/kehayden/alphaAsm/alphaAsm.c index d1d92ee..7c12be2 100644 --- src/kehayden/alphaAsm/alphaAsm.c +++ src/kehayden/alphaAsm/alphaAsm.c @@ -236,31 +236,31 @@ int total = 0; struct wordTree *wt; for (wt = list; wt != NULL; wt = wt->next) total += wt->outTarget; return total; } void addChainToTree(struct wordTree *wt, struct dlList *chain) /* Add chain of words to tree. */ { struct dlNode *node; wt->useCount += 1; for (node = chain->head; !dlEnd(node); node = node->next) { struct monomer *monomer = node->val; - verbose(2, " adding %s\n", monomer->word); + verbose(4, " adding %s\n", monomer->word); wt = wordTreeAddFollowing(wt, monomer); } } int wordTreeAddPseudoCount(struct wordTree *wt, int pseudo) /* Add pseudo to all leaves of tree and propagate counts up to parents. */ { if (wt->children == NULL) { wt->useCount += pseudo; return wt->useCount; } else { struct wordTree *child; @@ -357,60 +357,60 @@ fprintf(f, "\n"); } struct wordTree *child; for (child = wt->children; child != NULL; child = child->next) wordTreeDump(level+1, child, maxChainSize, f); } struct wordTree *pickWeightedRandomFromList(struct wordTree *list) /* Pick word from list randomly, but so that words with higher outTargets * are picked more often. */ { struct wordTree *picked = NULL; /* Debug output. */ { - verbose(2, " pickWeightedRandomFromList("); + verbose(4, " pickWeightedRandomFromList("); struct wordTree *wt; for (wt = list; wt != NULL; wt = wt->next) { if (wt != list) - verbose(2, " "); - verbose(2, "%s:%d", wt->monomer->word, wt->outTarget); + verbose(4, " "); + verbose(4, "%s:%d", wt->monomer->word, wt->outTarget); } - verbose(2, ") total %d\n", wordTreeSumOutTargets(list)); + verbose(4, ") total %d\n", wordTreeSumOutTargets(list)); } /* Figure out total number of outputs left, and a random number between 0 and that total. */ int total = wordTreeSumOutTargets(list); if (total > 0) { int threshold = rand() % total; - verbose(2, " threshold %d\n", threshold); + verbose(4, " threshold %d\n", threshold); /* Loop through list returning selection corresponding to random threshold. */ int binStart = 0; struct wordTree *wt; for (wt = list; wt != NULL; wt = wt->next) { int size = wt->outTarget; int binEnd = binStart + size; - verbose(2, " %s size %d, binEnd %d\n", wt->monomer->word, size, binEnd); + verbose(4, " %s size %d, binEnd %d\n", wt->monomer->word, size, binEnd); if (threshold < binEnd) { picked = wt; - verbose(2, " picked %s\n", wt->monomer->word); + verbose(4, " picked %s\n", wt->monomer->word); break; } binStart = binEnd; } } return picked; } int totUseZeroCount = 0; struct monomer *pickRandomFromType(struct monomerType *type) /* Pick a random word, weighted by outTarget, from all available of given type */ { /* Figure out total on list */ int total = 0; @@ -422,83 +422,86 @@ if (total > 0) { int threshold = rand() % total; int binStart = 0; for (ref = type->list; ref != NULL; ref = ref->next) { struct monomer *monomer = ref->val; int size = monomer->outTarget; int binEnd = binStart + size; if (threshold < binEnd) return monomer; binStart = binEnd; } } -verbose(2, "Backup plan for %s in pickRandomFromType\n", type->name); +verbose(3, "Backup plan for %s in pickRandomFromType\n", type->name); /* Try again based on use counts. */ total = 0; for (ref = type->list; ref != NULL; ref = ref->next) total += ref->val->useCount; if (total > 0) { int threshold = rand() % total; int binStart = 0; for (ref = type->list; ref != NULL; ref = ref->next) { struct monomer *monomer = ref->val; int size = monomer->useCount; int binEnd = binStart + size; if (threshold < binEnd) return monomer; binStart = binEnd; } } errAbort("Fell off end in pickRandomFromType on %s", type->name); return NULL; } struct wordTree *predictNextFromAllPredecessors(struct wordTree *wt, struct dlNode *list) /* Predict next word given tree and recently used word list. If tree doesn't * have statistics for what comes next given the words in list, then it returns * NULL. */ { -verbose(2, " predictNextFromAllPredecessors(%s, %s)\n", wordTreeString(wt), dlListFragWords(list)); +if (verboseLevel() >= 4) + verbose(4, " predictNextFromAllPredecessors(%s, %s)\n", wordTreeString(wt), dlListFragWords(list)); struct dlNode *node; for (node = list; !dlEnd(node); node = node->next) { struct monomer *monomer = node->val; wt = wordTreeFindInList(wt->children, monomer); - verbose(2, " wordTreeFindInList(%s) = %p %s\n", monomer->word, wt, wordTreeString(wt)); + if (verboseLevel() >= 4) + verbose(4, " wordTreeFindInList(%s) = %p %s\n", monomer->word, wt, wordTreeString(wt)); if (wt == NULL || wt->children == NULL) break; } struct wordTree *result = NULL; if (wt != NULL && wt->children != NULL) result = pickWeightedRandomFromList(wt->children); return result; } struct wordTree *predictFromWordTree(struct wordTree *wt, struct dlNode *recent) /* Predict next word given tree and recently used word list. Will use all words in * recent list if can, but if there is not data in tree, will back off, and use * progressively less previous words. */ { struct dlNode *node; -verbose(2, " predictNextFromWordTree(%s)\n", wordTreeString(wt)); +if (verboseLevel() >= 4) + verbose(4, " predictNextFromWordTree(%s)\n", wordTreeString(wt)); for (node = recent; !dlEnd(node); node = node->next) { struct wordTree *result = predictNextFromAllPredecessors(wt, node); if (result != NULL) return result; } return NULL; } struct dlNode *nodesFromTail(struct dlList *list, int count) /* Return count nodes from tail of list. If list is shorter than count, it returns the * whole list. */ { int i; @@ -534,76 +537,76 @@ int typeIx = ptArrayIx(oldType, store->typeArray, store->typeCount); typeIx -= amount; while (typeIx < 0) typeIx += store->typeCount; return store->typeArray[typeIx]; } struct wordTree *predictFromPreviousTypes(struct alphaStore *store, struct dlList *past) /* Predict next based on general pattern of monomer order. */ { /* This routine is now a bit more complex than necessary but it still works. It * these days only needs to look back 1 because of the read-based type fill-in. */ int maxToLookBack = 3; { /* Debugging block */ struct dlNode *node = nodesFromTail(past, maxToLookBack); - verbose(3, "predictFromPreviousTypes("); + verbose(4, "predictFromPreviousTypes("); for (; !dlEnd(node); node = node->next) { struct monomer *monomer = node->val; - verbose(3, "%s, ", monomer->word); + verbose(4, "%s, ", monomer->word); } - verbose(3, ")\n"); + verbose(4, ")\n"); } int pastDepth; struct dlNode *node; for (pastDepth = 1, node = past->tail; pastDepth <= maxToLookBack; ++pastDepth) { if (dlStart(node)) { - verbose(2, "predictFromPreviousTypes with empty past\n"); + verbose(3, "predictFromPreviousTypes with empty past\n"); return NULL; } struct monomer *monomer = node->val; struct monomerType *prevType = hashFindVal(store->typeHash, monomer->word); if (prevType != NULL) { struct monomerType *curType = typeAfter(store, prevType, pastDepth); struct monomer *curInfo = pickRandomFromType(curType); struct wordTree *curTree = wordTreeFindInList(store->markovChains->children, curInfo); - verbose(3, "predictFromPreviousType pastDepth %d, curInfo %s, curTree %p %s\n", + verbose(4, "predictFromPreviousType pastDepth %d, curInfo %s, curTree %p %s\n", pastDepth, curInfo->word, curTree, curTree->monomer->word); return curTree; } node = node->prev; } -verbose(2, "predictFromPreviousTypes past all unknown types\n"); +verbose(3, "predictFromPreviousTypes past all unknown types\n"); return NULL; } struct wordTree *predictNext(struct alphaStore *store, struct dlList *past) /* Given input data store and what is known from the past, predict the next word. */ { struct dlNode *recent = nodesFromTail(past, store->maxChainSize); struct wordTree *pick = predictFromWordTree(store->markovChains, recent); if (pick == NULL) pick = predictFromPreviousTypes(store, past); if (pick == NULL) { pick = pickWeightedRandomFromList(store->markovChains->children); - verbose(2, "in predictNext() last resort pick of %s\n", pick->monomer->word); + verbose(3, "in predictNext() last resort pick of %s\n", pick->monomer->word); } return pick; } void decrementOutputCountsInTree(struct wordTree *wt) /* Decrement output count of self and parents. */ { while (wt != NULL) { /* Decrement target count, but don't let it fall below sum of counts of all children. * This can happen with incomplete data if we don't prevent it. This * same code also prevents us from having negative outTarget. */ int outTarget = wt->outTarget - 1; int kidSum = wordTreeSumOutTargets(wt->children); if (outTarget < kidSum) @@ -642,41 +645,40 @@ } else if (chainSize >= maxSize) { chainStartNode = chainStartNode->next; picked = predictNext(store, ll); } else { picked = predictNext(store, ll); ++chainSize; } if (picked == NULL) break; - /* Add word from whatever level we fetched back to our output chain. */ struct monomer *monomer = picked->monomer; node->val = monomer; dlAddTail(ll, node); decrementOutputCountsInTree(picked); monomer->outTarget -= 1; monomer->outCount += 1; } -verbose(2, "totUseZeroCount = %d\n", totUseZeroCount); +verbose(3, "totUseZeroCount = %d\n", totUseZeroCount); return ll; } struct slRef *listUnusedMonomers(struct alphaStore *store, struct dlList *ll) /* Return list of references to monomers that are in store but not on ll. */ { struct slRef *refList = NULL, *ref; struct hash *usedHash = hashNew(0); struct dlNode *node; /* Make hash of used monomers. */ for (node = ll->head; !dlEnd(node); node = node->next) { struct monomer *monomer = node->val; hashAdd(usedHash, monomer->word, monomer); @@ -833,65 +835,65 @@ maxCount = count; maxWord = hel->name; } } *retWord = maxWord; *retCount = maxCount; hashFree(&countHash); } boolean subCommonCenter(struct alphaStore *store, struct monomer *center, struct monomerRef *neighborhood, struct dlList *ll) /* Scan list for places where have all items in neighborhood (except for center) matching. * Substitute in center at one of these places chosen at random and return TRUE if possible. */ { struct slRef *centerRefList = refsToPossibleCenters(center, neighborhood, ll); -verbose(3, "sub %s in neighborhood: ", center->word); +verbose(4, "sub %s in neighborhood: ", center->word); if (verboseLevel() >= 3) printMonomerRefList(neighborhood, stderr); -verbose(3, "Got %d possible centers\n", slCount(centerRefList)); +verbose(4, "Got %d possible centers\n", slCount(centerRefList)); if (centerRefList == NULL) return FALSE; int commonCount = 0; char *commonWord = NULL; mostCommonMonomerWord(centerRefList, &commonWord, &commonCount); struct monomer *commonMonomer = hashFindVal(store->monomerHash, commonWord); -verbose(3, "Commonest word to displace with %s is %s which occurs %d times in context and %d overall\n", center->word, commonWord, commonCount, commonMonomer->subbedOutCount); +verbose(4, "Commonest word to displace with %s is %s which occurs %d times in context and %d overall\n", center->word, commonWord, commonCount, commonMonomer->subbedOutCount); if (commonMonomer->subbedOutCount < 2) { - verbose(2, "Want to substitute %s for %s, but %s only occurs %d time.\n", + verbose(3, "Want to substitute %s for %s, but %s only occurs %d time.\n", center->word, commonWord, commonWord, commonMonomer->subbedOutCount); return FALSE; } /* Select a random one of the most commonly occuring possible centers. */ int targetIx = rand() % commonCount; struct slRef *ref; int currentIx = 0; for (ref = centerRefList; ref != NULL; ref = ref->next) { struct dlNode *node = ref->val; struct monomer *monomer = node->val; if (sameString(monomer->word, commonWord)) { if (currentIx == targetIx) { - verbose(2, "Substituting %s for %s in context of %d\n", center->word, commonWord, slCount(neighborhood)); + verbose(3, "Substituting %s for %s in context of %d\n", center->word, commonWord, slCount(neighborhood)); struct monomer *oldCenter = node->val; if (oldCenter->type != center->type) - verbose(2, "Type mismatch subbig %s vs %s\n", oldCenter->word, center->word); + verbose(3, "Type mismatch subbig %s vs %s\n", oldCenter->word, center->word); node->val = center; oldCenter->subbedOutCount -= 1; center->subbedOutCount += 1; return TRUE; } ++currentIx; } } internalErr(); // Should not get here. return FALSE; } boolean subCenterInNeighborhood(struct alphaStore *store, struct monomer *center, struct monomerRef *neighborhood, struct dlList *ll) /* Scan ll for cases where neighborhood around center matches. Replace one of these @@ -928,46 +930,46 @@ { commonCount = monomer->subbedOutCount; common = monomer; } } return common; } boolean subIntoFirstMostCommonOfType(struct alphaStore *store, struct monomer *unused, struct dlList *ll) /* Substitute unused for first occurence of most common monomer of same type. */ { struct monomer *common = mostCommonInType(unused->type); if (common == NULL) { - verbose(2, "No monomers of type %s used at all!\n", unused->type->name); + verbose(3, "No monomers of type %s used at all!\n", unused->type->name); return FALSE; } if (common->subbedOutCount < 2) { - verbose(2, "Trying to sub in %s, but there's no monomers of type %s that are used more than once.\n", + verbose(3, "Trying to sub in %s, but there's no monomers of type %s that are used more than once.\n", unused->word, unused->type->name); return FALSE; } struct dlNode *node; for (node = ll->head; !dlEnd(node); node = node->next) { struct monomer *monomer = node->val; if (monomer == common) { - verbose(2, "Subbing %s for %s of type %s\n", unused->word, monomer->word, unused->type->name); + verbose(3, "Subbing %s for %s of type %s\n", unused->word, monomer->word, unused->type->name); node->val = unused; unused->subbedOutCount += 1; monomer->subbedOutCount -= 1; break; } } return TRUE; } void setInitialSubbedOutCount(struct alphaStore *store, struct dlList *ll) /* Set subbedOutCount based on how many times monomers occur in list. */ { struct dlNode *node; for (node = ll->head; !dlEnd(node); node = node->next) { @@ -981,70 +983,73 @@ for (monomer = store->monomerList; monomer != NULL; monomer = monomer->next) { uglyf("%s %d %d\n", monomer->word, monomer->outCount, monomer->subbedOutCount); if (monomer->outCount != monomer->subbedOutCount) ++mismatchCount; } uglyf("mismatch count = %d\n", mismatchCount); #endif /* PARANOID */ } void subInMissing(struct alphaStore *store, struct dlList *ll) /* Go figure out missing monomers in ll, and attempt to substitute them in somewhere they would fit. */ { setInitialSubbedOutCount(store, ll); struct slRef *unusedList = listUnusedMonomers(store, ll); -verbose(2, "%d monomers, %d unused\n", slCount(store->monomerList), slCount(unusedList)); +verbose(3, "%d monomers, %d unused\n", slCount(store->monomerList), slCount(unusedList)); struct slRef *unusedRef; for (unusedRef = unusedList; unusedRef != NULL; unusedRef = unusedRef->next) { struct monomer *unused = unusedRef->val; struct monomerRef *neighborhood = findNeighborhoodFromReads(unused); if (!subCenterInNeighborhood(store, unused, neighborhood, ll)) { - verbose(2, "Couldn't substitute in %s with context, falling back to type logic\n", + verbose(3, "Couldn't substitute in %s with context, falling back to type logic\n", unused->word); subIntoFirstMostCommonOfType(store, unused, ll); } slFreeList(&neighborhood); } } static void writeMonomerList(char *fileName, struct dlList *ll) /* Write out monomer list to file. */ { FILE *f = mustOpen(fileName, "w"); struct dlNode *node; for (node = ll->head; !dlEnd(node); node = node->next) { struct monomer *monomer = node->val; fprintf(f, "%s\n", monomer->word); } carefulClose(&f); } static void wordTreeGenerateFile(struct alphaStore *store, int maxSize, struct wordTree *firstWord, int maxOutputWords, char *fileName) /* Create file containing words base on tree probabilities. The wordTreeGenerateList does * most of work. */ { struct dlList *ll = wordTreeGenerateList(store, maxSize, firstWord, maxOutputWords); +verbose(2, "Generated initial output list\n"); char *unsubbedFile = optionVal("unsubbed", NULL); if (unsubbedFile) writeMonomerList(unsubbedFile, ll); subInMissing(store, ll); +verbose(2, "Substituted in missing rare monomers\n"); writeMonomerList(fileName, ll); +verbose(2, "Wrote primary output\n"); dlListFree(&ll); } struct alphaStore *alphaStoreNew(int maxChainSize) /* Allocate and initialize a new word store. */ { struct alphaStore *store; AllocVar(store); store->maxChainSize = maxChainSize; store->monomerHash = hashNew(0); store->typeHash = hashNew(0); store->readHash = hashNew(0); return store; } @@ -1151,31 +1156,31 @@ /* Pass through again collecting orphans. */ struct hash *orphanHash = hashNew(0); for (read = readList; read != NULL; read = read->next) { struct monomer *monomer = read->list->val; char *word = monomer->word; if (!hashLookup(later, word)) { struct alphaOrphan *orphan = hashFindVal(orphanHash, word); if (orphan == NULL) { AllocVar(orphan); orphan->monomer = monomer; slAddHead(&orphanList, orphan); hashAdd(orphanHash, word, orphan); - verbose(2, "Adding orphan start %s\n", word); + verbose(3, "Adding orphan start %s\n", word); } } } hashFree(&later); hashFree(&orphanHash); slReverse(&orphanList); return orphanList; } struct alphaOrphan *findOrphanEnds(struct alphaRead *readList, struct alphaStore *store) /* Return list of monomers that are found only at end of reads. */ { struct alphaOrphan *orphanList = NULL; /* Build up hash of things seen sooner in reads. */ @@ -1194,31 +1199,31 @@ /* Pass through again collecting orphans. */ struct hash *orphanHash = hashNew(0); for (read = readList; read != NULL; read = read->next) { struct monomerRef *lastRef = slLastEl(read->list); struct monomer *monomer = lastRef->val; if (!hashLookup(sooner, monomer->word)) { struct alphaOrphan *orphan = hashFindVal(orphanHash, monomer->word); if (orphan == NULL) { AllocVar(orphan); orphan->monomer = monomer; slAddHead(&orphanList, orphan); hashAdd(orphanHash, monomer->word, orphan); - verbose(2, "Adding orphan end %s\n", monomer->word); + verbose(3, "Adding orphan end %s\n", monomer->word); } } } hashFree(&sooner); slReverse(&orphanList); hashFree(&orphanHash); return orphanList; } struct alphaRead *addReadOfTwo(struct alphaStore *store, struct monomer *a, struct monomer *b) /* Make up a fake read that goes from a to b and add it to store. */ { /* Create fake name and start fake read with it. */ static int fakeId = 0; char name[32]; @@ -1244,92 +1249,92 @@ /* Add read to the monomers. */ refAdd(&a->readList, read); refAdd(&b->readList, read); a->useCount += 1; b->useCount += 1; return read; } void integrateOrphans(struct alphaStore *store) /* Make up fake reads that integrate orphans (monomers only found at beginning or end * of a read) better. */ { struct alphaOrphan *orphanStarts = findOrphanStarts(store->readList, store); struct alphaOrphan *orphanEnds = findOrphanEnds(store->readList, store); -verbose(2, "orphanStarts has %d items, orphanEnds %d\n", +verbose(3, "orphanStarts has %d items, orphanEnds %d\n", slCount(orphanStarts), slCount(orphanEnds)); /* First look for the excellent situation where you can pair up orphans with each * other. For Y at least this happens half the time. */ struct alphaOrphan *start, *end; for (start = orphanStarts; start != NULL; start = start->next) { struct monomerType *startType = start->monomer->type; if (startType == NULL) continue; for (end = orphanEnds; end != NULL && !start->paired; end = end->next) { struct monomerType *endType = end->monomer->type; if (endType == NULL) continue; if (!end->paired) { struct monomerType *nextType = typeAfter(store, endType, 1); if (nextType == startType) { end->paired = TRUE; start->paired = TRUE; addReadOfTwo(store, end->monomer, start->monomer); - verbose(2, "Pairing %s with %s\n", end->monomer->word, start->monomer->word); + verbose(3, "Pairing %s with %s\n", end->monomer->word, start->monomer->word); } } } } /* Ok, now have to just manufacture other side of orphan starts out of thin air. */ for (start = orphanStarts; start != NULL; start = start->next) { if (start->paired) continue; struct monomer *startMono = start->monomer; struct monomerType *startType = startMono->type; if (startType == NULL) continue; struct monomerType *newType = typeBefore(store, startType, 1); - verbose(2, "Trying to find end of type %s\n", newType->name); + verbose(3, "Trying to find end of type %s\n", newType->name); struct monomer *newMono = pickRandomFromType(newType); addReadOfTwo(store, newMono, startMono); - verbose(2, "Pairing new %s with start %s\n", newMono->word, startMono->word); + verbose(3, "Pairing new %s with start %s\n", newMono->word, startMono->word); } /* Ok, now have to just manufacture other side of orphan ends out of thin air. */ for (end = orphanEnds; end != NULL; end = end->next) { if (end->paired) continue; struct monomer *endMono = end->monomer; struct monomerType *endType = endMono->type; if (endType == NULL) continue; struct monomerType *newType = typeAfter(store, endType, 1); - verbose(2, "Trying to find start of type %s\n", newType->name); + verbose(3, "Trying to find start of type %s\n", newType->name); struct monomer *newMono = pickRandomFromType(newType); addReadOfTwo(store, endMono, newMono); - verbose(2, "Pairing end %s with new %s\n", endMono->word, newMono->word); + verbose(3, "Pairing end %s with new %s\n", endMono->word, newMono->word); } } void makeMarkovChains(struct alphaStore *store) /* Return a alphaStore containing all words, and also all chains-of-words of length * chainSize seen in file. */ { /* We'll build up the tree starting with an empty root node. */ struct wordTree *wt = store->markovChains = wordTreeNew(alphaStoreAddMonomer(store, "")); int chainSize = store->maxChainSize; /* Loop through each read. There's special cases at the beginning and end of read, and for * short reads. In the main case we'll be maintaining a chain (doubly linked list) of maxChainSize * words, * popping off one word from the start, and adding one word to the end for each * new word we encounter. This list is added to the tree each iteration. */ @@ -1424,31 +1429,31 @@ struct monomerRef *ref; AllocVar(ref); ref->val = monomer; slAddHead(&type->list, ref); hashAddUnique(store->typeHash, word, type); } if (type->list == NULL) errAbort("Short line %d of %s. Format should be:\ntype list-of-monomers-of-type\n", lf->lineIx, lf->fileName); } slReverse(&store->typeList); if (store->typeList == NULL) errAbort("%s is empty", lf->fileName); lineFileClose(&lf); hashFree(&uniq); -verbose(2, "Added %d types containing %d words from %s\n", +verbose(3, "Added %d types containing %d words from %s\n", slCount(store->typeList), store->typeHash->elCount, fileName); /* Create type array */ store->typeCount = slCount(store->typeList); struct monomerType **types = AllocArray(store->typeArray, store->typeCount); struct monomerType *type; int i; for (i=0, type = store->typeList; i<store->typeCount; ++i, type = type->next) types[i] = type; } void crossCheckMonomerOrderAndReads(struct alphaStore *store, char *orderedPrefix, char *readFile, char *typeFile) /* Make sure all monomer that begin with ordered prefix are present in monomer order file. */ { /* Make hash of all monomers that have type info. */ @@ -1489,47 +1494,47 @@ { struct wordTree *wt = store->markovChains; wordTreeNormalize(wt, outputSize, 1.0); monomerListNormalise(store->monomerList, wt->useCount, outputSize); } struct monomerType *fillInTypeFromReads(struct monomer *monomer, struct alphaStore *store) /* Look through read list trying to find best supported type for this monomer. */ { /* The algorithm here could be better. It picks the closest nearby monomer that is typed * to fill in from. When have information from both before and after the program arbitrarily * uses before preferentially. When have information that is equally close from multiple reads, * it just uses info from first read. Fortunately the data, at least on Y, shows a lot of * consistency, so it's probably not actually worth a more sophisticated algorithm that would * instead take the most popular choice rather than the first one. */ -verbose(2, "fillInTypeFromReads on %s from %d reads\n", monomer->word, slCount(monomer->readList)); +verbose(3, "fillInTypeFromReads on %s from %d reads\n", monomer->word, slCount(monomer->readList)); struct monomerType *bestType = NULL; int bestPos = 0; boolean bestIsAfter = FALSE; struct slRef *readRef; for (readRef = monomer->readList; readRef != NULL; readRef = readRef->next) { struct alphaRead *read = readRef->val; struct monomerRef *ref; { - verbose(2, " read %s (%d els):", read->name, slCount(read->list)); + verbose(3, " read %s (%d els):", read->name, slCount(read->list)); for (ref = read->list; ref != NULL; ref = ref->next) { struct monomer *m = ref->val; - verbose(2, " %s", m->word); + verbose(3, " %s", m->word); } - verbose(2, "\n"); + verbose(3, "\n"); } struct monomerType *beforeType = NULL; int beforePos = 0; /* Look before. */ for (ref = read->list; ref != NULL; ref = ref->next) { struct monomer *m = ref->val; if (m == monomer) break; if (m->type != NULL) { beforeType = m->type; beforePos = 0; } @@ -1570,67 +1575,72 @@ bestIsAfter = TRUE; } } } /* Now have found a type that is at a known position relative to ourselves. From this * infer our own type. */ struct monomerType *chosenType = NULL; if (bestType != NULL) { if (bestIsAfter) chosenType = typeBefore(store, bestType, bestPos); else chosenType = typeAfter(store, bestType, bestPos); } +verbose(3, "chosenType for %s is %s\n", monomer->word, + (bestType == NULL ? "(null)" : chosenType->name)); return chosenType; } void fillInTypes(struct alphaStore *store) /* We have types for most but not all monomers. Try and fill out types for most of * the rest by looking for position in reads they are in that do contain known types */ { /* Do first pass on ones defined - just have to look them up in hash. */ struct monomer *monomer; for (monomer = store->monomerList; monomer != NULL; monomer = monomer->next) monomer->type = hashFindVal(store->typeHash, monomer->word); /* Do second pass on ones that are not yet defined */ for (monomer = store->monomerList; monomer != NULL; monomer = monomer->next) { if (monomer->type == NULL) { - struct monomerType *type = monomer->type = fillInTypeFromReads(monomer, store); - verbose(2, "Got %p %s\n", type, (type != NULL ? type->name : "n/a")); + monomer->type = fillInTypeFromReads(monomer, store); } } } void alphaAsm(char *readsFile, char *monomerOrderFile, char *outFile) /* alphaAsm - assemble alpha repeat regions such as centromeres from reads that have * been parsed into various repeat monomer variants. Cycles of these variants tend to * form higher order repeats. */ { struct alphaStore *store = alphaStoreNew(maxChainSize); alphaReadListFromFile(readsFile, store); alphaStoreLoadMonomerOrder(store, readsFile, monomerOrderFile); +verbose(2, "Loaded input reads and monomer order\n"); crossCheckMonomerOrderAndReads(store, "m", readsFile, monomerOrderFile); fillInTypes(store); integrateOrphans(store); +verbose(2, "Filled in missing types and integrated orphan ends\n"); makeMarkovChains(store); +verbose(2, "Made Markov Chains\n"); struct wordTree *wt = store->markovChains; alphaStoreNormalize(store, outSize); +verbose(2, "Normalized Markov Chains\n"); if (optionExists("chain")) { char *fileName = optionVal("chain", NULL); wordTreeWrite(wt, store->maxChainSize, fileName); } wordTreeGenerateFile(store, store->maxChainSize, pickWeightedRandomFromList(wt->children), outSize, outFile); if (optionExists("afterChain")) { char *fileName = optionVal("afterChain", NULL); wordTreeWrite(wt, store->maxChainSize, fileName); }