a3628f6cda9c870182b206496b6edf8a22094910 kent Tue Sep 11 18:26:47 2012 -0700 Integrating orphans (things that are only at beginning or end of any read) by adding fake reads. diff --git src/kehayden/alphaAsm/alphaAsm.c src/kehayden/alphaAsm/alphaAsm.c index 104f156..7280e49 100644 --- src/kehayden/alphaAsm/alphaAsm.c +++ src/kehayden/alphaAsm/alphaAsm.c @@ -476,74 +476,86 @@ struct dlNode *nodesFromTail(struct dlList *list, int count) /* Return count nodes from tail of list. If list is shorter than count, it returns the * whole list. */ { int i; struct dlNode *node; for (i=0, node = list->tail; ihead; node = node->prev; } return node; } -struct monomerType *advanceTypeWithWrap(struct monomerType *typeList, struct monomerType *startType, - int amountToAdvance) -/* Skip forward amountToAdvance in typeList from startType. If get to end of list start +struct monomerType *typeAfter(struct alphaStore *store, struct monomerType *oldType, + int amount) +/* Skip forward amount in typeList from oldType. If get to end of list start * again at the beginning */ { int i; -struct monomerType *type = startType; -for (i=0; inext; if (type == NULL) - type = typeList; + type = store->typeList; } return type; } +struct monomerType *typeBefore(struct alphaStore *store, struct monomerType *oldType, int amount) +/* Pick type that comes amount before given type. */ +{ +int typeIx = ptArrayIx(oldType, store->typeArray, store->typeCount); +typeIx -= amount; +while (typeIx < 0) + typeIx += store->typeCount; +return store->typeArray[typeIx]; +} + struct wordTree *predictFromPreviousTypes(struct alphaStore *store, struct dlList *past) /* Predict next based on general pattern of monomer order. */ { +/* This routine is now a bit more complex than necessary but it still works. It + * these days only needs to look back 1 because of the read-based type fill-in. */ int maxToLookBack = 3; { /* Debugging block */ struct dlNode *node = nodesFromTail(past, maxToLookBack); verbose(3, "predictFromPreviousTypes("); for (; !dlEnd(node); node = node->next) { struct monomer *monomer = node->val; verbose(3, "%s, ", monomer->word); } verbose(3, ")\n"); } int pastDepth; struct dlNode *node; for (pastDepth = 1, node = past->tail; pastDepth <= maxToLookBack; ++pastDepth) { if (dlStart(node)) { verbose(2, "predictFromPreviousTypes with empty past\n"); return NULL; } struct monomer *monomer = node->val; struct monomerType *prevType = hashFindVal(store->typeHash, monomer->word); if (prevType != NULL) { - struct monomerType *curType = advanceTypeWithWrap(store->typeList, prevType, pastDepth); + struct monomerType *curType = typeAfter(store, prevType, pastDepth); struct monomer *curInfo = pickRandomFromType(curType); struct wordTree *curTree = wordTreeFindInList(store->markovChains->children, curInfo); verbose(3, "predictFromPreviousType pastDepth %d, curInfo %s, curTree %p %s\n", pastDepth, curInfo->word, curTree, curTree->monomer->word); return curTree; } node = node->prev; } verbose(2, "predictFromPreviousTypes past all unknown types\n"); return NULL; } struct wordTree *predictNext(struct alphaStore *store, struct dlList *past) /* Given input data store and what is known from the past, predict the next word. */ @@ -671,30 +683,31 @@ return store; } struct monomer *alphaStoreAddMonomer(struct alphaStore *store, char *word) /* Add word to store, incrementing it's useCount if it's already there, otherwise * making up a new record for it. */ { struct monomer *monomer = hashFindVal(store->monomerHash, word); if (monomer == NULL) { AllocVar(monomer); hashAddSaveName(store->monomerHash, word, monomer, &monomer->word); slAddHead(&store->monomerList, monomer); } monomer->useCount += 1; +monomer->outTarget = monomer->useCount; /* Set default value, may be renormalized. */ return monomer; } void connectReadsToMonomers(struct alphaStore *store) /* Add each read monomer appears in to monomer's readList. */ { struct alphaRead *read; for (read = store->readList; read != NULL; read = read->next) { struct monomerRef *ref; for (ref = read->list; ref != NULL; ref = ref->next) { struct monomer *monomer = ref->val; refAdd(&monomer->readList, read); } @@ -733,33 +746,33 @@ struct alphaRead *read; AllocVar(read); read->list = list; hashAddSaveName(store->readHash, name, read, &read->name); slAddHead(&readList, read); } slReverse(&readList); store->readList = readList; lineFileClose(&lf); connectReadsToMonomers(store); } struct alphaOrphan /* Information about an orphan - something without great joining information. */ { - struct alphaOrphan *next; - struct monomer *monomer; - struct monomerType *type; + struct alphaOrphan *next; /* Next in list */ + struct monomer *monomer; /* Pointer to orphan */ + boolean paired; /* True if has been paired with somebody. */ }; struct alphaOrphan *findOrphanStarts(struct alphaRead *readList, struct alphaStore *store) /* Return list of monomers that are found only at start of reads. */ { struct alphaOrphan *orphanList = NULL; /* Build up hash of things seen later in reads. */ struct alphaRead *read; struct hash *later = hashNew(0); /* Hash of monomers found later. */ for (read = readList; read != NULL; read = read->next) { struct monomerRef *ref; for (ref = read->list->next; ref != NULL; ref = ref->next) { @@ -770,43 +783,31 @@ /* Pass through again collecting orphans. */ struct hash *orphanHash = hashNew(0); for (read = readList; read != NULL; read = read->next) { struct monomer *monomer = read->list->val; char *word = monomer->word; if (!hashLookup(later, word)) { struct alphaOrphan *orphan = hashFindVal(orphanHash, word); if (orphan == NULL) { AllocVar(orphan); orphan->monomer = monomer; slAddHead(&orphanList, orphan); hashAdd(orphanHash, word, orphan); - uglyf("Adding orphan start %s\n", word); - } - - /* Try and find orphan type by consulting either itself or previous items in read. */ - if (orphan->type == NULL) - { - struct monomerType *type = monomer->type; - if (type == NULL) - { - uglyf("Can't find type for %s\n", word); - } - else uglyf("%s is type %s\n", word, type->name); - orphan->type = type; + verbose(2, "Adding orphan start %s\n", word); } } } hashFree(&later); hashFree(&orphanHash); slReverse(&orphanList); return orphanList; } struct alphaOrphan *findOrphanEnds(struct alphaRead *readList, struct alphaStore *store) /* Return list of monomers that are found only at end of reads. */ { struct alphaOrphan *orphanList = NULL; /* Build up hash of things seen sooner in reads. */ @@ -825,48 +826,141 @@ /* Pass through again collecting orphans. */ struct hash *orphanHash = hashNew(0); for (read = readList; read != NULL; read = read->next) { struct monomerRef *lastRef = slLastEl(read->list); struct monomer *monomer = lastRef->val; if (!hashLookup(sooner, monomer->word)) { struct alphaOrphan *orphan = hashFindVal(orphanHash, monomer->word); if (orphan == NULL) { AllocVar(orphan); orphan->monomer = monomer; slAddHead(&orphanList, orphan); hashAdd(orphanHash, monomer->word, orphan); - uglyf("Adding orphan end %s\n", monomer->word); + verbose(2, "Adding orphan end %s\n", monomer->word); } - else uglyf("Repeating orphan end %s\n", monomer->word); } } hashFree(&sooner); slReverse(&orphanList); hashFree(&orphanHash); return orphanList; } -void matchUpOrphans(struct alphaStore *store) +struct alphaRead *addReadOfTwo(struct alphaStore *store, struct monomer *a, struct monomer *b) +/* Make up a fake read that goes from a to b and add it to store. */ +{ +/* Create fake name and start fake read with it. */ +static int fakeId = 0; +char name[32]; +safef(name, sizeof(name), "fake%d", ++fakeId); + +/* Allocate fake read. */ +struct alphaRead *read; +AllocVar(read); + +/* Add a and b to the read. */ +struct monomerRef *aRef, *bRef; +AllocVar(aRef); +aRef->val = a; +AllocVar(bRef); +bRef->val = b; +aRef->next = bRef; +read->list = aRef; + +/* Add read to the store. */ +slAddHead(&store->readList, read); +hashAddSaveName(store->readHash, name, read, &read->name); + +/* Add read to the monomers. */ +refAdd(&a->readList, read); +refAdd(&a->readList, read); + +a->useCount += 1; +b->useCount += 1; +return read; +} + +void integrateOrphans(struct alphaStore *store) /* Make up fake reads that integrate orphans (monomers only found at beginning or end * of a read) better. */ { struct alphaOrphan *orphanStarts = findOrphanStarts(store->readList, store); struct alphaOrphan *orphanEnds = findOrphanEnds(store->readList, store); -uglyf("orphanStarts has %d items, orphanEnds %d\n", slCount(orphanStarts), slCount(orphanEnds)); +verbose(2, "orphanStarts has %d items, orphanEnds %d\n", + slCount(orphanStarts), slCount(orphanEnds)); + +/* First look for the excellent situation where you can pair up orphans with each + * other. For Y at least this happens half the time. */ +struct alphaOrphan *start, *end; +for (start = orphanStarts; start != NULL; start = start->next) + { + struct monomerType *startType = start->monomer->type; + if (startType == NULL) + continue; + for (end = orphanEnds; end != NULL && !start->paired; end = end->next) + { + struct monomerType *endType = end->monomer->type; + if (endType == NULL) + continue; + if (!end->paired) + { + struct monomerType *nextType = typeAfter(store, endType, 1); + if (nextType == startType) + { + end->paired = TRUE; + start->paired = TRUE; + addReadOfTwo(store, end->monomer, start->monomer); + verbose(2, "Pairing %s with %s\n", end->monomer->word, start->monomer->word); + } + } + } + } + +/* Ok, now have to just manufacture other side of orphan starts out of thin air. */ +for (start = orphanStarts; start != NULL; start = start->next) + { + if (start->paired) + continue; + struct monomer *startMono = start->monomer; + struct monomerType *startType = startMono->type; + if (startType == NULL) + continue; + + struct monomerType *newType = typeBefore(store, startType, 1); + struct monomer *newMono = pickRandomFromType(newType); + addReadOfTwo(store, newMono, startMono); + verbose(2, "Pairing new %s with start %s\n", newMono->word, startMono->word); + } + +/* Ok, now have to just manufacture other side of orphan ends out of thin air. */ +for (end = orphanEnds; end != NULL; end = end->next) + { + if (end->paired) + continue; + struct monomer *endMono = end->monomer; + struct monomerType *endType = endMono->type; + if (endType == NULL) + continue; + + struct monomerType *newType = typeAfter(store, endType, 1); + struct monomer *newMono = pickRandomFromType(newType); + addReadOfTwo(store, endMono, newMono); + verbose(2, "Pairing end %s with new %s\n", endMono->word, newMono->word); + } } void makeMarkovChains(struct alphaStore *store) /* Return a alphaStore containing all words, and also all chains-of-words of length * chainSize seen in file. */ { /* We'll build up the tree starting with an empty root node. */ struct wordTree *wt = store->markovChains = wordTreeNew(alphaStoreAddMonomer(store, "")); int chainSize = store->maxChainSize; /* Loop through each read. There's special cases at the beginning and end of read, and for * short reads. In the main case we'll be maintaining a chain (doubly linked list) of maxChainSize * words, * popping off one word from the start, and adding one word to the end for each * new word we encounter. This list is added to the tree each iteration. */ struct alphaRead *read; @@ -1067,44 +1161,34 @@ if (afterType != NULL) { if (bestType == NULL || afterPos < bestPos) { bestType = afterType; bestPos = afterPos; bestIsAfter = TRUE; } } } struct monomerType *chosenType = NULL; if (bestType != NULL) { - int bestIx = ptArrayIx(bestType, store->typeArray, store->typeCount); if (bestIsAfter) - { - bestIx -= bestPos; - if (bestIx < 0) - bestIx += store->typeCount; - } + chosenType = typeBefore(store, bestType, bestPos); else - { - bestIx += bestPos; - if (bestIx > store->typeCount) - bestIx -= store->typeCount; - } - chosenType = store->typeArray[bestIx]; + chosenType = typeAfter(store, bestType, bestPos); } return chosenType; } void fillInTypes(struct alphaStore *store) /* We have types for most but not all monomers. Try and fill out types for most of * the rest by looking for position in reads they are in that do contain known types */ { /* Do first pass on ones defined - just have to look them up in hash. */ struct monomer *monomer; for (monomer = store->monomerList; monomer != NULL; monomer = monomer->next) monomer->type = hashFindVal(store->typeHash, monomer->word); /* Do second pass on ones that are not yet defined */ @@ -1115,31 +1199,31 @@ struct monomerType *type = monomer->type = fillInTypeFromReads(monomer, store); verbose(2, "Got %p %s\n", type, (type != NULL ? type->name : "n/a")); } } } void alphaAsm(char *readsFile, char *monomerOrderFile, char *outFile) /* alphaAsm - Create Markov chain of words and optionally output chain in two formats. */ { struct alphaStore *store = alphaStoreNew(maxChainSize); alphaReadListFromFile(readsFile, store); makeMarkovChains(store); struct wordTree *wt = store->markovChains; alphaStoreLoadMonomerOrder(store, readsFile, monomerOrderFile); fillInTypes(store); -matchUpOrphans(store); +integrateOrphans(store); alphaStoreNormalize(store, outSize); if (optionExists("chain")) { char *fileName = optionVal("chain", NULL); wordTreeWrite(wt, store->maxChainSize, fileName); } wordTreeGenerateFile(store, store->maxChainSize, pickRandom(wt->children), outSize, outFile); if (optionExists("afterChain")) { char *fileName = optionVal("afterChain", NULL); wordTreeWrite(wt, store->maxChainSize, fileName); }