c0c04675b1d4865752984e71a0a6d9034a45d622 kent Tue Sep 11 16:11:30 2012 -0700 Adding missing types based on reads. diff --git src/kehayden/alphaAsm/alphaAsm.c src/kehayden/alphaAsm/alphaAsm.c index ca818d4..104f156 100644 --- src/kehayden/alphaAsm/alphaAsm.c +++ src/kehayden/alphaAsm/alphaAsm.c @@ -49,65 +49,70 @@ {NULL, 0}, }; /* Some structures to keep track of words (which correspond to alpha satellight monomers) * seen in input. */ struct monomer /* Basic information on a monomer including how many times it is seen in input and output * streams. Unlike the wordTree, this is flat, and does not include predecessors. */ { struct monomer *next; /* Next in list of all words. */ char *word; /* The word used to represent monomer. Not allocated here. */ int useCount; /* Number of times used. */ int outTarget; /* Number of times want to output word. */ int outCount; /* Number of times have output word so far. */ + struct monomerType *type; /* The type of the monomer. */ + struct slRef *readList; /* List of references to reads this is in. */ }; struct monomerRef /* A reference to a word. */ { struct monomerRef *next; /* Next in list */ struct monomer *val; /* The word referred to. */ }; struct monomerType /* A collection of words of the same type - or monomers of same type. */ { struct monomerType *next; /* Next monomerType */ + char *name; /* Short name of type */ struct monomerRef *list; /* List of all words of that type */ }; struct alphaRead /* A read that's been parsed into alpha variants. */ { struct alphaRead *next; /* Next in list */ char *name; /* Id of read, not allocated here. */ struct monomerRef *list; /* List of alpha subunits */ }; struct alphaStore /* Stores info on all words */ { struct monomer *monomerList; /* List of words, fairly arbitrary order. */ struct hash *monomerHash; /* Hash of monomer, keyed by word. */ struct alphaRead *readList; /* List of all reads. */ struct hash *readHash; /* Hash of all reads keyed by read ID. */ struct wordTree *markovChains; /* Tree of words that follow other words. */ int maxChainSize; /* Maximum depth of tree. */ struct monomerType *typeList; /* List of all types. */ struct hash *typeHash; /* Hash with monomerType values, keyed by all words. */ + int typeCount; /* Count of types. */ + struct monomerType **typeArray; /* Array as opposed to list of types */ }; /* The wordTree structure below is the central data structure for this program. It is * used to build up a tree that contains all observed N-word-long sequences observed in * the text, where N corresponds to the "size" command line option which defaults to 3, * an option that in turn is stored in the maxChainSize variable. At this chain size the * text * this is the black dog and the black cat * would have the chains * this is the * is the black * the black dog * black dog and * dog and the * and the black @@ -669,30 +674,45 @@ struct monomer *alphaStoreAddMonomer(struct alphaStore *store, char *word) /* Add word to store, incrementing it's useCount if it's already there, otherwise * making up a new record for it. */ { struct monomer *monomer = hashFindVal(store->monomerHash, word); if (monomer == NULL) { AllocVar(monomer); hashAddSaveName(store->monomerHash, word, monomer, &monomer->word); slAddHead(&store->monomerList, monomer); } monomer->useCount += 1; return monomer; } +void connectReadsToMonomers(struct alphaStore *store) +/* Add each read monomer appears in to monomer's readList. */ +{ +struct alphaRead *read; +for (read = store->readList; read != NULL; read = read->next) + { + struct monomerRef *ref; + for (ref = read->list; ref != NULL; ref = ref->next) + { + struct monomer *monomer = ref->val; + refAdd(&monomer->readList, read); + } + } +} + void alphaReadListFromFile(char *fileName, struct alphaStore *store) /* Read in file and turn it into a list of alphaReads. */ { /* Stuff for processing file a line at a time. */ struct lineFile *lf = lineFileOpen(fileName, TRUE); char *line, *word; /* Loop through each line of file making up a read for it. */ struct alphaRead *readList = NULL; while (lineFileNextReal(lf, &line)) { /* Parse out first word of line into name, and rest into list. */ char *name = nextWord(&line); struct monomerRef *list = NULL; while ((word = nextWord(&line)) != NULL) @@ -707,83 +727,96 @@ if (list == NULL) errAbort("Line %d of %s has no alpha monomers.", lf->lineIx, lf->fileName); /* Create data structure and add read to list and hash */ if (hashLookup(store->readHash, name)) errAbort("Read ID %s is repeated line %d of %s", name, lf->lineIx, lf->fileName); struct alphaRead *read; AllocVar(read); read->list = list; hashAddSaveName(store->readHash, name, read, &read->name); slAddHead(&readList, read); } slReverse(&readList); store->readList = readList; lineFileClose(&lf); +connectReadsToMonomers(store); } struct alphaOrphan /* Information about an orphan - something without great joining information. */ { struct alphaOrphan *next; struct monomer *monomer; struct monomerType *type; }; -struct alphaOrphan *findOrphanStarts(struct alphaRead *readList) +struct alphaOrphan *findOrphanStarts(struct alphaRead *readList, struct alphaStore *store) /* Return list of monomers that are found only at start of reads. */ { struct alphaOrphan *orphanList = NULL; /* Build up hash of things seen later in reads. */ struct alphaRead *read; struct hash *later = hashNew(0); /* Hash of monomers found later. */ for (read = readList; read != NULL; read = read->next) { struct monomerRef *ref; for (ref = read->list->next; ref != NULL; ref = ref->next) { hashAdd(later, ref->val->word, NULL); } } /* Pass through again collecting orphans. */ struct hash *orphanHash = hashNew(0); for (read = readList; read != NULL; read = read->next) { struct monomer *monomer = read->list->val; - if (!hashLookup(later, monomer->word)) + char *word = monomer->word; + if (!hashLookup(later, word)) { - struct alphaOrphan *orphan = hashFindVal(orphanHash, monomer->word); + struct alphaOrphan *orphan = hashFindVal(orphanHash, word); if (orphan == NULL) { AllocVar(orphan); orphan->monomer = monomer; slAddHead(&orphanList, orphan); - hashAdd(orphanHash, monomer->word, orphan); - uglyf("Adding orphan start %s\n", monomer->word); + hashAdd(orphanHash, word, orphan); + uglyf("Adding orphan start %s\n", word); + } + + /* Try and find orphan type by consulting either itself or previous items in read. */ + if (orphan->type == NULL) + { + struct monomerType *type = monomer->type; + if (type == NULL) + { + uglyf("Can't find type for %s\n", word); + } + else uglyf("%s is type %s\n", word, type->name); + orphan->type = type; } - else uglyf("Repeating orphan start %s\n", monomer->word); } } hashFree(&later); hashFree(&orphanHash); slReverse(&orphanList); return orphanList; } -struct alphaOrphan *findOrphanEnds(struct alphaRead *readList) +struct alphaOrphan *findOrphanEnds(struct alphaRead *readList, struct alphaStore *store) /* Return list of monomers that are found only at end of reads. */ { struct alphaOrphan *orphanList = NULL; /* Build up hash of things seen sooner in reads. */ struct alphaRead *read; struct hash *sooner = hashNew(0); /* Hash of monomers found sooner. */ for (read = readList; read != NULL; read = read->next) { struct monomerRef *ref; /* Loop over all but last. */ for (ref = read->list; ref->next != NULL; ref = ref->next) { hashAdd(sooner, ref->val->word, NULL); } @@ -807,32 +840,32 @@ uglyf("Adding orphan end %s\n", monomer->word); } else uglyf("Repeating orphan end %s\n", monomer->word); } } hashFree(&sooner); slReverse(&orphanList); hashFree(&orphanHash); return orphanList; } void matchUpOrphans(struct alphaStore *store) /* Make up fake reads that integrate orphans (monomers only found at beginning or end * of a read) better. */ { -struct alphaOrphan *orphanStarts = findOrphanStarts(store->readList); -struct alphaOrphan *orphanEnds = findOrphanEnds(store->readList); +struct alphaOrphan *orphanStarts = findOrphanStarts(store->readList, store); +struct alphaOrphan *orphanEnds = findOrphanEnds(store->readList, store); uglyf("orphanStarts has %d items, orphanEnds %d\n", slCount(orphanStarts), slCount(orphanEnds)); } void makeMarkovChains(struct alphaStore *store) /* Return a alphaStore containing all words, and also all chains-of-words of length * chainSize seen in file. */ { /* We'll build up the tree starting with an empty root node. */ struct wordTree *wt = store->markovChains = wordTreeNew(alphaStoreAddMonomer(store, "")); int chainSize = store->maxChainSize; /* Loop through each read. There's special cases at the beginning and end of read, and for * short reads. In the main case we'll be maintaining a chain (doubly linked list) of maxChainSize * words, * popping off one word from the start, and adding one word to the end for each * new word we encounter. This list is added to the tree each iteration. */ @@ -885,89 +918,227 @@ dlListFree(&chain); } wordTreeSort(wt); // Make output of chain file prettier } void wordTreeWrite(struct wordTree *wt, int maxChainSize, char *fileName) /* Write out tree to file */ { FILE *f = mustOpen(fileName, "w"); fprintf(f, "#level\tuseCount\toutTarget\toutCount\tnormVal\tmonomers\n"); wordTreeDump(0, wt, maxChainSize, f); carefulClose(&f); } void alphaStoreLoadMonomerOrder(struct alphaStore *store, char *readsFile, char *fileName) -/* Read in a file with one line for each monomer type, containing a word for each - * monomer variant. Requires all variants already be in store. The readsFile is passed +/* Read in a file with one line for each monomer type, where first word is type name, + * and rest of words are all the actually variants of that type. + * Requires all variants already be in store. The readsFile is passed * just for nicer error reporting. */ { /* Stuff for processing file a line at a time. */ struct lineFile *lf = lineFileOpen(fileName, TRUE); char *line, *word; +struct hash *uniq = hashNew(0); /* Set up variables we'll put results in in store. */ store->typeList = NULL; while (lineFileNextReal(lf, &line)) { + char *name = nextWord(&line); + if (hashLookup(uniq, name) != NULL) + errAbort("Type name '%s' repeated line %d of %s\n", name, lf->lineIx, lf->fileName); struct monomerType *type; AllocVar(type); + type->name = cloneString(name); slAddHead(&store->typeList, type); while ((word = nextWord(&line)) != NULL) { struct monomer *monomer = hashFindVal(store->monomerHash, word); if (monomer == NULL) errAbort("%s is in %s but not %s", word, lf->fileName, readsFile); struct monomerRef *ref; AllocVar(ref); ref->val = monomer; slAddHead(&type->list, ref); hashAddUnique(store->typeHash, word, type); } } slReverse(&store->typeList); lineFileClose(&lf); +hashFree(&uniq); verbose(2, "Added %d types containing %d words from %s\n", slCount(store->typeList), store->typeHash->elCount, fileName); + +/* Create type array */ +store->typeCount = slCount(store->typeList); +struct monomerType **types = AllocArray(store->typeArray, store->typeCount); +struct monomerType *type; +int i; +for (i=0, type = store->typeList; i<store->typeCount; ++i, type = type->next) + types[i] = type; } void monomerListNormalise(struct monomer *list, int totalCount, int outputSize) /* Set outTarget field in all of list to be normalized to outputSize */ { struct monomer *monomer; double scale = outputSize/totalCount; for (monomer = list; monomer != NULL; monomer = monomer->next) monomer->outTarget = round(scale * monomer->useCount); } void alphaStoreNormalize(struct alphaStore *store, int outputSize) /* Set up output counts on each word to make sure it gets it's share of output size */ { struct wordTree *wt = store->markovChains; wordTreeNormalize(wt, outputSize, 1.0); monomerListNormalise(store->monomerList, wt->useCount, outputSize); } +struct monomerType *fillInTypeFromReads(struct monomer *monomer, struct alphaStore *store) +/* Look through read list trying to find best supported type for this monomer. */ +{ +/* The algorithm here could be better. It picks the closest nearby monomer that is typed + * to fill in from. When have information from both before and after uses arbitrarily + * before preferentially. When have information that is equally close from multiple reads, + * just uses info from first read. Fortunately the data, at least on Y, shows a lot of + * consistency, so probably not actually worth a more sophisticated algorithm that could + * take the most popular choice if data were inconsistent. */ +verbose(2, "fillInTypeFromReads on %s from %d reads\n", monomer->word, slCount(monomer->readList)); +struct monomerType *bestType = NULL; +int bestPos = 0; +boolean bestIsAfter = FALSE; +struct slRef *readRef; +for (readRef = monomer->readList; readRef != NULL; readRef = readRef->next) + { + struct alphaRead *read = readRef->val; + struct monomerRef *ref; + { + verbose(2, " read %s (%d els):", read->name, slCount(read->list)); + for (ref = read->list; ref != NULL; ref = ref->next) + { + struct monomer *m = ref->val; + verbose(2, " %s", m->word); + } + verbose(2, "\n"); + } + struct monomerType *beforeType = NULL; + int beforePos = 0; + + /* Look before. */ + for (ref = read->list; ref != NULL; ref = ref->next) + { + struct monomer *m = ref->val; + if (m == monomer) + break; + if (m->type != NULL) + { + beforeType = m->type; + beforePos = 0; + } + ++beforePos; + } + + /* Look after. */ + struct monomerType *afterType = NULL; + int afterPos = 0; + assert(ref != NULL && ref->val == monomer); + for (ref = ref->next; ref != NULL; ref = ref->next) + { + struct monomer *m = ref->val; + ++afterPos; + if (m->type != NULL) + { + afterType = m->type; + break; + } + } + + if (beforeType != NULL) + { + if (bestType == NULL || beforePos < bestPos) + { + bestType = beforeType; + bestPos = beforePos; + bestIsAfter = FALSE; + } + } + + if (afterType != NULL) + { + if (bestType == NULL || afterPos < bestPos) + { + bestType = afterType; + bestPos = afterPos; + bestIsAfter = TRUE; + } + } + } + +struct monomerType *chosenType = NULL; +if (bestType != NULL) + { + int bestIx = ptArrayIx(bestType, store->typeArray, store->typeCount); + if (bestIsAfter) + { + bestIx -= bestPos; + if (bestIx < 0) + bestIx += store->typeCount; + } + else + { + bestIx += bestPos; + if (bestIx > store->typeCount) + bestIx -= store->typeCount; + } + chosenType = store->typeArray[bestIx]; + } + +return chosenType; +} + +void fillInTypes(struct alphaStore *store) +/* We have types for most but not all monomers. Try and fill out types for most of + * the rest by looking for position in reads they are in that do contain known types */ +{ +/* Do first pass on ones defined - just have to look them up in hash. */ +struct monomer *monomer; +for (monomer = store->monomerList; monomer != NULL; monomer = monomer->next) + monomer->type = hashFindVal(store->typeHash, monomer->word); + +/* Do second pass on ones that are not yet defined */ +for (monomer = store->monomerList; monomer != NULL; monomer = monomer->next) + { + if (monomer->type == NULL) + { + struct monomerType *type = monomer->type = fillInTypeFromReads(monomer, store); + verbose(2, "Got %p %s\n", type, (type != NULL ? type->name : "n/a")); + } + } +} + void alphaAsm(char *readsFile, char *monomerOrderFile, char *outFile) /* alphaAsm - Create Markov chain of words and optionally output chain in two formats. */ { struct alphaStore *store = alphaStoreNew(maxChainSize); alphaReadListFromFile(readsFile, store); makeMarkovChains(store); struct wordTree *wt = store->markovChains; alphaStoreLoadMonomerOrder(store, readsFile, monomerOrderFile); +fillInTypes(store); matchUpOrphans(store); alphaStoreNormalize(store, outSize); if (optionExists("chain")) { char *fileName = optionVal("chain", NULL); wordTreeWrite(wt, store->maxChainSize, fileName); } wordTreeGenerateFile(store, store->maxChainSize, pickRandom(wt->children), outSize, outFile); if (optionExists("afterChain")) { char *fileName = optionVal("afterChain", NULL); wordTreeWrite(wt, store->maxChainSize, fileName);