6557cfc305545ee5c43930fb7548b3310061c2d6 kent Wed Jun 22 20:32:53 2022 -0700 First full genome functioning build of dynamic programming version to pick best chains to use for liftOver purposes out of a a pairwise genome alignment to allow best liftovers from query to target maximizing coverage of target while minimizing coverage of query. An improvement over the greedy algorithm used in netChain/chainNetSubset. diff --git src/hg/oneShot/testLiftRechainer/testLiftRechainer.c src/hg/oneShot/testLiftRechainer/testLiftRechainer.c index 1cb65435..29bd0fa 100644 --- src/hg/oneShot/testLiftRechainer/testLiftRechainer.c +++ src/hg/oneShot/testLiftRechainer/testLiftRechainer.c @@ -1,79 +1,95 @@ /* testLiftRechainer - Work out which chains to use for lift-over purposes using dynamic * programming rather than greedy fill-in-the-gaps method of nets.. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "dlist.h" #include "options.h" #include "sqlNum.h" #include "chain.h" #include "twoBit.h" #include "obscure.h" #include "dnaseq.h" +boolean noAlt = FALSE, sameChrom = FALSE; + void usage() /* Explain usage and exit. */ { errAbort( "testLiftRechainer - Work out which chains to use for lift-over purposes using dynamic \n" "programming rather than greedy fill-in-the-gaps method of nets.\n" "usage:\n" " testLiftRechainer in.chain target.2bit query.2bit out.chain\n" "options:\n" - " -xxx=XXX\n" + " -noAlt - don't include chains involving _alt or _fix chromosome fragments\n" + " -sameChrom - only include chains from same chromosome\n" ); } /* Command line validation table. */ static struct optionSpec options[] = { + {"noAlt", OPTION_BOOLEAN}, + {"sameChrom", OPTION_BOOLEAN}, {NULL, 0}, }; struct chainTarget /* All the chains associated with a target chromosome. */ { struct chainTarget *next; char *name; /* Target sequence name */ struct chain *chainList; /* Sorted by tStart */ + struct chain *bigChain; /* Biggest scoring chain on list */ }; +boolean isAltName(char *name) +/* Return true if ends with _alt or _fix */ +{ +return endsWith(name, "_alt") || endsWith(name, "_fix"); +} + struct chainTarget *readChainTargets(char *fileName, struct hash *tHash, struct hash *qHash) /* Return list of chainTargets read from file. Returned hash will be keyed by * target name */ { struct hash *hash = hashNew(0); struct lineFile *lf = lineFileOpen(fileName, TRUE); struct chainTarget *targetList = NULL, *target; struct chain *chain; while ((chain = chainRead(lf)) != NULL) { + if (noAlt && (isAltName(chain->qName) || isAltName(chain->tName))) + continue; char *tName = chain->tName; target = hashFindVal(hash, tName); if (target == NULL) { if (hashLookup(tHash, tName) == NULL) errAbort("Target sequence %s is in chain but not target.2bit", tName); AllocVar(target); target->name = cloneString(tName); hashAdd(hash, tName, target); slAddHead(&targetList, target); } if (hashLookup(qHash, chain->qName) == NULL) errAbort("Query sequence %s is in chain but not target.2bit", chain->qName); slAddHead(&target->chainList, chain); + if (target->bigChain == NULL || chain->score > target->bigChain->score) + target->bigChain = chain; } /* Sort chains within all targets */ for (target = targetList; target != NULL; target = target->next) slSort(&target->chainList, chainCmpTarget); /* Clean up and return */ hashFree(&hash); lineFileClose(&lf); slReverse(&targetList); return targetList; } struct crossover /* List of positions where one chain flips to another */ @@ -267,45 +283,48 @@ next = el->next; struct rechain *rechain = el->val; if (tPos >= rechain->tEnd) { dlRemove(el); dlAddTail(archiveList, el); } } /* Add new chains */ struct rechain *bestChain = NULL; long long bestScore = -BIGNUM; /* Add new chains to active list */ while (chain != NULL && chain->tStart == tPos) { + if (!sameChrom || sameString(chain->qName, target->bigChain->qName)) + { if (bestChain == NULL) { struct dlNode *el; for (el = activeList->head; !dlEnd(el); el = el->next) { struct rechain *rechain = el->val; if (rechain->score > bestScore) { bestScore = rechain->score; bestChain = rechain; } } } addNewRechain(chain, tSeq, qSeqHash, qRcSeqHash, bestChain, bestScore, activeList); + } chain = chain->next; } /* Go through active list updating scores */ for (el = activeList->head; !dlEnd(el); el = el->next) { /* Get rechain structure and point block to relevant one for current tPos */ struct rechain *rechain = el->val; struct cBlock *block = rechain->curBlock; while (block != NULL && tPos >= block->tEnd) { block = block->next; } rechain->curBlock = block; @@ -442,33 +461,35 @@ toLowerN(qSeq->dna, qSeq->size); hashAdd(qSeqHash, qSeq->name, qSeq); qTotal += qSeq->size; } verbose(1, "Read %lld query bases in %d sequences\n", qTotal, qSeqHash->elCount); /* Open output now that all input has been at least checked */ FILE *f = mustOpen(out, "w"); struct twoBitFile *targetTwoBit = twoBitOpen(tTwoBit); struct chainTarget *target; for (target = targetList; target != NULL; target = target->next) { int tSize = hashIntVal(tSizeHash, target->name); struct dnaSeq *tSeq = twoBitReadSeqFragLower(targetTwoBit, target->name, 0, tSize); - verbose(1, "%s has %d bases and %d chains\n", - target->name, tSeq->size, slCount(target->chainList)); + verbose(1, "%s has %d bases and %d chains, biggest going to %s\n", + target->name, tSeq->size, slCount(target->chainList), target->bigChain->qName); rechainOneTarget(target, tSeq, qSeqHash, qRcSeqHash, f); dnaSeqFree(&tSeq); } carefulClose(&f); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 5) usage(); +noAlt = optionExists("noAlt"); +sameChrom = optionExists("sameChrom"); testLiftRechainer(argv[1], argv[2], argv[3], argv[4]); return 0; }