29f88e9827e33081a9e4c4db1c6dcca6542f7ccd kent Sun Jun 19 22:29:39 2022 -0700 First cut of next generation liftOver chain maker. This one is meant to handle swapping in alternate haplotypes properly, even when, as is the case with the HLA locus in the t2t, more than one alt is moved. Still needs lots of testing and polishing, but finishes and makes something semi-plausable on t2t's chr6 going against hg38's chr6 + alts. diff --git src/hg/oneShot/testLiftRechainer/testLiftRechainer.c src/hg/oneShot/testLiftRechainer/testLiftRechainer.c new file mode 100644 index 0000000..890052d --- /dev/null +++ src/hg/oneShot/testLiftRechainer/testLiftRechainer.c @@ -0,0 +1,455 @@ +/* testLiftRechainer - Work out which chains to use for lift-over purposes using dynamic + * programming rather than greedy fill-in-the-gaps method of nets.. */ +#include "common.h" +#include "linefile.h" +#include "hash.h" +#include "dlist.h" +#include "options.h" +#include "sqlNum.h" +#include "chain.h" +#include "twoBit.h" +#include "obscure.h" +#include "dnaseq.h" + +void usage() +/* Explain usage and exit. */ +{ +errAbort( + "testLiftRechainer - Work out which chains to use for lift-over purposes using dynamic \n" + "programming rather than greedy fill-in-the-gaps method of nets.\n" + "usage:\n" + " testLiftRechainer in.chain target.2bit query.2bit out.chain\n" + "options:\n" + " -xxx=XXX\n" + ); +} + +/* Command line validation table. */ +static struct optionSpec options[] = { + {NULL, 0}, +}; + +struct chainTarget +/* All the chains associated with a target chromosome. */ + { + struct chainTarget *next; + char *name; /* Target sequence name */ + struct chain *chainList; /* Sorted by tStart */ + }; + +struct chainTarget *readChainTargets(char *fileName, struct hash *tHash, struct hash *qHash) +/* Return list of chainTargets read from file. Returned hash will be keyed by + * target name */ +{ +struct hash *hash = hashNew(0); +struct lineFile *lf = lineFileOpen(fileName, TRUE); +struct chainTarget *targetList = NULL, *target; + +struct chain *chain; +while ((chain = chainRead(lf)) != NULL) + { + char *tName = chain->tName; + target = hashFindVal(hash, tName); + if (target == NULL) + { + if (hashLookup(tHash, tName) == NULL) + errAbort("Target sequence %s is in chain but not target.2bit", tName); + AllocVar(target); + target->name = cloneString(tName); + hashAdd(hash, tName, target); + slAddHead(&targetList, target); + } + if (hashLookup(qHash, chain->qName) == NULL) + errAbort("Query sequence %s is in chain but not target.2bit", chain->qName); + slAddHead(&target->chainList, chain); + } + +/* Sort chains within all targets */ +for (target = targetList; target != NULL; target = target->next) + slSort(&target->chainList, chainCmpTarget); + +/* Clean up and return */ +hashFree(&hash); +lineFileClose(&lf); +slReverse(&targetList); +return targetList; +} + +struct crossover +/* List of positions where one chain flips to another */ + { + struct crossover *next; + int tPos; /* Position in target where crossover occurs */ + struct rechain *rechain; /* Rechain at this point */ + }; + +struct crossover *crossoverNew(int tPos, struct rechain *rechain) +/* Allocate and fill in a new crossover */ +{ +struct crossover *cross; +AllocVar(cross); +cross->tPos= tPos; +cross->rechain = rechain; +return cross; +} + + +struct rechain +/* An active chain. Lives on double-linked list */ + { + int tStart, tEnd; /* Position covered in target */ + struct chain *chain; /* Associated full chain */ + struct cBlock *curBlock; /* Current block */ + struct dlNode *node; /* dlNode associated with chain */ + DNA *tDna; /* Target DNA seq */ + DNA *qDna; /* Query DNA seq */ + long long score; /* Current score */ + long long lastScore; /* Score up to last position */ + struct crossover *crossList; /* List of crossings around chain */ + }; + +struct rechain *rechainZero(int tSize, struct dlList *list) +/* Make up a psuedo-chain that corresponds to no alignment to target */ +{ +struct rechain *rechain; +AllocVar(rechain); +rechain->tEnd = tSize; +rechain->node = dlAddValTail(list, rechain); +rechain->crossList = crossoverNew(0, rechain); +return rechain; +} + +const long long newChainPenalty = 500; + +int chainBlockTotalSize(struct chain *chain) +/* Add up total size of all blocks */ +{ +int total = 0; +struct cBlock *block; +for (block = chain->blockList; block != NULL; block = block->next) + total += block->tEnd - block->tStart; +return total; +} + +boolean chainOverMinScore(struct chain *chain, struct dnaSeq *tSeq, struct dnaSeq *qSeq, int minScore) +/* Add up total size of all blocks */ +{ +int total = 0; +DNA *tDna = tSeq->dna, *qDna = qSeq->dna; + +struct cBlock *block; +for (block = chain->blockList; block != NULL; block = block->next) + { + DNA *t = tDna + block->tStart; + DNA *q = qDna + block->qStart; + int blockSize = block->tEnd - block->tStart; + int i; + for (i=0; i<blockSize; ++i) + { + if (t[i] == q[i]) + total += 1; + else + total -= 1; + } + if (total > minScore) + return TRUE; + } +return FALSE; +} + +void addNewRechain(struct chain *chain, struct dnaSeq *tSeq, struct hash *qSeqHash, + struct hash *qRcSeqHash, struct rechain *bestExisting, long long bestExistingScore, + struct dlList *list) +/* Make up a real rechainer and put it on list */ +{ +/* First do some optimizations to see if chain could possibly help */ +int minScore = newChainPenalty; +if (bestExisting->tEnd > chain->tEnd) + minScore += newChainPenalty; // Will go back to old chain we assume + +if (chain->tEnd - chain->tStart <= minScore) + return; +if (chainBlockTotalSize(chain) <= minScore) + return; + +/* Get query sequence */ +struct dnaSeq *qSeq = NULL; +if (chain->qStrand == '+') + { + qSeq = hashMustFindVal(qSeqHash, chain->qName); + } +else + { + char *qName = chain->qName; + qSeq = hashFindVal(qRcSeqHash, qName); + if (qSeq == NULL) + { + qSeq = hashMustFindVal(qSeqHash, chain->qName); + qSeq = cloneDnaSeq(qSeq); + reverseComplement(qSeq->dna, qSeq->size); + hashAdd(qRcSeqHash, qName, qSeq); + } + } + +/* Do more chain scoreing */ +if (!chainOverMinScore(chain, tSeq, qSeq, minScore)) + return; + +struct rechain *rechain; +AllocVar(rechain); +rechain->tStart = chain->tStart; +rechain->tEnd = chain->tEnd; +rechain->tStart = chain->tStart; +rechain->tEnd = chain->tEnd; +rechain->chain = chain; +rechain->curBlock = chain->blockList; +rechain->tDna = tSeq->dna; +rechain->qDna = qSeq->dna; +rechain->node = dlAddValTail(list, rechain); +rechain->crossList = crossoverNew(chain->tStart, bestExisting); +rechain->score = bestExistingScore - newChainPenalty; +} + +int rechainId(struct rechain *rechain) +/* Return ID associated with rechain */ +{ +struct chain *chain = rechain->chain; +if (chain == NULL) + return 0; +else + return chain->id; +} + +struct chainPart +/* A part of a chain */ + { + struct chainPart *next; + struct chain *chain; + int tStart, tEnd; + }; + +void rechainOneTarget(struct chainTarget *target, struct dnaSeq *tSeq, struct hash *qSeqHash, + struct hash *qRcSeqHash, FILE *f) +/* Do rechaining on a single target */ +{ +int tSize = tSeq->size; + +/* Make rechain lists and put in the "no chain" on on the active list. */ +struct dlList *archiveList = dlListNew(); /* List of rechains no longer active */ +struct dlList *activeList = dlListNew(); /* List of active rechains. */ +rechainZero(tSize, activeList); + +struct chain *chain = target->chainList; +int tPos; +for (tPos=0; tPos < tSize; ++tPos) + { + /* Go through active list removing chains that we are past */ + DNA tBase = tSeq->dna[tPos]; + struct dlNode *el, *next; + for (el = activeList->head; !dlEnd(el); el = next) + { + next = el->next; + struct rechain *rechain = el->val; + if (tPos >= rechain->tEnd) + { + dlRemove(el); + dlAddTail(archiveList, el); + } + } + + /* Add new chains */ + struct rechain *bestChain = NULL; + long long bestScore = -BIGNUM; + /* Add new chains to active list */ + while (chain != NULL && chain->tStart == tPos) + { + if (bestChain == NULL) + { + struct dlNode *el; + for (el = activeList->head; !dlEnd(el); el = el->next) + { + struct rechain *rechain = el->val; + if (rechain->score > bestScore) + { + bestScore = rechain->score; + bestChain = rechain; + } + } + } + + addNewRechain(chain, tSeq, qSeqHash, qRcSeqHash, bestChain, bestScore, activeList); + chain = chain->next; + } + + /* Go through active list updating scores */ + for (el = activeList->head; !dlEnd(el); el = el->next) + { + /* Get rechain structure and point block to relevant one for current tPos */ + struct rechain *rechain = el->val; + struct cBlock *block = rechain->curBlock; + while (block != NULL && tPos >= block->tEnd) + { + block = block->next; + } + rechain->curBlock = block; + + /* Figure out score we get if we match current sequence */ + long long matchScore = 0; + DNA qBase = ' '; + if (block != NULL && block->tStart <= tPos) + { + int qIx = tPos - block->tStart + block->qStart; + qBase = rechain->qDna[qIx]; + if (qBase == tBase) + matchScore += 1; + else + matchScore -= 1; + } + + /* Find best previous sequence to link to */ + long long bestScore = -BIGNUM; + struct rechain *bestSource = NULL; + struct dlNode *el2; + for (el2 = activeList->head; !dlEnd(el2); el2 = el2->next) + { + struct rechain *rechain2 = el2->val; + long long newScore = rechain2->lastScore + matchScore; + if (rechain2 != rechain) + newScore -= newChainPenalty; + if (bestScore < newScore) + { + bestScore = newScore; + bestSource = rechain2; + } + } + + /* Update current score and links if any */ + rechain->score = bestScore; + if (rechain->crossList->rechain != bestSource) + { + struct crossover *cross = crossoverNew(tPos, bestSource); + slAddHead(&rechain->crossList, cross); + } + } + + /* Go through active list one last time updating last scores */ + for (el = activeList->head; !dlEnd(el); el = el->next) + { + struct rechain *rechain = el->val; + rechain->lastScore = rechain->score; + } + } + +verbose(2, "qRcSeqHash has %d elements\n", qRcSeqHash->elCount); +verbose(1, "%d chains on active list, %d on archive list\n", + dlCount(activeList), dlCount(archiveList)); + + +/* Find best scoreing chain that is still active at end */ +struct dlNode *el; +struct rechain *bestChain = NULL; +long long bestScore = -BIGNUM; +for (el = activeList->head; !dlEnd(el); el = el->next) + { + struct rechain *rechain = el->val; + if (rechain->score > bestScore) + { + bestChain = rechain; + bestScore = rechain->score; + verbose(1, "active %d: score %lld cross %d last from %d\n", rechainId(rechain), rechain->score, + slCount(rechain->crossList), rechainId(rechain->crossList->rechain)); + } + } + +/* Do traceback */ +struct chainPart *partList = NULL, *part; +struct rechain *rechain; +int tEnd = tSeq->size; +for (rechain = bestChain; rechain != NULL; ) + { + // uglyf("rechain id %d tEnd %d\n", rechainId(rechain), tEnd); + struct rechain *prev = NULL; + struct crossover *cross; + int tStart = 0; + for (cross = rechain->crossList; cross != NULL; cross = cross->next) + { + // uglyf(" crossing back to chain %d at %d\n", rechainId(cross->rechain), cross->tPos); + if (cross->tPos < tEnd && cross->rechain != rechain) + { + tStart = cross->tPos; + prev = cross->rechain; + rechain->crossList = cross; + break; + } + } + chain = rechain->chain; + if (chain != NULL) + { + AllocVar(part); + part->chain = chain; + part->tStart = tStart; + part->tEnd = tEnd; + slAddHead(&partList, part); + uglyf("chainId %d (%d-%d) qSeq (%s:%d-%d) [%s:%d-%d] %d\n", chain->id, chain->tStart, chain->tEnd, chain->qName, chain->qStart, chain->qEnd, chain->tName, tStart, tEnd, tEnd-tStart); + } + rechain = prev; + tEnd = tStart; + } +for (part = partList; part != NULL; part = part->next) + { + struct chain *chainToFree = NULL, *subchain; + chainSubsetOnT(part->chain, part->tStart, part->tEnd, &subchain, &chainToFree); + chainWrite(subchain, f); + chainFree(&chainToFree); + } +slFreeList(&partList); +} + +void testLiftRechainer(char *chainFile, char *tTwoBit, char *qTwoBit, char *out) +/* Rework chains into lift only chain */ +{ +dnaUtilOpen(); +struct hash *tSizeHash = twoBitChromHash(tTwoBit); +struct hash *qSizeHash = twoBitChromHash(qTwoBit); +struct chainTarget *targetList = readChainTargets(chainFile, tSizeHash, qSizeHash); +verbose(1, "Read %d chains targets to go against %d target sequences and %d query sequences\n", + slCount(targetList), tSizeHash->elCount, qSizeHash->elCount); + +/* Load up query DNA */ +struct dnaSeq *qSeqList = twoBitLoadAll(qTwoBit); +struct hash *qSeqHash = hashNew(0), *qRcSeqHash = hashNew(0); +struct dnaSeq *qSeq; +long long qTotal = 0; +for (qSeq = qSeqList; qSeq != NULL; qSeq = qSeq->next) + { + toLowerN(qSeq->dna, qSeq->size); + hashAdd(qSeqHash, qSeq->name, qSeq); + qTotal += qSeq->size; + } +verbose(1, "Read %lld query bases in %d sequences\n", qTotal, qSeqHash->elCount); + +/* Open output now that all input has been at least checked */ +FILE *f = mustOpen(out, "w"); + +struct twoBitFile *targetTwoBit = twoBitOpen(tTwoBit); +struct chainTarget *target; +for (target = targetList; target != NULL; target = target->next) + { + int tSize = hashIntVal(tSizeHash, target->name); + struct dnaSeq *tSeq = twoBitReadSeqFragLower(targetTwoBit, target->name, 0, tSize); + verbose(1, "Read %d bases in %s\n", tSeq->size, target->name); + rechainOneTarget(target, tSeq, qSeqHash, qRcSeqHash, f); + dnaSeqFree(&tSeq); + } +carefulClose(&f); +} + +int main(int argc, char *argv[]) +/* Process command line. */ +{ +optionInit(&argc, argv, options); +if (argc != 5) + usage(); +testLiftRechainer(argv[1], argv[2], argv[3], argv[4]); +return 0; +}