0e15377650ad1a2f32303ac3323b6a998b9ff937 angie Mon Feb 3 14:14:20 2020 -0800 chainBridge: add -diffTolerance option; run bandExt in both directions in case best alignment is at end not start. refs #21650 When using chainBridge to extend through "gaps" in AGP correspondences, use large -diffTolerance, no need to require that t & q have similar sizes. diff --git src/hg/mouseStuff/chainBridge/chainBridge.c src/hg/mouseStuff/chainBridge/chainBridge.c index 3b6c270..9b236a2 100644 --- src/hg/mouseStuff/chainBridge/chainBridge.c +++ src/hg/mouseStuff/chainBridge/chainBridge.c @@ -1,401 +1,491 @@ /* chainBridge - Attempt to extend alignments through double-sided gaps of similar size. */ #include "common.h" #include "axt.h" #include "bandExt.h" #include "chainBlock.h" #include "chainConnect.h" #include "dnaseq.h" #include "gapCalc.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "twoBit.h" /* Command line parameters & defaults */ unsigned maxGapSize = 6000; double diffTolerance = 0.3; struct gapCalc *gapCalc = NULL; /* Gap scoring scheme to use. */ struct axtScoreScheme *scoreScheme = NULL; void usage() /* Explain usage and exit. */ { errAbort( "chainBridge - Attempt to extend alignments through double-sided gaps of similar size\n" "usage:\n" " chainBridge in.chain target.2bit query.2bit out.chain\n" "options:\n" + " -diffTolerance=D Don't try to extend when a 2-sided gap's longer side is Dx\n" + " longer than its shorter side (default: %0.1f, i.e. %d%% longer)\n" " -maxGap=N Maximum size of double-sided gap to try to bridge (default: %d)\n" " Note: there is no size limit for exact sequence matches\n" " -scoreScheme=fileName Read the scoring matrix from a blastz-format file\n" " -linearGap=<medium|loose|filename> Specify type of linearGap to use.\n" " loose is chicken/human linear gap costs.\n" " medium is mouse/human linear gap costs.\n" " Or specify a piecewise linearGap tab delimited file.\n" " (default: loose)\n" - , maxGapSize + , diffTolerance, (int)(diffTolerance * 100), maxGapSize ); } /* Command line validation table. */ static struct optionSpec options[] = { + { "diffTolerance", OPTION_DOUBLE }, { "maxGap", OPTION_INT }, { "scoreScheme", OPTION_STRING }, { "linearGap", OPTION_STRING }, { NULL, 0 }, }; struct chromStrandSeq /* Cache whole chromosome sequences on the strand specified. * Lots of mem, but quicker than thousands of twoBitReadSeqFrag's and simpler than * reversing tweaked coords and sequence; we can just use chain q coords as-is. */ { struct hash *seqHash; struct twoBitFile *tbf; }; static struct chromStrandSeq *chromStrandSeqNew(char *twoBitFileName) /* Create a cache for tbf sequences. */ { struct chromStrandSeq *css; AllocVar(css); css->seqHash = hashNew(0); css->tbf = twoBitOpen(twoBitFileName); return css; } static struct dnaSeq *chromStrandSeqGetDnaSeq(struct chromStrandSeq *css, char *chrom, char strand) /* Return the lowercased sequence of chrom on specified strand, '+' or '-'. Do not modify/free! */ { if (strand != '-' && strand != '+') errAbort("chromStrandSeqGetSeq: strand must be '+' or '-' not '%c'", strand); int nameLen = strlen(chrom) + 1; char chromStrand[nameLen+1]; safef(chromStrand, sizeof(chromStrand), "%s%c", chrom, strand); struct dnaSeq *seq = hashFindVal(css->seqHash, chromStrand); if (seq == NULL) { // See if we already have sequence for the other strand, if so just copy and rc char chromOtherStrand[nameLen+1]; safef(chromOtherStrand, sizeof(chromOtherStrand), "%s%c", chrom, (strand == '-' ? '+' : '-')); struct dnaSeq *otherStrandSeq = hashFindVal(css->seqHash, chromOtherStrand); if (otherStrandSeq) { seq = cloneDnaSeq(otherStrandSeq); reverseComplement(seq->dna, seq->size); } else { seq = twoBitReadSeqFragLower(css->tbf, chrom, 0, 0); if (strand == '-') reverseComplement(seq->dna, seq->size); } hashAdd(css->seqHash, chromStrand, seq); } return seq; } static char *chromStrandSeqGetSeq(struct chromStrandSeq *css, char *chrom, char strand) /* Return the lowercased sequence of chrom on specified strand, '+' or '-'. Do not modify/free! */ { struct dnaSeq *seq = chromStrandSeqGetDnaSeq(css, chrom, strand); return seq->dna; } static boolean canTrivialExtend(struct chain *chain, struct cBlock *blk, struct chromStrandSeq *tCss, struct chromStrandSeq *qCss, int *retFromStart, int *retFromEnd) /* Find the number of bases between blk and blk->next that are identical in t and q, * starting at the beginning of the gap and, if bases are left over, from the end of the gap. * Return TRUE if gap has equal size and completely identical sequence on t and q. */ { if (blk == NULL || blk->next == NULL) { *retFromStart = 0; *retFromEnd = 0; return FALSE; } int fromStart = 0, fromEnd = 0; int tGapStart = blk->tEnd, tGapEnd = blk->next->tStart; int qGapStart = blk->qEnd, qGapEnd = blk->next->qStart; int tGapSize = tGapEnd - tGapStart; int qGapSize = qGapEnd - qGapStart; int smaller = (tGapSize < qGapSize) ? tGapSize : qGapSize; if (smaller == 0) { *retFromStart = 0; *retFromEnd = 0; return FALSE; } else if (smaller < 0) errAbort("Negative gap length found: chain %d, t %s[%d,%d) q %s[%d,%d)", chain->id, chain->tName, tGapStart, tGapEnd, chain->qName, qGapStart, qGapEnd); char *tChrom = chromStrandSeqGetSeq(tCss, chain->tName, '+'); char *qChrom = chromStrandSeqGetSeq(qCss, chain->qName, chain->qStrand); char *tSeq = tChrom + tGapStart; char *qSeq = qChrom + qGapStart; while (fromStart < smaller && tSeq[fromStart] == qSeq[fromStart]) fromStart++; int basesAtEnd = smaller - fromStart; tSeq = tChrom + tGapEnd - basesAtEnd; qSeq = qChrom + qGapEnd - basesAtEnd; while (fromEnd < basesAtEnd && tSeq[basesAtEnd-1-fromEnd] == qSeq[basesAtEnd-1-fromEnd]) fromEnd++; *retFromStart = fromStart; *retFromEnd = fromEnd; return (fromStart == tGapSize && tGapSize == qGapSize); } static boolean tryTrivialExtend(struct chain *chain, struct cBlock *blk, struct chromStrandSeq *tCss, struct chromStrandSeq *qCss, struct cBlock **retNextBlk) /* Sometimes double-sided gaps of equal length have identical sequence on t and q -- * in that trivial case, just merge blk and blk->next and see if the next gap is trivial too. * If a double-sided gap is not trivial but has identical sequence at the start and/or end, * adjust blk and blk->next coords to shrink the gap to just the mismatching sequence. * Return TRUE if any changes were made. */ { boolean changed = FALSE; struct cBlock *nextBlk = blk->next; int fromStart = 0, fromEnd = 0; while (canTrivialExtend(chain, blk, tCss, qCss, &fromStart, &fromEnd)) { // The most trivial case: entire gap matches perfectly on t and q. // Merge nextBlk into blk, repeat to see if we can keep merging. blk->tEnd = nextBlk->tEnd; blk->qEnd = nextBlk->qEnd; blk->next = nextBlk->next; freeMem(nextBlk); nextBlk = blk->next; changed = TRUE; } // Not completely trivial, but we can shave some bases off the beginning and/or // end of the gap. if (fromStart > 0) { blk->tEnd += fromStart; blk->qEnd += fromStart; changed = TRUE; } if (fromEnd > 0) { nextBlk->tStart -= fromEnd; nextBlk->qStart -= fromEnd; changed = TRUE; } *retNextBlk = nextBlk; return changed; } static boolean canExtend(struct cBlock *blk) /* Return TRUE if blk is followed by a gap of similar size on t and q and not too large. */ { if (blk == NULL || blk->next == NULL) return FALSE; int tGapSize = blk->next->tStart - blk->tEnd; int qGapSize = blk->next->qStart - blk->qEnd; if (tGapSize == 0 || qGapSize == 0) return FALSE; int smaller, larger; if (tGapSize < qGapSize) { smaller = tGapSize; larger = qGapSize; } else { smaller = qGapSize; larger = tGapSize; } if (smaller > maxGapSize) { verbose(2, "gap size %d is too big\n", smaller); return FALSE; } double sizeRatio = (double)larger / (double)smaller; verbose(2, "tGapSize=%d, qGapSize=%d, sizeRatio=%f\n", tGapSize, qGapSize, sizeRatio); if (sizeRatio < 1.0 + diffTolerance) return TRUE; return FALSE; } static boolean maybeMergeBlocks(struct cBlock *blk0, struct cBlock *blk1) /* Merge blk1 into blk0 if possible. I.e. if blk1's t and q start at the end of blk0 * then update blk0's t and q end to blk1's t and q end, and return TRUE. * blk0 and blk1 must be on same t and q chroms and blk1 must start after blk0 on t and q. */ { if (blk0 && blk1) { int tOverlap = blk0->tEnd - blk1->tStart; int qOverlap = blk0->qEnd - blk1->qStart; if (tOverlap == qOverlap && tOverlap >= 0 && qOverlap >= 0) { blk0->tEnd = blk1->tEnd; blk0->qEnd = blk1->qEnd; return TRUE; } } return FALSE; } static void trimAndAddBlock(struct cBlock **pBlockList, struct cBlock *blk) /* Trim overlap between blk and head of *pBlockList, then add blk to head if anything is left. */ { struct cBlock *curBlk = *pBlockList; if (curBlk != NULL) { int overlap = curBlk->tEnd - blk->tStart; if (overlap > 0) { blk->tStart = curBlk->tEnd; blk->qStart += overlap; } overlap = curBlk->qEnd - blk->qStart; if (overlap > 0) { blk->tStart += overlap; blk->qStart = curBlk->qEnd; } } if (blk->tEnd > blk->tStart && blk->qEnd > blk->qStart) slAddHead(pBlockList, blk); } +static int countNonGap(char *s) +/* Return the number of non-gap characters in s. */ +{ +int nonGap = 0; +while (*s != '\0') + if (*s++ != '-') + nonGap++; +return nonGap; +} + +static void fillInFromBlocks(struct chain *chainNew, struct chain *chainOld, struct cBlock *blocks) +/* Fill in chainNew using seq data from chainOld and coord data from blocks (don't free anything). */ +{ +ZeroVar(chainNew); +struct cBlock *lastBlk = blocks; +while (lastBlk->next) + lastBlk = lastBlk->next; +chainNew->blockList = blocks; +chainNew->tName = chainOld->tName; +chainNew->tSize = chainOld->tSize; +chainNew->tStart = blocks->tStart; +chainNew->tEnd = lastBlk->tEnd; +chainNew->qName = chainOld->qName; +chainNew->qSize = chainOld->qSize; +chainNew->qStrand = chainOld->qStrand; +chainNew->qStart = blocks->qStart; +chainNew->qEnd = lastBlk->qEnd; +chainNew->id = chainOld->id; +} + +static struct cBlock *extendThroughGap(struct chain *chain, struct cBlock *blk, + struct chromStrandSeq *tCss, struct chromStrandSeq *qCss) +/* Use bandExt (both forward and reverse) to extend the alignment through the gap after blk, + * if possible. Return alignment block(s) for best attempt. */ +{ +struct cBlock *extBlocks = NULL; +// Provide a little context for bandExt input sequences: +int overlap = 5; +// Alignment parameters for bandExt: +boolean global = FALSE; +int bandExtForward = 1, bandExtReverse = -1; +int maxInsert = maxGapSize/10; +int symAlloc = 2 * maxGapSize; +// Get gap sequence and coords for alignment. +struct dnaSeq *tSeq = chromStrandSeqGetDnaSeq(tCss, chain->tName, '+'); +struct dnaSeq *qSeq = chromStrandSeqGetDnaSeq(qCss, chain->qName, chain->qStrand); +int tGapStart = blk->tEnd, tGapEnd = blk->next->tStart; +int qGapStart = blk->qEnd, qGapEnd = blk->next->qStart; +int tAliStart = tGapStart - overlap, tAliEnd = tGapEnd + overlap; +int qAliStart = qGapStart - overlap, qAliEnd = qGapEnd + overlap; +if (tAliStart < 0 || tAliEnd > chain->tSize || qAliStart < 0 || qAliEnd > chain->qSize) + errAbort("chainBridgeOne: program error, need more sophisticated overlap arithmetic"); +// Align both forward and reverse since bandExt won't find a good match way at the other end +// past a long gap (it's meant for seeded alignments). +char tSymF[symAlloc+1], qSymF[symAlloc+1], tSymR[symAlloc+1], qSymR[symAlloc+1]; +int symCountF = 0, symCountR = 0; +struct cBlock *extBlocksF = NULL, *extBlocksR = NULL; +if (bandExt(global, scoreScheme, maxInsert, tSeq->dna + tAliStart, tAliEnd - tAliStart, + qSeq->dna + qAliStart, qAliEnd - qAliStart, + bandExtForward, symAlloc, &symCountF, tSymF, qSymF, NULL, NULL)) + extBlocksF = cBlocksFromAliSym(symCountF, qSymF, tSymF, qAliStart, tAliStart); +if (bandExt(global, scoreScheme, maxInsert, tSeq->dna + tAliStart, tAliEnd - tAliStart, + qSeq->dna + qAliStart, qAliEnd - qAliStart, + bandExtReverse, symAlloc, &symCountR, tSymR, qSymR, NULL, NULL)) + { + // tSymR and qSymR are anchored at the end, not the start, of aligned sequences. + int tSymStart = tAliEnd - countNonGap(tSymR); + int qSymStart = qAliEnd - countNonGap(qSymR); + extBlocksR = cBlocksFromAliSym(symCountR, qSymR, tSymR, qSymStart, tSymStart); + } +if (extBlocksF == NULL || (extBlocksF->next == NULL && extBlocksF->tEnd == tGapStart)) + { + extBlocks = extBlocksR; + slFreeList(&extBlocksF); + } +else if (extBlocksR == NULL || (extBlocksR->next == NULL && extBlocksR->tStart == tGapEnd)) + { + extBlocks = extBlocksF; + slFreeList(&extBlocksR); + } +else + { + // Something was found in both directions. Treat both as chains; compare bounds and scores. + struct chain chainF, chainR; + fillInFromBlocks(&chainF, chain, extBlocksF); + fillInFromBlocks(&chainR, chain, extBlocksR); + if (chainR.tStart >= chainF.tEnd && chainR.qStart >= chainF.qEnd) + extBlocks = slCat(extBlocksF, extBlocksR); + else + { + // Could look for a crossover point but nah - just use the higher scoring chain. + chainF.score = chainCalcScore(&chainF, scoreScheme, gapCalc, qSeq, tSeq); + chainR.score = chainCalcScore(&chainR, scoreScheme, gapCalc, qSeq, tSeq); + if (chainF.score >= chainR.score) + { + extBlocks = extBlocksF; + slFreeList(&extBlocksR); + } + else + { + extBlocks = extBlocksR; + slFreeList(&extBlocksF); + } + } + } +return extBlocks; +} + static void chainBridgeOne(struct chain *chain, struct chromStrandSeq *tCss, struct chromStrandSeq *qCss, FILE *outF) /* Iterate through gaps; merge blocks separated by double-sided gaps with identical sequence * on t and q. When we find a double-sided gap that is approximately the same size on t and q, * try to bridge the gap with a banded S-W alignment. */ { if (chain == NULL || chain->blockList == NULL || chain->blockList->next == NULL) return; -// Provide a little context for bandExt input sequences: -int overlap = 5; -// Alignment parameters for bandExt: -boolean global = FALSE; -int dir = 1; // +1 for forward, -1 for reverse -int maxInsert = maxGapSize/10; -int symAlloc = 2 * maxGapSize; -int symCount = 0; boolean changed = FALSE; struct cBlock *newBlockList = NULL; struct cBlock *blk = chain->blockList, *nextBlk = blk->next; for ( ; blk != NULL; blk = nextBlk) { nextBlk = blk->next; changed |= tryTrivialExtend(chain, blk, tCss, qCss, &nextBlk); if (canExtend(blk)) { - char *tChrom = chromStrandSeqGetSeq(tCss, chain->tName, '+'); - char *qChrom = chromStrandSeqGetSeq(qCss, chain->qName, chain->qStrand); - int tGapStart = blk->tEnd, tGapEnd = blk->next->tStart; - int qGapStart = blk->qEnd, qGapEnd = blk->next->qStart; - int tAliStart = tGapStart - overlap, tAliEnd = tGapEnd + overlap; - int qAliStart = qGapStart - overlap, qAliEnd = qGapEnd + overlap; - if (tAliStart < 0 || tAliEnd > chain->tSize || qAliStart < 0 || qAliEnd > chain->qSize) - errAbort("chainBridgeOne: program error, need more sophisticated overlap arithmetic"); - char tSym[symAlloc+1], qSym[symAlloc+1]; - if (bandExt(global, scoreScheme, maxInsert, tChrom + tAliStart, tAliEnd - tAliStart, - qChrom + qAliStart, qAliEnd - qAliStart, - dir, symAlloc, &symCount, tSym, qSym, NULL, NULL)) - { - struct cBlock *extBlocks = cBlocksFromAliSym(symCount, qSym, tSym, - qAliStart, tAliStart); + struct cBlock *extBlocks = extendThroughGap(chain, blk, tCss, qCss); if (maybeMergeBlocks(blk, extBlocks)) { // The first extended block merged with blk; if there was only one ext block, // can we also merge in nextBlk? freeMem(slPopHead(&extBlocks)); if (!extBlocks && maybeMergeBlocks(blk, nextBlk)) { // A single extBlock completely bridged the gap between blk and nextBlk. // Splice out nextBlk and repeat with blk. blk->next = nextBlk->next; freeMem(nextBlk); nextBlk = blk; + changed = TRUE; continue; } } // Done with blk. trimAndAddBlock(&newBlockList, blk); if (extBlocks) { // Add additional extBlocks before the final extBlock while (extBlocks->next) trimAndAddBlock(&newBlockList, slPopHead(&extBlocks)); if (maybeMergeBlocks(extBlocks, nextBlk)) { // The final extended block merged with nextBlk; splice out nextBlk // and repeat with final extended block. extBlocks->next = nextBlk->next; freeMem(nextBlk); nextBlk = extBlocks; } else { // The final extended block didn't merge with nextBlk. Done with final // extended block, on to nextBlk. trimAndAddBlock(&newBlockList, extBlocks); } } changed = TRUE; - continue; - } } + else trimAndAddBlock(&newBlockList, blk); } slReverse(&newBlockList); chain->blockList = newBlockList; if (changed) { // Update chain score. struct dnaSeq *tSeq = chromStrandSeqGetDnaSeq(tCss, chain->tName, '+'); struct dnaSeq *qSeq = chromStrandSeqGetDnaSeq(qCss, chain->qName, chain->qStrand); chain->score = chainCalcScore(chain, scoreScheme, gapCalc, qSeq, tSeq); } } void chainBridge(char *inFile, char *tTwoBitFile, char *qTwoBitFile, char *outFile) /* chainBridge - Attempt to extend alignments through double-sided gaps of similar size. */ { struct lineFile *lf = lineFileOpen(inFile, TRUE); struct chromStrandSeq *tCss = chromStrandSeqNew(tTwoBitFile); struct chromStrandSeq *qCss = chromStrandSeqNew(qTwoBitFile); FILE *outF = mustOpen(outFile, "w"); axtScoreSchemeDnaWrite(scoreScheme, outF, "chainBridge"); struct chain *chain, *chainList = NULL; while ((chain = chainRead(lf)) != NULL) { chainBridgeOne(chain, tCss, qCss, outF); slAddHead(&chainList, chain); } lineFileClose(&lf); // Since scores may have changed, re-sort to maintain score order (required by chainNet). slReverse(&chainList); slSort(&chainList, chainCmpScore); for (chain = chainList; chain != NULL; chain = chain->next) chainWrite(chain, outF); carefulClose(&outF); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 5) usage(); +diffTolerance = optionDouble("diffTolerance", diffTolerance); maxGapSize = optionInt("maxGap", maxGapSize); char *gapFileName = optionVal("linearGap", NULL); if (isNotEmpty(gapFileName)) gapCalc = gapCalcFromFile(gapFileName); else gapCalc = gapCalcDefault(); char *scoreSchemeName = optionVal("scoreScheme", NULL); if (isNotEmpty(scoreSchemeName)) scoreScheme = axtScoreSchemeRead(scoreSchemeName); else scoreScheme = axtScoreSchemeDefault(); chainBridge(argv[1], argv[2], argv[3], argv[4]); return 0; }