02336754147822f5aa61ba13277123b2cc629001 markd Thu May 20 08:38:55 2021 -0700 Moved pslMap, pslMapPostChain, pslRc, pslSwap to src/utils, as they do not have hg/lib dependencies. diff --git src/utils/pslMapPostChain/pslMapPostChain.c src/utils/pslMapPostChain/pslMapPostChain.c new file mode 100644 index 0000000..13e6d74 --- /dev/null +++ src/utils/pslMapPostChain/pslMapPostChain.c @@ -0,0 +1,323 @@ +/* pslMapPostChain - Post genomic pslMap (TransMap) chaining. */ + +#include "common.h" +#include "linefile.h" +#include "options.h" +#include "hash.h" +#include "psl.h" +#include + +void usage() +/* Explain usage and exit. */ +{ +errAbort("pslMapPostChain - Post genomic pslMap (TransMap) chaining.\n" + "usage:\n" + " pslMapPostChain [options] inPsl outPsl\n" + "\n" + "Post genomic pslMap (TransMap) chaining. This takes transcripts\n" + "that have been mapped via genomic chains adds back in\n" + "blocks that didn't get include in genomic chains due\n" + "to complex rearrangements or other issues.\n" + "\n" + "This program has not seen much use and may not do what you want\n"); +} + +static struct optionSpec optionSpecs[] = +{ + {"h", OPTION_BOOLEAN}, + {"help", OPTION_BOOLEAN}, + {NULL, 0} +}; +/* max distance to attempt to chain over */ +static int maxAdjacentDistance = 50*1000*1000; // 50mb + +/* special status used to in place of chain */ +static const int CHAIN_NOT_HERE = -1; +static const int CHAIN_CAN_NOT = -2; + +static struct hash* loadPslByQname(char* inPslFile) +/* load PSLs in to hash by qName. Make sure target strand is positive + * to make process easier later. */ +{ +struct hash* pslsByQName = hashNew(0); +struct psl *psls = pslLoadAll(inPslFile); +struct psl *psl; +while ((psl = slPopHead(&psls)) != NULL) + { + if (pslTStrand(psl) != '+') + pslRc(psl); + struct hashEl *hel = hashStore(pslsByQName, psl->qName); + struct psl** queryPsls = (struct psl**)&hel->val; + slAddHead(queryPsls, psl); + } +return pslsByQName; +} + +static int pslTSpan(const struct psl* psl) +/* get target span */ +{ +return psl->tEnd - psl->tStart; +} + +static int pslCmpTargetAndStrandSpan(const void *va, const void *vb) +/* Compare to sort based on target, strand, tStart, and then span */ +{ +const struct psl *a = *((struct psl **)va); +const struct psl *b = *((struct psl **)vb); +int dif; +dif = strcmp(a->tName, b->tName); +if (dif == 0) + dif = strcmp(a->strand, b->strand); +if (dif == 0) + dif = -(pslTSpan(a) - pslTSpan(b)); +return dif; +} + +static char normStrand(char strand) +/* return strand as stored in psl, converting implicit `\0' to `+' */ +{ +return (strand == '\0') ? '+' : strand; +} + +static unsigned pslQStartStrand(struct psl *psl, int blkIdx, char strand) +/* return query start for the given block, mapped to specified strand, + * which can be `\0' for `+' */ +{ +if (psl->strand[0] == normStrand(strand)) + return psl->qStarts[blkIdx]; +else + return psl->qSize - pslQEnd(psl, blkIdx); +} + +static unsigned pslQEndStrand(struct psl *psl, int blkIdx, char strand) +/* return query end for the given block, mapped to specified strand, + * which can be `\0' for `+' */ +{ +if (psl->strand[0] == normStrand(strand)) + return pslQEnd(psl, blkIdx); +else + return psl->qSize - pslQStart(psl, blkIdx); +} + +static unsigned pslTStartStrand(struct psl *psl, int blkIdx, char strand) +/* return target start for the given block, mapped to specified strand, + * which can be `\0' for `+' */ +{ +if (normStrand(psl->strand[1]) == normStrand(strand)) + return psl->tStarts[blkIdx]; +else + return psl->tSize - pslTEnd(psl, blkIdx); +} + +static unsigned pslTEndStrand(struct psl *psl, int blkIdx, char strand) +/* return target end for the given block, mapped to specified strand, + * which can be `\0' for `+' */ +{ +if (normStrand(psl->strand[1]) == normStrand(strand)) + return pslTEnd(psl, blkIdx); +else + return psl->tSize - pslTStart(psl, blkIdx); +} + +static int findChainPointUpstream(struct psl* chainedPsl, + struct psl* nextPsl) +/* findChainPoint upstream check. */ +{ +// check next being query upstream of chain +if ((pslQEnd(nextPsl, nextPsl->blockCount-1) <= pslQStart(chainedPsl, 0)) + && (pslTEnd(nextPsl, nextPsl->blockCount-1) <= pslTStart(chainedPsl, 0))) + { + if ((pslTStart(chainedPsl, 0) - pslTEnd(nextPsl, nextPsl->blockCount-1)) > maxAdjacentDistance) + return CHAIN_CAN_NOT; // too far before + else + return 0; + } +else + return CHAIN_NOT_HERE; +} + +static int findChainPointDownstream(struct psl* chainedPsl, + struct psl* nextPsl) +/* findChainPoint downstream check. */ +{ +if ((pslQStart(nextPsl, 0) >= pslQEnd(chainedPsl, chainedPsl->blockCount-1)) + && (pslTStart(nextPsl, 0) >= pslTEnd(chainedPsl, chainedPsl->blockCount-1))) + { + if ((pslTStart(nextPsl, 0) - pslTEnd(chainedPsl, chainedPsl->blockCount-1)) > maxAdjacentDistance) + return CHAIN_CAN_NOT; // too far after + else + return chainedPsl->blockCount; + } +else + return CHAIN_NOT_HERE; +} + +static int isBetweenBlocks(struct psl* chainedPsl, + struct psl* nextPsl, + int insertIdx) +/* does nextPsl fit between two chained PSL blocks */ +{ +return (pslQEnd(chainedPsl, insertIdx-1) <= pslQStart(nextPsl, 0)) + && (pslTEnd(chainedPsl, insertIdx-1) <= pslTStart(nextPsl, 0)) + && (pslQEnd(nextPsl, nextPsl->blockCount-1) <= pslQStart(chainedPsl, insertIdx)) + && (pslTEnd(nextPsl, nextPsl->blockCount-1) <= pslTStart(chainedPsl, insertIdx)); +} + +static int findChainPointInternal(struct psl* chainedPsl, + struct psl* nextPsl) +/* find internal insertion point between block of chainedPsl. */ +{ +int insertIdx; +for (insertIdx = 1; insertIdx < chainedPsl->blockCount; insertIdx++) + { + if (isBetweenBlocks(chainedPsl, nextPsl, insertIdx)) + return insertIdx; + } +return CHAIN_CAN_NOT; +} + +static int findChainPoint(struct psl* chainedPsl, + struct psl* nextPsl) +/* Return the position in the chained PSL where a PSL can be added. This + * returns the block-index of where to insert the PSL, blockCount if to added + * after, or one of the negative constants if it should not be chained. + * assumes it chainedPsl has longest target span so we only look for insert + * in chainPsl and not the other way around. */ +{ +if (!(sameString(chainedPsl->tName, nextPsl->tName) + && sameString(chainedPsl->strand, nextPsl->strand))) + return CHAIN_CAN_NOT; // not on same seq/strand +int insertIdx = findChainPointUpstream(chainedPsl, nextPsl); +if (insertIdx != CHAIN_NOT_HERE) + return insertIdx; +insertIdx = findChainPointDownstream(chainedPsl, nextPsl); +if (insertIdx != CHAIN_NOT_HERE) + return insertIdx; +insertIdx = findChainPointInternal(chainedPsl, nextPsl); +return insertIdx; +} + +static void pslArrayInsert(unsigned **destArray, + int numDestBlocks, + unsigned *srcArray, + int numSrcBlocks, + int insertIdx) +/* Grow one of the psl arrays and move contents down + * to make room, and copy new entries */ +{ +*destArray = needMoreMem(*destArray, numDestBlocks*sizeof(unsigned), (numDestBlocks+numSrcBlocks)*sizeof(unsigned)); +int iBlk; +for (iBlk = numDestBlocks-1; iBlk >= insertIdx; iBlk--) + { + assert(iBlk+numSrcBlocks < numDestBlocks+numSrcBlocks); + (*destArray)[iBlk+numSrcBlocks] = (*destArray)[iBlk]; + } +for (iBlk = 0; iBlk < numSrcBlocks; iBlk++) + { + assert(iBlk+insertIdx < numDestBlocks+numSrcBlocks); + (*destArray)[iBlk+insertIdx] = srcArray[iBlk]; + } +} + +static void addPslToChained(struct psl* chainedPsl, + struct psl* nextPsl, + int insertIdx) +/* add blocks form a psl to the chained psl at the given point */ +{ +assert(insertIdx <= chainedPsl->blockCount); +// expand arrays and copy in entries +pslArrayInsert(&chainedPsl->blockSizes, chainedPsl->blockCount, nextPsl->blockSizes, nextPsl->blockCount, insertIdx); +pslArrayInsert(&chainedPsl->qStarts, chainedPsl->blockCount, nextPsl->qStarts, nextPsl->blockCount, insertIdx); +pslArrayInsert(&chainedPsl->tStarts, chainedPsl->blockCount, nextPsl->tStarts, nextPsl->blockCount, insertIdx); +chainedPsl->blockCount += nextPsl->blockCount; + +// update bounds if needed +if (pslQStrand(chainedPsl) == '+') + { + chainedPsl->qStart = pslQStartStrand(chainedPsl, 0, '+'); + chainedPsl->qEnd = pslQEndStrand(chainedPsl, chainedPsl->blockCount-1, '+'); + } +else + { + chainedPsl->qStart = pslQStartStrand(chainedPsl, chainedPsl->blockCount-1, '+'); + chainedPsl->qEnd = pslQEndStrand(chainedPsl, 0, '+'); + } +assert(pslTStrand(chainedPsl) == '+'); +chainedPsl->tStart = pslTStartStrand(chainedPsl, 0, '+'); +chainedPsl->tEnd = pslTEndStrand(chainedPsl, chainedPsl->blockCount-1, '+'); + +// update counts +chainedPsl->match += nextPsl->match; +chainedPsl->misMatch += nextPsl->misMatch; +chainedPsl->repMatch += nextPsl->repMatch; +chainedPsl->nCount += nextPsl->nCount; +} + +static void chainToLongest(struct psl** queryPsls, + FILE* outPslFh) +/* pull off one or more psls from the list and chain them */ +{ +struct psl* chainedPsl = slPopHead(queryPsls); +struct psl* unchainedPsls = NULL; // ones not included +struct psl* nextPsl; +while ((nextPsl = slPopHead(queryPsls)) != NULL) + { + int insertIdx = findChainPoint(chainedPsl, nextPsl); + if (insertIdx >= 0) + { + addPslToChained(chainedPsl, nextPsl, insertIdx); + pslFree(&nextPsl); + } + else + { + slAddHead(&unchainedPsls, nextPsl); + } + } +pslComputeInsertCounts(chainedPsl); +pslTabOut(chainedPsl, outPslFh); +pslFree(&chainedPsl); +slReverse(&unchainedPsls); // preserve longest to shortest order +*queryPsls = unchainedPsls; +} + +static void chainQuery(struct psl** queryPsls, + FILE* outPslFh) +/* make chains for a single query. Takes over ownership + * of PSLs */ +{ +// sort by genomic location, then pull of a group to +// to chain +slSort(queryPsls, pslCmpTargetAndStrandSpan); +while (*queryPsls != NULL) + chainToLongest(queryPsls, outPslFh); +} + +static void pslMapPostChain(char* inPslFile, + char* outPslFile) +/* do chaining */ +{ +struct hash* pslsByQName = loadPslByQname(inPslFile); +FILE* outPslFh = mustOpen(outPslFile, "w"); +struct hashEl *hel; +struct hashCookie cookie = hashFirst(pslsByQName); +while ((hel = hashNext(&cookie)) != NULL) + { + struct psl** queryPsls = (struct psl**)&hel->val; + chainQuery(queryPsls, outPslFh); + } +carefulClose(&outPslFh); +} + +int main(int argc, char **argv) +/* entry */ +{ +optionInit(&argc, argv, optionSpecs); +if (optionExists("h") || optionExists("help")) + usage(); +if (argc != 3) + usage(); +pslMapPostChain(argv[1], argv[2]); +return 0; +} + +