02336754147822f5aa61ba13277123b2cc629001 markd Thu May 20 08:38:55 2021 -0700 Moved pslMap, pslMapPostChain, pslRc, pslSwap to src/utils, as they do not have hg/lib dependencies. diff --git src/hg/utils/pslMapPostChain/pslMapPostChain.c src/hg/utils/pslMapPostChain/pslMapPostChain.c deleted file mode 100644 index 13e6d74..0000000 --- src/hg/utils/pslMapPostChain/pslMapPostChain.c +++ /dev/null @@ -1,323 +0,0 @@ -/* pslMapPostChain - Post genomic pslMap (TransMap) chaining. */ - -#include "common.h" -#include "linefile.h" -#include "options.h" -#include "hash.h" -#include "psl.h" -#include <string.h> - -void usage() -/* Explain usage and exit. */ -{ -errAbort("pslMapPostChain - Post genomic pslMap (TransMap) chaining.\n" - "usage:\n" - " pslMapPostChain [options] inPsl outPsl\n" - "\n" - "Post genomic pslMap (TransMap) chaining. This takes transcripts\n" - "that have been mapped via genomic chains adds back in\n" - "blocks that didn't get include in genomic chains due\n" - "to complex rearrangements or other issues.\n" - "\n" - "This program has not seen much use and may not do what you want\n"); -} - -static struct optionSpec optionSpecs[] = -{ - {"h", OPTION_BOOLEAN}, - {"help", OPTION_BOOLEAN}, - {NULL, 0} -}; -/* max distance to attempt to chain over */ -static int maxAdjacentDistance = 50*1000*1000; // 50mb - -/* special status used to in place of chain */ -static const int CHAIN_NOT_HERE = -1; -static const int CHAIN_CAN_NOT = -2; - -static struct hash* loadPslByQname(char* inPslFile) -/* load PSLs in to hash by qName. Make sure target strand is positive - * to make process easier later. */ -{ -struct hash* pslsByQName = hashNew(0); -struct psl *psls = pslLoadAll(inPslFile); -struct psl *psl; -while ((psl = slPopHead(&psls)) != NULL) - { - if (pslTStrand(psl) != '+') - pslRc(psl); - struct hashEl *hel = hashStore(pslsByQName, psl->qName); - struct psl** queryPsls = (struct psl**)&hel->val; - slAddHead(queryPsls, psl); - } -return pslsByQName; -} - -static int pslTSpan(const struct psl* psl) -/* get target span */ -{ -return psl->tEnd - psl->tStart; -} - -static int pslCmpTargetAndStrandSpan(const void *va, const void *vb) -/* Compare to sort based on target, strand, tStart, and then span */ -{ -const struct psl *a = *((struct psl **)va); -const struct psl *b = *((struct psl **)vb); -int dif; -dif = strcmp(a->tName, b->tName); -if (dif == 0) - dif = strcmp(a->strand, b->strand); -if (dif == 0) - dif = -(pslTSpan(a) - pslTSpan(b)); -return dif; -} - -static char normStrand(char strand) -/* return strand as stored in psl, converting implicit `\0' to `+' */ -{ -return (strand == '\0') ? '+' : strand; -} - -static unsigned pslQStartStrand(struct psl *psl, int blkIdx, char strand) -/* return query start for the given block, mapped to specified strand, - * which can be `\0' for `+' */ -{ -if (psl->strand[0] == normStrand(strand)) - return psl->qStarts[blkIdx]; -else - return psl->qSize - pslQEnd(psl, blkIdx); -} - -static unsigned pslQEndStrand(struct psl *psl, int blkIdx, char strand) -/* return query end for the given block, mapped to specified strand, - * which can be `\0' for `+' */ -{ -if (psl->strand[0] == normStrand(strand)) - return pslQEnd(psl, blkIdx); -else - return psl->qSize - pslQStart(psl, blkIdx); -} - -static unsigned pslTStartStrand(struct psl *psl, int blkIdx, char strand) -/* return target start for the given block, mapped to specified strand, - * which can be `\0' for `+' */ -{ -if (normStrand(psl->strand[1]) == normStrand(strand)) - return psl->tStarts[blkIdx]; -else - return psl->tSize - pslTEnd(psl, blkIdx); -} - -static unsigned pslTEndStrand(struct psl *psl, int blkIdx, char strand) -/* return target end for the given block, mapped to specified strand, - * which can be `\0' for `+' */ -{ -if (normStrand(psl->strand[1]) == normStrand(strand)) - return pslTEnd(psl, blkIdx); -else - return psl->tSize - pslTStart(psl, blkIdx); -} - -static int findChainPointUpstream(struct psl* chainedPsl, - struct psl* nextPsl) -/* findChainPoint upstream check. */ -{ -// check next being query upstream of chain -if ((pslQEnd(nextPsl, nextPsl->blockCount-1) <= pslQStart(chainedPsl, 0)) - && (pslTEnd(nextPsl, nextPsl->blockCount-1) <= pslTStart(chainedPsl, 0))) - { - if ((pslTStart(chainedPsl, 0) - pslTEnd(nextPsl, nextPsl->blockCount-1)) > maxAdjacentDistance) - return CHAIN_CAN_NOT; // too far before - else - return 0; - } -else - return CHAIN_NOT_HERE; -} - -static int findChainPointDownstream(struct psl* chainedPsl, - struct psl* nextPsl) -/* findChainPoint downstream check. */ -{ -if ((pslQStart(nextPsl, 0) >= pslQEnd(chainedPsl, chainedPsl->blockCount-1)) - && (pslTStart(nextPsl, 0) >= pslTEnd(chainedPsl, chainedPsl->blockCount-1))) - { - if ((pslTStart(nextPsl, 0) - pslTEnd(chainedPsl, chainedPsl->blockCount-1)) > maxAdjacentDistance) - return CHAIN_CAN_NOT; // too far after - else - return chainedPsl->blockCount; - } -else - return CHAIN_NOT_HERE; -} - -static int isBetweenBlocks(struct psl* chainedPsl, - struct psl* nextPsl, - int insertIdx) -/* does nextPsl fit between two chained PSL blocks */ -{ -return (pslQEnd(chainedPsl, insertIdx-1) <= pslQStart(nextPsl, 0)) - && (pslTEnd(chainedPsl, insertIdx-1) <= pslTStart(nextPsl, 0)) - && (pslQEnd(nextPsl, nextPsl->blockCount-1) <= pslQStart(chainedPsl, insertIdx)) - && (pslTEnd(nextPsl, nextPsl->blockCount-1) <= pslTStart(chainedPsl, insertIdx)); -} - -static int findChainPointInternal(struct psl* chainedPsl, - struct psl* nextPsl) -/* find internal insertion point between block of chainedPsl. */ -{ -int insertIdx; -for (insertIdx = 1; insertIdx < chainedPsl->blockCount; insertIdx++) - { - if (isBetweenBlocks(chainedPsl, nextPsl, insertIdx)) - return insertIdx; - } -return CHAIN_CAN_NOT; -} - -static int findChainPoint(struct psl* chainedPsl, - struct psl* nextPsl) -/* Return the position in the chained PSL where a PSL can be added. This - * returns the block-index of where to insert the PSL, blockCount if to added - * after, or one of the negative constants if it should not be chained. - * assumes it chainedPsl has longest target span so we only look for insert - * in chainPsl and not the other way around. */ -{ -if (!(sameString(chainedPsl->tName, nextPsl->tName) - && sameString(chainedPsl->strand, nextPsl->strand))) - return CHAIN_CAN_NOT; // not on same seq/strand -int insertIdx = findChainPointUpstream(chainedPsl, nextPsl); -if (insertIdx != CHAIN_NOT_HERE) - return insertIdx; -insertIdx = findChainPointDownstream(chainedPsl, nextPsl); -if (insertIdx != CHAIN_NOT_HERE) - return insertIdx; -insertIdx = findChainPointInternal(chainedPsl, nextPsl); -return insertIdx; -} - -static void pslArrayInsert(unsigned **destArray, - int numDestBlocks, - unsigned *srcArray, - int numSrcBlocks, - int insertIdx) -/* Grow one of the psl arrays and move contents down - * to make room, and copy new entries */ -{ -*destArray = needMoreMem(*destArray, numDestBlocks*sizeof(unsigned), (numDestBlocks+numSrcBlocks)*sizeof(unsigned)); -int iBlk; -for (iBlk = numDestBlocks-1; iBlk >= insertIdx; iBlk--) - { - assert(iBlk+numSrcBlocks < numDestBlocks+numSrcBlocks); - (*destArray)[iBlk+numSrcBlocks] = (*destArray)[iBlk]; - } -for (iBlk = 0; iBlk < numSrcBlocks; iBlk++) - { - assert(iBlk+insertIdx < numDestBlocks+numSrcBlocks); - (*destArray)[iBlk+insertIdx] = srcArray[iBlk]; - } -} - -static void addPslToChained(struct psl* chainedPsl, - struct psl* nextPsl, - int insertIdx) -/* add blocks form a psl to the chained psl at the given point */ -{ -assert(insertIdx <= chainedPsl->blockCount); -// expand arrays and copy in entries -pslArrayInsert(&chainedPsl->blockSizes, chainedPsl->blockCount, nextPsl->blockSizes, nextPsl->blockCount, insertIdx); -pslArrayInsert(&chainedPsl->qStarts, chainedPsl->blockCount, nextPsl->qStarts, nextPsl->blockCount, insertIdx); -pslArrayInsert(&chainedPsl->tStarts, chainedPsl->blockCount, nextPsl->tStarts, nextPsl->blockCount, insertIdx); -chainedPsl->blockCount += nextPsl->blockCount; - -// update bounds if needed -if (pslQStrand(chainedPsl) == '+') - { - chainedPsl->qStart = pslQStartStrand(chainedPsl, 0, '+'); - chainedPsl->qEnd = pslQEndStrand(chainedPsl, chainedPsl->blockCount-1, '+'); - } -else - { - chainedPsl->qStart = pslQStartStrand(chainedPsl, chainedPsl->blockCount-1, '+'); - chainedPsl->qEnd = pslQEndStrand(chainedPsl, 0, '+'); - } -assert(pslTStrand(chainedPsl) == '+'); -chainedPsl->tStart = pslTStartStrand(chainedPsl, 0, '+'); -chainedPsl->tEnd = pslTEndStrand(chainedPsl, chainedPsl->blockCount-1, '+'); - -// update counts -chainedPsl->match += nextPsl->match; -chainedPsl->misMatch += nextPsl->misMatch; -chainedPsl->repMatch += nextPsl->repMatch; -chainedPsl->nCount += nextPsl->nCount; -} - -static void chainToLongest(struct psl** queryPsls, - FILE* outPslFh) -/* pull off one or more psls from the list and chain them */ -{ -struct psl* chainedPsl = slPopHead(queryPsls); -struct psl* unchainedPsls = NULL; // ones not included -struct psl* nextPsl; -while ((nextPsl = slPopHead(queryPsls)) != NULL) - { - int insertIdx = findChainPoint(chainedPsl, nextPsl); - if (insertIdx >= 0) - { - addPslToChained(chainedPsl, nextPsl, insertIdx); - pslFree(&nextPsl); - } - else - { - slAddHead(&unchainedPsls, nextPsl); - } - } -pslComputeInsertCounts(chainedPsl); -pslTabOut(chainedPsl, outPslFh); -pslFree(&chainedPsl); -slReverse(&unchainedPsls); // preserve longest to shortest order -*queryPsls = unchainedPsls; -} - -static void chainQuery(struct psl** queryPsls, - FILE* outPslFh) -/* make chains for a single query. Takes over ownership - * of PSLs */ -{ -// sort by genomic location, then pull of a group to -// to chain -slSort(queryPsls, pslCmpTargetAndStrandSpan); -while (*queryPsls != NULL) - chainToLongest(queryPsls, outPslFh); -} - -static void pslMapPostChain(char* inPslFile, - char* outPslFile) -/* do chaining */ -{ -struct hash* pslsByQName = loadPslByQname(inPslFile); -FILE* outPslFh = mustOpen(outPslFile, "w"); -struct hashEl *hel; -struct hashCookie cookie = hashFirst(pslsByQName); -while ((hel = hashNext(&cookie)) != NULL) - { - struct psl** queryPsls = (struct psl**)&hel->val; - chainQuery(queryPsls, outPslFh); - } -carefulClose(&outPslFh); -} - -int main(int argc, char **argv) -/* entry */ -{ -optionInit(&argc, argv, optionSpecs); -if (optionExists("h") || optionExists("help")) - usage(); -if (argc != 3) - usage(); -pslMapPostChain(argv[1], argv[2]); -return 0; -} - -