5711ac6c2bcaeda814ae1049a0661f499775885e markd Wed Dec 13 00:36:12 2023 -0800 change pslProtToRnaCoords to use common code with pslMap to convert protein/NA to CDS-coordinate/NA alignments. This will correctly remove overlapping blocks after conversion diff --git src/utils/pslProtToRnaCoords/pslProtToRnaCoords.c src/utils/pslProtToRnaCoords/pslProtToRnaCoords.c index 5f8034c..d3bc6f2 100644 --- src/utils/pslProtToRnaCoords/pslProtToRnaCoords.c +++ src/utils/pslProtToRnaCoords/pslProtToRnaCoords.c @@ -1,139 +1,52 @@ /* pslProtToRnaCoords - Convert protein alignments to RNA coordinates. */ #include "common.h" #include "psl.h" +#include "pslTransMap.h" #include "options.h" void usage() /* Explain usage and exit. */ { errAbort( "pslProtToRnaCoords - Convert protein alignments to RNA coordinates\n" "usage:\n" " pslProtToRnaCoords inPsl outPsl\n" "\n" "Convert either a protein/protein or protein/NA PSL to NA/NA PSL. This\n" "multiplies coordinates and statistics by three. As this can occasionally\n" "results in blocks overlapping, overlap is trimmed as needed.\n" ); } /* Command line validation table. */ static struct optionSpec options[] = { {NULL, 0}, }; -static void trimOverlapping(bool isProtNaPsl, struct psl *psl, int oBlk) -/* if this block overlaps the previous block, trim accordingly. Overlaps - * can be created with protein to DNA PSLs. */ -// check for overlap; use int so we can go negative -{ -int prevQEnd = pslQEnd(psl, oBlk - 1); -int prevTEnd = pslTEnd(psl, oBlk - 1); - -// trim, possibly setting to zero-length -int overAmt = max((prevQEnd - (int)psl->qStarts[oBlk]), (prevTEnd - (int)psl->tStarts[oBlk])); -if (overAmt < 0) - overAmt = 0; -else if (overAmt > psl->blockSizes[oBlk]) - overAmt = psl->blockSizes[oBlk]; - -psl->qStarts[oBlk] += overAmt; -psl->tStarts[oBlk] += overAmt; -psl->blockSizes[oBlk] -= overAmt; -psl->match -= overAmt; -} - - -static int updateBlk(bool isProtNaPsl, struct psl *psl, int iBlk, int oBlk) -/* Update sizes in a block, adjusting start of block if it now - * overlaps previous block due previous block size expansion. - * return updated oBlk */ -{ -// update to NA coordinates -psl->blockSizes[oBlk] = 3 * psl->blockSizes[iBlk]; -psl->qStarts[oBlk] = 3 * psl->qStarts[iBlk]; -if (isProtNaPsl) - psl->tStarts[oBlk] = psl->tStarts[iBlk]; // already in NA coords -else - psl->tStarts[oBlk] = 3 * psl->tStarts[iBlk]; -psl->match += psl->blockSizes[oBlk]; - -if (oBlk > 0) - trimOverlapping(isProtNaPsl, psl, oBlk); - -// only advance if something left in the block -if (psl->blockSizes[oBlk] > 0) - oBlk++; -return oBlk; -} - -static void pslConvertCoords(struct psl *psl) -/* convert psl coordinates */ -{ -boolean isProtNaPsl = pslIsProtein(psl); -// overall sizes and coordinates -psl->qSize *= 3; -psl->qStart *= 3; -psl->qEnd *= 3; -if (!isProtNaPsl) - { - psl->tSize *= 3; - psl->tStart *= 3; - psl->tEnd *= 3; - } - -/* set counts to be all match, since we don't know actual answer and - * it makes it easier to trim overlapping blocks */ -psl->match = psl->misMatch = psl->repMatch = psl->nCount = 0; - -// blocks -int oBlk = 0; -for (int iBlk = 0; iBlk < psl->blockCount; iBlk += 1) - { - oBlk = updateBlk(isProtNaPsl, psl, iBlk, oBlk); - } -psl->blockCount = oBlk; - -// need to update overall coordinates if last block was full remove due to -// overlap -psl->qStart = psl->qStarts[0]; -psl->qEnd = psl->qStarts[oBlk - 1] + psl->blockSizes[oBlk - 1]; -if (psl->strand[0] == '-') - reverseIntRange(&psl->qStart, &psl->qEnd, psl->qSize); -psl->tStart = psl->tStarts[0]; -psl->tEnd = psl->tStarts[oBlk - 1] + psl->blockSizes[oBlk - 1]; -if (psl->strand[0] == '-') - reverseIntRange(&psl->tStart, &psl->tEnd, psl->tSize); - -// indels -pslComputeInsertCounts(psl); -} - - static void pslProtToRnaCoords(char *inPsl, char *outPsl) /* pslProtToRnaCoords - Convert protein alignments to RNA coordinates. */ { struct lineFile *inFh = pslFileOpen(inPsl); FILE *outFh = mustOpen(outPsl, "w"); struct psl *psl; while ((psl = pslNext(inFh)) != NULL) { - pslConvertCoords(psl); + pslProtToNAConvert(psl); pslTabOut(psl, outFh); pslFree(&psl); } carefulClose(&outFh); lineFileClose(&inFh); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 3) usage(); pslProtToRnaCoords(argv[1], argv[2]); return 0; }