a97f8a0fffb31a2b3521e573a03f454fb60275d3 markd Tue May 30 21:30:12 2023 -0700 Add mechnaism to fix problem with mapping prot-prot via prot-na alignment by explictly specifying the type of alignments diff --git src/lib/pslTransMap.c src/lib/pslTransMap.c index 76f8cbb..3661d0f 100644 --- src/lib/pslTransMap.c +++ src/lib/pslTransMap.c @@ -10,30 +10,50 @@ /* * Notes: * - This code is used with both large and small mapping psls. Large * psls used for doing cross-species mappings and small psl are used for * doing protein to mRNA mappings. This introduces some speed issues. For * chain mapping, a large amount of time is spent in getBlockMapping() * searching for the starting point of a mapping. However an optimization * to find the starting point, such as a binKeeper, could be inefficient * for a large number of small psls. Implementing such an optimzation * would have to be dependent on the type of mapping. The code was made * 16x faster for genome mappings by remembering the current location in * the mapping psl between blocks (iMapBlkPtr arg). This will do for a * while. */ +static char *pslTypeDesc[] = +/* description of pslType */ +{ + "unspecified", // pslTypeUnspecified + "protein-protein", // pslTypeProtProt + "protein-NA", // pslTypeProtNa + "NA-NA" // pslTypeNaNa +}; + +static boolean pslTypeQueryIsProtein(enum pslType pslType) +/* is the pslType indicate the query is a protein? */ +{ +return (pslType == pslTypeProtProt) || (pslType == pslTypeProtNa); +} + +static boolean pslTypeTargetIsProtein(enum pslType pslType) +/* is the pslType indicate the target is a protein? */ +{ +return (pslType == pslTypeProtProt); +} struct block /* Coordinates of a block, which might be aligned or a gap on one side */ { int qStart; /* Query start position. */ int qEnd; /* Query end position. */ int tStart; /* Target start position. */ int tEnd; /* Target end position. */ }; static struct block ZERO_BLOCK = {0, 0, 0, 0}; static void assertBlockAligned(struct block *blk) /* Make sure both sides are same size and not negative length. */ { @@ -43,43 +63,68 @@ } static int blockIsAligned(struct block *blk) /* check that both the query and target side have alignment */ { return (blk->qEnd != 0) && (blk->tEnd != 0); // can start at zero } static int blockLength(struct block *blk) /* get length of an aligned block */ { assertBlockAligned(blk); return (blk->qEnd - blk->qStart); } -static void pslProtToNA(struct psl *psl) -/* convert a protein/NA alignment to a NA/NA alignment */ +static void setPslBoundsCounts(struct psl* psl) +/* set sequences bounds and counts from blocks on a PSL */ { +int lastBlk = psl->blockCount-1; + +/* set start/end of sequences */ +psl->qStart = psl->qStarts[0]; +psl->qEnd = psl->qStarts[lastBlk] + psl->blockSizes[lastBlk]; +if (pslQStrand(psl) == '-') + reverseIntRange(&psl->qStart, &psl->qEnd, psl->qSize); + +psl->tStart = psl->tStarts[0]; +psl->tEnd = psl->tStarts[lastBlk] + psl->blockSizes[lastBlk]; +if (pslTStrand(psl) == '-') + reverseIntRange(&psl->tStart, &psl->tEnd, psl->tSize); + +for (int iBlk = 0; iBlk < psl->blockCount; iBlk++) + psl->match += psl->blockSizes[iBlk]; +pslComputeInsertCounts(psl); +} + +static void pslProtToNAConvert(struct psl *psl) +/* convert a protein/NA or protein/protein alignment to a NA/NA alignment */ +{ +boolean isProtNa = pslIsProtein(psl); int iBlk; psl->qStart *= 3; psl->qEnd *= 3; psl->qSize *= 3; for (iBlk = 0; iBlk < psl->blockCount; iBlk++) { psl->blockSizes[iBlk] *= 3; psl->qStarts[iBlk] *= 3; + if (!isProtNa) + psl->tStarts[iBlk] *= 3; } +setPslBoundsCounts(psl); } static struct psl* createMappedPsl(struct psl* inPsl, struct psl *mapPsl, int mappedPslMax) /* setup a PSL for the output alignment */ { char strand[3]; assert(pslTStrand(inPsl) == pslQStrand(mapPsl)); /* strand can be taken from both alignments, since common sequence is in same * orientation. */ strand[0] = pslQStrand(inPsl); strand[1] = pslTStrand(mapPsl); strand[2] = '\0'; @@ -102,51 +147,30 @@ static void addPslBlock(struct psl* psl, struct block* blk, int* pslMax) /* add a block to a psl */ { unsigned newIBlk = psl->blockCount; assertBlockAligned(blk); assert((blk->qEnd - blk->qStart) == (blk->tEnd - blk->tStart)); if (newIBlk >= *pslMax) pslGrow(psl, pslMax); psl->qStarts[newIBlk] = blk->qStart; psl->tStarts[newIBlk] = blk->tStart; psl->blockSizes[newIBlk] = blk->qEnd - blk->qStart; psl->blockCount++; } -static void setPslBoundsCounts(struct psl* mappedPsl) -/* set sequences bounds and counts on mapped PSL */ -{ -int lastBlk = mappedPsl->blockCount-1; - -/* set start/end of sequences */ -mappedPsl->qStart = mappedPsl->qStarts[0]; -mappedPsl->qEnd = mappedPsl->qStarts[lastBlk] + mappedPsl->blockSizes[lastBlk]; -if (pslQStrand(mappedPsl) == '-') - reverseIntRange(&mappedPsl->qStart, &mappedPsl->qEnd, mappedPsl->qSize); - -mappedPsl->tStart = mappedPsl->tStarts[0]; -mappedPsl->tEnd = mappedPsl->tStarts[lastBlk] + mappedPsl->blockSizes[lastBlk]; -if (pslTStrand(mappedPsl) == '-') - reverseIntRange(&mappedPsl->tStart, &mappedPsl->tEnd, mappedPsl->tSize); - -for (int iBlk = 0; iBlk < mappedPsl->blockCount; iBlk++) - mappedPsl->match += mappedPsl->blockSizes[iBlk]; -pslComputeInsertCounts(mappedPsl); -} - static void adjustOrientation(unsigned opts, struct psl *inPsl, char *inPslOrigStrand, struct psl* mappedPsl) /* adjust strand, possibly reverse complementing, based on * pslTransMapKeepTrans option and input psls. */ { if (!(opts&pslTransMapKeepTrans) || ((inPslOrigStrand[1] == '\0') && (mappedPsl->strand[1] == '\0'))) { /* make untranslated */ if (pslTStrand(mappedPsl) == '-') pslRc(mappedPsl); mappedPsl->strand[1] = '\0'; /* implied target strand */ } } static struct block getBeforeBlockMapping(unsigned mqStart, unsigned mqEnd, @@ -207,31 +231,30 @@ { *iMapBlkPtr = iBlk; return getOverBlockMapping(mqStart, mqEnd, mapPsl->tStarts[iBlk], align1Blk); } } /* reached the end of the mRNA->genome alignment, finish off the * rest of the the protein as an insert */ struct block mappedBlk = ZERO_BLOCK; mappedBlk.qStart = align1Blk->qStart; mappedBlk.qEnd = align1Blk->qEnd; *iMapBlkPtr = iBlk; return mappedBlk; } - static void trimOverlapping(struct psl *mappedPsl, struct block *mappedBlk) /* if this block overlaps the previous block, trim accordingly. Overlaps * can be created with protein to DNA PSLs */ { assertBlockAligned(mappedBlk); assert(mappedPsl->blockCount > 0); // use int so we can go negative int prevQEnd = pslQEnd(mappedPsl, mappedPsl->blockCount - 1); int prevTEnd = pslTEnd(mappedPsl, mappedPsl->blockCount - 1); // trim, possibly setting to zero-length int overAmt = max((prevQEnd - (int)mappedBlk->qStart), (prevTEnd - (int)mappedBlk->tStart)); if (overAmt < 0) overAmt = 0; @@ -287,57 +310,96 @@ struct psl* mappedPsl = createMappedPsl(inPsl, mapPsl, mappedPslMax); /* Fill in ungapped blocks. */ for (int iBlock = 0; iBlock < inPsl->blockCount; iBlock++) { struct block align1Blk = blockFromPslBlock(inPsl, iBlock); while (mapBlock(inPsl, mapPsl, &iMapBlk, &align1Blk, mappedPsl, &mappedPslMax)) continue; } assert(mappedPsl->blockCount <= mappedPslMax); return mappedPsl; } -struct psl* pslTransMap(unsigned opts, struct psl *inPsl, struct psl *mapPsl) -/* map a psl via a mapping psl, a single psl is returned, or NULL if it - * couldn't be mapped. */ +static enum pslType determinePslType(struct psl *psl, enum pslType pslType, char *errDesc) +/* check the psl type if specified, set if unspecified */ { -boolean rcInPsl = (pslTStrand(inPsl) != pslQStrand(mapPsl)); -boolean cnvIn = (pslIsProtein(inPsl) && !pslIsProtein(mapPsl)); -boolean cnvMap = (pslIsProtein(mapPsl) && !pslIsProtein(inPsl)); +boolean isProtNa = pslIsProtein(psl); +if (pslType == pslTypeUnspecified) + return isProtNa ? pslTypeProtNa : pslTypeNaNa; // default + +// check specified type +if (isProtNa && (pslType != pslTypeProtNa)) + errAbort("%s PSL %s to %s is a protein to NA alignment, however %s specified", + errDesc, psl->qName, psl->tName, pslTypeDesc[pslType]); + +if ((!isProtNa) && (pslType == pslTypeProtNa)) + errAbort("%s PSL %s to %s is not a protein to NA alignment, however %s was specified", + errDesc, psl->qName, psl->tName, pslTypeDesc[pslType]); +return pslType; +} -/* sanity check size, but allow names to vary to allow ids to have +static void checkInMapCompat(struct psl *inPsl, enum pslType inPslType, + struct psl *mapPsl, enum pslType mapPslType) +/* validate input and mapping alignments are compatible types */ +{ +if (pslTypeTargetIsProtein(inPslType) != pslTypeQueryIsProtein(mapPslType)) + errAbort("input PSL %s to %s type %s is not compatible with mapping PSL %s to %s type %s ", + inPsl->qName, inPsl->tName, pslTypeDesc[inPslType], + mapPsl->qName, mapPsl->tName, pslTypeDesc[mapPslType]); +} + +struct psl* pslTransMap(unsigned opts, struct psl *inPsl, enum pslType inPslType, + struct psl *mapPsl, enum pslType mapPslType) +/* map a psl via a mapping psl, a single psl is returned, or NULL if it + * couldn't be mapped. psl types are normally set as pslTypeUnspecified, + * and assumed to be NA->NA. */ +{ +/* sanity check size but not name, so names to vary to allow ids to have * unique-ifying suffixes. */ if (inPsl->tSize != mapPsl->qSize) errAbort("Error: inPsl %s tSize (%d) != mapPsl %s qSize (%d)", inPsl->tName, inPsl->tSize, mapPsl->qName, mapPsl->qSize); +inPslType = determinePslType(inPsl, inPslType, "input PSL"); +mapPslType = determinePslType(mapPsl, mapPslType, "mapping PSL"); +checkInMapCompat(inPsl, inPslType, mapPsl, mapPslType); + +// check if need to swap strands +boolean rcInPsl = (pslTStrand(inPsl) != pslQStrand(mapPsl)); + +// need to convert all sides of both alignments to a common coordinate space +// all protein or all NA require no conversions. +boolean cnvIn = (inPslType == pslTypeProtNa) || + ((inPslType == pslTypeProtProt) && (mapPslType != pslTypeProtProt)); +boolean cnvMap = (mapPslType == pslTypeProtNa) || (cnvIn && (mapPslType == pslTypeProtProt)); + /* ensure common sequence is in same orientation and convert protein PSLs */ char inPslOrigStrand[3]; safef(inPslOrigStrand, sizeof(inPslOrigStrand), "%s", inPsl->strand); if (cnvIn || rcInPsl) inPsl = pslClone(inPsl); if (cnvIn) - pslProtToNA(inPsl); + pslProtToNAConvert(inPsl); if (rcInPsl) pslRc(inPsl); if (cnvMap) { mapPsl = pslClone(mapPsl); - pslProtToNA(mapPsl); + pslProtToNAConvert(mapPsl); } struct psl* mappedPsl = doMapping(inPsl, mapPsl); /* finish up psl, or free if no blocks were added */ if (mappedPsl->blockCount == 0) pslFree(&mappedPsl); /* nothing made it */ else { setPslBoundsCounts(mappedPsl); adjustOrientation(opts, inPsl, inPslOrigStrand, mappedPsl); } if (cnvIn || rcInPsl) pslFree(&inPsl);