a97f8a0fffb31a2b3521e573a03f454fb60275d3
markd
  Tue May 30 21:30:12 2023 -0700
Add mechnaism to fix problem with mapping prot-prot via prot-na alignment by explictly specifying the type of alignments

diff --git src/lib/pslTransMap.c src/lib/pslTransMap.c
index 76f8cbb..3661d0f 100644
--- src/lib/pslTransMap.c
+++ src/lib/pslTransMap.c
@@ -10,30 +10,50 @@
 /*
  * Notes:
  *  - This code is used with both large and small mapping psls.  Large
  *    psls used for doing cross-species mappings and small psl are used for
  *    doing protein to mRNA mappings.  This introduces some speed issues.  For
  *    chain mapping, a large amount of time is spent in getBlockMapping()
  *    searching for the starting point of a mapping.  However an optimization
  *    to find the starting point, such as a binKeeper, could be inefficient
  *    for a large number of small psls.  Implementing such an optimzation
  *    would have to be dependent on the type of mapping.  The code was made
  *    16x faster for genome mappings by remembering the current location in
  *    the mapping psl between blocks (iMapBlkPtr arg).  This will do for a
  *    while.
  */
 
+static char *pslTypeDesc[] =
+/* description of pslType */
+{
+    "unspecified",      // pslTypeUnspecified
+    "protein-protein",  // pslTypeProtProt
+    "protein-NA",       // pslTypeProtNa
+    "NA-NA"             // pslTypeNaNa
+};
+
+static boolean pslTypeQueryIsProtein(enum pslType pslType)
+/* is the pslType indicate the query is a protein? */
+{
+return (pslType == pslTypeProtProt) || (pslType == pslTypeProtNa);
+}
+
+static boolean pslTypeTargetIsProtein(enum pslType pslType)
+/* is the pslType indicate the target is a protein? */
+{
+return (pslType == pslTypeProtProt);
+}
 
 struct block
 /* Coordinates of a block, which might be aligned or a gap on one side */
 {
     int qStart;          /* Query start position. */
     int qEnd;            /* Query end position. */
     int tStart;          /* Target start position. */
     int tEnd;            /* Target end position. */
 };
 
 static struct block ZERO_BLOCK = {0, 0, 0, 0};
 
 static void assertBlockAligned(struct block *blk)
 /* Make sure both sides are same size and not negative length.  */
 {
@@ -43,43 +63,68 @@
 }
 
 static int blockIsAligned(struct block *blk)
 /* check that both the query and target side have alignment */
 {
 return (blk->qEnd != 0) && (blk->tEnd != 0); // can start at zero
 }
 
 static int blockLength(struct block *blk)
 /* get length of an aligned block  */
 {
 assertBlockAligned(blk);
 return (blk->qEnd - blk->qStart);
 }
 
-static void pslProtToNA(struct psl *psl)
-/* convert a protein/NA alignment to a NA/NA alignment */
+static void setPslBoundsCounts(struct psl* psl)
+/* set sequences bounds and counts from blocks on a PSL */
 {
+int lastBlk = psl->blockCount-1;
+
+/* set start/end of sequences */
+psl->qStart = psl->qStarts[0];
+psl->qEnd = psl->qStarts[lastBlk] + psl->blockSizes[lastBlk];
+if (pslQStrand(psl) == '-')
+    reverseIntRange(&psl->qStart, &psl->qEnd, psl->qSize);
+
+psl->tStart = psl->tStarts[0];
+psl->tEnd = psl->tStarts[lastBlk] + psl->blockSizes[lastBlk];
+if (pslTStrand(psl) == '-')
+    reverseIntRange(&psl->tStart, &psl->tEnd, psl->tSize);
+
+for (int iBlk = 0; iBlk < psl->blockCount; iBlk++)
+    psl->match += psl->blockSizes[iBlk];
+pslComputeInsertCounts(psl);
+}
+
+static void pslProtToNAConvert(struct psl *psl)
+/* convert a protein/NA or protein/protein alignment to a NA/NA alignment */
+{
+boolean isProtNa = pslIsProtein(psl);
 int iBlk;
 
 psl->qStart *= 3;
 psl->qEnd *= 3;
 psl->qSize *= 3;
 for (iBlk = 0; iBlk < psl->blockCount; iBlk++)
     {
     psl->blockSizes[iBlk] *= 3;
     psl->qStarts[iBlk] *= 3;
+    if (!isProtNa)
+        psl->tStarts[iBlk] *= 3;
     }
+setPslBoundsCounts(psl);
 }
 
 static struct psl* createMappedPsl(struct psl* inPsl, struct psl *mapPsl,
                                    int mappedPslMax)
 /* setup a PSL for the output alignment */
 {
 char strand[3];
 assert(pslTStrand(inPsl) == pslQStrand(mapPsl));
 
 /* strand can be taken from both alignments, since common sequence is in same
  * orientation. */
 strand[0] = pslQStrand(inPsl);
 strand[1] = pslTStrand(mapPsl);
 strand[2] = '\0';
 
@@ -102,51 +147,30 @@
 static void addPslBlock(struct psl* psl, struct block* blk, int* pslMax)
 /* add a block to a psl */
 {
 unsigned newIBlk = psl->blockCount;
 assertBlockAligned(blk);
 
 assert((blk->qEnd - blk->qStart) == (blk->tEnd - blk->tStart));
 if (newIBlk >= *pslMax)
     pslGrow(psl, pslMax);
 psl->qStarts[newIBlk] = blk->qStart;
 psl->tStarts[newIBlk] = blk->tStart;
 psl->blockSizes[newIBlk] = blk->qEnd - blk->qStart;
 psl->blockCount++;
 }
 
-static void setPslBoundsCounts(struct psl* mappedPsl)
-/* set sequences bounds and counts on mapped PSL */
-{
-int lastBlk = mappedPsl->blockCount-1;
-
-/* set start/end of sequences */
-mappedPsl->qStart = mappedPsl->qStarts[0];
-mappedPsl->qEnd = mappedPsl->qStarts[lastBlk] + mappedPsl->blockSizes[lastBlk];
-if (pslQStrand(mappedPsl) == '-')
-    reverseIntRange(&mappedPsl->qStart, &mappedPsl->qEnd, mappedPsl->qSize);
-
-mappedPsl->tStart = mappedPsl->tStarts[0];
-mappedPsl->tEnd = mappedPsl->tStarts[lastBlk] + mappedPsl->blockSizes[lastBlk];
-if (pslTStrand(mappedPsl) == '-')
-    reverseIntRange(&mappedPsl->tStart, &mappedPsl->tEnd, mappedPsl->tSize);
-
-for (int iBlk = 0; iBlk < mappedPsl->blockCount; iBlk++)
-    mappedPsl->match += mappedPsl->blockSizes[iBlk];
-pslComputeInsertCounts(mappedPsl);
-}
-
 static void adjustOrientation(unsigned opts, struct psl *inPsl, char *inPslOrigStrand,
                               struct psl* mappedPsl)
 /* adjust strand, possibly reverse complementing, based on
  * pslTransMapKeepTrans option and input psls. */
 {
 if (!(opts&pslTransMapKeepTrans) || ((inPslOrigStrand[1] == '\0') && (mappedPsl->strand[1] == '\0')))
     {
     /* make untranslated */
     if (pslTStrand(mappedPsl) == '-')
         pslRc(mappedPsl);
     mappedPsl->strand[1] = '\0';  /* implied target strand */
     }
 }
 
 static struct block getBeforeBlockMapping(unsigned mqStart, unsigned mqEnd,
@@ -207,31 +231,30 @@
         {
         *iMapBlkPtr = iBlk;
         return getOverBlockMapping(mqStart, mqEnd, mapPsl->tStarts[iBlk], align1Blk);
         }
     }
 
 /* reached the end of the mRNA->genome alignment, finish off the 
  * rest of the the protein as an insert */
 struct block mappedBlk = ZERO_BLOCK;
 mappedBlk.qStart = align1Blk->qStart;
 mappedBlk.qEnd = align1Blk->qEnd;
 *iMapBlkPtr = iBlk;
 return mappedBlk;
 }
 
-
 static void trimOverlapping(struct psl *mappedPsl, struct block *mappedBlk)
 /* if this block overlaps the previous block, trim accordingly.  Overlaps
  * can be created with protein to DNA PSLs */
 {
 assertBlockAligned(mappedBlk);
 assert(mappedPsl->blockCount > 0);
 
 // use int so we can go negative
 int prevQEnd = pslQEnd(mappedPsl, mappedPsl->blockCount - 1);
 int prevTEnd = pslTEnd(mappedPsl, mappedPsl->blockCount - 1);
 
 // trim, possibly setting to zero-length
 int overAmt = max((prevQEnd - (int)mappedBlk->qStart), (prevTEnd - (int)mappedBlk->tStart));
 if (overAmt < 0)
     overAmt = 0;
@@ -287,57 +310,96 @@
 struct psl* mappedPsl = createMappedPsl(inPsl, mapPsl, mappedPslMax);
 
 /* Fill in ungapped blocks.  */
 for (int iBlock = 0; iBlock < inPsl->blockCount; iBlock++)
     {
     struct block align1Blk = blockFromPslBlock(inPsl, iBlock);
     while (mapBlock(inPsl, mapPsl, &iMapBlk, &align1Blk, mappedPsl,
                     &mappedPslMax))
         continue;
     }
 assert(mappedPsl->blockCount <= mappedPslMax);
 
 return mappedPsl;
 }
 
-struct psl* pslTransMap(unsigned opts, struct psl *inPsl, struct psl *mapPsl)
-/* map a psl via a mapping psl, a single psl is returned, or NULL if it
- * couldn't be mapped. */
+static enum pslType determinePslType(struct psl *psl, enum pslType pslType, char *errDesc)
+/* check the psl type if specified, set if unspecified */
 {
-boolean rcInPsl = (pslTStrand(inPsl) != pslQStrand(mapPsl));
-boolean cnvIn = (pslIsProtein(inPsl) && !pslIsProtein(mapPsl));
-boolean cnvMap = (pslIsProtein(mapPsl) && !pslIsProtein(inPsl));
+boolean isProtNa = pslIsProtein(psl);
+if (pslType == pslTypeUnspecified)
+    return  isProtNa ? pslTypeProtNa : pslTypeNaNa;  // default
+
+// check specified type
+if (isProtNa &&  (pslType != pslTypeProtNa))
+    errAbort("%s PSL %s to %s is a protein to NA alignment, however %s specified",
+             errDesc, psl->qName, psl->tName, pslTypeDesc[pslType]);
+    
+if ((!isProtNa) &&  (pslType == pslTypeProtNa))
+    errAbort("%s PSL %s to %s is not a protein to NA alignment, however %s was specified",
+             errDesc, psl->qName, psl->tName, pslTypeDesc[pslType]);
+return pslType;
+}
 
-/* sanity check size, but allow names to vary to allow ids to have
+static void checkInMapCompat(struct psl *inPsl, enum pslType inPslType,
+                             struct psl *mapPsl, enum pslType mapPslType)
+/* validate input and mapping alignments are compatible types */
+{
+if (pslTypeTargetIsProtein(inPslType) != pslTypeQueryIsProtein(mapPslType))
+    errAbort("input PSL %s to %s type %s is not compatible with mapping PSL %s to %s type %s ",
+             inPsl->qName, inPsl->tName, pslTypeDesc[inPslType],
+             mapPsl->qName, mapPsl->tName, pslTypeDesc[mapPslType]);
+}
+    
+struct psl* pslTransMap(unsigned opts, struct psl *inPsl, enum pslType inPslType,
+                        struct psl *mapPsl, enum pslType mapPslType)
+/* map a psl via a mapping psl, a single psl is returned, or NULL if it
+ * couldn't be mapped. psl types are normally set as pslTypeUnspecified,
+ * and assumed to be NA->NA. */
+{
+/* sanity check size but not name, so names to vary to allow ids to have
  * unique-ifying suffixes. */
 if (inPsl->tSize != mapPsl->qSize)
     errAbort("Error: inPsl %s tSize (%d) != mapPsl %s qSize (%d)",
             inPsl->tName, inPsl->tSize, mapPsl->qName, mapPsl->qSize);
 
+inPslType = determinePslType(inPsl, inPslType, "input PSL");
+mapPslType = determinePslType(mapPsl, mapPslType, "mapping PSL");
+checkInMapCompat(inPsl, inPslType, mapPsl, mapPslType);
+
+// check if need to swap strands
+boolean rcInPsl = (pslTStrand(inPsl) != pslQStrand(mapPsl));
+
+// need to convert all sides of both alignments to a common coordinate space
+// all protein or all NA require no conversions.
+boolean cnvIn = (inPslType == pslTypeProtNa) ||
+    ((inPslType == pslTypeProtProt) && (mapPslType != pslTypeProtProt));
+boolean cnvMap = (mapPslType == pslTypeProtNa) || (cnvIn && (mapPslType == pslTypeProtProt));
+
 /* ensure common sequence is in same orientation and convert protein PSLs */
 char inPslOrigStrand[3];
 safef(inPslOrigStrand, sizeof(inPslOrigStrand), "%s", inPsl->strand);
 if (cnvIn || rcInPsl)
     inPsl = pslClone(inPsl);
 if (cnvIn)
-    pslProtToNA(inPsl);
+    pslProtToNAConvert(inPsl);
 if (rcInPsl)
     pslRc(inPsl);
 if (cnvMap)
     {
     mapPsl = pslClone(mapPsl);
-    pslProtToNA(mapPsl);
+    pslProtToNAConvert(mapPsl);
     }
 
 struct psl* mappedPsl = doMapping(inPsl, mapPsl);
 
 /* finish up psl, or free if no blocks were added */
 if (mappedPsl->blockCount == 0)
     pslFree(&mappedPsl);  /* nothing made it */
 else
     {
     setPslBoundsCounts(mappedPsl);
     adjustOrientation(opts, inPsl, inPslOrigStrand, mappedPsl);
     }
 
 if (cnvIn || rcInPsl)
     pslFree(&inPsl);