442e76e6f50e9c79f3dde2964d6f754511908e44
markd
  Sun May 14 17:32:02 2023 -0700
trim overlapping blocks as can happen when mapping protein to DNA alignments

diff --git src/lib/pslTransMap.c src/lib/pslTransMap.c
index eebd4f3..555140e 100644
--- src/lib/pslTransMap.c
+++ src/lib/pslTransMap.c
@@ -12,38 +12,61 @@
  *  - This code is used with both large and small mapping psls.  Large
  *    psls used for doing cross-species mappings and small psl are used for
  *    doing protein to mRNA mappings.  This introduces some speed issues.  For
  *    chain mapping, a large amount of time is spent in getBlockMapping()
  *    searching for the starting point of a mapping.  However an optimization
  *    to find the starting point, such as a binKeeper, could be inefficient
  *    for a large number of small psls.  Implementing such an optimzation
  *    would have to be dependent on the type of mapping.  The code was made
  *    16x faster for genome mappings by remembering the current location in
  *    the mapping psl between blocks (iMapBlkPtr arg).  This will do for a
  *    while.
  */
 
 
 struct block
-/* coordinates of a block */
+/* Coordinates of a block, which might be aligned or a gap on one side */
 {
     int qStart;          /* Query start position. */
     int qEnd;            /* Query end position. */
     int tStart;          /* Target start position. */
     int tEnd;            /* Target end position. */
 };
 
+static struct block ZERO_BLOCK = {0, 0, 0, 0};
+
+static void assertBlockAligned(struct block *blk)
+/* Make sure both sides are same size and not negative length.  */
+{
+assert(blk->qStart <= blk->qEnd);
+assert(blk->tStart <= blk->tEnd);
+assert((blk->qEnd - blk->qStart) == (blk->tEnd - blk->tStart));
+}
+
+static int blockIsAligned(struct block *blk)
+/* check that both the query and target side have alignment */
+{
+return (blk->qEnd != 0) && (blk->tEnd != 0); // can start at zero
+}
+
+static int blockLength(struct block *blk)
+/* get length of an aligned block  */
+{
+assertBlockAligned(blk);
+return (blk->qEnd - blk->qStart);
+}
+
 static void pslProtToNA(struct psl *psl)
 /* convert a protein/NA alignment to a NA/NA alignment */
 {
 int iBlk;
 
 psl->qStart *= 3;
 psl->qEnd *= 3;
 psl->qSize *= 3;
 for (iBlk = 0; iBlk < psl->blockCount; iBlk++)
     {
     psl->blockSizes[iBlk] *= 3;
     psl->qStarts[iBlk] *= 3;
     }
 }
 
@@ -83,244 +106,260 @@
 static struct block blockFromPslBlock(struct psl* psl, int iBlock)
 /* fill in a block object from a psl block */
 {
 struct block block;
 block.qStart = psl->qStarts[iBlock];
 block.qEnd = psl->qStarts[iBlock] + psl->blockSizes[iBlock];
 block.tStart = psl->tStarts[iBlock];
 block.tEnd = psl->tStarts[iBlock] + psl->blockSizes[iBlock];
 return block;
 }
 
 static void addPslBlock(struct psl* psl, struct block* blk, int* pslMax)
 /* add a block to a psl */
 {
 unsigned newIBlk = psl->blockCount;
+assertBlockAligned(blk);
 
 assert((blk->qEnd - blk->qStart) == (blk->tEnd - blk->tStart));
 if (newIBlk >= *pslMax)
     pslGrow(psl, pslMax);
 psl->qStarts[newIBlk] = blk->qStart;
 psl->tStarts[newIBlk] = blk->tStart;
 psl->blockSizes[newIBlk] = blk->qEnd - blk->qStart;
-/* lie about match counts. */
-psl->match += psl->blockSizes[newIBlk];
-/* count gaps */
-if (newIBlk > 0)
-    {
-    if (psl->qStarts[newIBlk] > pslQEnd(psl, newIBlk-1))
-        {
-        psl->qNumInsert++;
-        psl->qBaseInsert += psl->qStarts[newIBlk] - pslQEnd(psl, newIBlk-1);
-        }
-    if (psl->tStarts[newIBlk] > pslTEnd(psl, newIBlk-1))
-        {
-        psl->tNumInsert++;
-        psl->tBaseInsert += psl->tStarts[newIBlk] - pslTEnd(psl, newIBlk-1);
-        }
-    }
 psl->blockCount++;
 }
 
-static void setPslBounds(struct psl* mappedPsl)
-/* set sequences bounds on mapped PSL */
+static void setPslBoundsCounts(struct psl* mappedPsl)
+/* set sequences bounds and counts on mapped PSL */
 {
 int lastBlk = mappedPsl->blockCount-1;
 
 /* set start/end of sequences */
 mappedPsl->qStart = mappedPsl->qStarts[0];
 mappedPsl->qEnd = mappedPsl->qStarts[lastBlk] + mappedPsl->blockSizes[lastBlk];
 if (pslQStrand(mappedPsl) == '-')
     reverseIntRange(&mappedPsl->qStart, &mappedPsl->qEnd, mappedPsl->qSize);
 
 mappedPsl->tStart = mappedPsl->tStarts[0];
 mappedPsl->tEnd = mappedPsl->tStarts[lastBlk] + mappedPsl->blockSizes[lastBlk];
 if (pslTStrand(mappedPsl) == '-')
     reverseIntRange(&mappedPsl->tStart, &mappedPsl->tEnd, mappedPsl->tSize);
+
+for (int iBlk = 0; iBlk < mappedPsl->blockCount; iBlk++)
+    mappedPsl->match += mappedPsl->blockSizes[iBlk];
+pslComputeInsertCounts(mappedPsl);
 }
 
 static void adjustOrientation(unsigned opts, struct psl *inPsl, char *inPslOrigStrand,
                               struct psl* mappedPsl)
 /* adjust strand, possibly reverse complementing, based on
  * pslTransMapKeepTrans option and input psls. */
 {
 if (!(opts&pslTransMapKeepTrans) || ((inPslOrigStrand[1] == '\0') && (mappedPsl->strand[1] == '\0')))
     {
     /* make untranslated */
     if (pslTStrand(mappedPsl) == '-')
         pslRc(mappedPsl);
     mappedPsl->strand[1] = '\0';  /* implied target strand */
     }
 }
 
 static struct block getBeforeBlockMapping(unsigned mqStart, unsigned mqEnd,
                                           struct block* align1Blk)
 /* map part of an ungapped psl block that occurs before a mapPsl block */
 {
-struct block mappedBlk;
-
 /* mRNA start in genomic gap before this block, this will
  * be an inPsl insert */
 unsigned size = (align1Blk->tEnd < mqStart)
     ? (align1Blk->tEnd - align1Blk->tStart)
     : (mqStart - align1Blk->tStart);
-ZeroVar(&mappedBlk);
+struct block mappedBlk = ZERO_BLOCK;
 mappedBlk.qStart = align1Blk->qStart;
 mappedBlk.qEnd = align1Blk->qStart + size;
 return mappedBlk;
 }
 
 static struct block getOverBlockMapping(unsigned mqStart, unsigned mqEnd,
                                         unsigned mtStart, struct block* align1Blk)
 /* map part of an ungapped psl block that overlapps a mapPsl block. */
 {
-struct block mappedBlk;
-
 /* common sequence start contained in this block, this handles aligned
  * and genomic inserts */
 unsigned off = align1Blk->tStart - mqStart;
 unsigned size = (align1Blk->tEnd > mqEnd)
     ? (mqEnd - align1Blk->tStart)
     : (align1Blk->tEnd - align1Blk->tStart);
-ZeroVar(&mappedBlk);
+struct block mappedBlk = ZERO_BLOCK;
 mappedBlk.qStart = align1Blk->qStart;
 mappedBlk.qEnd = align1Blk->qStart + size;
 mappedBlk.tStart = mtStart + off;
 mappedBlk.tEnd = mtStart + off + size;
 return mappedBlk;
 }
 
 static struct block getBlockMapping(struct psl* inPsl, struct psl *mapPsl,
                                     int *iMapBlkPtr, struct block* align1Blk)
 /* Map part or all of a ungapped psl block to the genome.  This returns the
  * coordinates of the sub-block starting at the beginning of the align1Blk.
  * If this is a gap, either the target or query coords are zero.  This works
  * in genomic strand space.  The search starts at the specified map block,
  * which is updated to prevent rescanning large psls.
  */
 {
-int iBlk;
-struct block mappedBlk;
 
 /* search for block or gap containing start of mrna block */
-for (iBlk = *iMapBlkPtr; iBlk < mapPsl->blockCount; iBlk++)
+int iBlk = *iMapBlkPtr;
+for (; iBlk < mapPsl->blockCount; iBlk++)
     {
     unsigned mqStart = mapPsl->qStarts[iBlk];
     unsigned mqEnd = mapPsl->qStarts[iBlk]+mapPsl->blockSizes[iBlk];
     if (align1Blk->tStart < mqStart)
         {
         *iMapBlkPtr = iBlk;
         return getBeforeBlockMapping(mqStart, mqEnd, align1Blk);
         }
     if ((align1Blk->tStart >= mqStart) && (align1Blk->tStart < mqEnd))
         {
         *iMapBlkPtr = iBlk;
         return getOverBlockMapping(mqStart, mqEnd, mapPsl->tStarts[iBlk], align1Blk);
         }
     }
 
 /* reached the end of the mRNA->genome alignment, finish off the 
  * rest of the the protein as an insert */
-ZeroVar(&mappedBlk);
+struct block mappedBlk = ZERO_BLOCK;
 mappedBlk.qStart = align1Blk->qStart;
 mappedBlk.qEnd = align1Blk->qEnd;
 *iMapBlkPtr = iBlk;
 return mappedBlk;
 }
 
+
+static void trimOverlapping(struct psl *mappedPsl, struct block *mappedBlk)
+/* if this block overlaps the previous block, trim accordingly.  Overlaps
+ * can be created with protein to DNA PSLs */
+{
+assertBlockAligned(mappedBlk);
+assert(mappedPsl->blockCount > 0);
+
+// use int so we can go negative
+int prevQEnd = pslQEnd(mappedPsl, mappedPsl->blockCount - 1);
+int prevTEnd = pslTEnd(mappedPsl, mappedPsl->blockCount - 1);
+
+// trim, possibly setting to zero-length
+int overAmt = max((prevQEnd - mappedBlk->qStart), (prevTEnd - mappedBlk->tStart));
+if (overAmt < 0)
+    overAmt = 0;
+else if (overAmt > blockLength(mappedBlk))
+    overAmt = blockLength(mappedBlk);
+
+mappedBlk->qStart += overAmt;
+mappedBlk->tStart += overAmt;
+}
+   
 static boolean mapBlock(struct psl *inPsl, struct psl *mapPsl, int *iMapBlkPtr,
                         struct block *align1Blk, struct psl* mappedPsl,
                         int* mappedPslMax)
 /* Add a PSL block from a ungapped portion of an alignment, mapping it to the
  * genome.  If the started of the inPsl block maps to a part of the mapPsl
  * that is aligned, it is added as a PSL block, otherwise the gap is skipped.
  * Block starts are adjusted for next call.  Return FALSE if the end of the
  * alignment is reached. */
 {
-struct block mappedBlk;
-unsigned size;
 assert(align1Blk->qStart <= align1Blk->qEnd);
 assert(align1Blk->tStart <= align1Blk->tEnd);
 assert((align1Blk->qEnd - align1Blk->qStart) == (align1Blk->tEnd - align1Blk->tStart));
 
 if ((align1Blk->qStart >= align1Blk->qEnd) || (align1Blk->tStart >= align1Blk->tEnd))
     return FALSE;  /* end of ungapped block. */
 
 /* find block or gap with start coordinates of mrna */
-mappedBlk = getBlockMapping(inPsl, mapPsl, iMapBlkPtr, align1Blk);
-
-if ((mappedBlk.qEnd != 0) && (mappedBlk.tEnd != 0))
-    addPslBlock(mappedPsl, &mappedBlk, mappedPslMax);
+struct block mappedBlk = getBlockMapping(inPsl, mapPsl, iMapBlkPtr, align1Blk);
 
-/* advance past block or gap */
-size = (mappedBlk.qEnd != 0)
+/* Need to compute how much of input was consumed before trimming overlap */
+unsigned consumed = (mappedBlk.qEnd != 0)
     ? (mappedBlk.qEnd - mappedBlk.qStart)
     : (mappedBlk.tEnd - mappedBlk.tStart);
-align1Blk->qStart += size;
-align1Blk->tStart += size;
 
+/* remove overlap, which can happen with protein -> NA alignments */
+if (blockIsAligned(&mappedBlk) && (mappedPsl->blockCount > 0))
+    trimOverlapping(mappedPsl, &mappedBlk);
+
+if (blockIsAligned(&mappedBlk))
+    addPslBlock(mappedPsl, &mappedBlk, mappedPslMax);
+
+/* advance past block or gap */
+align1Blk->qStart += consumed;
+align1Blk->tStart += consumed;
 return TRUE;
 }
 
+struct psl* doMapping(struct psl *inPsl, struct psl *mapPsl)
+/* do the mapping after adjustments made to input */
+{
+int mappedPslMax = 8; /* allocated space in output psl */
+int iMapBlk = 0;
+struct psl* mappedPsl = createMappedPsl(inPsl, mapPsl, mappedPslMax);
+
+/* Fill in ungapped blocks.  */
+for (int iBlock = 0; iBlock < inPsl->blockCount; iBlock++)
+    {
+    struct block align1Blk = blockFromPslBlock(inPsl, iBlock);
+    while (mapBlock(inPsl, mapPsl, &iMapBlk, &align1Blk, mappedPsl,
+                    &mappedPslMax))
+        continue;
+    }
+assert(mappedPsl->blockCount <= mappedPslMax);
+
+return mappedPsl;
+}
+
 struct psl* pslTransMap(unsigned opts, struct psl *inPsl, struct psl *mapPsl)
 /* map a psl via a mapping psl, a single psl is returned, or NULL if it
  * couldn't be mapped. */
 {
-int mappedPslMax = 8; /* allocated space in output psl */
-int iMapBlk = 0;
 char inPslOrigStrand[3];
 boolean rcInPsl = (pslTStrand(inPsl) != pslQStrand(mapPsl));
 boolean cnv1 = (pslIsProtein(inPsl) && !pslIsProtein(mapPsl));
 boolean cnv2 = (pslIsProtein(mapPsl) && !pslIsProtein(inPsl));
-int iBlock;
-struct psl* mappedPsl;
 
 /* sanity check size, but allow names to vary to allow ids to have
  * unique-ifying suffixes. */
 if (inPsl->tSize != mapPsl->qSize)
     errAbort("Error: inPsl %s tSize (%d) != mapPsl %s qSize (%d)",
             inPsl->tName, inPsl->tSize, mapPsl->qName, mapPsl->qSize);
 
 /* convert protein PSLs */
 if (cnv1)
     pslProtToNA(inPsl);
 if (cnv2)
     pslProtToNA(mapPsl);
 
 /* need to ensure common sequence is in same orientation, save strand for later */
 safef(inPslOrigStrand, sizeof(inPslOrigStrand), "%s", inPsl->strand);
 if (rcInPsl)
     pslRc(inPsl);
 
-mappedPsl = createMappedPsl(inPsl, mapPsl, mappedPslMax);
-
-/* Fill in ungapped blocks.  */
-for (iBlock = 0; iBlock < inPsl->blockCount; iBlock++)
-    {
-    struct block align1Blk = blockFromPslBlock(inPsl, iBlock);
-    while (mapBlock(inPsl, mapPsl, &iMapBlk, &align1Blk, mappedPsl,
-                    &mappedPslMax))
-        continue;
-    }
+struct psl* mappedPsl = doMapping(inPsl, mapPsl);
 
 /* finish up psl, or free if no blocks were added */
-assert(mappedPsl->blockCount <= mappedPslMax);
 if (mappedPsl->blockCount == 0)
     pslFree(&mappedPsl);  /* nothing made it */
 else
     {
-    setPslBounds(mappedPsl);
+    setPslBoundsCounts(mappedPsl);
     adjustOrientation(opts, inPsl, inPslOrigStrand, mappedPsl);
     }
 
 /* restore input */
 if (rcInPsl)
     {
     pslRc(inPsl);
     strcpy(inPsl->strand, inPslOrigStrand);
     }
 if (cnv1)
     pslNAToProt(inPsl);
 if (cnv2)
     pslNAToProt(mapPsl);
 
 return mappedPsl;