e813f6c7cfd2f8f110266c0824062964322d62a3
markd
  Wed Sep 20 00:01:21 2023 -0700
fixed problems handling overlapping PSL blocks caused by protein to NA alignemtns. Now handles internal coversions create overlapoing blocks

diff --git src/lib/pslTransMap.c src/lib/pslTransMap.c
index 8c3ab0e..4efb5b2 100644
--- src/lib/pslTransMap.c
+++ src/lib/pslTransMap.c
@@ -43,102 +43,181 @@
 return !pslTypeQueryIsProtein(pslType);
 }
 
 static boolean pslTypeTargetIsProtein(enum pslType pslType)
 /* is the pslType indicate the target is a protein? */
 {
 return (pslType == pslTypeProtProt);
 }
 
 static boolean pslTypeTargetIsNa(enum pslType pslType)
 /* is the pslType indicate the target is a NA? */
 {
 return !pslTypeTargetIsProtein(pslType);
 }
 
+static void setPslBoundsCounts(struct psl* psl)
+/* set sequences bounds and counts from blocks on a PSL */
+{
+int lastBlk = psl->blockCount-1;
+
+/* set start/end of sequences */
+psl->qStart = psl->qStarts[0];
+psl->qEnd = psl->qStarts[lastBlk] + psl->blockSizes[lastBlk];
+if (pslQStrand(psl) == '-')
+    reverseIntRange(&psl->qStart, &psl->qEnd, psl->qSize);
+
+psl->tStart = psl->tStarts[0];
+psl->tEnd = psl->tStarts[lastBlk] + psl->blockSizes[lastBlk];
+if (pslTStrand(psl) == '-')
+    reverseIntRange(&psl->tStart, &psl->tEnd, psl->tSize);
+
+psl->match = 0;
+for (int iBlk = 0; iBlk < psl->blockCount; iBlk++)
+    psl->match += psl->blockSizes[iBlk];
+pslComputeInsertCounts(psl);
+}
+
+static unsigned int roundUpToMultipleOf3(unsigned n) {
+  return ((n + 2) / 3) * 3;
+}
+
+static unsigned blockOverlapAmt3(struct psl *psl,
+                                 int iBlk)
+/* How much overlap is there with the next block.  This is the max of query
+ * and target, rounded up to a multiple of three, since we are dealing with
+ * protein -> NA.
+ */
+{
+// care taken in subtraction because of unsigneds
+int tOver = (pslTStart(psl, iBlk + 1) < pslTEnd(psl, iBlk)) ?
+    (pslTEnd(psl, iBlk) - pslTStart(psl, iBlk + 1)) : 0;
+int qOver = (pslQStart(psl, iBlk + 1) < pslQEnd(psl, iBlk)) ?
+    (pslQEnd(psl, iBlk)- pslQStart(psl, iBlk + 1)) : 0;
+return roundUpToMultipleOf3(max(tOver, qOver));
+}   
+
+static void trimBlockOverlap(struct psl *psl, int jBlk,
+                             unsigned overlapAmt3)
+/* adjust bounds of a block by the specified amount */
+{
+psl->qStarts[jBlk] += overlapAmt3;
+psl->tStarts[jBlk] += overlapAmt3;
+psl->blockSizes[jBlk] -= overlapAmt3;
+}
+
+static void arrayDelete(unsigned *array, int iArray, unsigned arrayLen)
+/* remove one element in array by shifting down */
+{
+memmove(&array[iArray], &array[iArray + 1], (arrayLen - iArray - 1) * sizeof(unsigned));
+}
+
+static void removeBlock(struct psl *psl, int jBlk)
+/* remove the specified block */
+{
+arrayDelete(psl->blockSizes, jBlk, psl->blockCount);
+arrayDelete(psl->qStarts, jBlk, psl->blockCount);
+arrayDelete(psl->tStarts, jBlk, psl->blockCount);
+psl->blockCount--;
+}
+    
+static void editBlockOverlap(struct psl *psl, int iBlk,
+                             unsigned overlapAmt3)
+/* remove overlap between two blocks.  If multiple blocks are covered,
+ * then shift remove the block */
+{
+while ((overlapAmt3 > 0) && (iBlk < psl->blockCount - 1))
+    {
+    int jBlk = iBlk + 1;
+    if (overlapAmt3 < psl->blockSizes[jBlk])
+        {
+        trimBlockOverlap(psl, jBlk, overlapAmt3);
+        overlapAmt3 = 0;
+        }
+    else
+        {
+        overlapAmt3 -= psl->blockSizes[jBlk];
+        removeBlock(psl, jBlk);
+        }
+    }
+}
+
+static void removeOverlappingBlock(struct psl *psl)
+/* Remove overlapping blocks, which BLAT can create with protein to NA
+ * alignments.  These are exposed when convert prot-NA alignments to NA-NA
+ * alignment.
+ */
+{
+for (int iBlk = 0; iBlk < psl->blockCount - 1; iBlk++)
+    {
+    unsigned overlapAmt3 = blockOverlapAmt3(psl, iBlk);
+    if (overlapAmt3 > 0)
+        editBlockOverlap(psl, iBlk, overlapAmt3);
+    }
+setPslBoundsCounts(psl);
+}
+
+
 struct block
 /* Coordinates of a block, which might be aligned or a gap on one side */
 {
     int qStart;          /* Query start position. */
     int qEnd;            /* Query end position. */
     int tStart;          /* Target start position. */
     int tEnd;            /* Target end position. */
 };
 
 static struct block ZERO_BLOCK = {0, 0, 0, 0};
 
 static void assertBlockAligned(struct block *blk)
 /* Make sure both sides are same size and not negative length.  */
 {
 assert(blk->qStart <= blk->qEnd);
 assert(blk->tStart <= blk->tEnd);
 assert((blk->qEnd - blk->qStart) == (blk->tEnd - blk->tStart));
 }
 
 static int blockIsAligned(struct block *blk)
 /* check that both the query and target side have alignment */
 {
 return (blk->qEnd != 0) && (blk->tEnd != 0); // can start at zero
 }
 
-static int blockLength(struct block *blk)
-/* get length of an aligned block  */
-{
-assertBlockAligned(blk);
-return (blk->qEnd - blk->qStart);
-}
-
-static void setPslBoundsCounts(struct psl* psl)
-/* set sequences bounds and counts from blocks on a PSL */
-{
-int lastBlk = psl->blockCount-1;
-
-/* set start/end of sequences */
-psl->qStart = psl->qStarts[0];
-psl->qEnd = psl->qStarts[lastBlk] + psl->blockSizes[lastBlk];
-if (pslQStrand(psl) == '-')
-    reverseIntRange(&psl->qStart, &psl->qEnd, psl->qSize);
-
-psl->tStart = psl->tStarts[0];
-psl->tEnd = psl->tStarts[lastBlk] + psl->blockSizes[lastBlk];
-if (pslTStrand(psl) == '-')
-    reverseIntRange(&psl->tStart, &psl->tEnd, psl->tSize);
-
-for (int iBlk = 0; iBlk < psl->blockCount; iBlk++)
-    psl->match += psl->blockSizes[iBlk];
-pslComputeInsertCounts(psl);
-}
-
 static void pslProtToNAConvert(struct psl *psl)
 /* convert a protein/NA or protein/protein alignment to a NA/NA alignment */
 {
 boolean isProtNa = pslIsProtein(psl);
 int iBlk;
 
 psl->qStart *= 3;
 psl->qEnd *= 3;
 psl->qSize *= 3;
 if (!isProtNa)
     psl->tSize *= 3;
 for (iBlk = 0; iBlk < psl->blockCount; iBlk++)
     {
     psl->blockSizes[iBlk] *= 3;
     psl->qStarts[iBlk] *= 3;
     if (!isProtNa)
         psl->tStarts[iBlk] *= 3;
     }
-setPslBoundsCounts(psl);
+removeOverlappingBlock(psl);
+if (pslCheck("converted to NA", stderr, psl) > 0)
+    {
+    pslTabOut(psl, stderr);
+    errAbort("BUG: converting an AA to NA alignment produced an invalid PSL");
+    }
 }
 
 static struct psl* createMappedPsl(struct psl* inPsl, struct psl *mapPsl,
                                    int mappedPslMax)
 /* setup a PSL for the output alignment */
 {
 char strand[3];
 assert(pslTStrand(inPsl) == pslQStrand(mapPsl));
 
 /* strand can be taken from both alignments, since common sequence is in same
  * orientation. */
 strand[0] = pslQStrand(inPsl);
 strand[1] = pslTStrand(mapPsl);
 strand[2] = '\0';
 
@@ -245,106 +324,80 @@
         {
         *iMapBlkPtr = iBlk;
         return getOverBlockMapping(mqStart, mqEnd, mapPsl->tStarts[iBlk], align1Blk);
         }
     }
 
 /* reached the end of the mRNA->genome alignment, finish off the 
  * rest of the the protein as an insert */
 struct block mappedBlk = ZERO_BLOCK;
 mappedBlk.qStart = align1Blk->qStart;
 mappedBlk.qEnd = align1Blk->qEnd;
 *iMapBlkPtr = iBlk;
 return mappedBlk;
 }
 
-static void trimOverlapping(struct psl *mappedPsl, struct block *mappedBlk)
-/* if this block overlaps the previous block, trim accordingly.  Overlaps
- * can be created with protein to DNA PSLs */
-{
-assertBlockAligned(mappedBlk);
-assert(mappedPsl->blockCount > 0);
-
-// use int so we can go negative
-int prevQEnd = pslQEnd(mappedPsl, mappedPsl->blockCount - 1);
-int prevTEnd = pslTEnd(mappedPsl, mappedPsl->blockCount - 1);
-
-// trim, possibly setting to zero-length
-int overAmt = max((prevQEnd - (int)mappedBlk->qStart), (prevTEnd - (int)mappedBlk->tStart));
-if (overAmt < 0)
-    overAmt = 0;
-else if (overAmt > blockLength(mappedBlk))
-    overAmt = blockLength(mappedBlk);
-
-mappedBlk->qStart += overAmt;
-mappedBlk->tStart += overAmt;
-}
-   
 static boolean mapBlock(struct psl *inPsl, struct psl *mapPsl, int *iMapBlkPtr,
                         struct block *align1Blk, struct psl* mappedPsl,
                         int* mappedPslMax)
 /* Add a PSL block from a ungapped portion of an alignment, mapping it to the
  * genome.  If the started of the inPsl block maps to a part of the mapPsl
  * that is aligned, it is added as a PSL block, otherwise the gap is skipped.
  * Block starts are adjusted for next call.  Return FALSE if the end of the
  * alignment is reached. */
 {
 assert(align1Blk->qStart <= align1Blk->qEnd);
 assert(align1Blk->tStart <= align1Blk->tEnd);
 assert((align1Blk->qEnd - align1Blk->qStart) == (align1Blk->tEnd - align1Blk->tStart));
 
 if ((align1Blk->qStart >= align1Blk->qEnd) || (align1Blk->tStart >= align1Blk->tEnd))
     return FALSE;  /* end of ungapped block. */
 
 /* find block or gap with start coordinates of mrna */
 struct block mappedBlk = getBlockMapping(inPsl, mapPsl, iMapBlkPtr, align1Blk);
 
-/* Need to compute how much of input was consumed before trimming overlap */
+/* Compute how much of input was consumed */
 unsigned consumed = (mappedBlk.qEnd != 0)
     ? (mappedBlk.qEnd - mappedBlk.qStart)
     : (mappedBlk.tEnd - mappedBlk.tStart);
 
-/* remove overlap, which can happen with protein -> NA alignments */
-if (blockIsAligned(&mappedBlk) && (mappedPsl->blockCount > 0))
-    trimOverlapping(mappedPsl, &mappedBlk);
-
 if (blockIsAligned(&mappedBlk))
     addPslBlock(mappedPsl, &mappedBlk, mappedPslMax);
 
 /* advance past block or gap */
 align1Blk->qStart += consumed;
 align1Blk->tStart += consumed;
 return TRUE;
 }
 
 struct psl* doMapping(struct psl *inPsl, struct psl *mapPsl)
 /* do the mapping after adjustments made to input */
 {
 int mappedPslMax = 8; /* allocated space in output psl */
 int iMapBlk = 0;
 struct psl* mappedPsl = createMappedPsl(inPsl, mapPsl, mappedPslMax);
 
 /* Fill in ungapped blocks.  */
 for (int iBlock = 0; iBlock < inPsl->blockCount; iBlock++)
     {
     struct block align1Blk = blockFromPslBlock(inPsl, iBlock);
     while (mapBlock(inPsl, mapPsl, &iMapBlk, &align1Blk, mappedPsl,
                     &mappedPslMax))
         continue;
     }
 assert(mappedPsl->blockCount <= mappedPslMax);
-
+removeOverlappingBlock(mappedPsl);
 return mappedPsl;
 }
 
 static enum pslType determinePslType(struct psl *psl, enum pslType pslType, char *errDesc)
 /* check the psl type if specified, set if unspecified */
 {
 boolean isProtNa = pslIsProtein(psl);
 if (pslType == pslTypeUnspecified)
     return  isProtNa ? pslTypeProtNa : pslTypeNaNa;  // default
 
 // check specified type
 if (isProtNa &&  (pslType != pslTypeProtNa))
     errAbort("%s PSL %s to %s is a protein to NA alignment, however %s specified",
              errDesc, psl->qName, psl->tName, pslTypeDesc[pslType]);
     
@@ -396,30 +449,31 @@
 /* ensure common sequence is in same orientation and convert protein PSLs */
 char inPslOrigStrand[3];
 safef(inPslOrigStrand, sizeof(inPslOrigStrand), "%s", inPsl->strand);
 if (cnvIn || rcInPsl)
     inPsl = pslClone(inPsl);
 if (cnvIn)
     pslProtToNAConvert(inPsl);
 if (rcInPsl)
     pslRc(inPsl);
 if (cnvMap)
     {
     mapPsl = pslClone(mapPsl);
     pslProtToNAConvert(mapPsl);
     }
     
+
 /* sanity check sizes after conversion Don't check name, so names to vary to
  * allow ids to have unique-ifying suffixes. */
 if (inPsl->tSize != mapPsl->qSize)
     errAbort("Error: inPsl %s tSize (%d) != mapPsl %s qSize (%d)",
             inPsl->tName, inPsl->tSize, mapPsl->qName, mapPsl->qSize);
 
 
 struct psl* mappedPsl = doMapping(inPsl, mapPsl);
 
 /* finish up psl, or free if no blocks were added */
 if (mappedPsl->blockCount == 0)
     pslFree(&mappedPsl);  /* nothing made it */
 else
     {
     setPslBoundsCounts(mappedPsl);