8c802cc665358f0d69c1243f77d7931001b78841
markd
  Mon Aug 3 12:40:42 2015 -0700
Fixed with psl to genePred frame correction working when there
is target deletion.  A lot of test cases changed as they
were constructed from transMap and so had a lot of indels. (#15803)

diff --git src/hg/lib/genePred.c src/hg/lib/genePred.c
index 189196b..373fa8a 100644
--- src/hg/lib/genePred.c
+++ src/hg/lib/genePred.c
@@ -1114,55 +1114,56 @@
 
 if (start < end)
     {
     /* compute from end if it is complete in mRNA */
     if (cds->endComplete)
         {
         int fr = (cds->end-start) % 3;
         frame = (fr == 2) ? 1 : ((fr == 1) ? 2 : 0);
         }
     else
         frame = (start-cds->start) % 3;
     }
 return frame;
 }
 
-static boolean shouldMergeBlocks(struct genePred *gene, int iExon,
-                                 unsigned tStart, unsigned options,
+static boolean shouldMergeBlocks(struct genePred *gene, 
+                                 unsigned tStart, unsigned prevTEnd,
+                                 unsigned qStart, unsigned prevQEnd,
+                                 unsigned options,
                                  int cdsMergeSize, int utrMergeSize)
 /* determine if a new block starting at tStart whould be merged with
- * the preceeding exon, indicated by iExon.
+ * the preceeding exon.
  */
 {
-/* nothing to check if no exons have been added */
-if (iExon >= 0)
-    {
 boolean inCds = (gene->cdsStart <= tStart) && (tStart < gene->cdsEnd);
-    int gapSize = (tStart - gene->exonEnds[iExon]);
+int tGapSize = (tStart - prevTEnd);
+int qGapSize = (qStart - prevQEnd);
 if (inCds)
     {
-        if ((options & genePredPslCdsMod3) && ((gapSize % 3) != 0))
+    if ((options & genePredPslCdsMod3)
+        && (((tGapSize % 3) != 0) || ((qGapSize % 3) != 0)))
         return FALSE;  /* not a multiple of three */
-        if ((cdsMergeSize >= 0) && (gapSize <= cdsMergeSize))
+    if ((cdsMergeSize >= 0) && ((tGapSize <= cdsMergeSize) && (qGapSize <= cdsMergeSize)))
         return TRUE;
     }
 else 
     {
-        if ((utrMergeSize >= 0) && (gapSize <= utrMergeSize))
+    // qGapSize only matters for CDS where we are trying to keep frame sane
+    if ((utrMergeSize >= 0) && (tGapSize <= utrMergeSize))
         return TRUE;
     }
-    }
 return FALSE; /* don't merge */
 }
 
 static void pslToExons(struct psl *psl, struct genePred *gene,
                        struct genbankCds* cds, unsigned options,
                        int cdsMergeSize, int utrMergeSize)
 /* Convert psl alignment blocks to genePred exons, merging together blocks by
  * the specified paraemeters.  Optionally add frame information. */
 {
 int iBlk, iExon;
 int startIdx, stopIdx, idxIncr;
 
 /* this is bigger than needed if blocks merge */
 gene->exonStarts = needMem(psl->blockCount*sizeof(unsigned));
 gene->exonEnds = needMem(psl->blockCount*sizeof(unsigned));
@@ -1177,55 +1178,63 @@
 /* traverse psl in postive target order */
 if (psl->strand[1] == '-')
     {
     startIdx = psl->blockCount-1;
     stopIdx = -1;
     idxIncr = -1;
     }
 else
     {
     startIdx = 0;
     stopIdx = psl->blockCount;
     idxIncr = 1;
     }
 
 iExon = -1;  /* indicate none have been added */
+unsigned prevQEnd = 0;
+unsigned prevTEnd = 0;
 for (iBlk = startIdx; iBlk != stopIdx; iBlk += idxIncr)
     {
+    int qStart = psl->qStarts[iBlk];
+    int qEnd = qStart + psl->blockSizes[iBlk];
+    if (psl->strand[0] == '-')
+        reverseIntRange(&qStart, &qEnd, psl->qSize);
     int tStart = psl->tStarts[iBlk];
     int tEnd = tStart + psl->blockSizes[iBlk];
     if (psl->strand[1] == '-')
         reverseIntRange(&tStart, &tEnd, psl->tSize);
-    if (!shouldMergeBlocks(gene, iExon, tStart, options,
+    if ((iExon == 0) || !shouldMergeBlocks(gene, tStart, prevTEnd, qStart, prevQEnd, options,
                                            cdsMergeSize, utrMergeSize))
         {
         /* new exon */
         iExon++;
         gene->exonStarts[iExon] = tStart;
 	}
     gene->exonEnds[iExon] = tEnd;
     /* Set the frame. We have to deal with multiple blocks being merged.  For
      * positive strand, the first block frame is used, for negative strand,
      * the last is used. */
     if ((gene->optFields & genePredExonFramesFld) && (cds != NULL)
         && (((psl->strand[0] == '+') && (gene->exonFrames[iExon] < 0))
             || (psl->strand[0] == '-')))
 	{
         int fr = getFrame(psl, psl->qStarts[iBlk], psl->qStarts[iBlk]+psl->blockSizes[iBlk], cds);
         if (fr >= 0)
 	    gene->exonFrames[iExon] = fr;
 	}
+    prevTEnd = tEnd;
+    prevQEnd = qEnd;
     }
 gene->exonCount = iExon+1;
 assert(gene->exonCount <= psl->blockCount);
 }
 
 struct genePred *genePredFromPsl3(struct psl *psl,  struct genbankCds* cds, 
                                   unsigned optFields, unsigned options,
                                   int cdsMergeSize, int utrMergeSize)
 /* Convert a PSL of an mRNA alignment to a genePred, converting a genbank CDS
  * specification string to genomic coordinates. Small genomic inserts are
  * merged based on the mergeSize parameters.  Gaps no larger than the
  * specified merge sizes result in the adjacent blocks being merged into a
  * single exon.  Gaps in CDS use cdsMergeSize, in UTR use utrMergeSize.  If
  * the genePredPslCdsMod3 option is specified, then CDS gaps are only merged
  * if a multiple of three.  A negative merge sizes disables merging of blocks.