b1a9de0fd2ebab6d3caf4197eecc2e75eb74162d
markd
  Mon May 24 13:53:35 2021 -0700
Added better handling of GTFs without correct frames on CDS. This also simplified the handling of GTF stop codons

diff --git src/hg/lib/genePred.c src/hg/lib/genePred.c
index 0e0ee50..0dba3b6 100644
--- src/hg/lib/genePred.c
+++ src/hg/lib/genePred.c
@@ -558,105 +558,30 @@
 }
 
 static int findPosExon(struct genePred *gp, int pos)
 /* find exon containing the specified position */
 {
 int i;
 for (i = 0; i < gp->exonCount; i++)
     {
     if ((gp->exonStarts[i] <= pos) && (pos < gp->exonEnds[i]))
         return i;
     }
 errAbort("bug: position %d not found in exon of %s genePred", pos, gp->name);
 return 0;
 }
 
-static int findLastFramedExon(struct genePred *gp)
-/* locate the last exon with frame */
-{
-int iStart = (gp->strand[0] == '+') ? 0 : gp->exonCount-1;
-int iStop = (gp->strand[0] == '+') ? gp->exonCount : -1;
-int dir =  (gp->strand[0] == '+') ? +1 : -1;
-int iPrevExon = -1, iExon;
-
-// find beginning of frames
-for (iExon = iStart; (iExon != iStop) && (gp->exonFrames[iExon] == -1); iExon += dir)
-    continue;
-// find last with frame
-for (; (iExon != iStop) && (gp->exonFrames[iExon] != -1); iExon += dir)
-    iPrevExon = iExon;
-if (iPrevExon < 0)
-    errAbort("bug: findLastFramedExon should have found an exon with frame");
-return iPrevExon;
-}
-
-static int exonCdsStart(struct genePred *gp, int iExon)
-/* get the CDS starting position in specified exon */
-{
-return (gp->cdsStart < gp->exonStarts[iExon]) ? gp->exonStarts[iExon] : gp->cdsStart;
-}
-
-static int exonCdsEnd(struct genePred *gp, int iExon)
-/* get the CDS ending position in specified exon */
-{
-return (gp->cdsEnd > gp->exonEnds[iExon]) ? gp->exonEnds[iExon] : gp->cdsEnd;
-}
-
-static void extendFramePos(struct genePred *gp)
-/* extend frame missing from last exon(s) for positive strand genes, normally
- * caused by GTF stop_codon */
-{
-int iExon = findLastFramedExon(gp);
-int frame = incrFrame(gp->exonFrames[iExon], (exonCdsEnd(gp, iExon) - exonCdsStart(gp, iExon)));
-for (iExon++; (iExon < gp->exonCount) && (gp->exonStarts[iExon] < gp->cdsEnd); iExon++)
-    {
-    if (!((gp->exonFrames[iExon] < 0) || (gp->exonFrames[iExon] == frame)))
-        errAbort("conflicting frame for %s exon index %d, was %d, trying to assign %d", gp->name, iExon, gp->exonFrames[iExon], frame);
-    gp->exonFrames[iExon] = frame;
-    frame = incrFrame(gp->exonFrames[iExon], (exonCdsEnd(gp, iExon) - exonCdsStart(gp, iExon)));
-    }
-}
-
-static void extendFrameNeg(struct genePred *gp)
-/* extend frame missing from last exon(s) for negative strand genes, normally
- * caused by GTF stop_codon */
-{
-int iExon = findLastFramedExon(gp);
-int frame = incrFrame(gp->exonFrames[iExon], (exonCdsEnd(gp, iExon) - exonCdsStart(gp, iExon)));
-for (iExon--; (iExon >= 0) && (gp->exonEnds[iExon] > gp->cdsStart); iExon--)
-    {
-    if (!((gp->exonFrames[iExon] < 0) || (gp->exonFrames[iExon] == frame)))
-        errAbort("conflicting frame for %s exon index %d, was %d, trying to assign %d", gp->name, iExon, gp->exonFrames[iExon], frame);
-    gp->exonFrames[iExon] = frame;
-    frame = incrFrame(gp->exonFrames[iExon], (exonCdsEnd(gp, iExon) - exonCdsStart(gp, iExon)));
-    }
-}
-
-static void fixStopFrame(struct genePred *gp)
-/* Handle nasty corner case: GTF with the all or part of the stop codon as the
- * only codon in an exon, we must set the frame on these exons.  Some ENSEMBL
- * GTFs have single base `exons' with just a base of the stop codon, so must
- * check every base.  While less than ideal, we just extend the frame past the
- * last exon with frame, so a conflicting frame on the stop codon will be
- * missed.  Just switching to GFF3 fixes this mess. */
-{
-if (gp->strand[0] == '+')
-    extendFramePos(gp);
-else
-    extendFrameNeg(gp);
-}
-
 static boolean adjImpliedStopPos(struct genePred *gp)
 /* implied stop codon adjustment for positive strand gene */
 {
 int iExon = findPosExon(gp, gp->cdsEnd-1);
 unsigned space = (gp->exonEnds[iExon]-gp->cdsEnd);
 if (space >= 3)
     {
     // room in current exon
     gp->cdsEnd += 3;
     return TRUE;
     }
 else if ((iExon < gp->exonCount)
          && ((gp->exonEnds[iExon+1]-gp->exonStarts[iExon+1])+space) >= 3)
     {
     // room including next exon
@@ -696,59 +621,30 @@
         {
         if (gp->optFields & genePredCdsStatFld)
             gp->cdsEndStat = cdsComplete;
         }
     }
 else
     {
     if (adjImpliedStopNeg(gp))
         {
         if (gp->optFields & genePredCdsStatFld)
             gp->cdsStartStat = cdsComplete;
         }
     }
 }
 
-static void checkForNoFrames(struct genePred *gp)
-/* check for requesting frame from a file without frames. */
-{
-int i;
-/* Complain if we have a CDS but exonFrames are all -1: */
-if (gp->cdsStart < gp->cdsEnd)
-    {
-    boolean foundReal = FALSE;
-    for (i = 0;  i < gp->exonCount;  i++)
-        {
-        if (gp->exonFrames[i] >= 0 && gp->exonFrames[i] <= 2)
-            {
-            foundReal = TRUE;
-            break;
-            }
-        }
-    if (! foundReal)
-        {
-        errAbort("Error: exonFrames field is being added, but I found a "
-                 "gene (%s) with CDS but no valid frames.  "
-                 "This can happen if program is invoked with -genePredExt "
-                 "but no valid frames are given in the file.  If the 8th "
-                 "field of GFF/GTF file is always a placeholder, then don't use "
-                 "-genePredExt.",
-                 gp->name);
-        }
-    }
-}
-
 static boolean isGtfFeature(char *feat)
 /* check if a feature is one of the recognized features.  All otherse
  * should be ignored. http://mblab.wustl.edu/GTF22.html */
 {
 static char *gtfFeats[] = {"CDS", "start_codon", "stop_codon", "5UTR", "3UTR",
                            "inter", "inter_CNS", "intron_CNS", "exon", NULL};
 int i;
 for (i = 0; gtfFeats[i] != NULL; i++)
     {
     if (sameString(feat, gtfFeats[i]))
         return TRUE;
     }
 return FALSE;
 }
 
@@ -790,50 +686,59 @@
 }
 /* set frame from any overlapping records with frame. */
 while ((gl != NULL) && (rangeIntersection(gl->start, gl->end, start, end) > 0))
     {
     if (allowedFrameFeature(gl, isGtf))
         {
         int frame = phaseToFrame(gl->frame);
         if (frame >= 0)
             *framePtr = frame;
         }
     gl = gl->next;
     }
 return gl;
 }
 
-static void assignFrame(boolean isGtf, struct gffGroup *group, struct genePred *gp)
-/* Assign frame from GFF after genePred has been built.*/
+static void assignFrame(boolean isGtf, struct gffGroup *group, struct genePred *gp, unsigned options)
+/* Assign frame from GFF/GTF after genePred has been built.*/
 {
+
 struct gffLine *gl = group->lineList;
 int start, end, i;
-boolean haveFrame = FALSE;
+int numCdsExons = 0;
+int numFrameExons = 0;
 for (i = 0; i < gp->exonCount; i++)
     {
     if (genePredCdsExon(gp, i, &start, &end))
         {
+        numCdsExons += 1;
         gl = assignFrameForCdsExon(start, end, &(gp->exonFrames[i]), gl, isGtf);
         if (gp->exonFrames[i] >= 0)
-            haveFrame = TRUE;
+            numFrameExons += 1;
         }
     }
 
-/* make sure stop has frame if some exons in the gene had frame */
-if (haveFrame)
-    fixStopFrame(gp);
-checkForNoFrames(gp);
+/* Assign frames for exons without frames, either due to it incorrectly not
+ * being on cases.  This also Handle the nasty corner caseof GTF with the all
+ * or part of the stop codon as the only codon in an exon, we must set the
+ * frame on these exons.  Some ENSEMBL GTFs have single base `exons' (due to
+ * low quality assemblies) with just a base of the stop codon, so must check
+ * every base.  While less than ideal, we just extend the frame past the last
+ * exon with frame, so a conflicting frame on the stop codon will be missed.
+ * Just switching to GFF3 fixes this mess.  Return the count of exons that
+ * actually had frame set. */
+genePredFixExonFrames(gp);
 }
 
 static struct genePred *mkFromGroupedGxf(struct gffFile *gff, struct gffGroup *group, char *name,
                                          boolean isGtf, char *exonSelectWord, unsigned optFields,
                                          unsigned options)
 /* common function to create genePreds from GFFs or GTFs.  This is a little
  * ugly with to many check of isGtf, however the was way to much identical
  * code the other way. Options are from genePredFromGxfOpts */
 {
 struct genePred *gp;
 long stopCodonStart = -1, stopCodonEnd = -1;
 long cdsStart = -1, cdsEnd = -1;
 int exonCount = 0;
 boolean haveStartCodon = FALSE, haveStopCodon = FALSE;
 struct gffLine *gl;
@@ -1021,31 +926,31 @@
             assert(i < exonCount);
             assert(gl->start >= eStarts[i]);
             if (gl->end > eEnds[i])
                 eEnds[i] = gl->end;
             }
         }
     }
 gp->exonCount = i+1;
 
 /* adjust for flybase type of GFFs, with stop codon implied and outside of
  * CDS. */
 if ((options & genePredGxfImpliedStopAfterCds) && !haveStopCodon)
     adjImpliedStopAfterCds(gp);
 
 if (optFields & genePredExonFramesFld)
-    assignFrame(isGtf, group, gp);
+    assignFrame(isGtf, group, gp, options);
 
 return gp;
 }
 
 struct genePred *genePredFromGroupedGff(struct gffFile *gff, struct gffGroup *group, char *name,
                                         char *exonSelectWord, unsigned optFields, unsigned options)
 /* Convert gff->groupList to genePred list. See genePred.h for details. */
 {
 struct gffLine *gl;
 boolean anyExon = FALSE;
 
 /* Look to see if any exons.  If not allow CDS to be used instead. */
 if (exonSelectWord)
     {
     for (gl = group->lineList; gl != NULL; gl = gl->next)
@@ -1760,31 +1665,31 @@
     blockSize = gp->exonEnds[gp->exonCount-1] - gp->exonStarts[gp->exonCount-1];
     return endDist < (gpSize - blockSize - 50); 
     }
 else if(sameString(gp->strand, "-"))
     {
     blockSize = gp->exonEnds[0] - gp->exonStarts[0];
     return startDist > (blockSize + 50);
     }
 else 
     errAbort("genePredNmdsTarget() - Don't recognize strand: %s\n", gp->strand);
 return FALSE;
 }
 
 void genePredAddExonFrames(struct genePred *gp)
 /* Add exonFrames array to a genePred that doesn't have it. Frame is assumed
- * to be contiguous. */
+ * to be contiguous.  NOTE: suggest using genePredFixExonFrames for new code. */
 {
 int iExon, start, end, iBase = 0;
 int iStart, iEnd, iDir;
 
 /* initial array to all -1, for no frame */
 AllocArray(gp->exonFrames, gp->exonCount);
 gp->optFields |= genePredExonFramesFld;
 for (iExon = 0; iExon < gp->exonCount; iExon++)
     gp->exonFrames[iExon] = -1;
 
 if (sameString(gp->strand, "+"))
     {
     iStart = 0;
     iEnd = gp->exonCount;
     iDir = 1;
@@ -1793,30 +1698,80 @@
     {
     iStart = gp->exonCount-1;
     iEnd = -1;
     iDir = -1;
     }
 for (iExon = iStart; iExon != iEnd; iExon += iDir)
     {
     if (genePredCdsExon(gp, iExon, &start, &end))
         {
         gp->exonFrames[iExon] = iBase % 3;
         iBase += (end - start);
         }
     }
 }
 
+void genePredFixExonFrames(struct genePred *gp)
+/* Add exonFrames array to a genePred that has frame on only some or no
+ * features. Frame is assumed to be contiguous when an existing frame is not
+ * present. */
+{
+int iStart, iEnd, iDir;
+
+if (gp->exonFrames == NULL)
+    {
+    /* doesn't currently have frames */
+    AllocArray(gp->exonFrames, gp->exonCount);
+    gp->optFields |= genePredExonFramesFld;
+    for (int iExon = 0; iExon < gp->exonCount; iExon++)
+        gp->exonFrames[iExon] = -1;
+    }
+
+if (sameString(gp->strand, "+"))
+    {
+    iStart = 0;
+    iEnd = gp->exonCount;
+    iDir = 1;
+    }
+else
+    {
+    iStart = gp->exonCount - 1;
+    iEnd = -1;
+    iDir = -1;
+    }
+int nextFrame = -1;
+for (int iExon = iStart; iExon != iEnd; iExon += iDir)
+    {
+    int start, end;
+    if (genePredCdsExon(gp, iExon, &start, &end))
+        {
+        if (gp->exonFrames[iExon] < 0)
+            {
+            // no existing frame
+            if (nextFrame < 0)
+                nextFrame = 0;  // first frame
+            gp->exonFrames[iExon] = nextFrame;
+            }
+        nextFrame = incrFrame(gp->exonFrames[iExon], end - start);
+        }
+    else
+        {
+        gp->exonFrames[iExon] = -1;
+        }
+    }
+}
+
 void genePredRc(struct genePred *gp, int chromSize)
 /* Reverse complement a genePred (project it to the opposite strand).  Useful
  * when doing analysis that is simplified by having things on the same strand.
  */
 {
 int iExon;
 gp->strand[0] = (gp->strand[0] == '+') ? '-' : '+';
 reverseUnsignedRange(&gp->txStart, &gp->txEnd, chromSize);
 reverseUnsignedRange(&gp->cdsStart, &gp->cdsEnd, chromSize);
 for (iExon = 0; iExon < gp->exonCount; iExon++)
     reverseUnsignedRange(&gp->exonStarts[iExon], &gp->exonEnds[iExon], chromSize);
 reverseUnsigned(gp->exonStarts, gp->exonCount);
 reverseUnsigned(gp->exonEnds, gp->exonCount);
 if (gp->optFields & genePredCdsStatFld)
     {