eeb6e13f3c9d1f3e9883b76498f5cb1a80463531
markd
  Mon Aug 16 23:41:36 2010 -0700
fix bug where insertions between multiple bases of stop codon generated incorrect frame.  Provide explicit control over use of gene_id or gene_name for name2 column
diff --git src/hg/lib/genePred.c src/hg/lib/genePred.c
index e42c08a..76c475b 100644
--- src/hg/lib/genePred.c
+++ src/hg/lib/genePred.c
@@ -447,36 +447,6 @@
     }
 }
 
-static void fixStopFrame(struct genePred *gp)
-/* Nasty corner case: GTF with the all or part of the stop codon as the only
- * codon in an exon, we must set the frame on this exon.  */
-{
-int stopPos = (gp->strand[0] == '+') ? gp->cdsEnd-1 : gp->cdsStart;
-int iStop = -1, i;
-
-/* find position containing last base */
-for (i = 0; (i < gp->exonCount) && (iStop < 0); i++)
-    {
-    if ((gp->exonStarts[i] <= stopPos) && (stopPos < gp->exonEnds[i]))
-        iStop = i;
-    }
-if ((iStop >= 0) && (gp->exonFrames[iStop] < 0))
-    {
-    if (gp->strand[0] == '+')
-        {
-        /* pos, compute from previous exon */
-        assert(iStop > 0);
-        gp->exonFrames[iStop] = (gp->exonFrames[iStop-1]+(gp->exonEnds[iStop-1]-gp->exonStarts[iStop-1])) % 3;
-        }
-    else
-        {
-        /* neg, compute from next exon */
-        assert(iStop < gp->exonCount-1);
-        gp->exonFrames[iStop] = (gp->exonFrames[iStop+1]+(gp->exonEnds[iStop+1]-gp->exonStarts[iStop+1])) % 3;
-        }
-    }
-}
-
 static int phaseToFrame(char phase)
 /* convert GTF/GFF phase to frame */
 {
@@ -492,6 +462,37 @@
 return -1;
 }
 
+#ifdef NOT_CURRENTLY_USED
+static char frameToPhase(int frame)
+/* convert GTF/GFF frame to phase */
+{
+switch (frame)
+    {
+    case 0:
+        return '0';
+    case 1:
+        return '2';
+    case 2:
+        return '1';
+    }
+return -1;
+}
+#endif
+
+static int incrFrame(int frame, int amt)
+/* increment frame, no-op if frame is -1 */
+{
+if (frame < 0)
+    return frame;
+else if (amt >= 0)
+    return (frame+amt) % 3;
+else
+    {
+    int amt3 = (-amt) % 3;
+    return (frame - (amt - amt3)) % 3;
+    }
+}
+
 static int findPosExon(struct genePred *gp, int pos)
 /* find exon containing the specified position */
 {
@@ -505,6 +506,67 @@
 return 0;
 }
 
+static int findLastFramedExon(struct genePred *gp)
+/* locate the last exon with frame */
+{
+int iStart = (gp->strand[0] == '+') ? 0 : gp->exonCount-1;
+int iStop = (gp->strand[0] == '+') ? gp->exonCount : -1;
+int dir =  (gp->strand[0] == '+') ? +1 : -1;
+int iPrevExon = -1, iExon;
+
+// find beginning of frames
+for (iExon = iStart; (iExon != iStop) && (gp->exonFrames[iExon] == -1); iExon += dir)
+    continue;
+// find last with frame
+for (; (iExon != iStop) && (gp->exonFrames[iExon] != -1); iExon += dir)
+    iPrevExon = iExon;
+if (iPrevExon < 0)
+    errAbort("bug: findLastFramedExon should have found an exon with frame");
+return iPrevExon;
+}
+
+static void extendFramePos(struct genePred *gp)
+/* extend frame missing from last exon(s) for positive strand genes, normally
+ * caused by GTF stop_codon */
+{
+int iExon = findLastFramedExon(gp);
+int frame = incrFrame(gp->exonFrames[iExon], (gp->exonEnds[iExon]-gp->exonStarts[iExon]));
+for (iExon++; (iExon < gp->exonCount) && (gp->exonStarts[iExon] < gp->cdsEnd); iExon++)
+    {
+    assert(gp->exonFrames[iExon] < 0);
+    gp->exonFrames[iExon] = frame;
+    frame = incrFrame(gp->exonFrames[iExon], (gp->exonEnds[iExon]-gp->exonStarts[iExon]));
+    }
+}
+
+static void extendFrameNeg(struct genePred *gp)
+/* extend frame missing from last exon(s) for negative strand genes, normally
+ * caused by GTF stop_codon */
+{
+int iExon = findLastFramedExon(gp);
+int frame = incrFrame(gp->exonFrames[iExon], (gp->exonEnds[iExon]-gp->exonStarts[iExon]));
+for (iExon--; (iExon >= 0) && (gp->exonEnds[iExon] > gp->cdsStart); iExon--)
+    {
+    assert(gp->exonFrames[iExon] < 0);
+    gp->exonFrames[iExon] = frame;
+    frame = incrFrame(gp->exonFrames[iExon], (gp->exonEnds[iExon]-gp->exonStarts[iExon]));
+    }
+}
+
+static void fixStopFrame(struct genePred *gp)
+/* Handle nasty corner case: GTF with the all or part of the stop codon as the
+ * only codon in an exon, we must set the frame on these exons.  Some ENSEMBL
+ * GTFs have single base `exons' with just a base of the stop codon, so must
+ * check every base.  While less than ideal, we just extend the frame past the
+ * last exon with frame, so a conflicting frame on the stop codon will be
+ * missed.  Just switching to GFF3 fixes this mess. */
+{
+if (gp->strand[0] == '+')
+    extendFramePos(gp);
+else
+    extendFrameNeg(gp);
+}
+
 static boolean adjImpliedStopPos(struct genePred *gp)
 /* implied stop codon adjustment for positive strand gene */
 {
@@ -638,7 +700,8 @@
 
 static struct gffLine *assignFrameForCdsExon(int start, int end, int *framePtr,
                                              struct gffLine *gl, boolean isGtf)
-/* set the frame for an exon, advancing gl past end of this exon */
+/* set frame from any overlapping records with frame. Don't include
+ * start/stop_codon, as it isn't a full exon. */
 {
 /* skip lines preceeding this exon */
 while (gl != NULL)
@@ -647,8 +710,7 @@
         break;
     gl = gl->next;
 }
-/* set frame from any overlapping records with frame. Don't include
- * start/stop_codon, as it isn't a full exon. */
+/* set frame from any overlapping records with frame. */
 while ((gl != NULL) && (rangeIntersection(gl->start, gl->end, start, end) > 0))
     {
     if (allowedFrameFeature(gl, isGtf))
@@ -678,7 +740,7 @@
         }
     }
 
-/* make sure stop has face if some exons in the gene had frame */
+/* make sure stop has frame if some exons in the gene had frame */
 if (haveFrame)
     fixStopFrame(gp);
 checkForNoFrames(gp);
@@ -776,9 +838,17 @@
 
 if (optFields & genePredName2Fld)
     {
+    if (options & genePredGxfGeneNameAsName2)
+        {
+        if (group->lineList->geneName != NULL)
+            gp->name2 = cloneString(group->lineList->geneName);
+        }
+    else
+        {
     if (group->lineList->geneId != NULL)
         gp->name2 = cloneString(group->lineList->geneId);
-    else
+        }
+    if (gp->name2 == NULL)
         gp->name2 = cloneString("");
     }
 if (optFields & genePredCdsStatFld)