src/hg/lib/genePred.c 1.102

1.102 2009/02/13 01:02:16 markd
ignore non-GTF features, as specified by the spec
Index: src/hg/lib/genePred.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/lib/genePred.c,v
retrieving revision 1.101
retrieving revision 1.102
diff -b -B -U 4 -r1.101 -r1.102
--- src/hg/lib/genePred.c	13 Sep 2008 00:46:54 -0000	1.101
+++ src/hg/lib/genePred.c	13 Feb 2009 01:02:16 -0000	1.102
@@ -587,29 +587,72 @@
     if (! foundReal)
         {
         errAbort("Error: exonFrames field is being added, but I found a "
                  "gene (%s) with CDS but no valid frames.  "
-                 "This can happen if ldHgGene is invoked with -genePredExt "
+                 "This can happen if program is invoked with -genePredExt "
                  "but no valid frames are given in the file.  If the 8th "
                  "field of GFF/GTF file is always a placeholder, then don't use "
                  "-genePredExt.",
                  gp->name);
         }
     }
 }
 
+static boolean isGtfFeature(char *feat)
+/* check if a feature is one of the recognized features.  All otherse
+ * should be ignored. http://mblab.wustl.edu/GTF22.html */
+{
+static char *gtfFeats[] = {"CDS", "start_codon", "stop_codon", "5UTR", "3UTR",
+                           "inter", "inter_CNS", "intron_CNS", "exon", NULL};
+int i;
+for (i = 0; gtfFeats[i] != NULL; i++)
+    {
+    if (sameString(feat, gtfFeats[i]))
+        return TRUE;
+    }
+return FALSE;
+}
+
+static boolean ignoreGxfLine(struct gffLine *gl, boolean isGtf)
+/* should the specified line be skipped? */
+{
+if (isGtf)
+    return !isGtfFeature(gl->feature);
+else
+    return FALSE;
+}
+
+static boolean allowedFrameFeature(struct gffLine *gl, boolean isGtf)
+/* should this feature be used to assign frame? */
+{
+if (isStartStopCodon(gl->feature) || ignoreGxfLine(gl, isGtf))
+    return FALSE;
+
+// for GFF, any features with frame can be used to set frame,
+// with GTF, there are some bogus files that set frame on UTR features,
+// so only use CDS
+if (isGtf)
+    return sameString(gl->feature, "CDS");
+else
+    return (gl->frame != '.');
+}
+
 static struct gffLine *assignFrameForCdsExon(int start, int end, int *framePtr,
-                                             struct gffLine *gl)
+                                             struct gffLine *gl, boolean isGtf)
 /* set the frame for an exon, advancing gl past end of this exon */
 {
 /* skip lines preceeding this exon */
-while ((gl != NULL) && (gl->end <= start))
+while (gl != NULL)
+    {
+    if (allowedFrameFeature(gl, isGtf) && (gl->end >= start))
+        break;
     gl = gl->next;
+}
 /* set frame from any overlapping records with frame. Don't include
  * start/stop_codon, as it isn't a full exon. */
 while ((gl != NULL) && (rangeIntersection(gl->start, gl->end, start, end) > 0))
     {
-    if (!isStartStopCodon(gl->feature))
+    if (allowedFrameFeature(gl, isGtf))
         {
         int frame = phaseToFrame(gl->frame);
         if (frame >= 0)
             *framePtr = frame;
@@ -618,9 +661,9 @@
     }
 return gl;
 }
 
-static void assignFrame(struct gffGroup *group, struct genePred *gp)
+static void assignFrame(boolean isGtf, struct gffGroup *group, struct genePred *gp)
 /* Assign frame from GFF after genePred has been built.*/
 {
 struct gffLine *gl = group->lineList;
 int start, end, i;
@@ -628,9 +671,9 @@
 for (i = 0; i < gp->exonCount; i++)
     {
     if (genePredCdsExon(gp, i, &start, &end))
         {
-        gl = assignFrameForCdsExon(start, end, &(gp->exonFrames[i]), gl);
+        gl = assignFrameForCdsExon(start, end, &(gp->exonFrames[i]), gl, isGtf);
         if (gp->exonFrames[i] >= 0)
             haveFrame = TRUE;
         }
     }
@@ -663,21 +705,22 @@
 
 /* Count up exons and figure out cdsStart and cdsEnd. */
 for (gl = group->lineList; gl != NULL; gl = gl->next)
     {
-    char *feat = gl->feature;
-    if (isExon(feat, isGtf, exonSelectWord))
+    if (ignoreGxfLine(gl, isGtf))
+        continue;
+    if (isExon(gl->feature, isGtf, exonSelectWord))
 	++exonCount;
     if (isCds(gl->feature))
         {
 	if (gl->start < cdsStart)
             cdsStart = gl->start;
 	if (gl->end > cdsEnd)
             cdsEnd = gl->end;
 	}
-    if (sameWord(feat, "start_codon"))
+    if (sameWord(gl->feature, "start_codon"))
         haveStartCodon = TRUE;
-    if (sameWord(feat, "stop_codon"))
+    if (sameWord(gl->feature, "stop_codon"))
         {
         /* stop_codon can be split, need bounds for adjusting CDS below */
         if ((stopCodonStart < 0) || (gl->start < stopCodonStart))
             stopCodonStart = gl->start;
@@ -782,8 +825,10 @@
 i = -1; /* before first exon */
 /* fill in exons, merging overlaping and adjacent exons */
 for (gl = group->lineList; gl != NULL; gl = gl->next)
     {
+    if (ignoreGxfLine(gl, isGtf))
+        continue;
     if (isExon(gl->feature, isGtf, exonSelectWord) || (isGtf && isCds(gl->feature)))
         {
         chkGroupLine(group, gl, gp);
         if ((i < 0) || (gl->start > eEnds[i]))
@@ -811,9 +856,9 @@
 if ((options & genePredGxfImpliedStopAfterCds) && !haveStopCodon)
     adjImpliedStopAfterCds(gp);
 
 if (optFields & genePredExonFramesFld)
-    assignFrame(group, gp);
+    assignFrame(isGtf, group, gp);
 
 return gp;
 }