src/hg/lib/genePred.c 1.102
1.102 2009/02/13 01:02:16 markd
ignore non-GTF features, as specified by the spec
Index: src/hg/lib/genePred.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/lib/genePred.c,v
retrieving revision 1.101
retrieving revision 1.102
diff -b -B -U 4 -r1.101 -r1.102
--- src/hg/lib/genePred.c 13 Sep 2008 00:46:54 -0000 1.101
+++ src/hg/lib/genePred.c 13 Feb 2009 01:02:16 -0000 1.102
@@ -587,29 +587,72 @@
if (! foundReal)
{
errAbort("Error: exonFrames field is being added, but I found a "
"gene (%s) with CDS but no valid frames. "
- "This can happen if ldHgGene is invoked with -genePredExt "
+ "This can happen if program is invoked with -genePredExt "
"but no valid frames are given in the file. If the 8th "
"field of GFF/GTF file is always a placeholder, then don't use "
"-genePredExt.",
gp->name);
}
}
}
+static boolean isGtfFeature(char *feat)
+/* check if a feature is one of the recognized features. All otherse
+ * should be ignored. http://mblab.wustl.edu/GTF22.html */
+{
+static char *gtfFeats[] = {"CDS", "start_codon", "stop_codon", "5UTR", "3UTR",
+ "inter", "inter_CNS", "intron_CNS", "exon", NULL};
+int i;
+for (i = 0; gtfFeats[i] != NULL; i++)
+ {
+ if (sameString(feat, gtfFeats[i]))
+ return TRUE;
+ }
+return FALSE;
+}
+
+static boolean ignoreGxfLine(struct gffLine *gl, boolean isGtf)
+/* should the specified line be skipped? */
+{
+if (isGtf)
+ return !isGtfFeature(gl->feature);
+else
+ return FALSE;
+}
+
+static boolean allowedFrameFeature(struct gffLine *gl, boolean isGtf)
+/* should this feature be used to assign frame? */
+{
+if (isStartStopCodon(gl->feature) || ignoreGxfLine(gl, isGtf))
+ return FALSE;
+
+// for GFF, any features with frame can be used to set frame,
+// with GTF, there are some bogus files that set frame on UTR features,
+// so only use CDS
+if (isGtf)
+ return sameString(gl->feature, "CDS");
+else
+ return (gl->frame != '.');
+}
+
static struct gffLine *assignFrameForCdsExon(int start, int end, int *framePtr,
- struct gffLine *gl)
+ struct gffLine *gl, boolean isGtf)
/* set the frame for an exon, advancing gl past end of this exon */
{
/* skip lines preceeding this exon */
-while ((gl != NULL) && (gl->end <= start))
+while (gl != NULL)
+ {
+ if (allowedFrameFeature(gl, isGtf) && (gl->end >= start))
+ break;
gl = gl->next;
+}
/* set frame from any overlapping records with frame. Don't include
* start/stop_codon, as it isn't a full exon. */
while ((gl != NULL) && (rangeIntersection(gl->start, gl->end, start, end) > 0))
{
- if (!isStartStopCodon(gl->feature))
+ if (allowedFrameFeature(gl, isGtf))
{
int frame = phaseToFrame(gl->frame);
if (frame >= 0)
*framePtr = frame;
@@ -618,9 +661,9 @@
}
return gl;
}
-static void assignFrame(struct gffGroup *group, struct genePred *gp)
+static void assignFrame(boolean isGtf, struct gffGroup *group, struct genePred *gp)
/* Assign frame from GFF after genePred has been built.*/
{
struct gffLine *gl = group->lineList;
int start, end, i;
@@ -628,9 +671,9 @@
for (i = 0; i < gp->exonCount; i++)
{
if (genePredCdsExon(gp, i, &start, &end))
{
- gl = assignFrameForCdsExon(start, end, &(gp->exonFrames[i]), gl);
+ gl = assignFrameForCdsExon(start, end, &(gp->exonFrames[i]), gl, isGtf);
if (gp->exonFrames[i] >= 0)
haveFrame = TRUE;
}
}
@@ -663,21 +705,22 @@
/* Count up exons and figure out cdsStart and cdsEnd. */
for (gl = group->lineList; gl != NULL; gl = gl->next)
{
- char *feat = gl->feature;
- if (isExon(feat, isGtf, exonSelectWord))
+ if (ignoreGxfLine(gl, isGtf))
+ continue;
+ if (isExon(gl->feature, isGtf, exonSelectWord))
++exonCount;
if (isCds(gl->feature))
{
if (gl->start < cdsStart)
cdsStart = gl->start;
if (gl->end > cdsEnd)
cdsEnd = gl->end;
}
- if (sameWord(feat, "start_codon"))
+ if (sameWord(gl->feature, "start_codon"))
haveStartCodon = TRUE;
- if (sameWord(feat, "stop_codon"))
+ if (sameWord(gl->feature, "stop_codon"))
{
/* stop_codon can be split, need bounds for adjusting CDS below */
if ((stopCodonStart < 0) || (gl->start < stopCodonStart))
stopCodonStart = gl->start;
@@ -782,8 +825,10 @@
i = -1; /* before first exon */
/* fill in exons, merging overlaping and adjacent exons */
for (gl = group->lineList; gl != NULL; gl = gl->next)
{
+ if (ignoreGxfLine(gl, isGtf))
+ continue;
if (isExon(gl->feature, isGtf, exonSelectWord) || (isGtf && isCds(gl->feature)))
{
chkGroupLine(group, gl, gp);
if ((i < 0) || (gl->start > eEnds[i]))
@@ -811,9 +856,9 @@
if ((options & genePredGxfImpliedStopAfterCds) && !haveStopCodon)
adjImpliedStopAfterCds(gp);
if (optFields & genePredExonFramesFld)
- assignFrame(group, gp);
+ assignFrame(isGtf, group, gp);
return gp;
}