src/hg/utils/gff3ToGenePred/gff3ToGenePred.c 1.2
1.2 2010/03/19 02:24:35 markd
report multiple conversions errors before failing
Index: src/hg/utils/gff3ToGenePred/gff3ToGenePred.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/utils/gff3ToGenePred/gff3ToGenePred.c,v
retrieving revision 1.1
retrieving revision 1.2
diff -b -B -U 4 -r1.1 -r1.2
--- src/hg/utils/gff3ToGenePred/gff3ToGenePred.c 12 Aug 2009 07:48:06 -0000 1.1
+++ src/hg/utils/gff3ToGenePred/gff3ToGenePred.c 19 Mar 2010 02:24:35 -0000 1.2
@@ -22,8 +22,11 @@
" - top-level gene records with mRNA records\n"
" - top-level mRNA records\n"
" - mRNA records can contain exon and CDS, or only CDS, or only\n"
" exon for non--coding.\n"
+ "The first step is to parse GFF3 file, up to 50 errors are reported before\n"
+ "aborting. If the GFF3 files is successfully parse, it is converted to gene,\n"
+ "annotation. Up to 50 conversion errors are reported before aborting.\n"
);
}
static struct optionSpec options[] = {
@@ -31,15 +34,29 @@
{NULL, 0},
};
static boolean honorStartStopCodons = FALSE;
static int maxParseErrs = 50; // maximum number of errors during parse
+static int maxConvertErrs = 50; // maximum number of errors during conversion
+static int convertErrCnt = 0; // number of convert errors
+
+
+static void cnvError(char *format, ...)
+/* print a convert error */
+{
+va_list args;
+va_start(args, format);
+vfprintf(stderr, format, args);
+va_end(args);
+fputc('\n', stderr);
+convertErrCnt++;
+}
static struct gff3File *loadGff3(char *inGff3File)
/* load GFF3 into memory */
{
struct gff3File *gff3File = gff3FileOpen(inGff3File, maxParseErrs, NULL);
if (gff3File->errCnt > 0)
- errAbort("errors parsing GFF3 file: %s", inGff3File);
+ errAbort("%d errors parsing GFF3 file: %s", gff3File->errCnt, inGff3File);
return gff3File;
}
static boolean haveChildFeature(struct gff3Ann *parent, char *featName)
@@ -67,20 +84,26 @@
return feats;
}
static struct genePred *makeGenePred(struct gff3Ann *gene, struct gff3Ann *mrna, struct gff3AnnRef *exons, struct gff3AnnRef *cdsBlks)
-/* construct the empty genePred */
+/* construct the empty genePred, return NULL on a failure. */
{
if (exons == NULL)
- errAbort("no exons defined for mRNA %s", mrna->id);
+ {
+ cnvError("no exons defined for mRNA %s", mrna->id);
+ return NULL;
+ }
int txStart = exons->ann->start;
int txEnd = ((struct gff3AnnRef*)slLastEl(exons))->ann->end;
int cdsStart = (cdsBlks == NULL) ? txEnd : cdsBlks->ann->start;
int cdsEnd = (cdsBlks == NULL) ? txEnd : ((struct gff3AnnRef*)slLastEl(cdsBlks))->ann->end;
if ((mrna->strand == NULL) || (mrna->strand[0] == '?'))
- errAbort("invalid strand for mRNA %s", mrna->id);
+ {
+ cnvError("invalid strand for mRNA %s", mrna->id);
+ return NULL;
+ }
struct genePred *gp = genePredNew(mrna->id, mrna->seqid, mrna->strand[0],
txStart, txEnd, cdsStart, cdsEnd,
genePredAllFlds, slCount(exons));
@@ -121,64 +144,79 @@
}
}
static int findCdsExon(struct genePred *gp, struct gff3Ann *cds, int iExon)
-/* search for the exon containing the CDS, starting with iExon+1 */
+/* search for the exon containing the CDS, starting with iExon+1, return -1 on error */
{
for (iExon++; iExon < gp->exonCount; iExon++)
{
if ((gp->exonStarts[iExon] <= cds->start) && (cds->end <= gp->exonEnds[iExon]))
return iExon;
}
-errAbort("no exon in %s contains CDS %d-%d", gp->name, cds->start, cds->end);
+cnvError("no exon in %s contains CDS %d-%d", gp->name, cds->start, cds->end);
return -1;
}
-static void addCdsFrame(struct genePred *gp, struct gff3AnnRef *cdsBlks)
-/* assign frame based on CDS regions */
+static boolean addCdsFrame(struct genePred *gp, struct gff3AnnRef *cdsBlks)
+/* assign frame based on CDS regions. Return FALSE error */
{
struct gff3AnnRef *cds;
int iExon = -1; // caches current position
for (cds = cdsBlks; cds != NULL; cds = cds->next)
{
iExon = findCdsExon(gp, cds->ann, iExon);
+ if (iExon < 0)
+ return FALSE; // error
gp->exonFrames[iExon] = gff3PhaseToFrame(cds->ann->phase);
}
+return TRUE;
}
static void processMRna(FILE *gpFh, struct gff3Ann *gene, struct gff3Ann *mrna, struct hash *processed)
-/* process a mRNA node in the tree; gene can be NULL */
+/* process a mRNA node in the tree; gene can be NULL. Error count increment on error and genePred discarded */
{
hashStore(processed, mrna->id);
// allow for only having CDS children
struct gff3AnnRef *exons = getChildFeatures(mrna, gff3FeatExon);
struct gff3AnnRef *cdsBlks = getChildFeatures(mrna, gff3FeatCDS);
struct gff3AnnRef *useExons = (exons != NULL) ? exons : cdsBlks;
struct genePred *gp = makeGenePred((gene != NULL) ? gene : mrna, mrna, useExons, cdsBlks);
+if (gp == NULL)
+ return; // error
+
addExons(gp, useExons);
-addCdsFrame(gp, cdsBlks);
+if (!addCdsFrame(gp, cdsBlks))
+ return; // error
// output before checking so it can be examined
genePredTabOut(gp, gpFh);
if (genePredCheck("GFF3 converted to genePred", stderr, -1, gp) != 0)
- errAbort("conversion failed");
+ {
+ cnvError("conversion failed");
+ genePredFree(&gp);
+ return; // error
+ }
genePredFree(&gp);
slFreeList(&exons);
slFreeList(&cdsBlks);
}
static void processGene(FILE *gpFh, struct gff3Ann *gene, struct hash *processed)
-/* process a gene node in the tree */
+/* process a gene node in the tree. Stop process if maximum errors reached */
{
hashStore(processed, gene->id);
struct gff3AnnRef *child;
for (child = gene->children; child != NULL; child = child->next)
{
if (sameString(child->ann->type, gff3FeatMRna) && (hashLookup(processed, child->ann->id) == NULL))
+ {
processMRna(gpFh, gene, child->ann, processed);
+ if (convertErrCnt > maxConvertErrs)
+ break;
+ }
}
}
static void processRoot(FILE *gpFh, struct gff3Ann *node, struct hash *processed)
@@ -202,14 +240,19 @@
struct gff3AnnRef *root;
for (root = gff3File->roots; root != NULL; root = root->next)
{
if (hashLookup(processed, root->ann->id) == NULL)
+ {
processRoot(gpFh, root->ann, processed);
+ if (convertErrCnt > maxConvertErrs)
+ break;
+ }
}
-
carefulClose(&gpFh);
+if (convertErrCnt > 0)
+ errAbort("%d errors converting GFF3 file: %s", convertErrCnt, inGff3File);
-#if 1 // free memory for leak debugging if 1
+#if 0 // free memory for leak debugging if 1
gff3FileFree(&gff3File);
hashFree(&processed);
#endif
}