src/inc/gff3.h 1.1

1.1 2009/04/22 16:56:48 markd
added initial implementation of gff3 parser. Still has some rough edges and problems due to ambiguities in the GFF3 specification
Index: src/inc/gff3.h
===================================================================
RCS file: src/inc/gff3.h
diff -N src/inc/gff3.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/inc/gff3.h	22 Apr 2009 16:56:48 -0000	1.1
@@ -0,0 +1,211 @@
+/*
+ * Object for accessing GFF3 files
+ * See GFF3 specification for details of file format:
+ *   http://www.sequenceontology.org/gff3.shtml
+ */
+#ifndef gff3_h
+#define gff3_h
+
+struct gff3Ann
+/* Annotation record from a GFF3 file.  Attributes define in the spec (those
+ * starting with upper case letters) are parsed into fields of this
+ * object. User defined attributes (starting with lower-case characters) are
+ * stored as in a list, along with a copy of the string versions of the spec
+ * attributes. All strings stored in the object have been un-escaped.
+ * All storage for the object is allocated by the gff3File object. */
+{
+    struct gff3Ann *next; /* links all gff3Ann objects */
+    char *seqid;   /* The ID of the landmark used to establish the coordinate
+                    * system for the current feature. IDs may contain any
+                    * characters. */
+    char *source;  /* The source is a free text qualifier intended to describe
+                    * the algorithm or operating procedure that generated this
+                    * feature.  Typically this is the name of a piece of
+                    * software, such as "Genescan" or a database name, such as
+                    * "Genbank."  In effect, the source is used to extend the
+                    * feature ontology by adding a qualifier to the type
+                    * creating a new composite type that is a subclass of the
+                    * type in the type column. */
+
+    char *type; /* The type of the feature (previously called the "method").
+                 * This is constrained to be either: (a) a term from the
+                 * "lite" sequence ontology, SOFA; or (b) a SOFA accession
+                 * number.  The latter alternative is distinguished using the
+                 * syntax SO:000000. */
+
+    int start; /* The start and end of the feature, in 0-based, half open
+                * integer coordinates, relative to the landmark given in
+                * seqid.  Start is always less than or equal to end.  For
+                * zero-length features, such as insertion sites, start equals
+                * end and the implied site is to the right of the indicated
+                * base in the direction of the landmark.*/
+    int end;
+    float score; /* The score of the feature, a floating point number.  As in
+                    earlier versions of the format, the semantics of the score
+                    are ill-defined.  It is strongly recommended that E-values
+                    be used for sequence similarity features, and that
+                    P-values be used for ab initio gene prediction features. */
+    boolean haveScore;  /* was score specified? */
+
+    char *strand; /* The strand of the feature.  '+' for positive strand
+                   * (relative to the landmark), '-' for minus strand, and
+                   * NULL for features that are not stranded.  In addition,
+                   * '?' can be used for features whose strandedness is
+                   * relevant, but unknown. */
+
+    int phase; /* For features of type "CDS", the phase indicates where the
+                * feature begins with reference to the reading frame.  The
+                * phase is one of the integers 0, 1, or 2, indicating the
+                * number of bases that should be removed from the beginning of
+                * this feature to reach the first base of the next codon. In
+                * other words, a phase of 0 indicates that the next codon
+                * begins at the first base of the region described by the
+                * current line, a phase of 1 indicates that the next codon
+                * begins at the second base of this region, and a phase of 2
+                * indicates that the codon begins at the third base of this
+                * region. This is NOT to be confused with the frame, which is
+                * simply start modulo 3.  For forward strand features, phase
+                * is counted from the start field. For reverse strand
+                * features, phase is counted from the end field. The phase is
+                * REQUIRED for all CDS features. and -1 for other features. */
+
+    /* The remaining fields are the attributes.  Attributes defined by the
+     * GFF3 spec (starting with an upper-case letter) are stored in the fields
+     * below.  Application-specific attributes (starting with a lower-case
+     * letter) are stored in the attrs list.  */
+
+    char *id;  /* Indicates the name of the feature.  IDs must be unique
+                * within the scope of the GFF file.*/
+    char *name;  /* Display name for the feature.  This is the name to be
+                  * displayed to the user.  Unlike IDs, there is no requirement
+                  * that the Name be unique within the file. */
+
+    struct slName *aliases; /* A secondary names for the feature.  It is
+                             * suggested that this tag be used whenever a
+                             * secondary identifier for the feature is needed,
+                             * such as locus names and accession numbers.
+                             * Unlike ID, there is no requirement that Alias
+                             * be unique within the file. */
+    
+    struct slName *parentIds; /* Indicates the parent of the feature.  A parent
+                               * ID can be used to group exons into transcripts,
+                               * transcripts into genes, an so forth.  A feature
+                               * may have multiple parents.  Parent can *only* be
+                               * used to indicate a partof relationship. Value is
+                               * a null terminated array. */
+    struct gff3AnnRef *parents; /* Parent objects for parentIds */
+                                      
+
+    char *targetId; /* Indicates the target of a nucleotide-to-nucleotide or
+                       protein-to-nucleotide alignment.  NULL if not specified. */
+    int targetStart; /* target start/end, in 0-based, half open coordinates */
+    int targetEnd;
+    char *targetStrand; /* optional target strand, or NULL if none. */
+
+    char *gap; /* The alignment of the feature to the target if the two are
+                * not collinear (e.g. contain gaps).  The alignment format is
+                * taken from the CIGAR format.  See "THE GAP ATTRIBUTE"
+                * section of GFF3 specification for a description of this
+                * format.*/
+
+    char *derivesFromId; /* Used to disambiguate the relationship between one
+                          * feature and another when the relationship is a
+                          * temporal one rather than a purely structural "part
+                          * of" one.  This is needed for polycistronic
+                          * genes. */
+    struct gff3Ann *derivesFrom; /* Object for derivesFromId */
+
+    struct slName *notes;  /* free text notes. */
+
+    struct slName *dbxrefs; /* database cross references. */
+
+    struct slName *ontologyTerms; /* cross reference to ontology terms. */
+
+    struct gff3AttrVals *attrs;  /* attributes, both user-define and spec-defined,
+                                  * parsed into one or more values */
+
+    struct gff3AnnRef *children;  /* child nodes */
+
+    struct gff3SeqRegion *seqRegion;  /* start/end of sequence region, taken
+                                       * from ##sequence-region records, or
+                                       * NULL if not specified.*/
+
+    struct gff3File *file;  /* file this record is associated with */
+    int lineNum;            /* line number of record in file, or -1
+                             * if not known */
+};
+
+struct gff3AnnRef
+/* A reference to a gff3Ann object */
+{
+    struct gff3AnnRef *next;   /* next link in the chain */
+    struct gff3Ann *ann;       /* reference to object */
+};
+
+struct gff3AttrVals
+/* an attribute and string values */
+{
+    struct gff3AttrVals *next;  /* next attribute in the list */
+    char *attr;                 /* name of attribute */
+    struct slName *vals;        /* value for the attribute */
+};
+
+struct gff3SeqRegion
+/* start/end of a sequence region, taken from ##sequence-region record.*/
+{
+    struct gff3SeqRegion *next;     /* next region */
+    char *seqid;    /* sequence if of region */
+    int start;      /* bounds of region */
+    int end;
+};
+
+struct gff3File
+/* Object representing a GFF file. Manages all memory for related objects. */
+{
+    char *fileName;       /* path of file that was parsed */
+    struct hash *byId;    /* index of gff3Ann object by id */
+    struct gff3Ann *anns; /* all records in the file */
+    struct gff3AnnRef *roots;  /* all records without parents */
+    struct hash *pool;         /* used to allocate string values that tend to
+                                * be repeated in the files.  localMem is also 
+                                * to allocated memory for all other objects. */
+    struct gff3SeqRegion *seqRegions;  /* list of gff3SeqRegion objects. */
+    struct hash *seqRegionMap;  /* map of seqId to gff3SeqRegion objects. NULL
+                                 * if none are specified */
+
+    struct slName *featureOntologies;    /* feature ontology URIs */
+    struct slName *attributeOntologies;  /* attribute ontology URIs */
+    struct slName *sourceOntologies;     /* source ontology URIs */
+    struct slName *species;              /* Species, usually NCBI Taxonomy
+                                          * URI */
+    char *genomeBuildSource;             /* source of genome build */
+    char *genomeBuildName;               /* name or version of genome build */
+    struct dnaSeq *seqs;                 /* list of sequences */
+    struct hash *seqMap;                 /* map of sequence ids to sequence
+                                          * string from ##FASTA section or
+                                          * NULL if none specified */
+    struct lineFile *lf;                 /* only set while parsing */
+    FILE *errFh;            /* write errors to this file */
+    int maxErr;             /* maximum number of errors before aborting */
+    int errCnt;             /* error count */
+};
+
+struct gff3File *gff3FileOpen(char *fileName, int maxErr, FILE *errFh);
+/* Parse a GFF3 file into a gff3File object.  If maxErr not zero, then
+ * continue to parse until this number of error have been reached.  A maxErr
+ * less than zero does not stop reports all errors. Write errors to errFh,
+ * if NULL, use stderr. */
+
+void gff3FileFree(struct gff3File **g3fPtr);
+/* Free a gff3File object */
+
+struct gff3Ann *gff3FileFindAnn(struct gff3File *g3f, char *id);
+/* find an annotation record by id, or NULL if not found. */
+
+struct gff3AttrVals *gff3AnnFindAttr(struct gff3Ann *g3a, char *attr);
+/* find a user attribute, or NULL */
+
+void gff3FileWrite(struct gff3File *g3f, char *fileName);
+/* write contents of an GFF3File object to a file */
+
+#endif