src/inc/gff3.h 1.1
1.1 2009/04/22 16:56:48 markd
added initial implementation of gff3 parser. Still has some rough edges and problems due to ambiguities in the GFF3 specification
Index: src/inc/gff3.h
===================================================================
RCS file: src/inc/gff3.h
diff -N src/inc/gff3.h
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ src/inc/gff3.h 22 Apr 2009 16:56:48 -0000 1.1
@@ -0,0 +1,211 @@
+/*
+ * Object for accessing GFF3 files
+ * See GFF3 specification for details of file format:
+ * http://www.sequenceontology.org/gff3.shtml
+ */
+#ifndef gff3_h
+#define gff3_h
+
+struct gff3Ann
+/* Annotation record from a GFF3 file. Attributes define in the spec (those
+ * starting with upper case letters) are parsed into fields of this
+ * object. User defined attributes (starting with lower-case characters) are
+ * stored as in a list, along with a copy of the string versions of the spec
+ * attributes. All strings stored in the object have been un-escaped.
+ * All storage for the object is allocated by the gff3File object. */
+{
+ struct gff3Ann *next; /* links all gff3Ann objects */
+ char *seqid; /* The ID of the landmark used to establish the coordinate
+ * system for the current feature. IDs may contain any
+ * characters. */
+ char *source; /* The source is a free text qualifier intended to describe
+ * the algorithm or operating procedure that generated this
+ * feature. Typically this is the name of a piece of
+ * software, such as "Genescan" or a database name, such as
+ * "Genbank." In effect, the source is used to extend the
+ * feature ontology by adding a qualifier to the type
+ * creating a new composite type that is a subclass of the
+ * type in the type column. */
+
+ char *type; /* The type of the feature (previously called the "method").
+ * This is constrained to be either: (a) a term from the
+ * "lite" sequence ontology, SOFA; or (b) a SOFA accession
+ * number. The latter alternative is distinguished using the
+ * syntax SO:000000. */
+
+ int start; /* The start and end of the feature, in 0-based, half open
+ * integer coordinates, relative to the landmark given in
+ * seqid. Start is always less than or equal to end. For
+ * zero-length features, such as insertion sites, start equals
+ * end and the implied site is to the right of the indicated
+ * base in the direction of the landmark.*/
+ int end;
+ float score; /* The score of the feature, a floating point number. As in
+ earlier versions of the format, the semantics of the score
+ are ill-defined. It is strongly recommended that E-values
+ be used for sequence similarity features, and that
+ P-values be used for ab initio gene prediction features. */
+ boolean haveScore; /* was score specified? */
+
+ char *strand; /* The strand of the feature. '+' for positive strand
+ * (relative to the landmark), '-' for minus strand, and
+ * NULL for features that are not stranded. In addition,
+ * '?' can be used for features whose strandedness is
+ * relevant, but unknown. */
+
+ int phase; /* For features of type "CDS", the phase indicates where the
+ * feature begins with reference to the reading frame. The
+ * phase is one of the integers 0, 1, or 2, indicating the
+ * number of bases that should be removed from the beginning of
+ * this feature to reach the first base of the next codon. In
+ * other words, a phase of 0 indicates that the next codon
+ * begins at the first base of the region described by the
+ * current line, a phase of 1 indicates that the next codon
+ * begins at the second base of this region, and a phase of 2
+ * indicates that the codon begins at the third base of this
+ * region. This is NOT to be confused with the frame, which is
+ * simply start modulo 3. For forward strand features, phase
+ * is counted from the start field. For reverse strand
+ * features, phase is counted from the end field. The phase is
+ * REQUIRED for all CDS features. and -1 for other features. */
+
+ /* The remaining fields are the attributes. Attributes defined by the
+ * GFF3 spec (starting with an upper-case letter) are stored in the fields
+ * below. Application-specific attributes (starting with a lower-case
+ * letter) are stored in the attrs list. */
+
+ char *id; /* Indicates the name of the feature. IDs must be unique
+ * within the scope of the GFF file.*/
+ char *name; /* Display name for the feature. This is the name to be
+ * displayed to the user. Unlike IDs, there is no requirement
+ * that the Name be unique within the file. */
+
+ struct slName *aliases; /* A secondary names for the feature. It is
+ * suggested that this tag be used whenever a
+ * secondary identifier for the feature is needed,
+ * such as locus names and accession numbers.
+ * Unlike ID, there is no requirement that Alias
+ * be unique within the file. */
+
+ struct slName *parentIds; /* Indicates the parent of the feature. A parent
+ * ID can be used to group exons into transcripts,
+ * transcripts into genes, an so forth. A feature
+ * may have multiple parents. Parent can *only* be
+ * used to indicate a partof relationship. Value is
+ * a null terminated array. */
+ struct gff3AnnRef *parents; /* Parent objects for parentIds */
+
+
+ char *targetId; /* Indicates the target of a nucleotide-to-nucleotide or
+ protein-to-nucleotide alignment. NULL if not specified. */
+ int targetStart; /* target start/end, in 0-based, half open coordinates */
+ int targetEnd;
+ char *targetStrand; /* optional target strand, or NULL if none. */
+
+ char *gap; /* The alignment of the feature to the target if the two are
+ * not collinear (e.g. contain gaps). The alignment format is
+ * taken from the CIGAR format. See "THE GAP ATTRIBUTE"
+ * section of GFF3 specification for a description of this
+ * format.*/
+
+ char *derivesFromId; /* Used to disambiguate the relationship between one
+ * feature and another when the relationship is a
+ * temporal one rather than a purely structural "part
+ * of" one. This is needed for polycistronic
+ * genes. */
+ struct gff3Ann *derivesFrom; /* Object for derivesFromId */
+
+ struct slName *notes; /* free text notes. */
+
+ struct slName *dbxrefs; /* database cross references. */
+
+ struct slName *ontologyTerms; /* cross reference to ontology terms. */
+
+ struct gff3AttrVals *attrs; /* attributes, both user-define and spec-defined,
+ * parsed into one or more values */
+
+ struct gff3AnnRef *children; /* child nodes */
+
+ struct gff3SeqRegion *seqRegion; /* start/end of sequence region, taken
+ * from ##sequence-region records, or
+ * NULL if not specified.*/
+
+ struct gff3File *file; /* file this record is associated with */
+ int lineNum; /* line number of record in file, or -1
+ * if not known */
+};
+
+struct gff3AnnRef
+/* A reference to a gff3Ann object */
+{
+ struct gff3AnnRef *next; /* next link in the chain */
+ struct gff3Ann *ann; /* reference to object */
+};
+
+struct gff3AttrVals
+/* an attribute and string values */
+{
+ struct gff3AttrVals *next; /* next attribute in the list */
+ char *attr; /* name of attribute */
+ struct slName *vals; /* value for the attribute */
+};
+
+struct gff3SeqRegion
+/* start/end of a sequence region, taken from ##sequence-region record.*/
+{
+ struct gff3SeqRegion *next; /* next region */
+ char *seqid; /* sequence if of region */
+ int start; /* bounds of region */
+ int end;
+};
+
+struct gff3File
+/* Object representing a GFF file. Manages all memory for related objects. */
+{
+ char *fileName; /* path of file that was parsed */
+ struct hash *byId; /* index of gff3Ann object by id */
+ struct gff3Ann *anns; /* all records in the file */
+ struct gff3AnnRef *roots; /* all records without parents */
+ struct hash *pool; /* used to allocate string values that tend to
+ * be repeated in the files. localMem is also
+ * to allocated memory for all other objects. */
+ struct gff3SeqRegion *seqRegions; /* list of gff3SeqRegion objects. */
+ struct hash *seqRegionMap; /* map of seqId to gff3SeqRegion objects. NULL
+ * if none are specified */
+
+ struct slName *featureOntologies; /* feature ontology URIs */
+ struct slName *attributeOntologies; /* attribute ontology URIs */
+ struct slName *sourceOntologies; /* source ontology URIs */
+ struct slName *species; /* Species, usually NCBI Taxonomy
+ * URI */
+ char *genomeBuildSource; /* source of genome build */
+ char *genomeBuildName; /* name or version of genome build */
+ struct dnaSeq *seqs; /* list of sequences */
+ struct hash *seqMap; /* map of sequence ids to sequence
+ * string from ##FASTA section or
+ * NULL if none specified */
+ struct lineFile *lf; /* only set while parsing */
+ FILE *errFh; /* write errors to this file */
+ int maxErr; /* maximum number of errors before aborting */
+ int errCnt; /* error count */
+};
+
+struct gff3File *gff3FileOpen(char *fileName, int maxErr, FILE *errFh);
+/* Parse a GFF3 file into a gff3File object. If maxErr not zero, then
+ * continue to parse until this number of error have been reached. A maxErr
+ * less than zero does not stop reports all errors. Write errors to errFh,
+ * if NULL, use stderr. */
+
+void gff3FileFree(struct gff3File **g3fPtr);
+/* Free a gff3File object */
+
+struct gff3Ann *gff3FileFindAnn(struct gff3File *g3f, char *id);
+/* find an annotation record by id, or NULL if not found. */
+
+struct gff3AttrVals *gff3AnnFindAttr(struct gff3Ann *g3a, char *attr);
+/* find a user attribute, or NULL */
+
+void gff3FileWrite(struct gff3File *g3f, char *fileName);
+/* write contents of an GFF3File object to a file */
+
+#endif