src/inc/gff3.h 1.3
1.3 2010/05/25 00:14:45 markd
Made C attribute related names more consistent with GFF3 specification.
Fixed bug were bogus quotes were not detected.
Fixed bug with empty attribute values.
Index: src/inc/gff3.h
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/inc/gff3.h,v
retrieving revision 1.2
retrieving revision 1.3
diff -b -B -U 1000000 -r1.2 -r1.3
--- src/inc/gff3.h 12 Aug 2009 07:48:05 -0000 1.2
+++ src/inc/gff3.h 25 May 2010 00:14:45 -0000 1.3
@@ -1,261 +1,261 @@
/*
* Object for accessing GFF3 files
* See GFF3 specification for details of file format:
* http://www.sequenceontology.org/gff3.shtml
*/
#ifndef gff3_h
#define gff3_h
struct gff3Ann
/* Annotation record from a GFF3 file. Attributes define in the spec (those
* starting with upper case letters) are parsed into fields of this
* object. User defined attributes (starting with lower-case characters) are
* stored as in a list, along with a copy of the string versions of the spec
* attributes. All strings stored in the object have been un-escaped.
* All storage for the object is allocated by the gff3File object. */
{
struct gff3Ann *next; /* links all gff3Ann objects */
char *seqid; /* The ID of the landmark used to establish the coordinate
* system for the current feature. IDs may contain any
* characters. */
char *source; /* The source is a free text qualifier intended to describe
* the algorithm or operating procedure that generated this
* feature. Typically this is the name of a piece of
* software, such as "Genescan" or a database name, such as
* "Genbank." In effect, the source is used to extend the
* feature ontology by adding a qualifier to the type
* creating a new composite type that is a subclass of the
* type in the type column. */
char *type; /* The type of the feature (previously called the "method").
* This is constrained to be either: (a) a term from the
* "lite" sequence ontology, SOFA; or (b) a SOFA accession
* number. The latter alternative is distinguished using the
* syntax SO:000000. */
int start; /* The start and end of the feature, in 0-based, half open
* integer coordinates, relative to the landmark given in
* seqid. Start is always less than or equal to end. For
* zero-length features, such as insertion sites, start equals
* end and the implied site is to the right of the indicated
* base in the direction of the landmark.*/
int end;
float score; /* The score of the feature, a floating point number. As in
earlier versions of the format, the semantics of the score
are ill-defined. It is strongly recommended that E-values
be used for sequence similarity features, and that
P-values be used for ab initio gene prediction features. */
boolean haveScore; /* was score specified? */
char *strand; /* The strand of the feature. '+' for positive strand
* (relative to the landmark), '-' for minus strand, and
* NULL for features that are not stranded. In addition,
* '?' can be used for features whose strandedness is
* relevant, but unknown. */
int phase; /* For features of type "CDS", the phase indicates where the
* feature begins with reference to the reading frame. The
* phase is one of the integers 0, 1, or 2, indicating the
* number of bases that should be removed from the beginning of
* this feature to reach the first base of the next codon. In
* other words, a phase of 0 indicates that the next codon
* begins at the first base of the region described by the
* current line, a phase of 1 indicates that the next codon
* begins at the second base of this region, and a phase of 2
* indicates that the codon begins at the third base of this
* region. This is NOT to be confused with the frame, which is
* simply start modulo 3. For forward strand features, phase
* is counted from the start field. For reverse strand
* features, phase is counted from the end field. The phase is
* REQUIRED for all CDS features. and -1 for other features. */
/* The remaining fields are the attributes. Attributes defined by the
* GFF3 spec (starting with an upper-case letter) are stored in the fields
* below. Application-specific attributes (starting with a lower-case
* letter) are stored in the attrs list. */
char *id; /* Indicates the name of the feature. IDs must be unique
* within the scope of the GFF file.*/
char *name; /* Display name for the feature. This is the name to be
* displayed to the user. Unlike IDs, there is no requirement
* that the Name be unique within the file. */
struct slName *aliases; /* A secondary names for the feature. It is
* suggested that this tag be used whenever a
* secondary identifier for the feature is needed,
* such as locus names and accession numbers.
* Unlike ID, there is no requirement that Alias
* be unique within the file. */
struct slName *parentIds; /* Indicates the parent of the feature. A parent
* ID can be used to group exons into transcripts,
* transcripts into genes, an so forth. A feature
* may have multiple parents. Parent can *only* be
* used to indicate a partof relationship. */
struct gff3AnnRef *parents; /* Parent objects for parentIds */
char *targetId; /* Indicates the target of a nucleotide-to-nucleotide or
protein-to-nucleotide alignment. NULL if not specified. */
int targetStart; /* target start/end, in 0-based, half open coordinates */
int targetEnd;
char *targetStrand; /* optional target strand, or NULL if none. */
char *gap; /* The alignment of the feature to the target if the two are
* not collinear (e.g. contain gaps). The alignment format is
* taken from the CIGAR format. See "THE GAP ATTRIBUTE"
* section of GFF3 specification for a description of this
* format.*/
char *derivesFromId; /* Used to disambiguate the relationship between one
* feature and another when the relationship is a
* temporal one rather than a purely structural "part
* of" one. This is needed for polycistronic
* genes. */
struct gff3Ann *derivesFrom; /* Object for derivesFromId */
struct slName *notes; /* free text notes. */
struct slName *dbxrefs; /* database cross references. */
struct slName *ontologyTerms; /* cross reference to ontology terms. */
- struct gff3AttrVals *attrs; /* attributes, both user-define and spec-defined,
+ struct gff3Attr *attrs; /* attributes, both user-define and spec-defined,
* parsed into one or more values */
struct gff3AnnRef *children; /* child nodes */
struct gff3SeqRegion *seqRegion; /* start/end of sequence region, taken
* from ##sequence-region records, or
* NULL if not specified.*/
struct gff3File *file; /* file this record is associated with */
int lineNum; /* line number of record in file, or -1
* if not known */
};
struct gff3AnnRef
/* A reference to a gff3Ann object */
{
struct gff3AnnRef *next; /* next link in the chain */
struct gff3Ann *ann; /* reference to object */
};
-struct gff3AttrVals
+struct gff3Attr
/* an attribute and string values */
{
- struct gff3AttrVals *next; /* next attribute in the list */
- char *attr; /* name of attribute */
- struct slName *vals; /* value for the attribute */
+ struct gff3Attr *next; /* next attribute in the list */
+ char *tag; /* name of attribute */
+ struct slName *vals; /* values for the attribute */
};
struct gff3SeqRegion
/* start/end of a sequence region, taken from ##sequence-region record.*/
{
struct gff3SeqRegion *next; /* next region */
char *seqid; /* sequence if of region */
int start; /* bounds of region */
int end;
};
struct gff3File
/* Object representing a GFF file. Manages all memory for related objects. */
{
char *fileName; /* path of file that was parsed */
struct hash *byId; /* index of gff3Ann object by id */
struct gff3Ann *anns; /* all records in the file */
struct gff3AnnRef *roots; /* all records without parents */
struct hash *pool; /* used to allocate string values that tend to
* be repeated in the files. localMem is also
* to allocated memory for all other objects. */
struct gff3SeqRegion *seqRegions; /* list of gff3SeqRegion objects. */
struct hash *seqRegionMap; /* map of seqId to gff3SeqRegion objects. NULL
* if none are specified */
struct slName *featureOntologies; /* feature ontology URIs */
struct slName *attributeOntologies; /* attribute ontology URIs */
struct slName *sourceOntologies; /* source ontology URIs */
struct slName *species; /* Species, usually NCBI Taxonomy
* URI */
char *genomeBuildSource; /* source of genome build */
char *genomeBuildName; /* name or version of genome build */
struct dnaSeq *seqs; /* list of sequences */
struct hash *seqMap; /* map of sequence ids to sequence
* string from ##FASTA section or
* NULL if none specified */
struct lineFile *lf; /* only set while parsing */
FILE *errFh; /* write errors to this file */
int maxErr; /* maximum number of errors before aborting */
int errCnt; /* error count */
};
-/* standard attribute names */
+/* standard attribute tags */
extern char *gff3AttrID;
extern char *gff3AttrName;
extern char *gff3AttrAlias;
extern char *gff3AttrParent;
extern char *gff3AttrTarget;
extern char *gff3AttrGap;
extern char *gff3AttrDerivesFrom;
extern char *gff3AttrNote;
extern char *gff3AttrDbxref;
extern char *gff3AttrOntologyTerm;
/* commonly used features names */
extern char *gff3FeatGene;
extern char *gff3FeatMRna;
extern char *gff3FeatExon;
extern char *gff3FeatCDS;
extern char *gff3FeatThreePrimeUTR;
extern char *gff3FeatFivePrimeUTR;
extern char *gff3FeatStartCodon;
extern char *gff3FeatStopCodon;
struct gff3File *gff3FileOpen(char *fileName, int maxErr, FILE *errFh);
/* Parse a GFF3 file into a gff3File object. If maxErr not zero, then
* continue to parse until this number of error have been reached. A maxErr
* less than zero does not stop reports all errors. Write errors to errFh,
* if NULL, use stderr. */
void gff3FileFree(struct gff3File **g3fPtr);
/* Free a gff3File object */
struct gff3Ann *gff3FileFindAnn(struct gff3File *g3f, char *id);
/* find an annotation record by id, or NULL if not found. */
-struct gff3AttrVals *gff3AnnFindAttr(struct gff3Ann *g3a, char *attr);
+struct gff3Attr *gff3AnnFindAttr(struct gff3Ann *g3a, char *tag);
/* find a user attribute, or NULL */
void gff3FileWrite(struct gff3File *g3f, char *fileName);
/* write contents of an GFF3File object to a file */
INLINE struct gff3AnnRef *gff3AnnRefNew(struct gff3Ann *g3a)
/* Allocate a gff3AnnRef object from the heap. Not used by the parsing code, as
* all data is contained in localMem objects */
{
struct gff3AnnRef *ref;
AllocVar(ref);
ref->ann = g3a;
return ref;
}
int gff3AnnRefLocCmp(const void *va, const void *vb);
/* sort compare function for location of two gff3AnnRef objects */
INLINE int gff3PhaseToFrame(int phase)
/* convert a phase to a frame */
{
switch (phase)
{
case 0:
return 0;
case 1:
return 2;
case 2:
return 1;
}
return -1;
}
#endif