7d6d14047849f1ddcf0aadf5491f2830996df7b5
braney
  Mon Jul 28 15:37:04 2014 -0700
allow underbars in gff3.  Grok Is_circular.  #13673
diff --git src/inc/gff3.h src/inc/gff3.h
index dd28f6c..a047f14 100644
--- src/inc/gff3.h
+++ src/inc/gff3.h
@@ -1,266 +1,268 @@
 /*
  * Object for accessing GFF3 files
  * See GFF3 specification for details of file format:
  *   http://www.sequenceontology.org/gff3.shtml
  */
 #ifndef gff3_h
 #define gff3_h
 
 struct gff3Ann
 /* Annotation record from a GFF3 file.  Attributes define in the spec (those
  * starting with upper case letters) are parsed into fields of this
  * object. User defined attributes (starting with lower-case characters) are
  * stored as in a list, along with a copy of the string versions of the spec
  * attributes. All strings stored in the object have been un-escaped.
  * All storage for the object is allocated by the gff3File object.
  * For discontinuous features, there are multiple gff3Ann objects.
  * These objects are stored in a double-linked list, and all references
  * point to the first one in ascending start order.*/
 {
     struct gff3Ann *prevPart; /* Discontinuous features have linked annotation */
     struct gff3Ann *nextPart; /* field name next not used to avoid confusion */
     char *seqid;   /* The ID of the landmark used to establish the coordinate
                     * system for the current feature. IDs may contain any
                     * characters. */
     char *source;  /* The source is a free text qualifier intended to describe
                     * the algorithm or operating procedure that generated this
                     * feature.  Typically this is the name of a piece of
                     * software, such as "Genescan" or a database name, such as
                     * "Genbank."  In effect, the source is used to extend the
                     * feature ontology by adding a qualifier to the type
                     * creating a new composite type that is a subclass of the
                     * type in the type column. */
 
     char *type; /* The type of the feature (previously called the "method").
                  * This is constrained to be either: (a) a term from the
                  * "lite" sequence ontology, SOFA; or (b) a SOFA accession
                  * number.  The latter alternative is distinguished using the
                  * syntax SO:000000. */
 
     int start; /* The start and end of the feature, in 0-based, half open
                 * integer coordinates, relative to the landmark given in
                 * seqid.  Start is always less than or equal to end.  For
                 * zero-length features, such as insertion sites, start equals
                 * end and the implied site is to the right of the indicated
                 * base in the direction of the landmark.*/
     int end;
     float score; /* The score of the feature, a floating point number.  As in
                     earlier versions of the format, the semantics of the score
                     are ill-defined.  It is strongly recommended that E-values
                     be used for sequence similarity features, and that
                     P-values be used for ab initio gene prediction features. */
     boolean haveScore;  /* was score specified? */
 
     char *strand; /* The strand of the feature.  '+' for positive strand
                    * (relative to the landmark), '-' for minus strand, and
                    * NULL for features that are not stranded.  In addition,
                    * '?' can be used for features whose strandedness is
                    * relevant, but unknown. */
 
     int phase; /* For features of type "CDS", the phase indicates where the
                 * feature begins with reference to the reading frame.  The
                 * phase is one of the integers 0, 1, or 2, indicating the
                 * number of bases that should be removed from the beginning of
                 * this feature to reach the first base of the next codon. In
                 * other words, a phase of 0 indicates that the next codon
                 * begins at the first base of the region described by the
                 * current line, a phase of 1 indicates that the next codon
                 * begins at the second base of this region, and a phase of 2
                 * indicates that the codon begins at the third base of this
                 * region. This is NOT to be confused with the frame, which is
                 * simply start modulo 3.  For forward strand features, phase
                 * is counted from the start field. For reverse strand
                 * features, phase is counted from the end field. The phase is
                 * REQUIRED for all CDS features. and -1 for other features. */
 
     /* The remaining fields are the attributes.  Attributes defined by the
      * GFF3 spec (starting with an upper-case letter) are stored in the fields
      * below.  Application-specific attributes (starting with a lower-case
      * letter) are stored in the attrs list.  */
 
     char *id;  /* Indicates the name of the feature.  IDs must be unique
                 * within the scope of the GFF file.*/
     char *name;  /* Display name for the feature.  This is the name to be
                   * displayed to the user.  Unlike IDs, there is no requirement
                   * that the Name be unique within the file. */
 
     struct slName *aliases; /* A secondary names for the feature.  It is
                              * suggested that this tag be used whenever a
                              * secondary identifier for the feature is needed,
                              * such as locus names and accession numbers.
                              * Unlike ID, there is no requirement that Alias
                              * be unique within the file. */
     
     struct slName *parentIds; /* Indicates the parent of the feature.  A parent
                                * ID can be used to group exons into transcripts,
                                * transcripts into genes, an so forth.  A feature
                                * may have multiple parents.  Parent can *only* be
                                * used to indicate a partof relationship. */
     struct gff3AnnRef *parents; /* Parent objects for parentIds */
                                       
 
     char *targetId; /* Indicates the target of a nucleotide-to-nucleotide or
                        protein-to-nucleotide alignment.  NULL if not specified. */
     int targetStart; /* target start/end, in 0-based, half open coordinates */
     int targetEnd;
     char *targetStrand; /* optional target strand, or NULL if none. */
 
     char *gap; /* The alignment of the feature to the target if the two are
                 * not collinear (e.g. contain gaps).  The alignment format is
                 * taken from the CIGAR format.  See "THE GAP ATTRIBUTE"
                 * section of GFF3 specification for a description of this
                 * format.*/
 
     char *derivesFromId; /* Used to disambiguate the relationship between one
                           * feature and another when the relationship is a
                           * temporal one rather than a purely structural "part
                           * of" one.  This is needed for polycistronic
                           * genes. */
     struct gff3Ann *derivesFrom; /* Object for derivesFromId */
 
     struct slName *notes;  /* free text notes. */
 
+    boolean isCircular;  /* is this item circular */
+
     struct slName *dbxrefs; /* database cross references. */
 
     struct slName *ontologyTerms; /* cross reference to ontology terms. */
 
     struct gff3Attr *attrs;  /* attributes, both user-define and spec-defined,
                                   * parsed into one or more values */
 
     struct gff3AnnRef *children;  /* child nodes */
 
     struct gff3SeqRegion *seqRegion;  /* start/end of sequence region, taken
                                        * from ##sequence-region records, or
                                        * NULL if not specified.*/
 
     struct gff3File *file;  /* file this record is associated with */
     int lineNum;            /* line number of record in file, or -1
                              * if not known */
 };
 
 struct gff3AnnRef
 /* A reference to a gff3Ann object */
 {
     struct gff3AnnRef *next;   /* next link in the chain */
     struct gff3Ann *ann;       /* reference to object */
 };
 
 struct gff3Attr
 /* an attribute and string values */
 {
     struct gff3Attr *next;     /* next attribute in the list */
     char *tag;                 /* name of attribute */
    struct slName *vals;       /* values for the attribute */
 };
 
 struct gff3SeqRegion
 /* start/end of a sequence region, taken from ##sequence-region record.*/
 {
     struct gff3SeqRegion *next;     /* next region */
     char *seqid;    /* sequence if of region */
     int start;      /* bounds of region */
     int end;
 };
 
 struct gff3File
 /* Object representing a GFF file. Manages all memory for related objects. */
 {
     char *fileName;       /* path of file that was parsed */
     struct hash *byId;    /* index of gff3Ann object by id.  Links to first object of link discontinuous features */
     struct gff3AnnRef *anns;   /* all records in the file. Includes all parts of discontinuous features */
     struct gff3AnnRef *roots;  /* all records without parents. */
     struct hash *pool;         /* used to allocate string values that tend to
                                 * be repeated in the files.  localMem is also 
                                 * to allocated memory for all other objects. */
     struct gff3SeqRegion *seqRegions;  /* list of gff3SeqRegion objects. */
     struct hash *seqRegionMap;  /* map of seqId to gff3SeqRegion objects. NULL
                                  * if none are specified */
 
     struct slName *featureOntologies;    /* feature ontology URIs */
     struct slName *attributeOntologies;  /* attribute ontology URIs */
     struct slName *sourceOntologies;     /* source ontology URIs */
     struct slName *species;              /* Species, usually NCBI Taxonomy
                                           * URI */
     char *genomeBuildSource;             /* source of genome build */
     char *genomeBuildName;               /* name or version of genome build */
     struct dnaSeq *seqs;                 /* list of sequences */
     struct hash *seqMap;                 /* map of sequence ids to sequence
                                           * string from ##FASTA section or
                                           * NULL if none specified */
     struct lineFile *lf;                 /* only set while parsing */
     FILE *errFh;            /* write errors to this file */
     int maxErr;             /* maximum number of errors before aborting */
     int errCnt;             /* error count */
 };
 
 
 /* standard attribute tags */
 extern char *gff3AttrID;
 extern char *gff3AttrName;
 extern char *gff3AttrAlias;
 extern char *gff3AttrParent;
 extern char *gff3AttrTarget;
 extern char *gff3AttrGap;
 extern char *gff3AttrDerivesFrom;
 extern char *gff3AttrNote;
 extern char *gff3AttrDbxref;
 extern char *gff3AttrOntologyTerm;
 
 /* commonly used features names */
 extern char *gff3FeatGene;
 extern char *gff3FeatMRna;
 extern char *gff3FeatExon;
 extern char *gff3FeatCDS;
 extern char *gff3FeatThreePrimeUTR;
 extern char *gff3FeatFivePrimeUTR;
 extern char *gff3FeatStartCodon;
 extern char *gff3FeatStopCodon;
 extern char *gff3FeatTranscript;
 
 struct gff3File *gff3FileOpen(char *fileName, int maxErr, FILE *errFh);
 /* Parse a GFF3 file into a gff3File object.  If maxErr not zero, then
  * continue to parse until this number of error have been reached.  A maxErr
  * less than zero does not stop reports all errors. Write errors to errFh,
  * if NULL, use stderr. */
 
 void gff3FileFree(struct gff3File **g3fPtr);
 /* Free a gff3File object */
 
 struct gff3Ann *gff3FileFindAnn(struct gff3File *g3f, char *id);
 /* find an annotation record by id, or NULL if not found. */
 
 struct gff3Attr *gff3AnnFindAttr(struct gff3Ann *g3a, char *tag);
 /* find a user attribute, or NULL */
 
 void gff3FileWrite(struct gff3File *g3f, char *fileName);
 /* write contents of an GFF3File object to a file */
 
 INLINE struct gff3AnnRef *gff3AnnRefNew(struct gff3Ann *g3a)
 /* Allocate a gff3AnnRef object from the heap.  Not used by the parsing code, as 
  * all data is contained in localMem objects */
 {
 struct gff3AnnRef *ref;
 AllocVar(ref);
 ref->ann = g3a;
 return ref;
 }
 
 int gff3AnnRefLocCmp(const void *va, const void *vb);
 /* sort compare function for location of two gff3AnnRef objects */
 
 INLINE int gff3PhaseToFrame(int phase)
 /* convert a phase to a frame */
 {
 switch (phase)
     {
     case 0:
         return 0;
     case 1:
         return 2;
     case 2:
         return 1;
     }
 return -1;
 }
 
 #endif