src/lib/gff3.c 1.5

1.5 2010/05/25 00:14:54 markd
Made C attribute related names more consistent with GFF3 specification. Fixed bug were bogus quotes were not detected. Fixed bug with empty attribute values.
Index: src/lib/gff3.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/lib/gff3.c,v
retrieving revision 1.4
retrieving revision 1.5
diff -b -B -U 4 -r1.4 -r1.5
--- src/lib/gff3.c	20 Mar 2010 15:32:13 -0000	1.4
+++ src/lib/gff3.c	25 May 2010 00:14:54 -0000	1.5
@@ -151,16 +151,16 @@
     *numWordsRet = numWords;
 return words;
 }
 
-struct gff3AttrVals *gff3AnnFindAttr(struct gff3Ann *g3a, char *attr)
+struct gff3Attr *gff3AnnFindAttr(struct gff3Ann *g3a, char *tag)
 /* find a user attribute, or NULL */
 {
-struct gff3AttrVals *av;
-for (av = g3a->attrs; av != NULL; av = av->next)
+struct gff3Attr *attr;
+for (attr = g3a->attrs; attr != NULL; attr = attr->next)
     {
-    if (sameString(av->attr, attr))
-        return av;
+    if (sameString(attr->tag, tag))
+        return attr;
     }
 return NULL;
 }
 
@@ -321,115 +321,131 @@
 #endif
     }
 }
 
-static struct slName *parseAttrVals(struct gff3Ann *g3a, char *attr, char *valsStr)
+/* check that an attribute tag name is valid. */
+static boolean checkAttrTag(struct gff3Ann *g3a, char *tag)
+{
+// FIXME: spec is not clear on what is a valid tag.
+char *tc = tag;
+boolean isOk = isalpha(*tc);
+for (tc++; isOk && (*tc != '\0'); tc++)
+    isOk = (*tc == '_') || isalnum(*tc);
+if (!isOk)
+    gff3AnnErr(g3a, "invalid attribute tag, must start with an alphabetic character and be composed of alphanumeric or underscore characters: %s", tag);
+return isOk;
+}
+
+static struct slName *parseAttrVals(struct gff3Ann *g3a, char *tag, char *valsStr)
 /* parse an attribute into its values */
 {
 int i, numVals = chopString(valsStr, ",", NULL, 0);
 char **vals = needMem((numVals+1)*sizeof(char**)); // +1 allows for no values
 chopString(valsStr, ",", vals, numVals);
-struct slName *escVals = NULL;
+struct slName *unescVals = NULL;
 for (i = 0; i < numVals; i++)
-    slSafeAddHead(&escVals, unescapeSlName(g3a, vals[i]));
+    slAddHead(&unescVals, unescapeSlName(g3a, vals[i]));
+if (unescVals == NULL)
+    slAddHead(&unescVals, slNameNew(""));  // empty value
 freeMem(vals);
-slReverse(&escVals);
-return escVals;
+slReverse(&unescVals);
+return unescVals;
 }
 
-static void addAttrVals(struct gff3Ann *g3a, char *attr, char *valStr)
+static void addAttr(struct gff3Ann *g3a, char *tag, char *valStr)
 /* Add an attribute to the list of attributes.  If attribute has already been
  * specified, values are merged.  Attribute name must already be unescaped,
  * attribute values will be split and then unescaped. */
 {
-struct gff3AttrVals *attrVals = gff3AnnFindAttr(g3a, attr);
-if (attrVals == NULL)
+struct gff3Attr *attr = gff3AnnFindAttr(g3a, tag);
+if (attr == NULL)
     {
-    attrVals = gff3FileAlloc(g3a->file, sizeof(struct gff3AttrVals));
-    attrVals->attr = gff3FileCloneStr(g3a->file, attr);
-    slAddHead(&g3a->attrs, attrVals);
+    attr = gff3FileAlloc(g3a->file, sizeof(struct gff3Attr));
+    attr->tag = gff3FileCloneStr(g3a->file, tag);
+    slAddHead(&g3a->attrs, attr);
     }
-attrVals->vals = slCat(attrVals->vals, parseAttrVals(g3a, attr, valStr));
+attr->vals = slCat(attr->vals, parseAttrVals(g3a, tag, valStr));
 }
 
-static void parseAttrVal(struct gff3Ann *g3a, char *attrValsStr)
-/* parse one attribute and value from an annotation record */
+static void parseAttr(struct gff3Ann *g3a, char *attrStr)
+/* parse one attribute from an annotation record */
 {
-char *eq = strchr(attrValsStr, '=');
-if ((eq == NULL) || (eq == attrValsStr))
-    gff3AnnErr(g3a, "expected name=value: %s", attrValsStr);
+char *eq = strchr(attrStr, '=');
+if ((eq == NULL) || (eq == attrStr))
+    gff3AnnErr(g3a, "expected name=value: %s", attrStr);
 else
     {
-    char *attr = attrValsStr;
+    char *tag = attrStr;
     char *vals = eq+1;
     *eq = '\0';
-    unescapeStr(g3a, attr, attr);
-    addAttrVals(g3a, attr, vals);
+    unescapeStr(g3a, tag, tag);
+    if (checkAttrTag(g3a, tag))
+        addAttr(g3a, tag, vals);
     }
 }
 
 static void parseAttrs(struct gff3Ann *g3a, char *attrsCol)
 /* parse the attribute column in an annotation record */
 {
 int i, numAttrs = chopString(attrsCol, ";", NULL, 0);
-char **attrVals = needMem(numAttrs*sizeof(char**));
-chopString(attrsCol, ";", attrVals, numAttrs);
+char **attrStrs = needMem(numAttrs*sizeof(char**));
+chopString(attrsCol, ";", attrStrs, numAttrs);
 for (i = 0; i < numAttrs; i++)
     {
-    char *av = trimSpaces(attrVals[i]);
-    if (strlen(av) > 0)
-        parseAttrVal(g3a, av);
+    char *attrStr = trimSpaces(attrStrs[i]);
+    if (strlen(attrStr) > 0)
+        parseAttr(g3a, attrStr);
     }
-freeMem(attrVals);
+freeMem(attrStrs);
 slReverse(&g3a->attrs);
 }
 
-static void checkSingleValAttr(struct gff3Ann *g3a, struct gff3AttrVals *attrVals)
+static void checkSingleValAttr(struct gff3Ann *g3a, struct gff3Attr *attr)
 /* validate that an attribute has only one value */
 {
-if (attrVals->vals->next != NULL)
-    gff3AnnErr(g3a, "attribute %s must have a single value, found multiple comma-separated values", attrVals->attr);
+if (attr->vals->next != NULL)
+    gff3AnnErr(g3a, "attribute %s must have a single value, found multiple comma-separated values", attr->tag);
 }
 
-static void parseIDAttr(struct gff3Ann *g3a, struct gff3AttrVals *attrVals)
+static void parseIDAttr(struct gff3Ann *g3a, struct gff3Attr *attr)
 /* parse the ID attribute */
 {
-checkSingleValAttr(g3a, attrVals);
-char *id = attrVals->vals->name;
+checkSingleValAttr(g3a, attr);
+char *id = attr->vals->name;
 struct hashEl *hel = hashStore(g3a->file->byId, id);
 if (hel->val != NULL)
     gff3AnnErr(g3a, "duplicate annotation record with ID: %s", id);
 hel->val = g3a;
 g3a->id = id;
 }
 
-static void parseNameAttr(struct gff3Ann *g3a, struct gff3AttrVals *attrVals)
+static void parseNameAttr(struct gff3Ann *g3a, struct gff3Attr *attr)
 /* parse the Name attribute */
 {
-checkSingleValAttr(g3a, attrVals);
-g3a->name = attrVals->vals->name;
+checkSingleValAttr(g3a, attr);
+g3a->name = attr->vals->name;
 }
 
-static void parseAliasAttr(struct gff3Ann *g3a, struct gff3AttrVals *attrVals)
+static void parseAliasAttr(struct gff3Ann *g3a, struct gff3Attr *attr)
 /* parse the Alias attribute */
 {
-g3a->aliases = attrVals->vals;
+g3a->aliases = attr->vals;
 }
 
-static void parseParentAttr(struct gff3Ann *g3a, struct gff3AttrVals *attrVals)
+static void parseParentAttr(struct gff3Ann *g3a, struct gff3Attr *attr)
 /* parse the Parent attribute */
 {
-g3a->parentIds = attrVals->vals;
+g3a->parentIds = attr->vals;
 }
 
-static void parseTargetAttr(struct gff3Ann *g3a, struct gff3AttrVals *attrVals)
+static void parseTargetAttr(struct gff3Ann *g3a, struct gff3Attr *attr)
 /* parse the Target attribute */
 {
-checkSingleValAttr(g3a, attrVals);
+checkSingleValAttr(g3a, attr);
 
 // target_id start end [strand]
 int numWords;
-char **words = dynChopStringWhite(g3a->file, attrVals->vals->name, 3, 4, &numWords,
+char **words = dynChopStringWhite(g3a->file, attr->vals->name, 3, 4, &numWords,
                                   "Target attribute in the form \"target_id start end [strand]\"");
 if (words == NULL)
     return;  // got an error
 g3a->targetId = gff3FileCloneStr(g3a->file, words[0]);
@@ -439,78 +455,78 @@
     g3a->targetStrand = parseStrand(g3a, words[3]);
 freeMem(words);
 }
 
-static void parseGapAttr(struct gff3Ann *g3a, struct gff3AttrVals *attrVals)
+static void parseGapAttr(struct gff3Ann *g3a, struct gff3Attr *attr)
 /* parse the Gap attribute */
 {
-checkSingleValAttr(g3a, attrVals);
-g3a->gap = attrVals->vals->name;
+checkSingleValAttr(g3a, attr);
+g3a->gap = attr->vals->name;
 }
 
-static void parseDerivesFromAttr(struct gff3Ann *g3a, struct gff3AttrVals *attrVals)
+static void parseDerivesFromAttr(struct gff3Ann *g3a, struct gff3Attr *attr)
 /* parse the Derives_from attribute */
 {
-g3a->derivesFromId = attrVals->vals->name;
+g3a->derivesFromId = attr->vals->name;
 }
 
-static void parseNoteAttr(struct gff3Ann *g3a, struct gff3AttrVals *attrVals)
+static void parseNoteAttr(struct gff3Ann *g3a, struct gff3Attr *attr)
 /* parse the Note attribute */
 {
-g3a->notes = attrVals->vals;
+g3a->notes = attr->vals;
 }
 
-static void parseDbxrefAttr(struct gff3Ann *g3a, struct gff3AttrVals *attrVals)
+static void parseDbxrefAttr(struct gff3Ann *g3a, struct gff3Attr *attr)
 /* parse the Dbxref attribute */
 {
-g3a->dbxrefs = attrVals->vals;
+g3a->dbxrefs = attr->vals;
 }
 
-static void parseOntologyTermAttr(struct gff3Ann *g3a, struct gff3AttrVals *attrVals)
+static void parseOntologyTermAttr(struct gff3Ann *g3a, struct gff3Attr *attr)
 /* parse the Ontology_term attribute */
 {
-g3a->ontologyTerms = attrVals->vals;
+g3a->ontologyTerms = attr->vals;
 }
 
-static void parseStdAttr(struct gff3Ann *g3a, struct gff3AttrVals *attrVals)
+static void parseStdAttr(struct gff3Ann *g3a, struct gff3Attr *attr)
 /* Parse one of the standard specified attributes (those starting with upper
  * case) into fields. Multiple specifications of an attribute should have been
  * merged before calling this function. */
 {
-if (sameString(attrVals->attr, gff3AttrID))
-    parseIDAttr(g3a, attrVals);
-else if (sameString(attrVals->attr, gff3AttrName))
-    parseNameAttr(g3a, attrVals);
-else if (sameString(attrVals->attr, gff3AttrAlias))
-    parseAliasAttr(g3a, attrVals);
-else if (sameString(attrVals->attr, gff3AttrParent))
-    parseParentAttr(g3a, attrVals);
-else if (sameString(attrVals->attr, gff3AttrTarget))
-    parseTargetAttr(g3a, attrVals);
-else if (sameString(attrVals->attr, gff3AttrGap))
-    parseGapAttr(g3a, attrVals);
-else if (sameString(attrVals->attr, gff3AttrDerivesFrom))
-    parseDerivesFromAttr(g3a, attrVals);
-else if (sameString(attrVals->attr, gff3AttrNote))
-    parseNoteAttr(g3a, attrVals);
-else if (sameString(attrVals->attr, gff3AttrDbxref))
-    parseDbxrefAttr(g3a, attrVals);
-else if (sameString(attrVals->attr, gff3AttrOntologyTerm))
-    parseOntologyTermAttr(g3a, attrVals);
+if (sameString(attr->tag, gff3AttrID))
+    parseIDAttr(g3a, attr);
+else if (sameString(attr->tag, gff3AttrName))
+    parseNameAttr(g3a, attr);
+else if (sameString(attr->tag, gff3AttrAlias))
+    parseAliasAttr(g3a, attr);
+else if (sameString(attr->tag, gff3AttrParent))
+    parseParentAttr(g3a, attr);
+else if (sameString(attr->tag, gff3AttrTarget))
+    parseTargetAttr(g3a, attr);
+else if (sameString(attr->tag, gff3AttrGap))
+    parseGapAttr(g3a, attr);
+else if (sameString(attr->tag, gff3AttrDerivesFrom))
+    parseDerivesFromAttr(g3a, attr);
+else if (sameString(attr->tag, gff3AttrNote))
+    parseNoteAttr(g3a, attr);
+else if (sameString(attr->tag, gff3AttrDbxref))
+    parseDbxrefAttr(g3a, attr);
+else if (sameString(attr->tag, gff3AttrOntologyTerm))
+    parseOntologyTermAttr(g3a, attr);
 else
-    gff3AnnErr(g3a, "unknown standard attribute, user defined attributes must start with a lower-case letter: %s", attrVals->attr);
+    gff3AnnErr(g3a, "unknown standard attribute, user defined attributes must start with a lower-case letter: %s", attr->tag);
 }
 
 static void parseStdAttrs(struct gff3Ann *g3a)
 /* parse standard attributes (starting with upper case) into attributes
  * have been parsed into attribute list, which would have  merged multiply
  * specified attributes. */
 {
-struct gff3AttrVals *av;
-for (av = g3a->attrs; av != NULL; av = av->next)
+struct gff3Attr *attr;
+for (attr = g3a->attrs; attr != NULL; attr = attr->next)
     {
-    if (isupper(av->attr[0]))
-        parseStdAttr(g3a, av);
+    if (isupper(attr->tag[0]))
+        parseStdAttr(g3a, attr);
     }
 }
 
 static void parseAnn(struct gff3File *g3f, char *line)
@@ -530,32 +546,31 @@
 parseStdAttrs(g3a);
 slAddHead(&g3f->anns, g3a);
 }
 
-
-static void writeAttrVals(struct gff3AttrVals *av, FILE *fh)
+static void writeAttr(struct gff3Attr *attr, FILE *fh)
 /* write one attribute and it's values */
 {
-writeEscaped(av->attr, fh);
+writeEscaped(attr->tag, fh);
 fputc('=', fh);
 struct slName *val;
-for (val = av->vals; val != NULL; val = val->next)
+for (val = attr->vals; val != NULL; val = val->next)
     {
-    if (val != av->vals)
+    if (val != attr->vals)
         fputc(',', fh);
     writeEscaped(val->name, fh);
     }
 }
 
 static void writeAttrs(struct gff3Ann *g3a, FILE *fh)
 /* write annotation record attributes */
 {
-struct gff3AttrVals *av;
-for (av = g3a->attrs; av != NULL; av = av->next)
+struct gff3Attr *attr;
+for (attr = g3a->attrs; attr != NULL; attr = attr->next)
     {
-    if (av != g3a->attrs)
+    if (attr != g3a->attrs)
         fputc(';', fh);
-    writeAttrVals(av, fh);
+    writeAttr(attr, fh);
     }
 }
 
 static void writeFields(struct gff3Ann *g3a, FILE *fh)