88d10fbd0db11b5602e39f14809ea5fee2df08db
markd
  Sat Oct 30 12:42:41 2010 -0700
added support for GFF3 discontinuous features
diff --git src/lib/gff3.c src/lib/gff3.c
index d801372..2aab05f 100644
--- src/lib/gff3.c
+++ src/lib/gff3.c
@@ -2,79 +2,92 @@
  * Object for accessing GFF3 files
  * See GFF3 specification for details of file format:
  *   http://www.sequenceontology.org/gff3.shtml
  */
 #include "common.h"
 #include "gff3.h"
 #include <limits.h>
 #include "errabort.h"
 #include "localmem.h"
 #include "hash.h"
 #include "linefile.h"
 #include "dystring.h"
 #include "fa.h"
 
 // FIXME: spec unclear if attributes can be specified multiple times
-// FIXME: spec unclear if start/end can be `.'
+// FIXME: spec unclear on attribute of discontinuous features.
 // FIXME: should spaces be striped from attributes?
-// FIXME: chop functions might cause grief by skipping initial separators
+
+/*
+ * Notes:
+ *   - a separate feature object that linked discontinuous feature annotations
+ *     was not used because it create more complexity with the linking of parents
+ *     and the fact that the restriction on discontinguous features attributes is
+ *     not clearly defined.
+ */
 
 static const int gffNumCols = 9;
 
 /* standard attribute names */
 char *gff3AttrID = "ID";
 char *gff3AttrName = "Name";
 char *gff3AttrAlias = "Alias";
 char *gff3AttrParent = "Parent";
 char *gff3AttrTarget = "Target";
 char *gff3AttrGap = "Gap";
 char *gff3AttrDerivesFrom = "Derives_from";
 char *gff3AttrNote = "Note";
 char *gff3AttrDbxref = "Dbxref";
 char *gff3AttrOntologyTerm = "Ontology_term";
 
 /* commonly used features names */
 char *gff3FeatGene = "gene";
 char *gff3FeatMRna = "mRNA";
 char *gff3FeatExon = "exon";
 char *gff3FeatCDS = "CDS";
 char *gff3FeatThreePrimeUTR = "three_prime_UTR";
 char *gff3FeatFivePrimeUTR = "five_prime_UTR";
 char *gff3FeatStartCodon = "start_codon";
 char *gff3FeatStopCodon = "stop_codon";
 
+static bool gff3FileStopDueToErrors(struct gff3File *g3f)
+/* determine if we should stop due to the number of errors */
+{
+return g3f->errCnt > g3f->maxErr;
+}
+
 static void gff3FileErr(struct gff3File *g3f, char *format, ...)
 #if defined(__GNUC__)
 __attribute__((format(printf, 2, 3)))
 #endif
 ;
 
 static void gff3AnnErr(struct gff3Ann *g3a, char *format, ...)
 #if defined(__GNUC__)
 __attribute__((format(printf, 2, 3)))
 #endif
 ;
 
 static void vaGff3FileErr(struct gff3File *g3f, char *format, va_list args)
 /* Print error message to error file, abort if max errors have been reached */
 {
 if (g3f->lf != NULL)
     fprintf(g3f->errFh, "%s:%d: ", g3f->lf->fileName, g3f->lf->lineIx);
 vfprintf(g3f->errFh, format, args);
 fprintf(g3f->errFh, "\n");
 g3f->errCnt++;
-if (g3f->errCnt > g3f->maxErr)
+if (gff3FileStopDueToErrors(g3f))
     errAbort("GFF3: %d parser errors", g3f->errCnt);
 }
 
 static void gff3FileErr(struct gff3File *g3f, char *format, ...)
 /* Print error message and abort */
 {
 va_list args;
 va_start(args, format);
 vaGff3FileErr(g3f, format, args);
 va_end(args);
 }
 
 static void gff3AnnErr(struct gff3Ann *g3a, char *format, ...)
 /* Print error message abort */
 {
@@ -398,36 +411,38 @@
 freeMem(attrStrs);
 slReverse(&g3a->attrs);
 }
 
 static void checkSingleValAttr(struct gff3Ann *g3a, struct gff3Attr *attr)
 /* validate that an attribute has only one value */
 {
 if (attr->vals->next != NULL)
     gff3AnnErr(g3a, "attribute %s must have a single value, found multiple comma-separated values", attr->tag);
 }
 
 static void parseIDAttr(struct gff3Ann *g3a, struct gff3Attr *attr)
 /* parse the ID attribute */
 {
 checkSingleValAttr(g3a, attr);
-char *id = attr->vals->name;
-struct hashEl *hel = hashStore(g3a->file->byId, id);
-if (hel->val != NULL)
-    gff3AnnErr(g3a, "duplicate annotation record with ID: %s", id);
+g3a->id = attr->vals->name;
+// link into other parts of feature if discontinuous
+struct hashEl *hel = hashStore(g3a->file->byId, g3a->id);
+struct gff3Ann *head = hel->val;
+if (head != NULL)
+    head->prevPart = g3a;
+g3a->nextPart = head;
 hel->val = g3a;
-g3a->id = id;
 }
 
 static void parseNameAttr(struct gff3Ann *g3a, struct gff3Attr *attr)
 /* parse the Name attribute */
 {
 checkSingleValAttr(g3a, attr);
 g3a->name = attr->vals->name;
 }
 
 static void parseAliasAttr(struct gff3Ann *g3a, struct gff3Attr *attr)
 /* parse the Alias attribute */
 {
 g3a->aliases = attr->vals;
 }
 
@@ -532,31 +547,31 @@
 static void parseAnn(struct gff3File *g3f, char *line)
 /* parse an annotation line */
 {
 // extra column to check for too many
 char *words[gffNumCols+1];
 int numWords = chopString(line, "\t", words, gffNumCols+1);
 if (numWords != gffNumCols)
     gff3FileErr(g3f, "expected %d tab-separated columns: %s", gffNumCols, line);
 
 struct gff3Ann *g3a = gff3FileAlloc(g3f, sizeof(struct gff3Ann));
 g3a->file = g3f;
 g3a->lineNum = g3f->lf->lineIx;
 parseFields(g3a, words);
 parseAttrs(g3a, words[8]);
 parseStdAttrs(g3a);
-slAddHead(&g3f->anns, g3a);
+slAddHead(&g3f->anns, gff3AnnRefNew(g3a));
 }
 
 static void writeAttr(struct gff3Attr *attr, FILE *fh)
 /* write one attribute and it's values */
 {
 writeEscaped(attr->tag, fh);
 fputc('=', fh);
 struct slName *val;
 for (val = attr->vals; val != NULL; val = val->next)
     {
     if (val != attr->vals)
         fputc(',', fh);
     writeEscaped(val->name, fh);
     }
 }
@@ -817,37 +832,119 @@
     ver = trimSpaces(ver);
     }
 if (!(sameString(line, "##gff-version") && sameString(ver, "3")))
     gff3FileErr(g3f, "invalid GFF3 header");
 }
 
 static void parseFile(struct gff3File *g3f)
 /* do parsing phase of reading a GFF3 file */
 {
 g3f->lf = lineFileOpen(g3f->fileName, TRUE);
 parseHeader(g3f);
 char *line;
 while (lineFileNext(g3f->lf, &line, NULL))
     {
     parseLine(g3f, line);
-    if (g3f->errCnt >= g3f->maxErr)
+    if (gff3FileStopDueToErrors(g3f))
         break;
     }
 lineFileClose(&g3f->lf);
 slReverse(&g3f->anns);
 }
 
+static int gff3AnnCount(struct gff3Ann *g3a)
+/* count the number of gff3Ann objects linked together in a feature */
+{
+int cnt = 0;
+for (; g3a != NULL; g3a = g3a->nextPart)
+    cnt++;
+return cnt;
+}
+
+static void discontinFeatureCheck(struct gff3Ann *g3a)
+/* sanity check linked gff3Ann discontinuous features */
+{
+struct gff3Ann *g3a2;
+for (g3a2 = g3a->nextPart; (g3a2 != NULL) && !gff3FileStopDueToErrors(g3a->file); g3a2 = g3a2->nextPart)
+    {
+    if (!sameString(g3a->type, g3a2->type))
+        gff3AnnErr(g3a, "Annotation records for discontinuous features with ID=\"%s\" do not have the same type, found \"%s\" and \"%s\"", g3a->id, g3a->type, g3a2->type);
+    }
+}
+
+static void discontinFeatureFillArray(struct gff3Ann *g3a, int numAnns, struct gff3Ann *featAnns[])
+/* convert list to array for sorting */
+{
+int i = 0;
+for (; g3a != NULL; g3a = g3a->nextPart)
+    featAnns[i++] = g3a;
+}
+
+static struct gff3Ann *discontinFeatureArrayLink(int numAnns, struct gff3Ann *featAnns[])
+/* convert sorted array to a list */
+{
+struct gff3Ann *g3aHead = NULL, *g3aPrev = NULL;
+int i;
+for (i = 0; i < numAnns; i++)
+    {
+    if (g3aHead == NULL)
+        g3aHead = featAnns[i];
+    if (g3aPrev != NULL)
+        g3aPrev->nextPart = featAnns[i];
+    featAnns[i]->prevPart = g3aPrev;
+    }
+return g3aHead;
+}
+
+static int discontigFeatureSortCmp(const void *p1, const void *p2)
+/* compare function for discontigFeatureSort */
+{
+struct gff3Ann *g3a1 = *((struct gff3Ann **)p1);
+struct gff3Ann *g3a2 = *((struct gff3Ann **)p2);
+int diff = g3a1->start - g3a2->start;
+if (diff == 0)
+    diff = g3a1->end - g3a2->end;
+return diff;
+}
+
+static struct gff3Ann *discontigFeatureSort(struct gff3Ann *g3a)
+/* sort a list of gff3Ann object representing discontinuous */
+{
+int numAnns = gff3AnnCount(g3a);
+struct gff3Ann *featAnns[numAnns];
+discontinFeatureFillArray(g3a, numAnns, featAnns);
+qsort(featAnns, numAnns, sizeof(struct gff3Ann*), discontigFeatureSortCmp);
+return discontinFeatureArrayLink(numAnns, featAnns);
+}
+
+static void discontigFeatureFinish(struct gff3File *g3f)
+/* finish up discontinuous features, sorting them into ascending order */
+{
+// only both sorting if more than one annotation
+struct hashCookie cookie = hashFirst(g3f->byId);
+struct hashEl *hel;
+while (((hel = hashNext(&cookie)) != NULL) && !gff3FileStopDueToErrors(g3f))
+    {
+    struct gff3Ann *g3a = hel->val;
+    if (g3a->nextPart != NULL)
+        {
+        discontinFeatureCheck(g3a);
+        hel->val = discontigFeatureSort(g3a);
+        }
+    }
+}
+
 static struct gff3Ann *resolveRef(struct gff3Ann *g3a, char *id, char *attr)
 /* resolve a link for an attribute */
 {
 struct gff3Ann *ann = gff3FileFindAnn(g3a->file, id);
 if (ann == NULL)
     gff3AnnErr(g3a, "Can't find annotation record \"%s\" referenced by \"%s\" %s attribute", id, g3a->id, attr);
 return ann;
 }
 
 static struct gff3AnnRef *resolveRefs(struct gff3Ann *g3a, struct slName *ids, char *attr)
 /* resolve links for an attribute */
 {
 struct gff3AnnRef *refs = NULL;
 struct slName *id;
 for (id = ids; id != NULL; id = id->next)
@@ -863,71 +960,75 @@
 /* resolve links for an gff3Ann */
 {
 g3a->parents = resolveRefs(g3a, g3a->parentIds, gff3AttrParent);
 if (g3a->parents == NULL)
     slSafeAddHead(&g3a->file->roots, gff3AnnRefAlloc(g3a));
 else
     {
     struct gff3AnnRef *par;
     for (par = g3a->parents; par != NULL; par = par->next)
         slSafeAddHead(&par->ann->children, gff3AnnRefAlloc(g3a));
     }
 if (g3a->derivesFromId != NULL)
     g3a->derivesFrom = resolveRef(g3a, g3a->derivesFromId, gff3AttrDerivesFrom);
 }
 
+static void resolveAnns(struct gff3File *g3f)
+/* resolve links */
+{
+struct gff3AnnRef *g3aRef;
+for (g3aRef = g3f->anns; (g3aRef != NULL) && !gff3FileStopDueToErrors(g3f); g3aRef = g3aRef->next)
+    resolveAnn(g3aRef->ann);
+}
+
 static void resolveFile(struct gff3File *g3f)
 /* do resolution phase of reading a GFF3 file */
 {
-struct gff3Ann *g3a;
-for (g3a = g3f->anns; g3a != NULL; g3a = g3a->next)
-    {
-    resolveAnn(g3a);
-    if (g3f->errCnt >= g3f->maxErr)
-        break;
-    }
+// must sort first, as links point to the first feature
+discontigFeatureFinish(g3f);
+resolveAnns(g3f);
 // reorder just for test reproducibility
 slReverse(&g3f->seqRegions);
 slReverse(&g3f->featureOntologies);
 slReverse(&g3f->attributeOntologies);
 slReverse(&g3f->sourceOntologies);
 slReverse(&g3f->species);
 slReverse(&g3f->seqs);
 }
 
 static struct gff3File *gff3FileNew()
 /* construct a new, empty gff3File object */
 {
 struct gff3File *g3f;
 AllocVar(g3f);
 g3f->byId = hashNew(0);
 g3f->pool = hashNew(0);
 return g3f;
 }
 
 struct gff3File *gff3FileOpen(char *fileName, int maxErr, FILE *errFh)
 /* Parse a GFF3 file into a gff3File object.  If maxErr not zero, then
  * continue to parse until this number of error have been reached.  A maxErr
  * less than zero does not stop reports all errors. Write errors to errFh,
  * if NULL, use stderr. */
 {
 struct gff3File *g3f = gff3FileNew();
 g3f->fileName = gff3FileCloneStr(g3f, fileName);
 g3f->errFh = (errFh != NULL) ? errFh : stderr;
 g3f->maxErr = (maxErr < 0) ? INT_MAX : maxErr;
 parseFile(g3f);
-if (g3f->errCnt < g3f->maxErr)
+if (!gff3FileStopDueToErrors(g3f))
     resolveFile(g3f);
 if (g3f->errCnt > 0)
     errAbort("GFF3: %d parser errors", g3f->errCnt);
 return g3f;
 }
 
 void gff3FileFree(struct gff3File **g3fPtr)
 /* Free a gff3File object */
 {
 struct gff3File *g3f = *g3fPtr;
 if (g3f != NULL)
     {
     hashFree(&g3f->byId);
     hashFree(&g3f->pool);
     hashFree(&g3f->seqRegionMap);
@@ -947,33 +1048,33 @@
 {
 fputs("##gff-version 3\n", fh);
 writeSequenceRegions(g3f, fh);
 writeFeatureOntologies(g3f, fh);
 writeAttributeOntologies(g3f, fh);
 writeSourceOntologies(g3f, fh);
 writeSpecies(g3f, fh);
 writeGenomeBuild(g3f, fh);
 }
 
 void gff3FileWrite(struct gff3File *g3f, char *fileName)
 /* write contents of an GFF3File object to a file */
 {
 FILE *fh = mustOpen(fileName, "w");
 writeMeta(g3f, fh);
-struct gff3Ann *g3a;
-for (g3a = g3f->anns; g3a != NULL; g3a = g3a->next)
-    writeAnn(g3a, fh);
+struct gff3AnnRef *g3aRef;
+for (g3aRef = g3f->anns; g3aRef != NULL; g3aRef = g3aRef->next)
+    writeAnn(g3aRef->ann, fh);
 writeFastas(g3f, fh);
 carefulClose(&fh);
 }
 
 int gff3AnnRefLocCmp(const void *va, const void *vb)
 /* sort compare function for two gff3AnnRef objects */
 {
 const struct gff3Ann *a = (*((struct gff3AnnRef **)va))->ann;
 const struct gff3Ann *b = (*((struct gff3AnnRef **)vb))->ann;
 int diff = strcmp(a->seqid, b->seqid);
 if ((diff == 0) && (a->strand != b->strand))
     {
     // allow for various types of strand fields. above tests handles both null
     if (a->strand == NULL)
         diff = 1;