88d10fbd0db11b5602e39f14809ea5fee2df08db markd Sat Oct 30 12:42:41 2010 -0700 added support for GFF3 discontinuous features diff --git src/lib/gff3.c src/lib/gff3.c index d801372..2aab05f 100644 --- src/lib/gff3.c +++ src/lib/gff3.c @@ -2,79 +2,92 @@ * Object for accessing GFF3 files * See GFF3 specification for details of file format: * http://www.sequenceontology.org/gff3.shtml */ #include "common.h" #include "gff3.h" #include #include "errabort.h" #include "localmem.h" #include "hash.h" #include "linefile.h" #include "dystring.h" #include "fa.h" // FIXME: spec unclear if attributes can be specified multiple times -// FIXME: spec unclear if start/end can be `.' +// FIXME: spec unclear on attribute of discontinuous features. // FIXME: should spaces be striped from attributes? -// FIXME: chop functions might cause grief by skipping initial separators + +/* + * Notes: + * - a separate feature object that linked discontinuous feature annotations + * was not used because it create more complexity with the linking of parents + * and the fact that the restriction on discontinguous features attributes is + * not clearly defined. + */ static const int gffNumCols = 9; /* standard attribute names */ char *gff3AttrID = "ID"; char *gff3AttrName = "Name"; char *gff3AttrAlias = "Alias"; char *gff3AttrParent = "Parent"; char *gff3AttrTarget = "Target"; char *gff3AttrGap = "Gap"; char *gff3AttrDerivesFrom = "Derives_from"; char *gff3AttrNote = "Note"; char *gff3AttrDbxref = "Dbxref"; char *gff3AttrOntologyTerm = "Ontology_term"; /* commonly used features names */ char *gff3FeatGene = "gene"; char *gff3FeatMRna = "mRNA"; char *gff3FeatExon = "exon"; char *gff3FeatCDS = "CDS"; char *gff3FeatThreePrimeUTR = "three_prime_UTR"; char *gff3FeatFivePrimeUTR = "five_prime_UTR"; char *gff3FeatStartCodon = "start_codon"; char *gff3FeatStopCodon = "stop_codon"; +static bool gff3FileStopDueToErrors(struct gff3File *g3f) +/* determine if we should stop due to the number of errors */ +{ +return g3f->errCnt > g3f->maxErr; +} + static void gff3FileErr(struct gff3File *g3f, char *format, ...) #if defined(__GNUC__) __attribute__((format(printf, 2, 3))) #endif ; static void gff3AnnErr(struct gff3Ann *g3a, char *format, ...) #if defined(__GNUC__) __attribute__((format(printf, 2, 3))) #endif ; static void vaGff3FileErr(struct gff3File *g3f, char *format, va_list args) /* Print error message to error file, abort if max errors have been reached */ { if (g3f->lf != NULL) fprintf(g3f->errFh, "%s:%d: ", g3f->lf->fileName, g3f->lf->lineIx); vfprintf(g3f->errFh, format, args); fprintf(g3f->errFh, "\n"); g3f->errCnt++; -if (g3f->errCnt > g3f->maxErr) +if (gff3FileStopDueToErrors(g3f)) errAbort("GFF3: %d parser errors", g3f->errCnt); } static void gff3FileErr(struct gff3File *g3f, char *format, ...) /* Print error message and abort */ { va_list args; va_start(args, format); vaGff3FileErr(g3f, format, args); va_end(args); } static void gff3AnnErr(struct gff3Ann *g3a, char *format, ...) /* Print error message abort */ { @@ -398,36 +411,38 @@ freeMem(attrStrs); slReverse(&g3a->attrs); } static void checkSingleValAttr(struct gff3Ann *g3a, struct gff3Attr *attr) /* validate that an attribute has only one value */ { if (attr->vals->next != NULL) gff3AnnErr(g3a, "attribute %s must have a single value, found multiple comma-separated values", attr->tag); } static void parseIDAttr(struct gff3Ann *g3a, struct gff3Attr *attr) /* parse the ID attribute */ { checkSingleValAttr(g3a, attr); -char *id = attr->vals->name; -struct hashEl *hel = hashStore(g3a->file->byId, id); -if (hel->val != NULL) - gff3AnnErr(g3a, "duplicate annotation record with ID: %s", id); +g3a->id = attr->vals->name; +// link into other parts of feature if discontinuous +struct hashEl *hel = hashStore(g3a->file->byId, g3a->id); +struct gff3Ann *head = hel->val; +if (head != NULL) + head->prevPart = g3a; +g3a->nextPart = head; hel->val = g3a; -g3a->id = id; } static void parseNameAttr(struct gff3Ann *g3a, struct gff3Attr *attr) /* parse the Name attribute */ { checkSingleValAttr(g3a, attr); g3a->name = attr->vals->name; } static void parseAliasAttr(struct gff3Ann *g3a, struct gff3Attr *attr) /* parse the Alias attribute */ { g3a->aliases = attr->vals; } @@ -532,31 +547,31 @@ static void parseAnn(struct gff3File *g3f, char *line) /* parse an annotation line */ { // extra column to check for too many char *words[gffNumCols+1]; int numWords = chopString(line, "\t", words, gffNumCols+1); if (numWords != gffNumCols) gff3FileErr(g3f, "expected %d tab-separated columns: %s", gffNumCols, line); struct gff3Ann *g3a = gff3FileAlloc(g3f, sizeof(struct gff3Ann)); g3a->file = g3f; g3a->lineNum = g3f->lf->lineIx; parseFields(g3a, words); parseAttrs(g3a, words[8]); parseStdAttrs(g3a); -slAddHead(&g3f->anns, g3a); +slAddHead(&g3f->anns, gff3AnnRefNew(g3a)); } static void writeAttr(struct gff3Attr *attr, FILE *fh) /* write one attribute and it's values */ { writeEscaped(attr->tag, fh); fputc('=', fh); struct slName *val; for (val = attr->vals; val != NULL; val = val->next) { if (val != attr->vals) fputc(',', fh); writeEscaped(val->name, fh); } } @@ -817,37 +832,119 @@ ver = trimSpaces(ver); } if (!(sameString(line, "##gff-version") && sameString(ver, "3"))) gff3FileErr(g3f, "invalid GFF3 header"); } static void parseFile(struct gff3File *g3f) /* do parsing phase of reading a GFF3 file */ { g3f->lf = lineFileOpen(g3f->fileName, TRUE); parseHeader(g3f); char *line; while (lineFileNext(g3f->lf, &line, NULL)) { parseLine(g3f, line); - if (g3f->errCnt >= g3f->maxErr) + if (gff3FileStopDueToErrors(g3f)) break; } lineFileClose(&g3f->lf); slReverse(&g3f->anns); } +static int gff3AnnCount(struct gff3Ann *g3a) +/* count the number of gff3Ann objects linked together in a feature */ +{ +int cnt = 0; +for (; g3a != NULL; g3a = g3a->nextPart) + cnt++; +return cnt; +} + +static void discontinFeatureCheck(struct gff3Ann *g3a) +/* sanity check linked gff3Ann discontinuous features */ +{ +struct gff3Ann *g3a2; +for (g3a2 = g3a->nextPart; (g3a2 != NULL) && !gff3FileStopDueToErrors(g3a->file); g3a2 = g3a2->nextPart) + { + if (!sameString(g3a->type, g3a2->type)) + gff3AnnErr(g3a, "Annotation records for discontinuous features with ID=\"%s\" do not have the same type, found \"%s\" and \"%s\"", g3a->id, g3a->type, g3a2->type); + } +} + +static void discontinFeatureFillArray(struct gff3Ann *g3a, int numAnns, struct gff3Ann *featAnns[]) +/* convert list to array for sorting */ +{ +int i = 0; +for (; g3a != NULL; g3a = g3a->nextPart) + featAnns[i++] = g3a; +} + +static struct gff3Ann *discontinFeatureArrayLink(int numAnns, struct gff3Ann *featAnns[]) +/* convert sorted array to a list */ +{ +struct gff3Ann *g3aHead = NULL, *g3aPrev = NULL; +int i; +for (i = 0; i < numAnns; i++) + { + if (g3aHead == NULL) + g3aHead = featAnns[i]; + if (g3aPrev != NULL) + g3aPrev->nextPart = featAnns[i]; + featAnns[i]->prevPart = g3aPrev; + } +return g3aHead; +} + +static int discontigFeatureSortCmp(const void *p1, const void *p2) +/* compare function for discontigFeatureSort */ +{ +struct gff3Ann *g3a1 = *((struct gff3Ann **)p1); +struct gff3Ann *g3a2 = *((struct gff3Ann **)p2); +int diff = g3a1->start - g3a2->start; +if (diff == 0) + diff = g3a1->end - g3a2->end; +return diff; +} + +static struct gff3Ann *discontigFeatureSort(struct gff3Ann *g3a) +/* sort a list of gff3Ann object representing discontinuous */ +{ +int numAnns = gff3AnnCount(g3a); +struct gff3Ann *featAnns[numAnns]; +discontinFeatureFillArray(g3a, numAnns, featAnns); +qsort(featAnns, numAnns, sizeof(struct gff3Ann*), discontigFeatureSortCmp); +return discontinFeatureArrayLink(numAnns, featAnns); +} + +static void discontigFeatureFinish(struct gff3File *g3f) +/* finish up discontinuous features, sorting them into ascending order */ +{ +// only both sorting if more than one annotation +struct hashCookie cookie = hashFirst(g3f->byId); +struct hashEl *hel; +while (((hel = hashNext(&cookie)) != NULL) && !gff3FileStopDueToErrors(g3f)) + { + struct gff3Ann *g3a = hel->val; + if (g3a->nextPart != NULL) + { + discontinFeatureCheck(g3a); + hel->val = discontigFeatureSort(g3a); + } + } +} + static struct gff3Ann *resolveRef(struct gff3Ann *g3a, char *id, char *attr) /* resolve a link for an attribute */ { struct gff3Ann *ann = gff3FileFindAnn(g3a->file, id); if (ann == NULL) gff3AnnErr(g3a, "Can't find annotation record \"%s\" referenced by \"%s\" %s attribute", id, g3a->id, attr); return ann; } static struct gff3AnnRef *resolveRefs(struct gff3Ann *g3a, struct slName *ids, char *attr) /* resolve links for an attribute */ { struct gff3AnnRef *refs = NULL; struct slName *id; for (id = ids; id != NULL; id = id->next) @@ -863,71 +960,75 @@ /* resolve links for an gff3Ann */ { g3a->parents = resolveRefs(g3a, g3a->parentIds, gff3AttrParent); if (g3a->parents == NULL) slSafeAddHead(&g3a->file->roots, gff3AnnRefAlloc(g3a)); else { struct gff3AnnRef *par; for (par = g3a->parents; par != NULL; par = par->next) slSafeAddHead(&par->ann->children, gff3AnnRefAlloc(g3a)); } if (g3a->derivesFromId != NULL) g3a->derivesFrom = resolveRef(g3a, g3a->derivesFromId, gff3AttrDerivesFrom); } +static void resolveAnns(struct gff3File *g3f) +/* resolve links */ +{ +struct gff3AnnRef *g3aRef; +for (g3aRef = g3f->anns; (g3aRef != NULL) && !gff3FileStopDueToErrors(g3f); g3aRef = g3aRef->next) + resolveAnn(g3aRef->ann); +} + static void resolveFile(struct gff3File *g3f) /* do resolution phase of reading a GFF3 file */ { -struct gff3Ann *g3a; -for (g3a = g3f->anns; g3a != NULL; g3a = g3a->next) - { - resolveAnn(g3a); - if (g3f->errCnt >= g3f->maxErr) - break; - } +// must sort first, as links point to the first feature +discontigFeatureFinish(g3f); +resolveAnns(g3f); // reorder just for test reproducibility slReverse(&g3f->seqRegions); slReverse(&g3f->featureOntologies); slReverse(&g3f->attributeOntologies); slReverse(&g3f->sourceOntologies); slReverse(&g3f->species); slReverse(&g3f->seqs); } static struct gff3File *gff3FileNew() /* construct a new, empty gff3File object */ { struct gff3File *g3f; AllocVar(g3f); g3f->byId = hashNew(0); g3f->pool = hashNew(0); return g3f; } struct gff3File *gff3FileOpen(char *fileName, int maxErr, FILE *errFh) /* Parse a GFF3 file into a gff3File object. If maxErr not zero, then * continue to parse until this number of error have been reached. A maxErr * less than zero does not stop reports all errors. Write errors to errFh, * if NULL, use stderr. */ { struct gff3File *g3f = gff3FileNew(); g3f->fileName = gff3FileCloneStr(g3f, fileName); g3f->errFh = (errFh != NULL) ? errFh : stderr; g3f->maxErr = (maxErr < 0) ? INT_MAX : maxErr; parseFile(g3f); -if (g3f->errCnt < g3f->maxErr) +if (!gff3FileStopDueToErrors(g3f)) resolveFile(g3f); if (g3f->errCnt > 0) errAbort("GFF3: %d parser errors", g3f->errCnt); return g3f; } void gff3FileFree(struct gff3File **g3fPtr) /* Free a gff3File object */ { struct gff3File *g3f = *g3fPtr; if (g3f != NULL) { hashFree(&g3f->byId); hashFree(&g3f->pool); hashFree(&g3f->seqRegionMap); @@ -947,33 +1048,33 @@ { fputs("##gff-version 3\n", fh); writeSequenceRegions(g3f, fh); writeFeatureOntologies(g3f, fh); writeAttributeOntologies(g3f, fh); writeSourceOntologies(g3f, fh); writeSpecies(g3f, fh); writeGenomeBuild(g3f, fh); } void gff3FileWrite(struct gff3File *g3f, char *fileName) /* write contents of an GFF3File object to a file */ { FILE *fh = mustOpen(fileName, "w"); writeMeta(g3f, fh); -struct gff3Ann *g3a; -for (g3a = g3f->anns; g3a != NULL; g3a = g3a->next) - writeAnn(g3a, fh); +struct gff3AnnRef *g3aRef; +for (g3aRef = g3f->anns; g3aRef != NULL; g3aRef = g3aRef->next) + writeAnn(g3aRef->ann, fh); writeFastas(g3f, fh); carefulClose(&fh); } int gff3AnnRefLocCmp(const void *va, const void *vb) /* sort compare function for two gff3AnnRef objects */ { const struct gff3Ann *a = (*((struct gff3AnnRef **)va))->ann; const struct gff3Ann *b = (*((struct gff3AnnRef **)vb))->ann; int diff = strcmp(a->seqid, b->seqid); if ((diff == 0) && (a->strand != b->strand)) { // allow for various types of strand fields. above tests handles both null if (a->strand == NULL) diff = 1;