5652d415b7ad388d28eebebdca8e448da4a3e0f3 markd Sat May 26 22:54:39 2018 -0700 Have gff3ToGenePred -warnAndContinue discard more bogus cases, such as Maker files that have exons and CDS that don't have gene or mRNA records. This way Max doesn't think I am a pathetic loser ;-) diff --git src/lib/gff3.c src/lib/gff3.c index 714e878..6250739 100644 --- src/lib/gff3.c +++ src/lib/gff3.c @@ -57,89 +57,95 @@ char *gff3FeatStartCodon = "start_codon"; char *gff3FeatStopCodon = "stop_codon"; char *gff3FeatTranscript = "transcript"; char *gff3FeatPrimaryTranscript = "primary_transcript"; char *gff3FeatCGeneSegment = "C_gene_segment"; char *gff3FeatDGeneSegment = "D_gene_segment"; char *gff3FeatJGeneSegment = "J_gene_segment"; char *gff3FeatVGeneSegment = "V_gene_segment"; static bool gff3FileStopDueToErrors(struct gff3File *g3f) /* determine if we should stop due to the number of errors */ { return g3f->errCnt > g3f->maxErr; } -static void gff3FileErr(struct gff3File *g3f, char *format, ...) +static void gff3FileErr(struct gff3File *g3f, bool canWarn, char *format, ...) #if defined(__GNUC__) -__attribute__((format(printf, 2, 3))) +__attribute__((format(printf, 3, 4))) #endif ; -static void gff3AnnErr(struct gff3Ann *g3a, char *format, ...) +static void gff3AnnErr(struct gff3Ann *g3a, bool canWarn, char *format, ...) #if defined(__GNUC__) -__attribute__((format(printf, 2, 3))) +__attribute__((format(printf, 3, 4))) #endif ; -static void vaGff3FileErr(struct gff3File *g3f, char *format, va_list args) +static void vaGff3FileErr(struct gff3File *g3f, bool canWarn, char *format, va_list args) /* Print error message to error file, abort if max errors have been reached */ { +bool isWarning = canWarn && (g3f->flags & GFF3_WARN_WHEN_POSSIBLE); if (g3f->lf != NULL) fprintf(g3f->errFh, "%s:%d: ", g3f->lf->fileName, g3f->lf->lineIx); +if (isWarning) + fprintf(g3f->errFh, "WARNING: "); vfprintf(g3f->errFh, format, args); fprintf(g3f->errFh, "\n"); +if (!isWarning) + { g3f->errCnt++; if (gff3FileStopDueToErrors(g3f)) errAbort("GFF3: %d parser errors", g3f->errCnt); } +} -static void gff3FileErr(struct gff3File *g3f, char *format, ...) +static void gff3FileErr(struct gff3File *g3f, bool canWarn, char *format, ...) /* Print error message and abort */ { va_list args; va_start(args, format); -vaGff3FileErr(g3f, format, args); +vaGff3FileErr(g3f, canWarn, format, args); va_end(args); } -static void gff3AnnErr(struct gff3Ann *g3a, char *format, ...) +static void gff3AnnErr(struct gff3Ann *g3a, bool canWarn, char *format, ...) /* Print error message abort */ { va_list args; va_start(args, format); -vaGff3FileErr(g3a->file, format, args); +vaGff3FileErr(g3a->file, canWarn, format, args); va_end(args); } static int gff3FileStrToInt(struct gff3File *g3f, char *str) /* convert a string to an integer */ { char *end; long val = strtol(str, &end, 0); if ((end == str) || (*end != '\0')) - gff3FileErr(g3f, "invalid integer: %s", str); + gff3FileErr(g3f, FALSE, "invalid integer: %s", str); return (int)val; } static float gff3FileStrToFloat(struct gff3File *g3f, char *str) /* convert a string to a float */ { char *end; double val = strtod(str, &end); if ((end == str) || (*end != '\0')) - gff3FileErr(g3f, "invalid float: %s", str); + gff3FileErr(g3f, FALSE, "invalid float: %s", str); return (float)val; } static void *gff3FileAlloc(struct gff3File *g3f, size_t size) /* allocate memory from the memory pool */ { return lmAlloc(g3f->pool->lm, size); } static char *gff3FileCloneStr(struct gff3File *g3f, char *str) /* allocate memory for a string and copy if */ { return lmCloneString(g3f->pool->lm, str); } @@ -152,31 +158,31 @@ static struct slName *gff3FileSlNameNew(struct gff3File *g3f, char *name) /* allocate slName from the memory pool */ { return lmSlName(g3f->pool->lm, name); } static char **dynChopStringWhite(struct gff3File *g3f, char *str, int minWords, int maxWords, int *numWordsRet, char *desc) /* dynamic chop string without corrupting it, generate error if expected * number of words not found. Free return when done. Returns NULL on * error. */ { int numWords = chopByWhite(str, NULL, 0); if ((numWords < minWords) || (numWords > maxWords)) { - gff3FileErr(g3f, "expected %s, got \"%s\"", desc, str); + gff3FileErr(g3f, FALSE, "expected %s, got \"%s\"", desc, str); return NULL; } // allocate buffer for both array and string int wordsBytes = ((numWords+1)*sizeof(char**)); char **words = needMem(wordsBytes+strlen(str)+1); char *strcp = ((char*)words)+wordsBytes; strcpy(strcp, str); chopByWhite(strcp, words, numWords); words[numWords] = NULL; if (numWordsRet != NULL) *numWordsRet = numWords; return words; } struct gff3Attr *gff3AnnFindAttr(struct gff3Ann *g3a, char *tag) @@ -200,45 +206,51 @@ } struct gff3AnnRef *gff3RefFind(struct gff3AnnRef *refList, struct gff3Ann *ann) /* Return ref if ann is already on list, otherwise NULL. */ { struct gff3AnnRef *ref; for (ref = refList; ref != NULL; ref = ref->next) if (ref->ann == ann) return ref; return NULL; } static void raiseInvalidEscape(struct gff3Ann *g3a, char *str) /* raise an error about an invalid escape in a string */ { -gff3AnnErr(g3a, "invalid GFF escape sequence in string: %s", str); +gff3AnnErr(g3a, TRUE, "invalid GFF escape sequence in string: %s", str); } static char convertEscape(struct gff3Ann *g3a, char *esc, char *src) /* convert character at esc, which should start with a `%' and be a string * in the form `%09' */ { if (!(isxdigit(esc[1]) && isxdigit(esc[2]))) + { raiseInvalidEscape(g3a, src); + return '_'; // something + } char num[3], *end; strncpy(num, esc+1, 2); num[2] = '\0'; long val = strtol(num, &end, 16); if ((end == num) || (*end != '\0')) + { raiseInvalidEscape(g3a, src); + return '_'; + } return (char)val; } static void unescapeStr(struct gff3Ann *g3a, char *dest, char *src) /* remove URL-style escapes from a string. dest need only have enough * memory to hold src, as unescaping will not grow the string */ { char *s = src, *d = dest; while (*s != '\0') { if (*s == '%') { *d++ = convertEscape(g3a, s, src); s += 3; } @@ -302,88 +314,98 @@ } } static char *parseStrand(struct gff3Ann *g3a, char *strand) /* parse strand into static string, validating it */ { if (sameString(strand, ".")) return NULL; else if (sameString(strand, "+")) return "+"; else if (sameString(strand, "-")) return "-"; else if (sameString(strand, "?")) return "?"; else - gff3AnnErr(g3a, "invalid strand: '%s'", strand); + { + gff3AnnErr(g3a, TRUE, "invalid strand: '%s'", strand); + return NULL; + } return NULL; } static int parsePhase(struct gff3Ann *g3a, char *str) /* parse phase into a number, validating it */ { if (sameString(str, ".")) return -1; int phase = gff3FileStrToInt(g3a->file, str); if ((phase < 0) || (phase > 2)) - gff3AnnErr(g3a, "invalid phase: %d", phase); + { + gff3AnnErr(g3a, TRUE, "invalid phase: %d", phase); + return -1; + } return phase; } static void parseFields(struct gff3Ann *g3a, char **words) /* parse the field in an annotation record */ { g3a->seqid = unescapeStrPooled(g3a, words[0]); g3a->source = unescapeStrPooled(g3a, words[1]); g3a->type = unescapeStrPooled(g3a, words[2]); g3a->start = gff3FileStrToInt(g3a->file, words[3])-1; g3a->end = gff3FileStrToInt(g3a->file, words[4]); if (!sameString(words[5], ".")) { g3a->score = gff3FileStrToFloat(g3a->file, words[5]); g3a->haveScore = TRUE; } g3a->strand = parseStrand(g3a, words[6]); g3a->phase = parsePhase(g3a, words[7]); if (sameString(g3a->type, "CDS")) { if (g3a->phase < 0) - gff3AnnErr(g3a, "CDS feature must have phase"); + gff3AnnErr(g3a, TRUE, "CDS feature must have phase"); } else { #if 0 // spec unclear; bug report filed // spec currently doesn't restrict phase, unclear if it's allowed on start/stop codon features if (g3a->phase >= 0) - gff3AnnErr(g3a, "phase only allowed on CDS features"); + gff3AnnErr(g3a, TRUE, "phase only allowed on CDS features"); #endif } } /* check that an attribute tag name is valid. */ static boolean checkAttrTag(struct gff3Ann *g3a, char *tag) { // FIXME: spec is not clear on what is a valid tag. char *tc = tag; boolean isOk = (isalnum(*tc) || (*tc == '-') || (*tc == '_')); for (tc++; isOk && (*tc != '\0'); tc++) { if (!((*tc == '-') || (*tc == '_') || isalnum(*tc))) isOk = FALSE; } if (!isOk) - gff3AnnErr(g3a, "invalid attribute tag, must be only alphanumeric, dash or underscore: %s", tag); + { + gff3AnnErr(g3a, TRUE, "invalid attribute tag, must be only alphanumeric, dash or underscore: %s", tag); + if (g3a->file->flags & GFF3_WARN_WHEN_POSSIBLE) + return TRUE; + } return isOk; } static struct slName *parseAttrVals(struct gff3Ann *g3a, char *tag, char *valsStr) /* parse an attribute into its values */ { int i, numVals = chopString(valsStr, ",", NULL, 0); char **vals = needMem((numVals+1)*sizeof(char**)); // +1 allows for no values chopString(valsStr, ",", vals, numVals); struct slName *unescVals = NULL; for (i = 0; i < numVals; i++) slAddHead(&unescVals, unescapeSlName(g3a, vals[i])); if (unescVals == NULL) slAddHead(&unescVals, slNameNew("")); // empty value freeMem(vals); @@ -399,31 +421,31 @@ struct gff3Attr *attr = gff3AnnFindAttr(g3a, tag); if (attr == NULL) { attr = gff3FileAlloc(g3a->file, sizeof(struct gff3Attr)); attr->tag = gff3FileCloneStr(g3a->file, tag); slAddHead(&g3a->attrs, attr); } attr->vals = slCat(attr->vals, parseAttrVals(g3a, tag, valStr)); } static void parseAttr(struct gff3Ann *g3a, char *attrStr) /* parse one attribute from an annotation record */ { char *eq = strchr(attrStr, '='); if ((eq == NULL) || (eq == attrStr)) - gff3AnnErr(g3a, "expected name=value: %s", attrStr); + gff3AnnErr(g3a, FALSE, "expected name=value: %s", attrStr); else { char *tag = attrStr; char *vals = eq+1; *eq = '\0'; unescapeStr(g3a, tag, tag); if (checkAttrTag(g3a, tag)) addAttr(g3a, tag, vals); } } static void parseAttrs(struct gff3Ann *g3a, char *attrsCol) /* parse the attribute column in an annotation record */ { int i, numAttrs = chopString(attrsCol, ";", NULL, 0); @@ -431,31 +453,31 @@ chopString(attrsCol, ";", attrStrs, numAttrs); for (i = 0; i < numAttrs; i++) { char *attrStr = trimSpaces(attrStrs[i]); if (strlen(attrStr) > 0) parseAttr(g3a, attrStr); } freeMem(attrStrs); slReverse(&g3a->attrs); } static void checkSingleValAttr(struct gff3Ann *g3a, struct gff3Attr *attr) /* validate that an attribute has only one value */ { if (attr->vals->next != NULL) - gff3AnnErr(g3a, "attribute %s must have a single value, found multiple comma-separated values", attr->tag); + gff3AnnErr(g3a, FALSE, "attribute %s must have a single value, found multiple comma-separated values", attr->tag); } static void parseIDAttr(struct gff3Ann *g3a, struct gff3Attr *attr) /* parse the ID attribute */ { checkSingleValAttr(g3a, attr); g3a->id = attr->vals->name; // link into other parts of feature if discontinuous struct hashEl *hel = hashStore(g3a->file->byId, g3a->id); struct gff3Ann *head = hel->val; if (head != NULL) head->prevPart = g3a; g3a->nextPart = head; hel->val = g3a; } @@ -552,54 +574,54 @@ else if (sameString(attr->tag, gff3AttrTarget)) parseTargetAttr(g3a, attr); else if (sameString(attr->tag, gff3AttrGap)) parseGapAttr(g3a, attr); else if (sameString(attr->tag, gff3AttrDerivesFrom)) parseDerivesFromAttr(g3a, attr); else if (sameString(attr->tag, gff3AttrIsCircular)) parseIsCircular(g3a, attr); else if (sameString(attr->tag, gff3AttrNote)) parseNoteAttr(g3a, attr); else if (sameString(attr->tag, gff3AttrDbxref)) parseDbxrefAttr(g3a, attr); else if (sameString(attr->tag, gff3AttrOntologyTerm)) parseOntologyTermAttr(g3a, attr); else - gff3AnnErr(g3a, "unknown standard attribute, user defined attributes must start with a lower-case letter: %s", attr->tag); + gff3AnnErr(g3a, TRUE, "unknown standard attribute, user defined attributes must start with a lower-case letter: %s", attr->tag); } static void parseStdAttrs(struct gff3Ann *g3a) /* parse standard attributes (starting with upper case) into attributes * have been parsed into attribute list, which would have merged multiply * specified attributes. */ { struct gff3Attr *attr; for (attr = g3a->attrs; attr != NULL; attr = attr->next) { if (isupper(attr->tag[0])) parseStdAttr(g3a, attr); } } static void parseAnn(struct gff3File *g3f, char *line) /* parse an annotation line */ { // extra column to check for too many char *words[gffNumCols+1]; int numWords = chopString(line, "\t", words, gffNumCols+1); if (numWords != gffNumCols) - gff3FileErr(g3f, "expected %d tab-separated columns: %s", gffNumCols, line); + gff3FileErr(g3f, FALSE, "expected %d tab-separated columns: %s", gffNumCols, line); else { struct gff3Ann *g3a = gff3FileAlloc(g3f, sizeof(struct gff3Ann)); g3a->file = g3f; g3a->lineNum = g3f->lf->lineIx; parseFields(g3a, words); parseAttrs(g3a, words[8]); parseStdAttrs(g3a); slAddHead(&g3f->anns, gff3AnnRefNew(g3a)); } } static void writeAttr(struct gff3Attr *attr, FILE *fh) /* write one attribute and it's values */ { @@ -694,31 +716,31 @@ static void parseSequenceRegion(struct gff3File *g3f, char *line) /* parse ##sequence-region seqid start end */ { char **words = dynChopStringWhite(g3f, line, 4, 4, NULL, "\"##sequence-region seqid start end\""); if (words == NULL) return; // got an error struct gff3SeqRegion *sr = gff3FileAlloc(g3f, sizeof(struct gff3SeqRegion)); sr->seqid = gff3FileCloneStr(g3f, words[1]); sr->start = gff3FileStrToInt(g3f, words[2])-1; sr->end = gff3FileStrToInt(g3f, words[3]); if (g3f->seqRegionMap == NULL) g3f->seqRegionMap = hashNew(0); struct hashEl *hel = hashStore(g3f->seqRegionMap, sr->seqid); if (hel->val != NULL) - gff3FileErr(g3f, "duplicate ##sequence-region for %s", sr->seqid); + gff3FileErr(g3f, TRUE, "duplicate ##sequence-region for %s", sr->seqid); else { hel->val = sr; slAddHead(&g3f->seqRegions, sr); } freeMem(words); } static void writeSequenceRegions(struct gff3File *g3f, FILE *fh) /* parse ##sequence-region metadata */ { struct gff3SeqRegion *sr; for (sr = g3f->seqRegions; sr != NULL; sr = sr->next) fprintf(fh, "##sequence-region %s %d %d\n", sr->seqid, sr->start, sr->end); } @@ -791,31 +813,34 @@ return; // got an error slSafeAddHead(&g3f->species, gff3FileSlNameNew(g3f, words[1])); freeMem(words); } static void writeSpecies(struct gff3File *g3f, FILE *fh) /* write ##species NCBI_Taxonomy_URI */ { writeSlNameMetas("##species", g3f->species, fh); } static void parseGenomeBuild(struct gff3File *g3f, char *line) /* parse ##genome-build source buildName */ { if (g3f->genomeBuildSource != NULL) - gff3FileErr(g3f, "multiple ##genome-build records"); + { + gff3FileErr(g3f, TRUE, "multiple ##genome-build records"); + return; // skip + } char **words = dynChopStringWhite(g3f, line, 3, 3, NULL, "\"##genome-build source buildName\""); if (words == NULL) return; // got an error g3f->genomeBuildSource = gff3FileCloneStr(g3f, words[1]); g3f->genomeBuildName = gff3FileCloneStr(g3f, words[2]); freeMem(words); } static void writeGenomeBuild(struct gff3File *g3f, FILE *fh) /* parse ##genome-build source buildName */ { if (g3f->genomeBuildSource != NULL) fprintf(fh, "##genome-build %s %s\n", g3f->genomeBuildSource, g3f->genomeBuildName); } @@ -835,56 +860,56 @@ else if (startsWithWord("##attribute-ontology", line)) parseAttributeOntology(g3f, line); else if (startsWithWord("##source-ontology", line)) parseSourceOntology(g3f, line); else if (startsWithWord("##species", line)) parseSpecies(g3f, line); else if (startsWithWord("##genome-build", line)) parseGenomeBuild(g3f, line); else if (startsWithWord("##gff-spec-version", line) || startsWithWord("##source-version", line) || startsWithWord("##source", line) || startsWithWord("##date", line) || startsWithWord("##Type", line)) ; /* FIXME: silently ignore these. Mark says. */ else - gff3FileErr(g3f, "invalid meta line: %s", line); + gff3FileErr(g3f, TRUE, "invalid meta line: %s", line); } static void parseLine(struct gff3File *g3f, char *line) /* parse one line of a gff3 file */ { if (startsWith("##", line)) parseMeta(g3f, line); else if (!startsWith("#", line) && (strlen(line) > 0)) parseAnn(g3f, line); } static void parseHeader(struct gff3File *g3f) /* parse and validate a GFF3 header */ { char *line; if (!lineFileNext(g3f->lf, &line, NULL)) - gff3FileErr(g3f, "empty GFF file, must have header"); + gff3FileErr(g3f, FALSE, "empty GFF file, must have header"); char *ver = skipToSpaces(line); if (*ver != '\0') { *ver++ = '\0'; ver = trimSpaces(ver); } if (!(sameString(line, "##gff-version") && sameString(ver, "3"))) - gff3FileErr(g3f, "invalid GFF3 header"); + gff3FileErr(g3f, TRUE, "invalid GFF3 header"); } static void parseFile(struct gff3File *g3f) /* do parsing phase of reading a GFF3 file */ { g3f->lf = lineFileOpen(g3f->fileName, TRUE); parseHeader(g3f); char *line; while (lineFileNext(g3f->lf, &line, NULL)) { parseLine(g3f, line); if (gff3FileStopDueToErrors(g3f)) break; } lineFileClose(&g3f->lf); @@ -895,31 +920,31 @@ /* count the number of gff3Ann objects linked together in a feature */ { int cnt = 0; for (; g3a != NULL; g3a = g3a->nextPart) cnt++; return cnt; } static void discontinFeatureCheck(struct gff3Ann *g3a) /* sanity check linked gff3Ann discontinuous features */ { struct gff3Ann *g3a2; for (g3a2 = g3a->nextPart; (g3a2 != NULL) && !gff3FileStopDueToErrors(g3a->file); g3a2 = g3a2->nextPart) { if (!sameString(g3a->type, g3a2->type)) - gff3AnnErr(g3a, "Annotation records for discontinuous features with ID=\"%s\" do not have the same type, found \"%s\" and \"%s\"", g3a->id, g3a->type, g3a2->type); + gff3AnnErr(g3a, FALSE, "Annotation records for discontinuous features with ID=\"%s\" do not have the same type, found \"%s\" and \"%s\"", g3a->id, g3a->type, g3a2->type); } } static void discontinFeatureFillArray(struct gff3Ann *g3a, int numAnns, struct gff3Ann *featAnns[]) /* convert list to array for sorting */ { int i = 0; for (; g3a != NULL; g3a = g3a->nextPart) featAnns[i++] = g3a; } static struct gff3Ann *discontinFeatureArrayLink(int numAnns, struct gff3Ann *featAnns[]) /* convert sorted array to a list */ { struct gff3Ann *g3aHead = NULL, *g3aPrev = NULL; @@ -966,54 +991,58 @@ { struct gff3Ann *g3a = hel->val; if (g3a->nextPart != NULL) { discontinFeatureCheck(g3a); hel->val = discontigFeatureSort(g3a); } } } static struct gff3Ann *resolveRef(struct gff3Ann *g3a, char *id, char *attr) /* resolve a link for an attribute */ { struct gff3Ann *ann = gff3FileFindAnn(g3a->file, id); if (ann == NULL) - gff3AnnErr(g3a, "Can't find annotation record \"%s\" referenced by \"%s\" %s attribute", id, g3a->id, attr); + gff3AnnErr(g3a, TRUE, "Can't find annotation record \"%s\" referenced by \"%s\" %s attribute", id, g3a->id, attr); return ann; } static struct gff3AnnRef *resolveRefs(struct gff3Ann *g3a, struct slName *ids, char *attr) /* resolve links for an attribute */ { struct gff3AnnRef *refs = NULL; struct slName *id; for (id = ids; id != NULL; id = id->next) { struct gff3Ann *ann = resolveRef(g3a, id->name, attr); if (ann != NULL) slSafeAddHead(&refs, gff3AnnRefAlloc(ann)); } return refs; } static void resolveAnn(struct gff3Ann *g3a) /* resolve links for an gff3Ann */ { g3a->parents = resolveRefs(g3a, g3a->parentIds, gff3AttrParent); if (g3a->parents == NULL) + { + // is droppped if there here ignored errors resolving reference + if (g3a->parentIds == NULL) slSafeAddHead(&g3a->file->roots, gff3AnnRefAlloc(g3a)); + } else { struct gff3AnnRef *par; for (par = g3a->parents; par != NULL; par = par->next) slSafeAddHead(&par->ann->children, gff3AnnRefAlloc(g3a)); } if (g3a->derivesFromId != NULL) g3a->derivesFrom = resolveRef(g3a, g3a->derivesFromId, gff3AttrDerivesFrom); } static void resolveAnns(struct gff3File *g3f) /* resolve links */ { struct gff3AnnRef *g3aRef; for (g3aRef = g3f->anns; (g3aRef != NULL) && !gff3FileStopDueToErrors(g3f); g3aRef = g3aRef->next) @@ -1033,38 +1062,39 @@ slReverse(&g3f->sourceOntologies); slReverse(&g3f->species); slReverse(&g3f->seqs); } static struct gff3File *gff3FileNew() /* construct a new, empty gff3File object */ { struct gff3File *g3f; AllocVar(g3f); g3f->byId = hashNew(0); g3f->pool = hashNew(0); return g3f; } -struct gff3File *gff3FileOpen(char *fileName, int maxErr, FILE *errFh) +struct gff3File *gff3FileOpen(char *fileName, int maxErr, unsigned flags, FILE *errFh) /* Parse a GFF3 file into a gff3File object. If maxErr not zero, then * continue to parse until this number of error have been reached. A maxErr * less than zero does not stop reports all errors. Write errors to errFh, * if NULL, use stderr. */ { struct gff3File *g3f = gff3FileNew(); g3f->fileName = gff3FileCloneStr(g3f, fileName); +g3f->flags = flags; g3f->errFh = (errFh != NULL) ? errFh : stderr; g3f->maxErr = (maxErr < 0) ? INT_MAX : maxErr; parseFile(g3f); if (!gff3FileStopDueToErrors(g3f)) resolveFile(g3f); if (g3f->errCnt > 0) errAbort("GFF3: %d parser errors", g3f->errCnt); return g3f; } void gff3FileFree(struct gff3File **g3fPtr) /* Free a gff3File object */ { struct gff3File *g3f = *g3fPtr; if (g3f != NULL)