0c93cc06f5419af125bdeb09ab888b2bf4123ae7 kent Tue Mar 19 17:09:25 2013 -0700 Making a rather general gffToBed program. diff --git src/lib/gff.c src/lib/gff.c index 14309e4..f3a93ea 100644 --- src/lib/gff.c +++ src/lib/gff.c @@ -96,61 +96,82 @@ if (nameBuf == NULL) nameBuf = dyStringNew(0); dyStringClear(nameBuf); if (startsWith("gene-", groupName)) groupName += 5; if (startsWith("cc_", groupName)) groupName += 3; dyStringAppend(nameBuf, groupName); return nameBuf->string; } static boolean isGtfGroup(char *group) /* Return TRUE if group field looks like GTF */ { -if (strstr(group, "gene_id") == NULL) +if (findWordByDelimiter(group, ' ', "gene_id") == NULL) return FALSE; if (countChars(group, '"') >= 2) return TRUE; -if (strstr(group, "transcript_id") != NULL) +if (findWordByDelimiter(group, ' ', "transcript_id") != NULL) return TRUE; return FALSE; } boolean gffHasGtfGroup(char *line) /* Return TRUE if line has a GTF group field */ { char *words[10]; char *dupe = cloneString(line); int wordCt = chopTabs(dupe, words); boolean isGtf = FALSE; if (wordCt >= 9) if (isGtfGroup(words[8])) isGtf = TRUE; freeMem(dupe); return isGtf; } static void readQuotedString(char *fileName, int lineIx, char *in, char *out, char **retNext) /* Parse quoted string and abort on error. */ { if (!parseQuotedString(in, out, retNext)) errAbort("Line %d of %s\n", lineIx, fileName); } -static void parseGtfEnd(char *s, struct gffFile *gff, struct gffLine *gl, +void addGroup(struct gffFile *gff, struct gffLine *gl, char *group) +/* Add group to gff if it's not there already, and attach it to gl. */ +{ +struct gffGroup *gg; +struct hashEl *hel; +if ((hel = hashLookup(gff->groupHash, group)) == NULL) + { + AllocVar(gg); + hel = hashAdd(gff->groupHash, group, gg); + gg->name = hel->name; + gg->seq = gl->seq; + gg->source = gl->source; + slAddHead(&gff->groupList, gg); + } +else + { + gg = hel->val; + } +gl->group = gg->name; +} + +static void parseGff2End(char *s, struct gffFile *gff, struct gffLine *gl, char *fileName, int lineIx) /* Read the semi-colon separated end bits of a GTF line into gl and * hashes. */ { char *type, *val; struct hashEl *hel; bool gotSemi; for (;;) { gotSemi = FALSE; if ((type = nextWord(&s)) == NULL) break; s = skipLeadingSpaces(s); if (NULL == s || s[0] == 0) @@ -187,68 +208,56 @@ if ((hel = hashLookup(gff->geneIdHash, val)) == NULL) { AllocVar(gg); hel = hashAdd(gff->geneIdHash, val, gg); gg->name = hel->name; slAddHead(&gff->geneIdList, gg); } else { gg = hel->val; } gl->geneId = gg->name; } else if (sameString("transcript_id", type) && (gl->group == NULL)) { - struct gffGroup *gg; - if ((hel = hashLookup(gff->groupHash, val)) == NULL) - { - AllocVar(gg); - hel = hashAdd(gff->groupHash, val, gg); - gg->name = hel->name; - gg->seq = gl->seq; - gg->source = gl->source; - slAddHead(&gff->groupList, gg); - } - else - { - gg = hel->val; - } - gl->group = gg->name; + addGroup(gff, gl, val); } else if (sameString("exon_id", type)) gl->exonId = gffFileGetStr(gff, val); else if (sameString("exon_number", type)) { if (!isdigit(val[0])) errAbort("Expecting number after exon_number, got %s line %d of %s", val, lineIx, fileName); gl->exonNumber = atoi(val); } else if (sameString("intron_id", type)) gl->intronId = gffFileGetStr(gff, val); else if (sameString("intron_status", type)) gl->intronStatus = gffFileGetStr(gff, val); else if (sameString("protein_id", type)) gl->proteinId = gffFileGetStr(gff, val); else if (sameString("gene_name", type)) gl->geneName = gffFileGetStr(gff, val); else if (sameString("transcript_name", type)) gl->transcriptName = gffFileGetStr(gff, val); } if (gl->group == NULL) { - if (gl->geneId == NULL) + if (gl->geneId != NULL) + addGroup(gff, gl, gl->geneId); + else warn("No gene_id or transcript_id line %d of %s", lineIx, fileName); } } void gffFileAddRow(struct gffFile *gff, int baseOffset, char *words[], int wordCount, char *fileName, int lineIx) /* Process one row of GFF file (a non-comment line parsed by tabs normally). */ { struct hashEl *hel; struct gffLine *gl; if (wordCount < 8) gffSyntaxError(fileName, lineIx, "Word count less than 8 "); AllocVar(gl); @@ -268,67 +277,80 @@ AllocVar(el); hel = hashAdd(gff->sourceHash, words[1], el); el->name = hel->name; slAddHead(&gff->sourceList, el); } gl->source = hel->name; if ((hel = hashLookup(gff->featureHash, words[2])) == NULL) { struct gffFeature *el; AllocVar(el); hel = hashAdd(gff->featureHash, words[2], el); el->name = hel->name; slAddHead(&gff->featureList, el); } +struct gffFeature *feature = hel->val; +feature->count += 1; + gl->feature = hel->name; if (!isdigit(words[3][0]) || !isdigit(words[4][0])) gffSyntaxError(fileName, lineIx, "col 3 or 4 not a number "); gl->start = atoi(words[3])-1 + baseOffset; gl->end = atoi(words[4]) + baseOffset; gl->score = atof(words[5]); gl->strand = words[6][0]; gl->frame = words[7][0]; if (wordCount >= 9) { + char *groupField = words[8]; if (!gff->typeKnown) { gff->typeKnown = TRUE; - gff->isGtf = isGtfGroup(words[8]); + gff->isGtf = isGtfGroup(groupField); } if (gff->isGtf) { - parseGtfEnd(words[8], gff, gl, fileName, lineIx); + parseGff2End(groupField, gff, gl, fileName, lineIx); + } + else + { + if (strchr(groupField, ';')) + { + char *dupeGroup = cloneString(groupField); + parseGff2End(dupeGroup, gff, gl, fileName, lineIx); + freeMem(dupeGroup); } else { char *tnName = gffTnName(gl->seq, trimSpaces(words[8])); if ((hel = hashLookup(gff->groupHash, tnName)) == NULL) { struct gffGroup *group; AllocVar(group); hel = hashAdd(gff->groupHash, tnName, group); group->name = hel->name; group->seq = gl->seq; group->source = gl->source; slAddHead(&gff->groupList, group); } gl->group = hel->name; } } + } slAddHead(&gff->lineList, gl); } void gffFileAdd(struct gffFile *gff, char *fileName, int baseOffset) /* Create a gffFile structure from a GFF file. */ { /* Open file and do basic allocations. */ struct lineFile *lf = lineFileOpen(fileName, TRUE); char *line, *words[9]; int lineSize, wordCount; while (lineFileNext(lf, &line, &lineSize)) { if (line[0] != '#')