135eb5b13252cdb26d6e2390108a87348d225781 kent Tue Mar 26 09:34:47 2013 -0700 Unrolling gff changes that broke some of Mark's tests. diff --git src/lib/gff.c src/lib/gff.c index ccbc43c..14309e4 100644 --- src/lib/gff.c +++ src/lib/gff.c @@ -96,82 +96,61 @@ if (nameBuf == NULL) nameBuf = dyStringNew(0); dyStringClear(nameBuf); if (startsWith("gene-", groupName)) groupName += 5; if (startsWith("cc_", groupName)) groupName += 3; dyStringAppend(nameBuf, groupName); return nameBuf->string; } static boolean isGtfGroup(char *group) /* Return TRUE if group field looks like GTF */ { -if (findWordByDelimiter(group, ' ', "gene_id") == NULL) +if (strstr(group, "gene_id") == NULL) return FALSE; if (countChars(group, '"') >= 2) return TRUE; -if (findWordByDelimiter(group, ' ', "transcript_id") != NULL) +if (strstr(group, "transcript_id") != NULL) return TRUE; return FALSE; } boolean gffHasGtfGroup(char *line) /* Return TRUE if line has a GTF group field */ { char *words[10]; char *dupe = cloneString(line); int wordCt = chopTabs(dupe, words); boolean isGtf = FALSE; if (wordCt >= 9) if (isGtfGroup(words[8])) isGtf = TRUE; freeMem(dupe); return isGtf; } static void readQuotedString(char *fileName, int lineIx, char *in, char *out, char **retNext) /* Parse quoted string and abort on error. */ { if (!parseQuotedString(in, out, retNext)) errAbort("Line %d of %s\n", lineIx, fileName); } -void addGroup(struct gffFile *gff, struct gffLine *gl, char *group) -/* Add group to gff if it's not there already, and attach it to gl. */ -{ -struct gffGroup *gg; -struct hashEl *hel; -if ((hel = hashLookup(gff->groupHash, group)) == NULL) - { - AllocVar(gg); - hel = hashAdd(gff->groupHash, group, gg); - gg->name = hel->name; - gg->seq = gl->seq; - gg->source = gl->source; - slAddHead(&gff->groupList, gg); - } -else - { - gg = hel->val; - } -gl->group = gg->name; -} - -static void parseGff2End(char *s, struct gffFile *gff, struct gffLine *gl, +static void parseGtfEnd(char *s, struct gffFile *gff, struct gffLine *gl, char *fileName, int lineIx) /* Read the semi-colon separated end bits of a GTF line into gl and * hashes. */ { char *type, *val; struct hashEl *hel; bool gotSemi; for (;;) { gotSemi = FALSE; if ((type = nextWord(&s)) == NULL) break; s = skipLeadingSpaces(s); if (NULL == s || s[0] == 0) @@ -208,57 +187,69 @@ if ((hel = hashLookup(gff->geneIdHash, val)) == NULL) { AllocVar(gg); hel = hashAdd(gff->geneIdHash, val, gg); gg->name = hel->name; slAddHead(&gff->geneIdList, gg); } else { gg = hel->val; } gl->geneId = gg->name; } else if (sameString("transcript_id", type) && (gl->group == NULL)) { - addGroup(gff, gl, val); + struct gffGroup *gg; + if ((hel = hashLookup(gff->groupHash, val)) == NULL) + { + AllocVar(gg); + hel = hashAdd(gff->groupHash, val, gg); + gg->name = hel->name; + gg->seq = gl->seq; + gg->source = gl->source; + slAddHead(&gff->groupList, gg); + } + else + { + gg = hel->val; + } + gl->group = gg->name; } else if (sameString("exon_id", type)) gl->exonId = gffFileGetStr(gff, val); else if (sameString("exon_number", type)) { if (!isdigit(val[0])) errAbort("Expecting number after exon_number, got %s line %d of %s", val, lineIx, fileName); gl->exonNumber = atoi(val); } else if (sameString("intron_id", type)) gl->intronId = gffFileGetStr(gff, val); else if (sameString("intron_status", type)) gl->intronStatus = gffFileGetStr(gff, val); else if (sameString("protein_id", type)) gl->proteinId = gffFileGetStr(gff, val); else if (sameString("gene_name", type)) gl->geneName = gffFileGetStr(gff, val); else if (sameString("transcript_name", type)) gl->transcriptName = gffFileGetStr(gff, val); } if (gl->group == NULL) { - if (gl->geneId != NULL) - addGroup(gff, gl, gl->geneId); - else - verbose(2, "No gene_id or transcript_id line %d of %s", lineIx, fileName); + if (gl->geneId == NULL) + warn("No gene_id or transcript_id line %d of %s", lineIx, fileName); } } void gffFileAddRow(struct gffFile *gff, int baseOffset, char *words[], int wordCount, char *fileName, int lineIx) /* Process one row of GFF file (a non-comment line parsed by tabs normally). */ { struct hashEl *hel; struct gffLine *gl; if (wordCount < 8) gffSyntaxError(fileName, lineIx, "Word count less than 8 "); AllocVar(gl); if ((hel = hashLookup(gff->seqHash, words[0])) == NULL) @@ -277,80 +268,67 @@ AllocVar(el); hel = hashAdd(gff->sourceHash, words[1], el); el->name = hel->name; slAddHead(&gff->sourceList, el); } gl->source = hel->name; if ((hel = hashLookup(gff->featureHash, words[2])) == NULL) { struct gffFeature *el; AllocVar(el); hel = hashAdd(gff->featureHash, words[2], el); el->name = hel->name; slAddHead(&gff->featureList, el); } -struct gffFeature *feature = hel->val; -feature->count += 1; - gl->feature = hel->name; if (!isdigit(words[3][0]) || !isdigit(words[4][0])) gffSyntaxError(fileName, lineIx, "col 3 or 4 not a number "); gl->start = atoi(words[3])-1 + baseOffset; gl->end = atoi(words[4]) + baseOffset; gl->score = atof(words[5]); gl->strand = words[6][0]; gl->frame = words[7][0]; if (wordCount >= 9) { - char *groupField = words[8]; if (!gff->typeKnown) { gff->typeKnown = TRUE; - gff->isGtf = isGtfGroup(groupField); + gff->isGtf = isGtfGroup(words[8]); } if (gff->isGtf) { - parseGff2End(groupField, gff, gl, fileName, lineIx); - } - else - { - if (strchr(groupField, ';')) - { - char *dupeGroup = cloneString(groupField); - parseGff2End(dupeGroup, gff, gl, fileName, lineIx); - freeMem(dupeGroup); + parseGtfEnd(words[8], gff, gl, fileName, lineIx); } else { char *tnName = gffTnName(gl->seq, trimSpaces(words[8])); if ((hel = hashLookup(gff->groupHash, tnName)) == NULL) { struct gffGroup *group; AllocVar(group); hel = hashAdd(gff->groupHash, tnName, group); group->name = hel->name; group->seq = gl->seq; group->source = gl->source; slAddHead(&gff->groupList, group); } gl->group = hel->name; } } - } slAddHead(&gff->lineList, gl); } void gffFileAdd(struct gffFile *gff, char *fileName, int baseOffset) /* Create a gffFile structure from a GFF file. */ { /* Open file and do basic allocations. */ struct lineFile *lf = lineFileOpen(fileName, TRUE); char *line, *words[9]; int lineSize, wordCount; while (lineFileNext(lf, &line, &lineSize)) { if (line[0] != '#') @@ -399,134 +377,66 @@ int start = 0x3fffffff; int end = -start; line = group->lineList; group->strand = line->strand; for (; line != NULL; line = line->next) { if (start > line->start) start = line->start; if (end < line->end) end = line->end; } group->start = start; group->end = end; } -#ifdef UNUSED -static boolean allSameSeq(struct gffGroup *group) -/* Return TRUE if all lines of group are for same chrom */ -{ -if (group->lineList == NULL || group->lineList->next == NULL) - return TRUE; -char *seq = group->lineList->seq; -struct gffLine *line; -for (line = group->lineList->next; line != NULL; line = line->next) - if (!sameString(line->seq, seq)) - return FALSE; -return TRUE; -} -#endif /* UNUSED */ - -static struct gffGroup *breakGroupBySeq(struct gffGroup *group) -/* Break up a group that has multiple sequences. Assumes lineList is sorted. */ -{ -char *curSeq = group->lineList->seq; -struct gffLine *line, *next; -struct gffGroup *brokenList = NULL; -for (line = group->lineList; line != NULL; line = next) - { - next = line->next; - if (next != NULL && !sameString(next->seq, curSeq)) - { - curSeq = next->seq; - struct gffGroup *newGroup; - AllocVar(newGroup); - newGroup->name = group->name; - newGroup->seq = curSeq; - newGroup->source = group->source; - line->next = NULL; - newGroup->lineList = next; - slAddHead(&brokenList, group); - group = newGroup; - } - } -slAddHead(&brokenList, group); -slReverse(&brokenList); -return brokenList; -} - -static struct gffGroup *breakMultiSeqGroups(struct gffGroup *oldList) -/* Break up any groups that span multiple chromosomes into one per group. - * Return reworked list. */ -{ -struct gffGroup *newList = NULL, *group, *next; - -for (group = oldList; group != NULL; group = next) - { - next = group->next; - struct gffGroup *groupList = breakGroupBySeq(group); - struct gffGroup *newGroup, *newNext; - for (newGroup = groupList; newGroup != NULL; newGroup = newNext) - { - newNext = newGroup->next; - slAddHead(&newList, newGroup); - } - } -slReverse(&newList); -return newList; -} - - void gffGroupLines(struct gffFile *gff) /* Group lines of gff file together, in process mofing * gff->lineList to gffGroup->lineList. */ { struct gffLine *line, *nextLine; struct hash *groupHash = gff->groupHash; char *groupName; struct gffGroup *group; struct gffLine *ungroupedLines = NULL; for (line = gff->lineList; line != NULL; line = nextLine) { nextLine = line->next; if ((groupName = line->group) != NULL) { struct hashEl *hel = hashLookup(groupHash, groupName); group = hel->val; slAddHead(&group->lineList, line); } else { slAddHead(&ungroupedLines, line); } } /* Restore ungrouped lines to gff->lineList. */ slReverse(&ungroupedLines); gff->lineList = ungroupedLines; -/* Restore order of grouped lines. */ +/* Restore order of grouped lines and fill in start and end. */ for (group = gff->groupList; group != NULL; group = group->next) + { slSort(&group->lineList, gffLineCmp); - -/* Look for groups that traverse multiple chromosomes. Break them apart. */ -gff->groupList = breakMultiSeqGroups(gff->groupList); - -for (group = gff->groupList; group != NULL; group = group->next) getGroupBoundaries(group); } +} void gffOutput(struct gffLine *el, FILE *f, char sep, char lastSep) /* Print out GTF. Separate fields with sep. Follow last field with lastSep. */ { if (sep == ',') fputc('"',f); fprintf(f, "%s", el->seq); if (sep == ',') fputc('"',f); fputc(sep,f); if (sep == ',') fputc('"',f); fprintf(f, "%s", el->source); if (sep == ',') fputc('"',f); fputc(sep,f); if (sep == ',') fputc('"',f); fprintf(f, "%s", el->feature); if (sep == ',') fputc('"',f);