135eb5b13252cdb26d6e2390108a87348d225781
kent
  Tue Mar 26 09:34:47 2013 -0700
Unrolling gff changes that broke some of Mark's tests.
diff --git src/lib/gff.c src/lib/gff.c
index ccbc43c..14309e4 100644
--- src/lib/gff.c
+++ src/lib/gff.c
@@ -96,82 +96,61 @@
 if (nameBuf == NULL)
     nameBuf = dyStringNew(0);
 dyStringClear(nameBuf);
 if (startsWith("gene-", groupName))
     groupName += 5;
 if (startsWith("cc_", groupName))
     groupName += 3;
 dyStringAppend(nameBuf, groupName);
 
 return nameBuf->string;
 }
 
 static boolean isGtfGroup(char *group)
 /* Return TRUE if group field looks like GTF */
 {
-if (findWordByDelimiter(group, ' ', "gene_id") == NULL)
+if (strstr(group, "gene_id") == NULL)
     return FALSE;
 if (countChars(group, '"') >= 2)
     return TRUE;
-if (findWordByDelimiter(group, ' ', "transcript_id") != NULL)
+if (strstr(group, "transcript_id") != NULL)
     return TRUE;
 return FALSE;
 }
 
 boolean gffHasGtfGroup(char *line)
 /* Return TRUE if line has a GTF group field */
 {
 char *words[10];
 char *dupe = cloneString(line);
 int wordCt = chopTabs(dupe, words);
 boolean isGtf = FALSE;
 if (wordCt >= 9) 
     if (isGtfGroup(words[8]))
         isGtf = TRUE;
 freeMem(dupe);
 return isGtf;
 }
 
 static void readQuotedString(char *fileName, int lineIx, char *in, char *out, char **retNext)
 /* Parse quoted string and abort on error. */
 {
 if (!parseQuotedString(in, out, retNext))
     errAbort("Line %d of %s\n", lineIx, fileName);
 }
 
-void addGroup(struct gffFile *gff, struct gffLine *gl, char *group)
-/* Add group to gff if it's not there already, and attach it to gl. */
-{
-struct gffGroup *gg;
-struct hashEl *hel;
-if ((hel = hashLookup(gff->groupHash, group)) == NULL)
-   {
-   AllocVar(gg);
-   hel = hashAdd(gff->groupHash, group, gg);
-   gg->name = hel->name;
-   gg->seq = gl->seq;
-   gg->source = gl->source;
-   slAddHead(&gff->groupList, gg);
-   }
-else
-   {
-   gg = hel->val;
-   }
-gl->group = gg->name;
-}
-
-static void parseGff2End(char *s, struct gffFile *gff, struct gffLine *gl, 
+static void parseGtfEnd(char *s, struct gffFile *gff, struct gffLine *gl, 
     char *fileName, int lineIx)
 /* Read the semi-colon separated end bits of a GTF line into gl and
  * hashes. */
 {
 char *type, *val;
 struct hashEl *hel;
 bool gotSemi;
 
 for (;;)
    {
    gotSemi = FALSE;
    if ((type = nextWord(&s)) == NULL)
        break;
    s = skipLeadingSpaces(s);
    if (NULL == s || s[0] == 0)
@@ -208,57 +187,69 @@
        if ((hel = hashLookup(gff->geneIdHash, val)) == NULL)
 	   {
 	   AllocVar(gg);
            hel = hashAdd(gff->geneIdHash, val, gg);
 	   gg->name = hel->name;
 	   slAddHead(&gff->geneIdList, gg);
 	   }
 	else
 	   {
 	   gg = hel->val;
 	   }
        gl->geneId = gg->name;
        }
    else if (sameString("transcript_id", type) && (gl->group == NULL))
        {
-       addGroup(gff, gl, val);
+       struct gffGroup *gg;
+       if ((hel = hashLookup(gff->groupHash, val)) == NULL)
+	   {
+	   AllocVar(gg);
+           hel = hashAdd(gff->groupHash, val, gg);
+	   gg->name = hel->name;
+	   gg->seq = gl->seq;
+	   gg->source = gl->source;
+	   slAddHead(&gff->groupList, gg);
+	   }
+	else
+	   {
+	   gg = hel->val;
+	   }
+       gl->group = gg->name;
        }
    else if (sameString("exon_id", type))
        gl->exonId = gffFileGetStr(gff, val);
    else if (sameString("exon_number", type))
        {
        if (!isdigit(val[0]))
            errAbort("Expecting number after exon_number, got %s line %d of %s", val, lineIx, fileName);
        gl->exonNumber = atoi(val);
        }
    else if (sameString("intron_id", type))
        gl->intronId = gffFileGetStr(gff, val);
    else if (sameString("intron_status", type))
        gl->intronStatus = gffFileGetStr(gff, val);
    else if (sameString("protein_id", type))
        gl->proteinId = gffFileGetStr(gff, val);
    else if (sameString("gene_name", type))
        gl->geneName = gffFileGetStr(gff, val);
    else if (sameString("transcript_name", type))
        gl->transcriptName = gffFileGetStr(gff, val);
    }
 if (gl->group == NULL)
     {
-    if (gl->geneId != NULL)
-        addGroup(gff, gl, gl->geneId);
-    else
-        verbose(2, "No gene_id or transcript_id line %d of %s", lineIx, fileName);
+    if (gl->geneId == NULL)
+        warn("No gene_id or transcript_id line %d of %s", lineIx, fileName);
     }
 }
 
 void gffFileAddRow(struct gffFile *gff, int baseOffset, char *words[], int wordCount, 
     char *fileName, int lineIx)
 /* Process one row of GFF file (a non-comment line parsed by tabs normally). */
 {
 struct hashEl *hel;
 struct gffLine *gl;
 
 if (wordCount < 8)
     gffSyntaxError(fileName, lineIx, "Word count less than 8 ");
 AllocVar(gl);
 
 if ((hel = hashLookup(gff->seqHash, words[0])) == NULL)
@@ -277,80 +268,67 @@
     AllocVar(el);
     hel = hashAdd(gff->sourceHash, words[1], el);
     el->name = hel->name;
     slAddHead(&gff->sourceList, el);
     }
 gl->source = hel->name;
 
 if ((hel = hashLookup(gff->featureHash, words[2])) == NULL)
     {
     struct gffFeature *el;
     AllocVar(el);
     hel = hashAdd(gff->featureHash, words[2], el);
     el->name = hel->name;
     slAddHead(&gff->featureList, el);
     }
-struct gffFeature *feature = hel->val;
-feature->count += 1;
-
 gl->feature = hel->name;
 
 if (!isdigit(words[3][0]) || !isdigit(words[4][0]))
    gffSyntaxError(fileName, lineIx, "col 3 or 4 not a number ");	
 gl->start = atoi(words[3])-1 + baseOffset;
 gl->end = atoi(words[4]) + baseOffset;
 gl->score = atof(words[5]);
 gl->strand = words[6][0];
 gl->frame = words[7][0];
 
 if (wordCount >= 9)
     {
-    char *groupField = words[8];
     if (!gff->typeKnown)
 	{
 	gff->typeKnown = TRUE;
-	gff->isGtf = isGtfGroup(groupField);
+	gff->isGtf = isGtfGroup(words[8]);
 	}
     if (gff->isGtf)
 	{
-	parseGff2End(groupField, gff, gl, fileName, lineIx);
-	}
-    else
-	{
-	if (strchr(groupField, ';'))
-	    {
-	    char *dupeGroup = cloneString(groupField);
-	    parseGff2End(dupeGroup, gff, gl, fileName, lineIx);
-	    freeMem(dupeGroup);
+	parseGtfEnd(words[8], gff, gl, fileName, lineIx);
 	    }
 	else
 	    {
 	    char *tnName = gffTnName(gl->seq, trimSpaces(words[8]));
 	    if ((hel = hashLookup(gff->groupHash, tnName)) == NULL)
 		{
 		struct gffGroup *group;
 		AllocVar(group);
 		hel = hashAdd(gff->groupHash, tnName, group);
 		group->name = hel->name;
 		group->seq = gl->seq;
 		group->source = gl->source;
 		slAddHead(&gff->groupList, group);
 		}
 	    gl->group = hel->name;
 	    }
 	}
-    }
 slAddHead(&gff->lineList, gl);
 }
 
 
 void gffFileAdd(struct gffFile *gff, char *fileName, int baseOffset)
 /* Create a gffFile structure from a GFF file. */
 {
 /* Open file and do basic allocations. */
 struct lineFile *lf = lineFileOpen(fileName, TRUE);
 char *line, *words[9];
 int lineSize, wordCount;
 
 while (lineFileNext(lf, &line, &lineSize))
     {
     if (line[0] != '#')
@@ -399,134 +377,66 @@
 int start = 0x3fffffff;
 int end = -start;
 line = group->lineList;
 group->strand = line->strand;
 for (; line != NULL; line = line->next)
     {
     if (start > line->start)
 	start = line->start;
     if (end < line->end)
 	end = line->end;
     }
 group->start = start;
 group->end = end;
 }
 
-#ifdef UNUSED
-static boolean allSameSeq(struct gffGroup *group)
-/* Return TRUE if all lines of group are for same chrom */
-{
-if (group->lineList == NULL || group->lineList->next == NULL)
-    return TRUE;
-char *seq = group->lineList->seq;
-struct gffLine *line;
-for (line = group->lineList->next; line != NULL; line = line->next)
-    if (!sameString(line->seq, seq))
-        return FALSE;
-return TRUE;
-}
-#endif /* UNUSED */
-
-static struct gffGroup *breakGroupBySeq(struct gffGroup *group)
-/* Break up a group that has multiple sequences.  Assumes lineList is sorted. */
-{
-char *curSeq = group->lineList->seq;
-struct gffLine *line, *next;
-struct gffGroup *brokenList = NULL;
-for (line = group->lineList; line != NULL; line = next)
-    {
-    next = line->next;
-    if (next != NULL && !sameString(next->seq, curSeq))
-        {
-	curSeq = next->seq;
-	struct gffGroup *newGroup;
-	AllocVar(newGroup);
-	newGroup->name = group->name;
-	newGroup->seq = curSeq;
-	newGroup->source = group->source;
-	line->next = NULL;
-	newGroup->lineList = next;
-	slAddHead(&brokenList, group);
-	group = newGroup;
-	}
-    }
-slAddHead(&brokenList, group);
-slReverse(&brokenList);
-return brokenList;
-}
-
-static struct gffGroup *breakMultiSeqGroups(struct gffGroup *oldList)
-/* Break up any groups that span multiple chromosomes into one per group.
- * Return reworked list. */
-{
-struct gffGroup *newList = NULL, *group, *next;
-
-for (group = oldList; group != NULL; group = next)
-    {
-    next = group->next;
-    struct gffGroup *groupList = breakGroupBySeq(group);
-    struct gffGroup *newGroup, *newNext;
-    for (newGroup = groupList; newGroup != NULL; newGroup = newNext)
-	{
-	newNext = newGroup->next;
-	slAddHead(&newList, newGroup);
-	}
-    }
-slReverse(&newList);
-return newList;
-}
-
-
 void gffGroupLines(struct gffFile *gff)
 /* Group lines of gff file together, in process mofing
  * gff->lineList to gffGroup->lineList. */
 {
 struct gffLine *line, *nextLine;
 struct hash *groupHash = gff->groupHash;
 char *groupName;
 struct gffGroup *group;
 struct gffLine *ungroupedLines = NULL;
 
 for (line = gff->lineList; line != NULL; line = nextLine)
     {
     nextLine = line->next;
     if ((groupName = line->group) != NULL)
 	{
 	struct hashEl *hel = hashLookup(groupHash, groupName);
 	group = hel->val;
 	slAddHead(&group->lineList, line);
 	}
     else
 	{
 	slAddHead(&ungroupedLines, line);
 	}
     }
 
 /* Restore ungrouped lines to gff->lineList. */
 slReverse(&ungroupedLines);
 gff->lineList = ungroupedLines;
 
-/* Restore order of grouped lines. */
+/* Restore order of grouped lines and fill in start and end. */
 for (group = gff->groupList; group != NULL; group = group->next)
+    {
     slSort(&group->lineList, gffLineCmp);
-
-/* Look for groups that traverse multiple chromosomes.  Break them apart. */
-gff->groupList = breakMultiSeqGroups(gff->groupList);
-
-for (group = gff->groupList; group != NULL; group = group->next)
     getGroupBoundaries(group);
 }
+}
 
 void gffOutput(struct gffLine *el, FILE *f, char sep, char lastSep) 
 /* Print out GTF.  Separate fields with sep. Follow last field with lastSep. */
 {
 if (sep == ',') fputc('"',f);
 fprintf(f, "%s", el->seq);
 if (sep == ',') fputc('"',f);
 fputc(sep,f);
 if (sep == ',') fputc('"',f);
 fprintf(f, "%s", el->source);
 if (sep == ',') fputc('"',f);
 fputc(sep,f);
 if (sep == ',') fputc('"',f);
 fprintf(f, "%s", el->feature);
 if (sep == ',') fputc('"',f);