0c93cc06f5419af125bdeb09ab888b2bf4123ae7
kent
  Tue Mar 19 17:09:25 2013 -0700
Making a rather general gffToBed program.
diff --git src/lib/gff.c src/lib/gff.c
index 14309e4..f3a93ea 100644
--- src/lib/gff.c
+++ src/lib/gff.c
@@ -96,61 +96,82 @@
 if (nameBuf == NULL)
     nameBuf = dyStringNew(0);
 dyStringClear(nameBuf);
 if (startsWith("gene-", groupName))
     groupName += 5;
 if (startsWith("cc_", groupName))
     groupName += 3;
 dyStringAppend(nameBuf, groupName);
 
 return nameBuf->string;
 }
 
 static boolean isGtfGroup(char *group)
 /* Return TRUE if group field looks like GTF */
 {
-if (strstr(group, "gene_id") == NULL)
+if (findWordByDelimiter(group, ' ', "gene_id") == NULL)
     return FALSE;
 if (countChars(group, '"') >= 2)
     return TRUE;
-if (strstr(group, "transcript_id") != NULL)
+if (findWordByDelimiter(group, ' ', "transcript_id") != NULL)
     return TRUE;
 return FALSE;
 }
 
 boolean gffHasGtfGroup(char *line)
 /* Return TRUE if line has a GTF group field */
 {
 char *words[10];
 char *dupe = cloneString(line);
 int wordCt = chopTabs(dupe, words);
 boolean isGtf = FALSE;
 if (wordCt >= 9) 
     if (isGtfGroup(words[8]))
         isGtf = TRUE;
 freeMem(dupe);
 return isGtf;
 }
 
 static void readQuotedString(char *fileName, int lineIx, char *in, char *out, char **retNext)
 /* Parse quoted string and abort on error. */
 {
 if (!parseQuotedString(in, out, retNext))
     errAbort("Line %d of %s\n", lineIx, fileName);
 }
 
-static void parseGtfEnd(char *s, struct gffFile *gff, struct gffLine *gl, 
+void addGroup(struct gffFile *gff, struct gffLine *gl, char *group)
+/* Add group to gff if it's not there already, and attach it to gl. */
+{
+struct gffGroup *gg;
+struct hashEl *hel;
+if ((hel = hashLookup(gff->groupHash, group)) == NULL)
+   {
+   AllocVar(gg);
+   hel = hashAdd(gff->groupHash, group, gg);
+   gg->name = hel->name;
+   gg->seq = gl->seq;
+   gg->source = gl->source;
+   slAddHead(&gff->groupList, gg);
+   }
+else
+   {
+   gg = hel->val;
+   }
+gl->group = gg->name;
+}
+
+static void parseGff2End(char *s, struct gffFile *gff, struct gffLine *gl, 
     char *fileName, int lineIx)
 /* Read the semi-colon separated end bits of a GTF line into gl and
  * hashes. */
 {
 char *type, *val;
 struct hashEl *hel;
 bool gotSemi;
 
 for (;;)
    {
    gotSemi = FALSE;
    if ((type = nextWord(&s)) == NULL)
        break;
    s = skipLeadingSpaces(s);
    if (NULL == s || s[0] == 0)
@@ -187,68 +208,56 @@
        if ((hel = hashLookup(gff->geneIdHash, val)) == NULL)
 	   {
 	   AllocVar(gg);
            hel = hashAdd(gff->geneIdHash, val, gg);
 	   gg->name = hel->name;
 	   slAddHead(&gff->geneIdList, gg);
 	   }
 	else
 	   {
 	   gg = hel->val;
 	   }
        gl->geneId = gg->name;
        }
    else if (sameString("transcript_id", type) && (gl->group == NULL))
        {
-       struct gffGroup *gg;
-       if ((hel = hashLookup(gff->groupHash, val)) == NULL)
-	   {
-	   AllocVar(gg);
-           hel = hashAdd(gff->groupHash, val, gg);
-	   gg->name = hel->name;
-	   gg->seq = gl->seq;
-	   gg->source = gl->source;
-	   slAddHead(&gff->groupList, gg);
-	   }
-	else
-	   {
-	   gg = hel->val;
-	   }
-       gl->group = gg->name;
+       addGroup(gff, gl, val);
        }
    else if (sameString("exon_id", type))
        gl->exonId = gffFileGetStr(gff, val);
    else if (sameString("exon_number", type))
        {
        if (!isdigit(val[0]))
            errAbort("Expecting number after exon_number, got %s line %d of %s", val, lineIx, fileName);
        gl->exonNumber = atoi(val);
        }
    else if (sameString("intron_id", type))
        gl->intronId = gffFileGetStr(gff, val);
    else if (sameString("intron_status", type))
        gl->intronStatus = gffFileGetStr(gff, val);
    else if (sameString("protein_id", type))
        gl->proteinId = gffFileGetStr(gff, val);
    else if (sameString("gene_name", type))
        gl->geneName = gffFileGetStr(gff, val);
    else if (sameString("transcript_name", type))
        gl->transcriptName = gffFileGetStr(gff, val);
    }
 if (gl->group == NULL)
     {
-    if (gl->geneId == NULL)
+    if (gl->geneId != NULL)
+        addGroup(gff, gl, gl->geneId);
+    else
         warn("No gene_id or transcript_id line %d of %s", lineIx, fileName);
     }
 }
 
 void gffFileAddRow(struct gffFile *gff, int baseOffset, char *words[], int wordCount, 
     char *fileName, int lineIx)
 /* Process one row of GFF file (a non-comment line parsed by tabs normally). */
 {
 struct hashEl *hel;
 struct gffLine *gl;
 
 if (wordCount < 8)
     gffSyntaxError(fileName, lineIx, "Word count less than 8 ");
 AllocVar(gl);
 
@@ -268,67 +277,80 @@
     AllocVar(el);
     hel = hashAdd(gff->sourceHash, words[1], el);
     el->name = hel->name;
     slAddHead(&gff->sourceList, el);
     }
 gl->source = hel->name;
 
 if ((hel = hashLookup(gff->featureHash, words[2])) == NULL)
     {
     struct gffFeature *el;
     AllocVar(el);
     hel = hashAdd(gff->featureHash, words[2], el);
     el->name = hel->name;
     slAddHead(&gff->featureList, el);
     }
+struct gffFeature *feature = hel->val;
+feature->count += 1;
+
 gl->feature = hel->name;
 
 if (!isdigit(words[3][0]) || !isdigit(words[4][0]))
    gffSyntaxError(fileName, lineIx, "col 3 or 4 not a number ");	
 gl->start = atoi(words[3])-1 + baseOffset;
 gl->end = atoi(words[4]) + baseOffset;
 gl->score = atof(words[5]);
 gl->strand = words[6][0];
 gl->frame = words[7][0];
 
 if (wordCount >= 9)
     {
+    char *groupField = words[8];
     if (!gff->typeKnown)
 	{
 	gff->typeKnown = TRUE;
-	gff->isGtf = isGtfGroup(words[8]);
+	gff->isGtf = isGtfGroup(groupField);
 	}
     if (gff->isGtf)
 	{
-	parseGtfEnd(words[8], gff, gl, fileName, lineIx);
+	parseGff2End(groupField, gff, gl, fileName, lineIx);
+	}
+    else
+	{
+	if (strchr(groupField, ';'))
+	    {
+	    char *dupeGroup = cloneString(groupField);
+	    parseGff2End(dupeGroup, gff, gl, fileName, lineIx);
+	    freeMem(dupeGroup);
 	}
     else
 	{
 	char *tnName = gffTnName(gl->seq, trimSpaces(words[8]));
 	if ((hel = hashLookup(gff->groupHash, tnName)) == NULL)
 	    {
 	    struct gffGroup *group;
 	    AllocVar(group);
 	    hel = hashAdd(gff->groupHash, tnName, group);
 	    group->name = hel->name;
 	    group->seq = gl->seq;
 	    group->source = gl->source;
 	    slAddHead(&gff->groupList, group);
 	    }
 	gl->group = hel->name;
 	}
     }
+    }
 slAddHead(&gff->lineList, gl);
 }
 
 
 void gffFileAdd(struct gffFile *gff, char *fileName, int baseOffset)
 /* Create a gffFile structure from a GFF file. */
 {
 /* Open file and do basic allocations. */
 struct lineFile *lf = lineFileOpen(fileName, TRUE);
 char *line, *words[9];
 int lineSize, wordCount;
 
 while (lineFileNext(lf, &line, &lineSize))
     {
     if (line[0] != '#')