src/lib/gff.c db98805df7428701b205e398607447fa72acac9b

db98805df7428701b205e398607447fa72acac9b
kent
  Tue Mar 19 18:24:48 2013 -0700
Making gff reader routines work on a wider variety of gffs including those that reuse same gene names on different chromosomes.  THis comes up in pseudo-autosomal parts of X and Y in ensembl gene sets.
diff --git src/lib/gff.c src/lib/gff.c
index f3a93ea..01baa4b 100644
--- src/lib/gff.c
+++ src/lib/gff.c
@@ -399,66 +399,134 @@
 int start = 0x3fffffff;
 int end = -start;
 line = group->lineList;
 group->strand = line->strand;
 for (; line != NULL; line = line->next)
     {
     if (start > line->start)
 	start = line->start;
     if (end < line->end)
 	end = line->end;
     }
 group->start = start;
 group->end = end;
 }
 
+#ifdef UNUSED
+static boolean allSameSeq(struct gffGroup *group)
+/* Return TRUE if all lines of group are for same chrom */
+{
+if (group->lineList == NULL || group->lineList->next == NULL)
+    return TRUE;
+char *seq = group->lineList->seq;
+struct gffLine *line;
+for (line = group->lineList->next; line != NULL; line = line->next)
+    if (!sameString(line->seq, seq))
+        return FALSE;
+return TRUE;
+}
+#endif /* UNUSED */
+
+static struct gffGroup *breakGroupBySeq(struct gffGroup *group)
+/* Break up a group that has multiple sequences.  Assumes lineList is sorted. */
+{
+char *curSeq = group->lineList->seq;
+struct gffLine *line, *next;
+struct gffGroup *brokenList = NULL;
+for (line = group->lineList; line != NULL; line = next)
+    {
+    next = line->next;
+    if (next != NULL && !sameString(next->seq, curSeq))
+        {
+	curSeq = next->seq;
+	struct gffGroup *newGroup;
+	AllocVar(newGroup);
+	newGroup->name = group->name;
+	newGroup->seq = curSeq;
+	newGroup->source = group->source;
+	line->next = NULL;
+	newGroup->lineList = next;
+	slAddHead(&brokenList, group);
+	group = newGroup;
+	}
+    }
+slAddHead(&brokenList, group);
+slReverse(&brokenList);
+return brokenList;
+}
+
+static struct gffGroup *breakMultiSeqGroups(struct gffGroup *oldList)
+/* Break up any groups that span multiple chromosomes into one per group.
+ * Return reworked list. */
+{
+struct gffGroup *newList = NULL, *group, *next;
+
+for (group = oldList; group != NULL; group = next)
+    {
+    next = group->next;
+    struct gffGroup *groupList = breakGroupBySeq(group);
+    struct gffGroup *newGroup, *newNext;
+    for (newGroup = groupList; newGroup != NULL; newGroup = newNext)
+	{
+	newNext = newGroup->next;
+	slAddHead(&newList, newGroup);
+	}
+    }
+slReverse(&newList);
+return newList;
+}
+
+
 void gffGroupLines(struct gffFile *gff)
 /* Group lines of gff file together, in process mofing
  * gff->lineList to gffGroup->lineList. */
 {
 struct gffLine *line, *nextLine;
 struct hash *groupHash = gff->groupHash;
 char *groupName;
 struct gffGroup *group;
 struct gffLine *ungroupedLines = NULL;
 
 for (line = gff->lineList; line != NULL; line = nextLine)
     {
     nextLine = line->next;
     if ((groupName = line->group) != NULL)
 	{
 	struct hashEl *hel = hashLookup(groupHash, groupName);
 	group = hel->val;
 	slAddHead(&group->lineList, line);
 	}
     else
 	{
 	slAddHead(&ungroupedLines, line);
 	}
     }
 
 /* Restore ungrouped lines to gff->lineList. */
 slReverse(&ungroupedLines);
 gff->lineList = ungroupedLines;
 
-/* Restore order of grouped lines and fill in start and end. */
+/* Restore order of grouped lines. */
 for (group = gff->groupList; group != NULL; group = group->next)
-    {
     slSort(&group->lineList, gffLineCmp);
+
+/* Look for groups that traverse multiple chromosomes.  Break them apart. */
+gff->groupList = breakMultiSeqGroups(gff->groupList);
+
+for (group = gff->groupList; group != NULL; group = group->next)
     getGroupBoundaries(group);
     }
-}
 
 void gffOutput(struct gffLine *el, FILE *f, char sep, char lastSep) 
 /* Print out GTF.  Separate fields with sep. Follow last field with lastSep. */
 {
 if (sep == ',') fputc('"',f);
 fprintf(f, "%s", el->seq);
 if (sep == ',') fputc('"',f);
 fputc(sep,f);
 if (sep == ',') fputc('"',f);
 fprintf(f, "%s", el->source);
 if (sep == ',') fputc('"',f);
 fputc(sep,f);
 if (sep == ',') fputc('"',f);
 fprintf(f, "%s", el->feature);
 if (sep == ',') fputc('"',f);