dae4e0aa5ea2ca7d8322580278a017d31230138b
markd
  Wed Feb 2 23:33:05 2011 -0800
fixed memory corruption on very long GFF attribute columns (#2694)
diff --git src/lib/gff.c src/lib/gff.c
index 6fa5e41..ee2c75f 100644
--- src/lib/gff.c
+++ src/lib/gff.c
@@ -1,473 +1,477 @@
 /* gff - routines to read many types of gff and gtf files
  * and turn them into a relatively easy to deal with form
  * in memory.
  *
  * This file is copyright 2002 Jim Kent, but license is hereby
  * granted for all use - public, private or commercial. */
 #include "common.h"
 #include "hash.h"
 #include "linefile.h"
 #include "gff.h"
 #include "obscure.h"
+#include "dystring.h"
 
 static char const rcsid[] = "$Id: gff.c,v 1.24 2009/02/05 20:53:24 hiram Exp $";
 
 void gffGroupFree(struct gffGroup **pGroup)
 /* Free up a gffGroup including lineList. */
 {
 struct gffGroup *group;
 if ((group = *pGroup) != NULL)
     {
     slFreeList(&group->lineList);
     freez(pGroup);
     }
 }
 
 void gffGroupFreeList(struct gffGroup **pList)
 /* Free up a list of gffGroups. */
 {
 struct gffGroup *el, *next;
 for (el = *pList; el != NULL; el = next)
     {
     next = el->next;
     gffGroupFree(&el);
     }
 *pList = NULL;
 }
 
 
 void gffFileFree(struct gffFile **pGff)
 /* Free up a gff file. */
 {
 struct gffFile *gff;
 if ((gff = *pGff) != NULL)
     {
     freeMem(gff->fileName);
     freeHash(&gff->seqHash);
     freeHash(&gff->sourceHash);
     freeHash(&gff->featureHash);
     freeHash(&gff->groupHash);
     freeHash(&gff->geneIdHash);
     freeHash(&gff->strPool);
     slFreeList(&gff->lineList);
     slFreeList(&gff->seqList);
     slFreeList(&gff->sourceList);
     slFreeList(&gff->featureList);
     slFreeList(&gff->geneIdList);
     gffGroupFreeList(&gff->groupList);
     freez(pGff);
     }
 }
 
 static char *gffFileGetStr(struct gffFile *gff, char *str)
 /* get a string from the string pool */
 {
 return hashStore(gff->strPool,  str)->name;
 }
 
 int gffLineCmp(const void *va, const void *vb)
 /* Compare two gffLines. */
 {
 const struct gffLine *a = *((struct gffLine **)va);
 const struct gffLine *b = *((struct gffLine **)vb);
 int diff;
 
 /* for overlaping starts, sort by end, genePredFromGroupedGtf() depends on
  * this */
 diff = strcmp(a->seq, b->seq);
 if (diff == 0)
     diff = a->start - b->start;
 if (diff == 0)
     diff = a->end - b->end;
 return diff;
 }
 
 
 static void gffSyntaxError(char *fileName, int line, char *msg)
 /* Complain about syntax error in GFF file. */
 {
 errAbort("%s Bad line %d of %s:\n", msg, line, fileName);
 }
 
 static char *gffTnName(char *seqName, char *groupName)
 /* Make name that encorperates seq and group names.... */
 {
-static char nameBuf[512];
+static struct dyString *nameBuf = NULL;
+if (nameBuf == NULL)
+    nameBuf = dyStringNew(0);
+dyStringClear(nameBuf);
 if (startsWith("gene-", groupName))
     groupName += 5;
 if (startsWith("cc_", groupName))
     groupName += 3;
-strcpy(nameBuf, groupName);
+dyStringAppend(nameBuf, groupName);
 
-return nameBuf;
+return nameBuf->string;
 }
 
 static boolean isGtfGroup(char *group)
 /* Return TRUE if group field looks like GTF */
 {
 if (strstr(group, "gene_id") == NULL)
     return FALSE;
 if (countChars(group, '"') >= 2)
     return TRUE;
 if (strstr(group, "transcript_id") != NULL)
     return TRUE;
 return FALSE;
 }
 
 boolean gffHasGtfGroup(char *line)
 /* Return TRUE if line has a GTF group field */
 {
 char *words[10];
 char *dupe = cloneString(line);
 int wordCt = chopTabs(dupe, words);
 boolean isGtf = FALSE;
 if (wordCt >= 9) 
     if (isGtfGroup(words[8]))
         isGtf = TRUE;
 freeMem(dupe);
 return isGtf;
 }
 
 static void readQuotedString(char *fileName, int lineIx, char *in, char *out, char **retNext)
 /* Parse quoted string and abort on error. */
 {
 if (!parseQuotedString(in, out, retNext))
     errAbort("Line %d of %s\n", lineIx, fileName);
 }
 
 static void parseGtfEnd(char *s, struct gffFile *gff, struct gffLine *gl, 
     char *fileName, int lineIx)
 /* Read the semi-colon separated end bits of a GTF line into gl and
  * hashes. */
 {
 char *type, *val;
 struct hashEl *hel;
 bool gotSemi;
 
 for (;;)
    {
    gotSemi = FALSE;
    if ((type = nextWord(&s)) == NULL)
        break;
    s = skipLeadingSpaces(s);
    if (NULL == s || s[0] == 0)
        errAbort("Unpaired type(%s)/val on end of gtf line %d of %s", type, lineIx, fileName);
    if (s[0] == '"' || s[0] == '\'')
        {
        val = s;
        readQuotedString(fileName, lineIx, s, val, &s);
        }
    else
        {
        int len;
        val = nextWord(&s);
        len = strlen(val) - 1;
        if (val[len] == ';')
 	   {
 	   val[len] = 0;
 	   len -= 1;
            gotSemi = TRUE;
 	   }
        if (len < 0)
            errAbort("Empty value for %s line %d of %s", type, lineIx, fileName);
        }
    if (s != NULL && !gotSemi)
       {
       s = strchr(s, ';');
       if (s != NULL)
          ++s;
       }
    /* only use the first occurance of gene_id and transcript_id */
    if (sameString("gene_id", type) && (gl->geneId == NULL))
        {
        struct gffGeneId *gg;
        if ((hel = hashLookup(gff->geneIdHash, val)) == NULL)
 	   {
 	   AllocVar(gg);
            hel = hashAdd(gff->geneIdHash, val, gg);
 	   gg->name = hel->name;
 	   slAddHead(&gff->geneIdList, gg);
 	   }
 	else
 	   {
 	   gg = hel->val;
 	   }
        gl->geneId = gg->name;
        }
    else if (sameString("transcript_id", type) && (gl->group == NULL))
        {
        struct gffGroup *gg;
        if ((hel = hashLookup(gff->groupHash, val)) == NULL)
 	   {
 	   AllocVar(gg);
            hel = hashAdd(gff->groupHash, val, gg);
 	   gg->name = hel->name;
 	   gg->seq = gl->seq;
 	   gg->source = gl->source;
 	   slAddHead(&gff->groupList, gg);
 	   }
 	else
 	   {
 	   gg = hel->val;
 	   }
        gl->group = gg->name;
        }
    else if (sameString("exon_id", type))
        gl->exonId = gffFileGetStr(gff, val);
    else if (sameString("exon_number", type))
        {
        if (!isdigit(val[0]))
            errAbort("Expecting number after exon_number, got %s line %d of %s", val, lineIx, fileName);
        gl->exonNumber = atoi(val);
        }
    else if (sameString("intron_id", type))
        gl->intronId = gffFileGetStr(gff, val);
    else if (sameString("intron_status", type))
        gl->intronStatus = gffFileGetStr(gff, val);
    else if (sameString("protein_id", type))
        gl->proteinId = gffFileGetStr(gff, val);
    else if (sameString("gene_name", type))
        gl->geneName = gffFileGetStr(gff, val);
    else if (sameString("transcript_name", type))
        gl->transcriptName = gffFileGetStr(gff, val);
    }
 if (gl->group == NULL)
     {
     if (gl->geneId == NULL)
         warn("No gene_id or transcript_id line %d of %s", lineIx, fileName);
     }
 }
 
 void gffFileAddRow(struct gffFile *gff, int baseOffset, char *words[], int wordCount, 
     char *fileName, int lineIx)
 /* Process one row of GFF file (a non-comment line parsed by tabs normally). */
 {
 struct hashEl *hel;
 struct gffLine *gl;
 
 if (wordCount < 8)
     gffSyntaxError(fileName, lineIx, "Word count less than 8 ");
 AllocVar(gl);
 
 if ((hel = hashLookup(gff->seqHash, words[0])) == NULL)
     {
     struct gffSeqName *el;
     AllocVar(el);
     hel = hashAdd(gff->seqHash, words[0], el);
     el->name = hel->name;
     slAddHead(&gff->seqList, el);
     }
 gl->seq = hel->name;
 
 if ((hel = hashLookup(gff->sourceHash, words[1])) == NULL)
     {
     struct gffSource *el;
     AllocVar(el);
     hel = hashAdd(gff->sourceHash, words[1], el);
     el->name = hel->name;
     slAddHead(&gff->sourceList, el);
     }
 gl->source = hel->name;
 
 if ((hel = hashLookup(gff->featureHash, words[2])) == NULL)
     {
     struct gffFeature *el;
     AllocVar(el);
     hel = hashAdd(gff->featureHash, words[2], el);
     el->name = hel->name;
     slAddHead(&gff->featureList, el);
     }
 gl->feature = hel->name;
 
 if (!isdigit(words[3][0]) || !isdigit(words[4][0]))
    gffSyntaxError(fileName, lineIx, "col 3 or 4 not a number ");	
 gl->start = atoi(words[3])-1 + baseOffset;
 gl->end = atoi(words[4]) + baseOffset;
 gl->score = atof(words[5]);
 gl->strand = words[6][0];
 gl->frame = words[7][0];
 
 if (wordCount >= 9)
     {
     if (!gff->typeKnown)
 	{
 	gff->typeKnown = TRUE;
 	gff->isGtf = isGtfGroup(words[8]);
 	}
     if (gff->isGtf)
 	{
 	parseGtfEnd(words[8], gff, gl, fileName, lineIx);
 	}
     else
 	{
 	char *tnName = gffTnName(gl->seq, trimSpaces(words[8]));
 	if ((hel = hashLookup(gff->groupHash, tnName)) == NULL)
 	    {
 	    struct gffGroup *group;
 	    AllocVar(group);
 	    hel = hashAdd(gff->groupHash, tnName, group);
 	    group->name = hel->name;
 	    group->seq = gl->seq;
 	    group->source = gl->source;
 	    slAddHead(&gff->groupList, group);
 	    }
 	gl->group = hel->name;
 	}
     }
 slAddHead(&gff->lineList, gl);
 }
 
 
 void gffFileAdd(struct gffFile *gff, char *fileName, int baseOffset)
 /* Create a gffFile structure from a GFF file. */
 {
 /* Open file and do basic allocations. */
 struct lineFile *lf = lineFileOpen(fileName, TRUE);
 char *line, *words[9];
 int lineSize, wordCount;
 
 while (lineFileNext(lf, &line, &lineSize))
     {
     if (line[0] != '#')
 	{
 	wordCount = chopTabs(line, words);
         if (wordCount > 0)
             gffFileAddRow(gff, baseOffset, words, wordCount, lf->fileName, lf->lineIx);
 	}
     }
 slReverse(&gff->lineList);
 slReverse(&gff->seqList);
 slReverse(&gff->sourceList);
 slReverse(&gff->featureList);
 slReverse(&gff->groupList);
 slReverse(&gff->geneIdList);
 lineFileClose(&lf);
 }
 
 struct gffFile *gffFileNew(char *fileName)
 /* Create a new gffFile structure. */
 {
 struct gffFile *gff;
 AllocVar(gff);
 gff->fileName = cloneString(fileName);
 gff->seqHash = newHash(18);
 gff->sourceHash = newHash(6);
 gff->featureHash = newHash(6);
 gff->groupHash = newHash(16);
 gff->geneIdHash = newHash(16);
 gff->strPool = newHash(20);
 return gff;
 }
 
 struct gffFile *gffRead(char *fileName)
 /* Create a gffFile structure from a GFF file. */
 {
 struct gffFile *gff = gffFileNew(fileName);
 gffFileAdd(gff, fileName, 0);
 return gff;
 }
 
 static void getGroupBoundaries(struct gffGroup *group)
 /* Fill in start, end, strand of group from lines. */
 {
 struct gffLine *line;
 int start = 0x3fffffff;
 int end = -start;
 line = group->lineList;
 group->strand = line->strand;
 for (; line != NULL; line = line->next)
     {
     if (start > line->start)
 	start = line->start;
     if (end < line->end)
 	end = line->end;
     }
 group->start = start;
 group->end = end;
 }
 
 void gffGroupLines(struct gffFile *gff)
 /* Group lines of gff file together, in process mofing
  * gff->lineList to gffGroup->lineList. */
 {
 struct gffLine *line, *nextLine;
 struct hash *groupHash = gff->groupHash;
 char *groupName;
 struct gffGroup *group;
 struct gffLine *ungroupedLines = NULL;
 
 for (line = gff->lineList; line != NULL; line = nextLine)
     {
     nextLine = line->next;
     if ((groupName = line->group) != NULL)
 	{
 	struct hashEl *hel = hashLookup(groupHash, groupName);
 	group = hel->val;
 	slAddHead(&group->lineList, line);
 	}
     else
 	{
 	slAddHead(&ungroupedLines, line);
 	}
     }
 
 /* Restore ungrouped lines to gff->lineList. */
 slReverse(&ungroupedLines);
 gff->lineList = ungroupedLines;
 
 /* Restore order of grouped lines and fill in start and end. */
 for (group = gff->groupList; group != NULL; group = group->next)
     {
     slSort(&group->lineList, gffLineCmp);
     getGroupBoundaries(group);
     }
 }
 
 void gffOutput(struct gffLine *el, FILE *f, char sep, char lastSep) 
 /* Print out GTF.  Separate fields with sep. Follow last field with lastSep. */
 {
 if (sep == ',') fputc('"',f);
 fprintf(f, "%s", el->seq);
 if (sep == ',') fputc('"',f);
 fputc(sep,f);
 if (sep == ',') fputc('"',f);
 fprintf(f, "%s", el->source);
 if (sep == ',') fputc('"',f);
 fputc(sep,f);
 if (sep == ',') fputc('"',f);
 fprintf(f, "%s", el->feature);
 if (sep == ',') fputc('"',f);
 fputc(sep,f);
 fprintf(f, "%u", el->start+1);
 fputc(sep,f);
 fprintf(f, "%u", el->end);
 fputc(sep,f);
 fprintf(f, "%f", el->score);
 fputc(sep,f);
 if (sep == ',') fputc('"',f);
 fprintf(f, "%c", el->strand);
 if (sep == ',') fputc('"',f);
 fputc(sep,f);
 if (sep == ',') fputc('"',f);
 fprintf(f, "%c", el->frame);
 if (sep == ',') fputc('"',f);
 fputc(sep,f);
 if (sep == ',') fputc('"',f);
 if (el->geneId != NULL)
     fprintf(f, "gene_id %s\"%s%s\"; ",
 	    (sep == ',') ? "\\" : "",
 	    el->geneId,
 	    (sep == ',') ? "\\" : "");
 fprintf(f, "transcript_id %s\"%s%s\"; ",
 	(sep == ',') ? "\\" : "",
 	el->group,
 	(sep == ',') ? "\\" : "");
 if (el->exonId != NULL)
     fprintf(f, "exon_id %s\"%s%s\"; ",
 	    (sep == ',') ? "\\" : "",
 	    el->exonId,
 	    (sep == ',') ? "\\" : "");
 if (sep == ',') fputc('"',f);
 fputc(lastSep,f);
 }