src/lib/bigBed.c 1.14

1.14 2009/04/17 23:23:23 kent
Improving error detection of bedToBigBed.
Index: src/lib/bigBed.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/lib/bigBed.c,v
retrieving revision 1.13
retrieving revision 1.14
diff -b -B -U 4 -r1.13 -r1.14
--- src/lib/bigBed.c	16 Mar 2009 18:34:46 -0000	1.13
+++ src/lib/bigBed.c	17 Apr 2009 23:23:23 -0000	1.14
@@ -9,8 +9,9 @@
 #include "dystring.h"
 #include "rangeTree.h"
 #include "cirTree.h"
 #include "bPlusTree.h"
+#include "basicBed.h"
 #include "asParse.h"
 #include "sig.h"
 #include "udc.h"
 #include "bbiFile.h"
@@ -68,45 +69,93 @@
 }
 
 
 static struct ppBed *ppBedLoadAll(char *fileName, struct hash *chromHash, struct lm *lm, 
-	bits64 *retDiskSize, bits16 *retFieldCount)
+	struct asObject *as, int definedFieldCount, bits64 *retDiskSize, bits16 *retFieldCount)
 /* Read bed file and return it as list of ppBeds. The whole thing will
  * be allocated in the passed in lm - don't ppBedFreeList or slFree
  * list! */
 {
 struct ppBed *pbList = NULL, *pb;
 struct lineFile *lf = lineFileOpen(fileName, TRUE);
 char *line;
 bits64 diskSize = 0;
-int fieldCount = 0;
+int fieldCount = 0, fieldAlloc=0;
+char **row = NULL;
 while (lineFileNextReal(lf, &line))
     {
+    /* First time through figure out the field count, and if not set, the defined field count. */
     if (fieldCount == 0)
+	{
+	if (as == NULL)
+	    {
         fieldCount = chopByWhite(line, NULL, 0);
-    int i;
-    char *words[3];
-    for (i=0; i<3; ++i)
-	words[i] = nextWord(&line);
+	    if (definedFieldCount == 0)
+		definedFieldCount = fieldCount;
+	    if (as == NULL)
+		{
+		char *asText = bedAsDef(definedFieldCount, fieldCount);
+		as = asParseText(asText);
+		freeMem(asText);
+		}
+	    }
+	else
+	    {
+	    fieldCount = slCount(as->columnList);
+	    }
+	fieldAlloc = fieldCount+1;
+	AllocArray(row, fieldAlloc);
+	}
+
+    /* Chop up line and make sure the word count is right. */
+    int wordCount = chopByWhite(line, row, fieldAlloc);
+    lineFileExpectWords(lf, fieldCount, wordCount);
+
+    /* Allocate variable and fill in first three fields. */
     lmAllocVar(lm, pb);
-    char *chrom = words[0];
+    char *chrom = row[0];
     struct hashEl *hel = hashLookup(chromHash, chrom);
     if (hel == NULL)
         errAbort("%s is not in chrom.sizes line %d of %s", chrom, lf->lineIx, lf->fileName);
     pb->chrom = hel->name;
-    pb->start = lineFileNeedNum(lf, words, 1);
-    pb->end = lineFileNeedNum(lf, words, 2);
-    int len = 0;
-    line = skipLeadingSpaces(line);
-    if (line != NULL)
+    pb->start = lineFileNeedNum(lf, row, 1);
+    pb->end = lineFileNeedNum(lf, row, 2);
+    int i;
+
+    /* Check remaining fields are formatted right, and concatenate them into "rest" string. */
+    if (fieldCount > 3)
 	{
-        len = strlen(line);
-	pb->rest = lmCloneString(lm, skipLeadingSpaces(line));
+	/* Count up string sizes and allocate something big enough. */
+	int textSize = 0;
+	for (i=3; i<fieldCount; ++i)
+	    textSize += strlen(row[i]) + 1;
+	char *s = pb->rest = lmAlloc(lm, textSize);
+
+	/* Go through and check that numerical strings really are numerical. */
+	struct asColumn *asCol = slElementFromIx(as->columnList, 3);
+	for (i=3; i<fieldCount; ++i)
+	    {
+	    enum asTypes type = asCol->lowType->type;
+	    if (asTypesIsInt(type))
+	        lineFileNeedFullNum(lf, row, i);
+	    else if (asTypesIsFloating(type))
+	        lineFileNeedDouble(lf, row, i);
+	    int len = strlen(row[i]);
+	    memcpy(s, row[i], len);
+	    s[len] = '\t';
+	    s += len+1;
+	    asCol = asCol->next;
+	    }
+	/* Convert final tab to a zero. */
+	pb->rest[textSize-1] = 0;
+	diskSize += textSize + 3*sizeof(bits32);
 	}
-    diskSize += len + 1 + 3*sizeof(bits32);
+    else
+        diskSize += 3*sizeof(bits32) + 1;  /* Still will write terminal 0 */
     slAddHead(&pbList, pb);
     }
 slReverse(&pbList);
+freeMem(row);
 *retDiskSize = diskSize;
 *retFieldCount = fieldCount;
 return pbList;
 }
@@ -236,15 +285,26 @@
 bits64 reductionDataOffsets[10];
 bits64 reductionIndexOffsets[10];
 bits16 fieldCount;
 
+/* Load up as object if defined in file. */
+struct asObject *as = NULL;
+if (asFileName != NULL)
+    {
+    /* Parse it and do sanity check. */
+    as = asParseFile(asFileName);
+    if (as->next != NULL)
+        errAbort("Can only handle .as files containing a single object.");
+    }
+
 /* Load in chromosome sizes. */
 struct hash *chromHash = bbiChromSizesFromFile(chromSizes);
 verbose(1, "Read %d chromosomes and sizes from %s\n",  chromHash->elCount, chromSizes);
 
 /* Load and sort input file. */
 bits64 fullSize;
-struct ppBed *pb, *pbList = ppBedLoadAll(inName, chromHash, chromHash->lm, &fullSize, &fieldCount);
+struct ppBed *pb, *pbList = ppBedLoadAll(inName, chromHash, chromHash->lm, as, 
+	definedFieldCount, &fullSize, &fieldCount);
 if (definedFieldCount == 0)
     definedFieldCount = fieldCount;
 bits64 pbCount = slCount(pbList);
 verbose(1, "Read %llu items from %s\n", pbCount, inName);
@@ -335,23 +395,20 @@
     writeOne(f, reserved64);	// Fill in with data offset later
     writeOne(f, reserved64);	// Fill in with index offset later
     }
 
-/* Optionally write out as file. */
+/* Optionally write out .as file. */
 if (asFileName != NULL)
     {
-    /* Parse it and do sanity check. */
-    struct asObject *as = asParseFile(asFileName);
-    if (as->next != NULL)
-        errAbort("Can only handle .as files containing a single object.");
     int colCount = slCount(as->columnList);
     if (colCount != fieldCount)
         errAbort("%d columns in %s, %d columns in %s. These must match!",
 		colCount, asFileName, fieldCount, inName);
     asOffset = ftell(f);
     FILE *asFile = mustOpen(asFileName, "r");
     copyOpenFile(asFile, f);
     fputc(0, f);
+    carefulClose(&asFile);
     }
 
 /* Write chromosome bPlusTree */
 chromTreeOffset = ftell(f);