src/lib/bigBed.c 1.17

1.17 2009/04/29 17:59:34 mikep
splitting bigBedFileCreate() logic to load bed records from a file separately from calculating summary and writing files
Index: src/lib/bigBed.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/lib/bigBed.c,v
retrieving revision 1.16
retrieving revision 1.17
diff -b -B -U 4 -r1.16 -r1.17
--- src/lib/bigBed.c	20 Apr 2009 23:21:04 -0000	1.16
+++ src/lib/bigBed.c	29 Apr 2009 17:59:34 -0000	1.17
@@ -22,19 +22,8 @@
 {
 return bbiFileOpen(fileName, bigBedSig, "big bed");
 }
 
-struct ppBed
-/* A partially parsed out bed record plus some extra fields. */
-    {
-    struct ppBed *next;	/* Next in list. */
-    char *chrom;		/* Chromosome name (not allocated here) */
-    bits32 start, end;		/* Range inside chromosome - half open zero based. */
-    char *rest;			/* The rest of the bed. */
-    bits64 fileOffset;		/* File offset. */
-    bits32 chromId;		/* Chromosome ID. */
-    };
-
 static int ppBedCmp(const void *va, const void *vb)
 /* Compare to sort based on chrom,start,end. */
 {
 const struct ppBed *a = *((struct ppBed **)va);
@@ -67,21 +56,87 @@
 const struct ppBed *a = *((struct ppBed **)va);
 return a->fileOffset;
 }
 
+struct ppBed *ppBedLoadOne(char **row, int fieldCount, struct lineFile *lf, struct hash *chromHash, struct lm *lm, struct asObject *as, bits64 *diskSize)
+/* Return a ppBed record from a line of bed file in lf.
+   Return the disk size it would occupy in *diskSize.
+   row is a preallocated array of pointers to the individual fields in this row to load.
+   fieldCount is the number of fields.
+   lf is the lineFile the row is coming from, used for error messages and parsing fields.
+   chromHash is a hash of the chromosome sizes.
+   lm is localMem to allocate ppBed memory from - don't ppBedFreeList or slFree
+   list!
+   as is the autoSql object describing this bed file or NULL if standard bed.
+   */
+{
+struct ppBed *pb;
+
+/* Allocate variable and fill in first three fields. */
+lmAllocVar(lm, pb);
+char *chrom = row[0];
+struct hashEl *hel = hashLookup(chromHash, chrom);
+if (hel == NULL)
+    errAbort("%s is not in chrom.sizes line %d of %s", chrom, lf->lineIx, lf->fileName);
+pb->chrom = hel->name;
+pb->start = lineFileNeedNum(lf, row, 1);
+pb->end = lineFileNeedNum(lf, row, 2);
+int i;
+
+/* Check remaining fields are formatted right, and concatenate them into "rest" string. */
+if (fieldCount > 3)
+    {
+    /* Count up string sizes and allocate something big enough. */
+    int textSize = 0;
+    for (i=3; i<fieldCount; ++i)
+	textSize += strlen(row[i]) + 1;
+    char *s = pb->rest = lmAlloc(lm, textSize);
+
+    /* Go through and check that numerical strings really are numerical. */
+    struct asColumn *asCol = slElementFromIx(as->columnList, 3);
+    for (i=3; i<fieldCount; ++i)
+	{
+	enum asTypes type = asCol->lowType->type;
+	if (asTypesIsInt(type))
+	    lineFileNeedFullNum(lf, row, i);
+	else if (asTypesIsFloating(type))
+	    lineFileNeedDouble(lf, row, i);
+	int len = strlen(row[i]);
+	memcpy(s, row[i], len);
+	s[len] = '\t';
+	s += len+1;
+	asCol = asCol->next;
+	}
+    /* Convert final tab to a zero. */
+    pb->rest[textSize-1] = 0;
+    *diskSize = textSize + 3*sizeof(bits32);
+    }
+else
+    {
+    *diskSize = 3*sizeof(bits32) + 1;  /* Still will write terminal 0 */
+    }
+return pb;
+}
 
 static struct ppBed *ppBedLoadAll(char *fileName, struct hash *chromHash, struct lm *lm, 
-	struct asObject *as, int definedFieldCount, bits64 *retDiskSize, bits16 *retFieldCount)
+	struct asObject *as, int definedFieldCount, bits64 *retDiskSize, bits16 *retFieldCount, boolean *isSorted, bits64 *count, double *avgSize)
 /* Read bed file and return it as list of ppBeds. The whole thing will
  * be allocated in the passed in lm - don't ppBedFreeList or slFree
- * list! */
+ * list! 
+ * Returns TRUE in isSorted if the input file is already sorted,
+ * Returns the count of records and the average size of records */
 {
 struct ppBed *pbList = NULL, *pb;
 struct lineFile *lf = lineFileOpen(fileName, TRUE);
 char *line;
-bits64 diskSize = 0;
 int fieldCount = 0, fieldAlloc=0;
 char **row = NULL;
+char *prevChrom = NULL; // set after first time through loop
+bits32 prevStart = 0;   // ditto
+double totalSize = 0.0;
+long n = 0;
+*retDiskSize = 0;
+*isSorted = TRUE;
 while (lineFileNextReal(lf, &line))
     {
     /* First time through figure out the field count, and if not set, the defined field count. */
     if (fieldCount == 0)
@@ -104,76 +159,45 @@
 	    }
 	fieldAlloc = fieldCount+1;
 	AllocArray(row, fieldAlloc);
 	}
-
     /* Chop up line and make sure the word count is right. */
     int wordCount = chopByWhite(line, row, fieldAlloc);
     lineFileExpectWords(lf, fieldCount, wordCount);
-
-    /* Allocate variable and fill in first three fields. */
-    lmAllocVar(lm, pb);
-    char *chrom = row[0];
-    struct hashEl *hel = hashLookup(chromHash, chrom);
-    if (hel == NULL)
-        errAbort("%s is not in chrom.sizes line %d of %s", chrom, lf->lineIx, lf->fileName);
-    pb->chrom = hel->name;
-    pb->start = lineFileNeedNum(lf, row, 1);
-    pb->end = lineFileNeedNum(lf, row, 2);
-    int i;
-
-    /* Check remaining fields are formatted right, and concatenate them into "rest" string. */
-    if (fieldCount > 3)
-        {
-	/* Count up string sizes and allocate something big enough. */
-	int textSize = 0;
-	for (i=3; i<fieldCount; ++i)
-	    textSize += strlen(row[i]) + 1;
-	char *s = pb->rest = lmAlloc(lm, textSize);
-
-	/* Go through and check that numerical strings really are numerical. */
-	struct asColumn *asCol = slElementFromIx(as->columnList, 3);
-	for (i=3; i<fieldCount; ++i)
-	    {
-	    enum asTypes type = asCol->lowType->type;
-	    if (asTypesIsInt(type))
-	        lineFileNeedFullNum(lf, row, i);
-	    else if (asTypesIsFloating(type))
-	        lineFileNeedDouble(lf, row, i);
-	    int len = strlen(row[i]);
-	    memcpy(s, row[i], len);
-	    s[len] = '\t';
-	    s += len+1;
-	    asCol = asCol->next;
-	    }
-	/* Convert final tab to a zero. */
-	pb->rest[textSize-1] = 0;
-	diskSize += textSize + 3*sizeof(bits32);
-	}
-    else
-        diskSize += 3*sizeof(bits32) + 1;  /* Still will write terminal 0 */
+    bits64 diskSize = 0;
+    pb = ppBedLoadOne(row, fieldCount, lf, chromHash, lm, as, &diskSize);
+    if (*isSorted && prevChrom && prevChrom == pb->chrom && pb->start < prevStart)
+	*isSorted = FALSE; // first time through the loop prevChrom is NULL so test fails
+    prevChrom = pb->chrom;
+    prevStart = pb->start;
+    ++n;
+    totalSize += pb->end - pb->start;
+    *retDiskSize += diskSize;
     slAddHead(&pbList, pb);
     }
+if (n > 0)
+    *avgSize = totalSize/n;
+*count = n;
 slReverse(&pbList);
 freeMem(row);
-*retDiskSize = diskSize;
 *retFieldCount = fieldCount;
 return pbList;
 }
 
-static double ppBedAverageSize(struct ppBed *pbList)
-/* Return size of average bed in list. */
-{
-struct ppBed *pb;
-double total = 0.0;
-long count = 0;
-for (pb = pbList; pb != NULL; pb = pb->next)
-    {
-    total += pb->end - pb->start;
-    count += 1;
-    }
-return total/count;
-}
+// Not used now, compiler gives a 'unused code' warning if its left in
+// static double ppBedAverageSize(struct ppBed *pbList)
+// /* Return size of average bed in list. */
+// {
+// struct ppBed *pb;
+// double total = 0.0;
+// long count = 0;
+// for (pb = pbList; pb != NULL; pb = pb->next)
+//     {
+//     total += pb->end - pb->start;
+//     count += 1;
+//     }
+// return total/count;
+// }
 
 static void makeChromInfo(struct ppBed *pbList, struct hash *chromSizeHash,
 	int *retChromCount, struct bbiChromInfo **retChromArray,
 	int *retMaxChromNameSize)
@@ -262,22 +286,79 @@
 	char *asFileName, /* If non-null points to a .as file that describes fields. */
 	char *outName)    /* BigBed output file name. */
 /* Convert tab-separated bed file to binary indexed, zoomed bigBed version. */
 {
-bigBedFileCreateDetailed(inName, FALSE, chromSizes, blockSize, itemsPerSlot, definedFieldCount, asFileName, outName);
+bits16 fieldCount;
+bits64 fullSize;
+bits64 count;
+double averageSize;
+struct asObject *as = NULL;
+struct hash *chromHash = NULL;
+struct ppBed *pbList = NULL;
+bigBedFileCreateReadInfile(inName, chromSizes, blockSize, itemsPerSlot, definedFieldCount, asFileName, 
+    outName, &pbList, &count, &averageSize, &chromHash, &fieldCount, &as, &fullSize);
+bigBedFileCreateDetailed(pbList, count, averageSize, inName, chromHash, blockSize, itemsPerSlot, 
+    definedFieldCount, fieldCount, asFileName, as, fullSize, outName);
 }
 
-void bigBedFileCreateDetailed(
+void bigBedFileCreateReadInfile(
 	char *inName, 	  /* Input file in a tabular bed format <chrom><start><end> + whatever. */
-	boolean sorted,	  /* Input is already sorted */
 	char *chromSizes, /* Two column tab-separated file: <chromosome> <size>. */
 	int blockSize,	  /* Number of items to bundle in r-tree.  1024 is good. */
 	int itemsPerSlot, /* Number of items in lowest level of tree.  64 is good. */
 	bits16 definedFieldCount,  /* Number of defined bed fields - 3-16 or so.  0 means all fields
 				    * are the defined bed ones. */
 	char *asFileName, /* If non-null points to a .as file that describes fields. */
+	char *outName,    /* BigBed output file name. */
+	struct ppBed **ppbList,   /* Input bed data, will be sorted. */
+	bits64 *count,            /* size of input pbList */
+	double *averageSize,      /* average size of elements in pbList */
+	struct hash **pChromHash,  /* Hash containing sizes of all chroms. */
+	bits16 *fieldCount,       /* actual field count from input data. */
+	struct asObject **pAs,    /* If non-null contains as object that describes fields. */
+	bits64 *fullSize)         /* full size of ppBed on disk */
+/* Load data to prepare bigBed. */
+{
+/* Load up as object if defined in file. */
+struct asObject *as = NULL;
+boolean sorted;
+if (asFileName != NULL)
+    {
+    /* Parse it and do sanity check. */
+    as = asParseFile(asFileName);
+    if (as->next != NULL)
+        errAbort("Can only handle .as files containing a single object.");
+    }
+/* Load in chromosome sizes. */
+struct hash *chromHash = bbiChromSizesFromFile(chromSizes);
+verbose(1, "Read %d chromosomes and sizes from %s\n",  chromHash->elCount, chromSizes);
+/* Load and sort input file. */
+struct ppBed *pbList = ppBedLoadAll(inName, chromHash, chromHash->lm, as, 
+	definedFieldCount, fullSize, fieldCount, &sorted, count, averageSize);
+verbose(1, "Read %llu items from %s\n", *count, inName);
+if (!sorted)
+    slSort(&pbList, ppBedCmp);
+*ppbList = pbList;
+*pChromHash = chromHash;
+*pAs = as;
+}
+
+void bigBedFileCreateDetailed(
+	struct ppBed *pbList, 	  /* Input bed data. Must be sorted. */
+	bits64 pbCount,           /* size of input pbList */
+	double pbAverageSize,     /* average size of elements in pbList */
+	char *inName,             /* Input file name (for error message reporting) */
+	struct hash *chromHash,   /* Hash containing sizes of all chroms. */
+	int blockSize,	  /* Number of items to bundle in r-tree.  1024 is good. */
+	int itemsPerSlot, /* Number of items in lowest level of tree.  64 is good. */
+	bits16 definedFieldCount, /* Number of defined bed fields - 3-16 or so.  0 means all fields
+				    * are the defined bed ones. */
+	bits16 fieldCount,        /* actual field count from input data. */
+	char *asFileName,         /* If non-null points to a .as file that describes fields. */
+	struct asObject *as,      /* If non-null contains as object that describes fields. */
+	bits64 fullSize,          /* full size of ppBed on disk */
 	char *outName)    /* BigBed output file name. */
-/* Convert tab-separated bed file to binary indexed, zoomed bigBed version. */
+/* create zoomed bigBed version from ppBed list. */
 {
 /* This code needs to agree with code in two other places currently - bigWigFileCreate,
  * and bbiFileOpen.  I'm thinking of refactoring to share at least between
  * bigBedFileCreate and bigWigFileCreate.  It'd be great so it could be structured
@@ -298,38 +379,12 @@
 bits32 reductionAmounts[10];
 bits64 reductionDataOffsetPos[10];
 bits64 reductionDataOffsets[10];
 bits64 reductionIndexOffsets[10];
-bits16 fieldCount;
-
-uglyTime(NULL); // initialize timing
-
-/* Load up as object if defined in file. */
-struct asObject *as = NULL;
-if (asFileName != NULL)
-    {
-    /* Parse it and do sanity check. */
-    as = asParseFile(asFileName);
-    if (as->next != NULL)
-        errAbort("Can only handle .as files containing a single object.");
-    }
-
-/* Load in chromosome sizes. */
-struct hash *chromHash = bbiChromSizesFromFile(chromSizes);
-verbose(1, "Read %d chromosomes and sizes from %s\n",  chromHash->elCount, chromSizes);
+struct ppBed *pb;
 
-/* Load and sort input file. */
-bits64 fullSize;
-struct ppBed *pb, *pbList = ppBedLoadAll(inName, chromHash, chromHash->lm, as, 
-	definedFieldCount, &fullSize, &fieldCount);
 if (definedFieldCount == 0)
     definedFieldCount = fieldCount;
-bits64 pbCount = slCount(pbList);
-verbose(1, "Read %llu items from %s\n", pbCount, inName);
-if (!sorted)
-    {
-    slSort(&pbList, ppBedCmp);
-    }
 
 /* Make chromosomes. */
 int chromCount, maxChromNameSize;
 struct bbiChromInfo *chromInfoArray;
@@ -339,9 +394,10 @@
 
 /* Calculate first meaningful reduction size and make first reduction. */
 bits16 summaryCount = 0;
 bits16 version = 1;
-int initialReduction = ppBedAverageSize(pbList)*10;
+int initialReduction = pbAverageSize*10;
+
 verbose(2, "averageBedSize=%d\n", initialReduction/10);
 if (initialReduction > 10000)
     initialReduction = 10000;
 bits64 lastSummarySize = 0, summarySize;