src/inc/bigBed.h 1.13

1.13 2009/04/29 17:59:33 mikep
splitting bigBedFileCreate() logic to load bed records from a file separately from calculating summary and writing files
Index: src/inc/bigBed.h
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/inc/bigBed.h,v
retrieving revision 1.12
retrieving revision 1.13
diff -b -B -U 4 -r1.12 -r1.13
--- src/inc/bigBed.h	20 Apr 2009 23:16:18 -0000	1.12
+++ src/inc/bigBed.h	29 Apr 2009 17:59:33 -0000	1.13
@@ -17,8 +17,19 @@
     bits32 start, end;		/* Range inside chromosome - half open zero based. */
     char *rest;			/* Rest of line. May be NULL*/
     };
 
+struct ppBed
+/* A partially parsed out bed record plus some extra fields. */
+    {
+    struct ppBed *next;	/* Next in list. */
+    char *chrom;		/* Chromosome name (not allocated here) */
+    bits32 start, end;		/* Range inside chromosome - half open zero based. */
+    char *rest;			/* The rest of the bed. */
+    bits64 fileOffset;		/* File offset. */
+    bits32 chromId;		/* Chromosome ID. */
+    };
+
 struct bbiFile *bigBedFileOpen(char *fileName);
 /* Open up big bed file.   Free this up with bbiFileFree */
 
 struct bigBedInterval *bigBedIntervalQuery(struct bbiFile *bbi, char *chrom, 
@@ -62,18 +73,53 @@
 	char *asFileName, /* If non-null points to a .as file that describes fields. */
 	char *outName);   /* BigBed output file name. */
 /* Convert tab-separated bed file to binary indexed, zoomed bigBed version. */
 
-void bigBedFileCreateDetailed(
+void bigBedFileCreateReadInfile(
 	char *inName, 	  /* Input file in a tabular bed format <chrom><start><end> + whatever. */
-	boolean sorted,	  /* Input is already sorted */
 	char *chromSizes, /* Two column tab-separated file: <chromosome> <size>. */
 	int blockSize,	  /* Number of items to bundle in r-tree.  1024 is good. */
 	int itemsPerSlot, /* Number of items in lowest level of tree.  64 is good. */
 	bits16 definedFieldCount,  /* Number of defined bed fields - 3-16 or so.  0 means all fields
 				    * are the defined bed ones. */
 	char *asFileName, /* If non-null points to a .as file that describes fields. */
+	char *outName,    /* BigBed output file name. */
+	struct ppBed **ppbList,   /* Input bed data, will be sorted. */
+	bits64 *count,            /* size of input pbList */
+	double *averageSize,      /* average size of elements in pbList */
+	struct hash **pChromHash,  /* Hash containing sizes of all chroms. */
+	bits16 *fieldCount,       /* actual field count from input data. */
+	struct asObject **pAs,    /* If non-null contains as object that describes fields. */
+	bits64 *fullSize);         /* full size of ppBed on disk */
+/* Load data to prepare bigBed. */
+
+void bigBedFileCreateDetailed(
+	struct ppBed *pbList, 	  /* Input bed data. Must be sorted. */
+	bits64 pbCount,           /* size of input pbList */
+	double pbAverageSize,     /* average size of elements in pbList */
+	char *inName,             /* Input file name (for error message reporting) */
+	struct hash *chromHash,   /* Hash containing sizes of all chroms. */
+	int blockSize,	  /* Number of items to bundle in r-tree.  1024 is good. */
+	int itemsPerSlot, /* Number of items in lowest level of tree.  64 is good. */
+	bits16 definedFieldCount, /* Number of defined bed fields - 3-16 or so.  0 means all fields
+				    * are the defined bed ones. */
+	bits16 fieldCount,        /* actual field count from input data. */
+	char *asFileName,         /* If non-null points to a .as file that describes fields. */
+	struct asObject *as,      /* If non-null contains as object that describes fields. */
+	bits64 fullSize,          /* full size of ppBed on disk */
 	char *outName);   /* BigBed output file name. */
-/* Convert tab-separated bed file to binary indexed, zoomed bigBed version. */
+/* create zoomed bigBed version from ppBed list. */
+
+struct ppBed *ppBedLoadOne(char **row, int fieldCount, struct lineFile *lf, struct hash *chromHash, struct lm *lm, struct asObject *as, bits64 *diskSize);
+/* Return a ppBed record from a line of bed file in lf.
+   Return the disk size it would occupy in *diskSize.
+   row is a preallocated array of pointers to the individual fields in this row to load.
+   fieldCount is the number of fields.
+   lf is the lineFile the row is coming from, used for error messages and parsing fields.
+   chromHash is a hash of the chromosome sizes.
+   lm is localMem to allocate ppBed memory from - don't ppBedFreeList or slFree
+   list!
+   as is the autoSql object describing this bed file or NULL if standard bed.
+   */
 
 #endif /* BIGBED_H */