src/lib/bigBed.c 1.18
1.18 2009/05/13 01:13:57 kent
Adding clip parameter to bigBed creation. Making a few functions static that don't seem to need to be exposed at the moment.
Index: src/lib/bigBed.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/lib/bigBed.c,v
retrieving revision 1.17
retrieving revision 1.18
diff -b -B -U 4 -r1.17 -r1.18
--- src/lib/bigBed.c 29 Apr 2009 17:59:34 -0000 1.17
+++ src/lib/bigBed.c 13 May 2009 01:13:57 -0000 1.18
@@ -56,9 +56,10 @@
const struct ppBed *a = *((struct ppBed **)va);
return a->fileOffset;
}
-struct ppBed *ppBedLoadOne(char **row, int fieldCount, struct lineFile *lf, struct hash *chromHash, struct lm *lm, struct asObject *as, bits64 *diskSize)
+static struct ppBed *ppBedLoadOne(char **row, int fieldCount, struct lineFile *lf,
+ struct hash *chromHash, boolean clip, struct lm *lm, struct asObject *as, bits64 *diskSize)
/* Return a ppBed record from a line of bed file in lf.
Return the disk size it would occupy in *diskSize.
row is a preallocated array of pointers to the individual fields in this row to load.
fieldCount is the number of fields.
@@ -68,19 +69,47 @@
list!
as is the autoSql object describing this bed file or NULL if standard bed.
*/
{
-struct ppBed *pb;
-
-/* Allocate variable and fill in first three fields. */
-lmAllocVar(lm, pb);
char *chrom = row[0];
+int start = lineFileNeedNum(lf, row, 1);
+int end = lineFileNeedNum(lf, row, 2);
+
struct hashEl *hel = hashLookup(chromHash, chrom);
if (hel == NULL)
errAbort("%s is not in chrom.sizes line %d of %s", chrom, lf->lineIx, lf->fileName);
+int chromSize = ptToInt(hel->val);
+
+if (start < 0)
+ {
+ if (clip)
+ start = 0;
+ else
+ errAbort("Start coordinate %d is negative line %d of %s", start, lf->lineIx, lf->fileName);
+ }
+if (end > chromSize)
+ {
+ if (clip)
+ end = chromSize;
+ else
+ errAbort("End coordinate %d is bigger than %s, which just has %d bases. Line %d of %s",
+ end, chrom, chromSize, lf->lineIx, lf->fileName);
+ }
+if (start > end)
+ {
+ if (clip)
+ return NULL;
+ else
+ errAbort("Start coordinate %d after end coordinate %d line %d of %s",
+ start, end, lf->lineIx, lf->fileName);
+ }
+
+/* Allocate variable and fill in first three fields. */
+struct ppBed *pb;
+lmAllocVar(lm, pb);
pb->chrom = hel->name;
-pb->start = lineFileNeedNum(lf, row, 1);
-pb->end = lineFileNeedNum(lf, row, 2);
+pb->start = start;
+pb->end = end;
int i;
/* Check remaining fields are formatted right, and concatenate them into "rest" string. */
if (fieldCount > 3)
@@ -117,9 +146,11 @@
return pb;
}
static struct ppBed *ppBedLoadAll(char *fileName, struct hash *chromHash, struct lm *lm,
- struct asObject *as, int definedFieldCount, bits64 *retDiskSize, bits16 *retFieldCount, boolean *isSorted, bits64 *count, double *avgSize)
+ struct asObject *as, int definedFieldCount, boolean clip,
+ bits64 *retDiskSize, bits16 *retFieldCount, boolean *isSorted,
+ bits64 *count, double *avgSize)
/* Read bed file and return it as list of ppBeds. The whole thing will
* be allocated in the passed in lm - don't ppBedFreeList or slFree
* list!
* Returns TRUE in isSorted if the input file is already sorted,
@@ -163,9 +194,11 @@
/* Chop up line and make sure the word count is right. */
int wordCount = chopByWhite(line, row, fieldAlloc);
lineFileExpectWords(lf, fieldCount, wordCount);
bits64 diskSize = 0;
- pb = ppBedLoadOne(row, fieldCount, lf, chromHash, lm, as, &diskSize);
+ pb = ppBedLoadOne(row, fieldCount, lf, chromHash, clip, lm, as, &diskSize);
+ if (pb == NULL)
+ continue;
if (*isSorted && prevChrom && prevChrom == pb->chrom && pb->start < prevStart)
*isSorted = FALSE; // first time through the loop prevChrom is NULL so test fails
prevChrom = pb->chrom;
prevStart = pb->start;
@@ -275,40 +308,17 @@
slReverse(&outList);
return outList;
}
-void bigBedFileCreate(
- char *inName, /* Input file in a tabular bed format <chrom><start><end> + whatever. */
- char *chromSizes, /* Two column tab-separated file: <chromosome> <size>. */
- int blockSize, /* Number of items to bundle in r-tree. 1024 is good. */
- int itemsPerSlot, /* Number of items in lowest level of tree. 64 is good. */
- bits16 definedFieldCount, /* Number of defined bed fields - 3-16 or so. 0 means all fields
- * are the defined bed ones. */
- char *asFileName, /* If non-null points to a .as file that describes fields. */
- char *outName) /* BigBed output file name. */
-/* Convert tab-separated bed file to binary indexed, zoomed bigBed version. */
-{
-bits16 fieldCount;
-bits64 fullSize;
-bits64 count;
-double averageSize;
-struct asObject *as = NULL;
-struct hash *chromHash = NULL;
-struct ppBed *pbList = NULL;
-bigBedFileCreateReadInfile(inName, chromSizes, blockSize, itemsPerSlot, definedFieldCount, asFileName,
- outName, &pbList, &count, &averageSize, &chromHash, &fieldCount, &as, &fullSize);
-bigBedFileCreateDetailed(pbList, count, averageSize, inName, chromHash, blockSize, itemsPerSlot,
- definedFieldCount, fieldCount, asFileName, as, fullSize, outName);
-}
-
-void bigBedFileCreateReadInfile(
+static void bigBedFileCreateReadInfile(
char *inName, /* Input file in a tabular bed format <chrom><start><end> + whatever. */
char *chromSizes, /* Two column tab-separated file: <chromosome> <size>. */
int blockSize, /* Number of items to bundle in r-tree. 1024 is good. */
int itemsPerSlot, /* Number of items in lowest level of tree. 64 is good. */
bits16 definedFieldCount, /* Number of defined bed fields - 3-16 or so. 0 means all fields
* are the defined bed ones. */
char *asFileName, /* If non-null points to a .as file that describes fields. */
+ boolean clip, /* If set silently clip out of bound coordinates. */
char *outName, /* BigBed output file name. */
struct ppBed **ppbList, /* Input bed data, will be sorted. */
bits64 *count, /* size of input pbList */
double *averageSize, /* average size of elements in pbList */
@@ -332,9 +342,9 @@
struct hash *chromHash = bbiChromSizesFromFile(chromSizes);
verbose(1, "Read %d chromosomes and sizes from %s\n", chromHash->elCount, chromSizes);
/* Load and sort input file. */
struct ppBed *pbList = ppBedLoadAll(inName, chromHash, chromHash->lm, as,
- definedFieldCount, fullSize, fieldCount, &sorted, count, averageSize);
+ definedFieldCount, clip, fullSize, fieldCount, &sorted, count, averageSize);
verbose(1, "Read %llu items from %s\n", *count, inName);
if (!sorted)
slSort(&pbList, ppBedCmp);
*ppbList = pbList;
@@ -555,8 +565,34 @@
carefulClose(&f);
freez(&chromInfoArray);
}
+void bigBedFileCreate(
+ char *inName, /* Input file in a tabular bed format <chrom><start><end> + whatever. */
+ char *chromSizes, /* Two column tab-separated file: <chromosome> <size>. */
+ int blockSize, /* Number of items to bundle in r-tree. 1024 is good. */
+ int itemsPerSlot, /* Number of items in lowest level of tree. 64 is good. */
+ bits16 definedFieldCount, /* Number of defined bed fields - 3-16 or so. 0 means all fields
+ * are the defined bed ones. */
+ char *asFileName, /* If non-null points to a .as file that describes fields. */
+ boolean clip, /* If set silently clip out of bound coordinates. */
+ char *outName) /* BigBed output file name. */
+/* Convert tab-separated bed file to binary indexed, zoomed bigBed version. */
+{
+bits16 fieldCount;
+bits64 fullSize;
+bits64 count;
+double averageSize = 0;
+struct asObject *as = NULL;
+struct hash *chromHash = NULL;
+struct ppBed *pbList = NULL;
+bigBedFileCreateReadInfile(inName, chromSizes, blockSize, itemsPerSlot, definedFieldCount,
+ asFileName, clip, outName, &pbList, &count, &averageSize, &chromHash, &fieldCount, &as,
+ &fullSize);
+bigBedFileCreateDetailed(pbList, count, averageSize, inName, chromHash, blockSize, itemsPerSlot,
+ definedFieldCount, fieldCount, asFileName, as, fullSize, outName);
+}
+
struct bigBedInterval *bigBedIntervalQuery(struct bbiFile *bbi, char *chrom,
bits32 start, bits32 end, int maxItems, struct lm *lm)
/* Get data for interval. Return list allocated out of lm. Set maxItems to maximum
* number of items to return, or to 0 for all items. */