72d5004d94ad220eecb2868ee1d58d6257ab051d
kent
  Mon Jan 20 12:05:23 2014 -0800
Improving handling of empty data sets by bedToBigBed and bigBedInfo.
diff --git src/lib/cirTree.c src/lib/cirTree.c
index 84f3687..afc1419 100644
--- src/lib/cirTree.c
+++ src/lib/cirTree.c
@@ -1,626 +1,632 @@
 /* cirTree chromosome id r tree.  Part of a system to index chromosome ranges - things of
  * form chrN:start-end.  Generally you'll be using the crTree module - which
  * makes use of this module and the bPlusTree module - rather than this module directly.
  * This module works with chromosomes mapped to small integers rather than chromosomes
  * as strings, saving space and speeding things up in the process, but requiring the
  * separate bPlusTree to map the names to IDs. 
  *   This module implements a one dimensional R-tree index treating the chromosome ID
  * as the most significant part of a two-part key, and the base position as the least 
  * significant part of the key. */
 
 #include "common.h"
 #include "localmem.h"
 #include "sig.h"
 #include "udc.h"
 #include "cirTree.h"
 
 struct rTree
 /* Recursive range structure. */
     {
     struct rTree *next;	/* Next on same level. */
     struct rTree *children;	/* Child list. */
     struct rTree *parent;	/* Our parent if any. */
     bits32 startChromIx;	/* Starting chromosome. */
     bits32 startBase;		/* Starting base position. */
     bits32 endChromIx;		/* Ending chromosome. */
     bits32 endBase;		/* Ending base. */
     bits64 startFileOffset;	/* Start offset in file for leaves. */
     bits64 endFileOffset;	/* End file offset for leaves. */
     };
 
 #define fileHeaderSize (48)	/* Size of file header. */
 #define indexSlotSize (24)	/* Size of startChrom,startBase,endChrom,endBase,offset */
 #define leafSlotSize (32)       /* Size of startChrom,startBase,endChrom,endBase,offset,size */
 #define nodeHeaderSize (4)	/* Size of rTree node header. isLeaf,reserved,childCount. */
 
 int indexNodeSize(int blockSize)
 /* Return size of an index node. */
 {
 return nodeHeaderSize + indexSlotSize * blockSize;
 }
 
 int leafNodeSize(int blockSize)
 /* Return size of a leaf node. */
 {
 return nodeHeaderSize + leafSlotSize * blockSize;
 }
 
 
 static bits64 rWriteIndexLevel(bits16 blockSize, int childNodeSize,
 	struct rTree *tree, int curLevel, int destLevel,
 	bits64 offsetOfFirstChild, FILE *f)
 /* Recursively write an index level, skipping levels below destLevel,
  * writing out destLevel. */
 {
 struct rTree *el;
 bits64 offset = offsetOfFirstChild;
 if (curLevel == destLevel)
     {
     /* We've reached the right level, write out a node header */
     UBYTE reserved = 0;
     UBYTE isLeaf = FALSE;
     bits16 countOne = slCount(tree->children);
     writeOne(f, isLeaf);
     writeOne(f, reserved);
     writeOne(f, countOne);
 
     /* Write out elements of this node. */
     for (el = tree->children; el != NULL; el = el->next)
 	{
 	writeOne(f, el->startChromIx);
 	writeOne(f, el->startBase);
 	writeOne(f, el->endChromIx);
 	writeOne(f, el->endBase);
 	writeOne(f, offset);
 	offset += childNodeSize;
 	}
 
     /* Write out zeroes for empty slots in node. */
     int i;
     for (i=countOne; i<blockSize; ++i)
 	repeatCharOut(f, 0, indexSlotSize);
     }
 else 
     {
     /* Otherwise recurse on children. */
     for (el = tree->children; el != NULL; el = el->next)
 	offset = rWriteIndexLevel(blockSize, childNodeSize, el, curLevel+1, destLevel,
 		offset, f);
     }
 return offset;
 }
 
 static void writeIndexLevel(int blockSize, int childNodeSize, 
 	struct rTree *tree, bits64 offsetOfFirstChild, int level, FILE *f)
 /* Write out a non-leaf level nodes at given level. */
 {
 rWriteIndexLevel(blockSize, childNodeSize, tree, 0, level, offsetOfFirstChild, f);
 }
 
 static void rWriteLeaves(int itemsPerSlot, int lNodeSize, struct rTree *tree, int curLevel,
 	int leafLevel, FILE *f)
 /* Write out leaf-level nodes. */
 {
 if (curLevel == leafLevel)
     {
     /* We've reached the right level, write out a node header. */
     UBYTE reserved = 0;
     UBYTE isLeaf = TRUE;
     bits16 countOne = slCount(tree->children);
     writeOne(f, isLeaf);
     writeOne(f, reserved);
     writeOne(f, countOne);
 
     /* Write out elements of this node. */
     struct rTree *el;
     for (el = tree->children; el != NULL; el = el->next)
 	{
 	writeOne(f, el->startChromIx);
 	writeOne(f, el->startBase);
 	writeOne(f, el->endChromIx);
 	writeOne(f, el->endBase);
 	writeOne(f, el->startFileOffset);
 	bits64 size = el->endFileOffset - el->startFileOffset;
 	writeOne(f, size);
 	}
 
     /* Write out zeroes for empty slots in node. */
     int i;
     for (i=countOne; i<itemsPerSlot; ++i)
 	repeatCharOut(f, 0, indexSlotSize);
     }
 else
     {
     /* Otherwise recurse on children. */
     struct rTree *el;
     for (el = tree->children; el != NULL; el = el->next)
 	rWriteLeaves(itemsPerSlot, lNodeSize, el, curLevel+1, leafLevel, f);
     }
 }
 
 static void writeLeaves(int itemsPerSlot, int lNodeSize, struct rTree *tree, int leafLevel, FILE *f)
 /* Write out leaf-level nodes. */
 {
 rWriteLeaves(itemsPerSlot, lNodeSize, tree, 0, leafLevel, f);
 }
 
 void calcLevelSizes(struct rTree *tree, int *levelSizes, int level, int maxLevel)
 /* Recursively count sizes of levels and add to appropriate slots of levelSizes */
 {
 struct rTree *el;
 for (el = tree; el != NULL; el = el->next)
     {
     levelSizes[level] += 1;
     if (level < maxLevel)
         calcLevelSizes(el->children, levelSizes, level+1, maxLevel);
     }
 }
 
 static struct rTree *rTreeFromChromRangeArray( struct lm *lm, int blockSize, int itemsPerSlot,
 	void *itemArray, int itemSize, bits64 itemCount,  void *context,
 	struct cirTreeRange (*fetchKey)(const void *va, void *context),
 	bits64 (*fetchOffset)(const void *va, void *context), bits64 endFileOffset,
 	int *retLevelCount)
 {
+if (itemCount == 0)
+    return NULL;
 char *items = itemArray;
 struct rTree *el, *list=NULL, *tree = NULL;
 
 /* Make first level above leaf. */
 bits64 i;
 bits64 nextOffset = (*fetchOffset)(items, context);
 int oneSize = 0;
 for (i=0; i<itemCount; i += oneSize)
     {
 
     /* Allocate element and put on list. */
     lmAllocVar(lm, el);
     slAddHead(&list, el);
 
     /* Fill out most of element from first item in element. */
     char *startItem = items + itemSize * i;
     struct cirTreeRange key = (*fetchKey)(startItem, context);
     el->startChromIx = el->endChromIx = key.chromIx;
     el->startBase = key.start;
     el->endBase = key.end;
     el->startFileOffset = nextOffset;
 
     oneSize = 1;
 
     char *endItem = startItem;
     int j;
     for (j=i+1; j<itemCount; ++j) {
 	endItem += itemSize;
 	nextOffset = (*fetchOffset)(endItem, context);
 	if (nextOffset != el->startFileOffset)
 		break;
 	else 
 		oneSize++;
     }
     if (j == itemCount) {
 	nextOffset = endFileOffset;
     }
 
     el->endFileOffset = nextOffset;
 
     /* Expand area spanned to include all items in block. */
     for (j=1; j<oneSize; ++j)
         {
 	void *item = items + itemSize*(i+j);
 	key = (*fetchKey)(item, context);
 	if (key.chromIx < el->startChromIx)
 	    {
 	    el->startChromIx = key.chromIx;
 	    el->startBase = key.start;
 	    }
 	else if (key.chromIx == el->startChromIx)
 	    {
 	    if (key.start < el->startBase)
 	        el->startBase = key.start;
 	    }
 	if (key.chromIx > el->endChromIx)
 	    {
 	    el->endChromIx = key.chromIx;
 	    el->endBase = key.end;
 	    }
 	else if (key.chromIx == el->endChromIx)
 	    {
 	    if (key.end > el->endBase)
 	        el->endBase = key.end;
 	    }
 	}
     }
 slReverse(&list);
 verbose(2, "Made %d primary index nodes out of %llu items\n", slCount(list), itemCount);
 
 /* Now iterate through making more and more condensed versions until have just one. */
 int levelCount = 1;
 tree = list;
 while (tree->next != NULL || levelCount < 2)
     {
     list = NULL;
     int slotsUsed = blockSize;
     struct rTree *parent = NULL, *next;
     for (el = tree; el != NULL; el = next)
         {
 	next = el->next;
 	if (slotsUsed >= blockSize)
 	    {
 	    slotsUsed = 1;
 	    lmAllocVar(lm, parent);
 	    parent = lmCloneMem(lm, el, sizeof(*el));
 	    parent->children = el;
 	    el->parent = parent;
 	    el->next = NULL;
 	    slAddHead(&list, parent);
 	    }
 	else
 	    {
 	    ++slotsUsed;
 	    slAddHead(&parent->children, el);
 	    el->parent = parent;
 	    if (el->startChromIx < parent->startChromIx)
 		{
 	        parent->startChromIx = el->startChromIx;
 		parent->startBase = el->startBase;
 		}
 	    else if (el->startChromIx == parent->startChromIx)
 	        {
 		if (el->startBase < parent->startBase)
 		    parent->startBase = el->startBase;
 		}
 	    if (el->endChromIx > parent->endChromIx)
 		{
 	        parent->endChromIx = el->endChromIx;
 		parent->endBase = el->endBase;
 		}
 	    else if (el->endChromIx == parent->endChromIx)
 	        {
 		if (el->endBase > parent->endBase)
 		    parent->endBase = el->endBase;
 		}
 	    }
 	}
 
     slReverse(&list);
     for (el = list; el != NULL; el = el->next)
         slReverse(&el->children);
     tree = list;
     levelCount += 1;
     }
 *retLevelCount = levelCount;
 return tree;
 }
 
 static void writeTreeToOpenFile(struct rTree *tree, int blockSize, int levelCount, FILE *f)
 /* Write out tree to a file that is open already - writing out index nodes from 
  * highest to lowest level, and then leaf nodes. */
 {
 /* Calculate sizes of each level. */
 int i;
 int levelSizes[levelCount];
 for (i=0; i<levelCount; ++i)
     levelSizes[i] = 0;
 calcLevelSizes(tree, levelSizes, 0, levelCount-1);
 
 /* Calc offsets of each level. */
 bits64 levelOffsets[levelCount];
 bits64 offset = ftell(f);
 bits64 iNodeSize = indexNodeSize(blockSize);
 bits64 lNodeSize = leafNodeSize(blockSize);
 for (i=0; i<levelCount; ++i)
     {
     levelOffsets[i] = offset;
     offset += levelSizes[i] * iNodeSize;
     verbose(2, "level %d: size %d, offset %llu\n", i, levelSizes[i], levelOffsets[i]);
     }
 
 verbose(2, "%d levels.  Level sizes are", levelCount);
 for (i=0; i<levelCount; ++i) verbose(2, " %d", levelSizes[i]);
 verbose(2, "\n");
 
 /* Write out index levels. */
 int finalLevel = levelCount-3;
 for (i=0; i<=finalLevel; ++i)
     {
     bits64 childNodeSize = (i==finalLevel ? lNodeSize : iNodeSize);
     writeIndexLevel(blockSize, childNodeSize, tree,
     	levelOffsets[i+1], i, f);
     if (ftell(f) != levelOffsets[i+1])
         errAbort("Internal error: offset mismatch (%llu vs %llu) line %d of %s\n", (bits64)ftell(f), levelOffsets[i+1], __LINE__, __FILE__);
     }
 
 /* Write out leaf level. */
 int leafLevel = levelCount - 2;
 writeLeaves(blockSize, leafNodeSize(blockSize), tree, leafLevel, f);
 }
 
 void cirTreeFileBulkIndexToOpenFile(
 	void *itemArray, int itemSize, bits64 itemCount, 
 	bits32 blockSize, bits32 itemsPerSlot,
 	void *context,
 	struct cirTreeRange (*fetchKey)(const void *va, void *context),
 	bits64 (*fetchOffset)(const void *va, void *context), 
 	bits64 endFileOffset, FILE *f)
 /* Create a r tree index from a sorted array, writing output starting at current position
  * of an already open file.  See rTreeFileCreate for explanation of parameters. */
 {
-int levelCount;
+int levelCount = 0;
 struct lm *lm = lmInit(0);
 struct rTree *tree = rTreeFromChromRangeArray(lm, blockSize, itemsPerSlot,
 	itemArray, itemSize, itemCount, context, fetchKey, fetchOffset, endFileOffset,
 	&levelCount);
+struct rTree dummyTree = {.startBase=0};
+if (tree == NULL) 
+    tree = &dummyTree;	// Work for empty files....
 bits32 magic = cirTreeSig;
 bits32 reserved = 0;
 writeOne(f, magic);
 writeOne(f, blockSize);
 writeOne(f, itemCount);
 writeOne(f, tree->startChromIx);
 writeOne(f, tree->startBase);
 writeOne(f, tree->endChromIx);
 writeOne(f, tree->endBase);
 writeOne(f, endFileOffset);
 writeOne(f, itemsPerSlot);
 writeOne(f, reserved);
+if (tree != &dummyTree)
     writeTreeToOpenFile(tree, blockSize, levelCount, f);
 lmCleanup(&lm);
 }
 
 void cirTreeFileCreate(
 	void *itemArray, 	/* Sorted array of things to index. */
 	int itemSize, 		/* Size of each element in array. */
 	bits64 itemCount, 	/* Number of elements in array. */
 	bits32 blockSize,	/* R tree block size - # of children for each node. */
 	bits32 itemsPerSlot,	/* Number of items to put in each index slot at lowest level. */
 	void *context,		/* Context pointer for use by fetch call-back functions. */
 	struct cirTreeRange (*fetchKey)(const void *va, void *context),/* Given item, return key. */
 	bits64 (*fetchOffset)(const void *va, void *context), /* Given item, return file offset */
 	bits64 endFileOffset,				 /* Last position in file we index. */
 	char *fileName)                                  /* Name of output file. */
 /* Create a r tree index file from a sorted array. */
 {
 FILE *f = mustOpen(fileName, "wb");
 cirTreeFileBulkIndexToOpenFile(itemArray, itemSize, itemCount, blockSize, itemsPerSlot,
 	context, fetchKey, fetchOffset, endFileOffset, f);
 carefulClose(&f);
 }
 
 struct cirTreeFile *cirTreeFileAttach(char *fileName, struct udcFile *udc)
 /* Open up r-tree index file on previously open file, with cirTree
  * header at current file position. */
 {
 /* Open file and allocate structure to hold info from header etc. */
 struct cirTreeFile *crt = needMem(sizeof(*crt));
 crt->fileName = fileName;
 crt->udc = udc;
 
 /* Read magic number at head of file and use it to see if we are proper file type, and
  * see if we are byte-swapped. */
 bits32 magic;
 boolean isSwapped = FALSE;
 udcMustReadOne(udc, magic);
 if (magic != cirTreeSig)
     {
     magic = byteSwap32(magic);
     isSwapped = crt->isSwapped = TRUE;
     if (magic != cirTreeSig)
        errAbort("%s is not a chromosome id r-tree index file", fileName);
     }
 
 /* Read rest of defined bits of header, byte swapping as needed. */
 crt->blockSize = udcReadBits32(udc, isSwapped);
 crt->itemCount = udcReadBits64(udc, isSwapped);
 crt->startChromIx = udcReadBits32(udc, isSwapped);
 crt->startBase = udcReadBits32(udc, isSwapped);
 crt->endChromIx = udcReadBits32(udc, isSwapped);
 crt->endBase = udcReadBits32(udc, isSwapped);
 crt->fileSize = udcReadBits64(udc, isSwapped);
 crt->itemsPerSlot = udcReadBits32(udc, isSwapped);
 
 /* Skip over reserved bits of header. */
 bits32 reserved32;
 udcMustReadOne(udc, reserved32);
 
 /* Save position of root block of r tree. */
 crt->rootOffset = udcTell(udc);
 
 return crt;
 }
 
 struct cirTreeFile *cirTreeFileOpen(char *fileName)
 /* Open up r-tree index file - reading header and verifying things. */
 {
 return cirTreeFileAttach(cloneString(fileName), udcFileOpen(fileName, udcDefaultDir()));
 }
 
 void cirTreeFileDetach(struct cirTreeFile **pCrt)
 /* Detatch and free up cirTree file opened with cirTreeFileAttach. */
 {
 freez(pCrt);
 }
 
 void cirTreeFileClose(struct cirTreeFile **pCrt)
 /* Close and free up cirTree file opened with cirTreeFileAttach. */
 {
 struct cirTreeFile *crt = *pCrt;
 if (crt != NULL)
     {
     freez(&crt->fileName);
     udcFileClose(&crt->udc);
     cirTreeFileDetach(pCrt);
     }
 }
 
 INLINE int cmpTwoBits32(bits32 aHi, bits32 aLo, bits32 bHi, bits32 bLo)
 /* Return - if b is less than a , 0 if equal, else +*/
 {
 if (aHi < bHi)
     return 1;
 else if (aHi > bHi)
     return -1;
 else
     {
     if (aLo < bLo)
        return 1;
     else if (aLo > bLo)
        return -1;
     else
        return 0;
     }
 }
 
 INLINE boolean cirTreeOverlaps(int qChrom, int qStart, int qEnd, 
 	int rStartChrom, int rStartBase, int rEndChrom, int rEndBase)
 {
 return cmpTwoBits32(qChrom, qStart, rEndChrom, rEndBase) > 0 &&
        cmpTwoBits32(qChrom, qEnd, rStartChrom, rStartBase) < 0;
 }
 
 static void rFindOverlappingBlocks(struct cirTreeFile *crt, int level, bits64 indexFileOffset,
 	bits32 chromIx, bits32 start, bits32 end, struct fileOffsetSize **retList)
 /* Recursively find blocks with data. */
 {
 struct udcFile *udc = crt->udc;
 
 /* Seek to start of block. */
 udcSeek(udc, indexFileOffset);
 
 /* Read block header. */
 UBYTE isLeaf;
 UBYTE reserved;
 bits16 i, childCount;
 udcMustReadOne(udc, isLeaf);
 udcMustReadOne(udc, reserved);
 boolean isSwapped = crt->isSwapped;
 childCount = udcReadBits16(udc, isSwapped);
 
 verbose(3, "rFindOverlappingBlocks %llu %u:%u-%u.  childCount %d. isLeaf %d\n", indexFileOffset, chromIx, start, end, (int)childCount, (int)isLeaf);
 
 if (isLeaf)
     {
     /* Loop through node adding overlapping leaves to block list. */
     for (i=0; i<childCount; ++i)
         {
 	bits32 startChromIx = udcReadBits32(udc, isSwapped);
 	bits32 startBase = udcReadBits32(udc, isSwapped);
 	bits32 endChromIx = udcReadBits32(udc, isSwapped);
 	bits32 endBase = udcReadBits32(udc, isSwapped);
 	bits64 offset = udcReadBits64(udc, isSwapped);
 	bits64 size = udcReadBits64(udc, isSwapped);
 	if (cirTreeOverlaps(chromIx, start, end, startChromIx, startBase, endChromIx, endBase))
 	    {
 	    struct fileOffsetSize *block;
 	    AllocVar(block);
 	    block->offset = offset;
 	    block->size = size;
 	    slAddHead(retList, block);
 	    }
 	}
     }
 else
     {
     /* Read node into arrays. */
     bits32 startChromIx[childCount], startBase[childCount];
     bits32 endChromIx[childCount], endBase[childCount];
     bits64 offset[childCount];
     for (i=0; i<childCount; ++i)
         {
 	startChromIx[i] = udcReadBits32(udc, isSwapped);
 	startBase[i] = udcReadBits32(udc, isSwapped);
 	endChromIx[i] = udcReadBits32(udc, isSwapped);
 	endBase[i] = udcReadBits32(udc, isSwapped);
 	offset[i] = udcReadBits64(udc, isSwapped);
 	}
 
     /* Recurse into child nodes that we overlap. */
     for (i=0; i<childCount; ++i)
 	{
 	if (cirTreeOverlaps(chromIx, start, end, startChromIx[i], startBase[i], 
 		endChromIx[i], endBase[i]))
 	    {
 	    rFindOverlappingBlocks(crt, level+1, offset[i], chromIx, start, end, retList);
 	    }
 	}
     }
 }
 
 struct fileOffsetSize *cirTreeFindOverlappingBlocks(struct cirTreeFile *crt, 
 	bits32 chromIx, bits32 start, bits32 end)
 /* Return list of file blocks that between them contain all items that overlap
  * start/end on chromIx.  Also there will be likely some non-overlapping items
  * in these blocks too. When done, use slListFree to dispose of the result. */
 {
 struct fileOffsetSize *blockList = NULL;
 rFindOverlappingBlocks(crt, 0, crt->rootOffset, chromIx, start, end, &blockList);
 slReverse(&blockList);
 return blockList;
 }
 
 static void rEnumerateBlocks(struct cirTreeFile *crt, int level, bits64 indexFileOffset,
 	struct fileOffsetSize **retList)
 /* Recursively find blocks with data. */
 {
 struct udcFile *udc = crt->udc;
 
 /* Seek to start of block. */
 udcSeek(udc, indexFileOffset);
 
 /* Read block header. */
 UBYTE isLeaf;
 UBYTE reserved;
 bits16 i, childCount;
 udcMustReadOne(udc, isLeaf);
 udcMustReadOne(udc, reserved);
 boolean isSwapped = crt->isSwapped;
 childCount = udcReadBits16(udc, isSwapped);
 
 verbose(3, "rEnumerateBlocks %llu childCount %d. isLeaf %d\n", indexFileOffset, (int)childCount, (int)isLeaf);
 
 if (isLeaf)
     {
     /* Loop through node adding overlapping leaves to block list. */
     for (i=0; i<childCount; ++i)
         {
 	udcReadBits32(udc, isSwapped);
 	udcReadBits32(udc, isSwapped);
 	udcReadBits32(udc, isSwapped);
 	udcReadBits32(udc, isSwapped);
 	bits64 offset = udcReadBits64(udc, isSwapped);
 	bits64 size = udcReadBits64(udc, isSwapped);
         struct fileOffsetSize *block;
         AllocVar(block);
         block->offset = offset;
         block->size = size;
         slAddHead(retList, block);
 	}
     }
 else
     {
     /* Read node into arrays. */
     bits64 offset[childCount];
     for (i=0; i<childCount; ++i)
         { 
 	udcReadBits32(udc, isSwapped);
 	udcReadBits32(udc, isSwapped);
 	udcReadBits32(udc, isSwapped);
 	udcReadBits32(udc, isSwapped);
 	offset[i] = udcReadBits64(udc, isSwapped);
 	}
 
     /* Recurse into child nodes that we overlap. */
     for (i=0; i<childCount; ++i)
 	{
 	rEnumerateBlocks(crt, level+1, offset[i], retList);
 	}
     }
 }
 
 struct fileOffsetSize *cirTreeEnumerateBlocks(struct cirTreeFile *crt)
 /* Return list of file blocks that between them contain all items that overlap
  * start/end on chromIx.  Also there will be likely some non-overlapping items
  * in these blocks too. When done, use slListFree to dispose of the result. */
 {
 struct fileOffsetSize *blockList = NULL;
 rEnumerateBlocks(crt, 0, crt->rootOffset, &blockList);
 slReverse(&blockList);
 return blockList;
 }