src/utils/bedToBigBed/bedToBigBed.c 7df99795b147931dfea8220ed5ab305d11fde6b4

7df99795b147931dfea8220ed5ab305d11fde6b4
braney
  Thu Mar 10 15:41:46 2022 -0800
move some chromAlias stuff around so bedToBigBed can use a chromAlias
bigBed as a chromAlias file.

diff --git src/utils/bedToBigBed/bedToBigBed.c src/utils/bedToBigBed/bedToBigBed.c
index 2b0e45d..c6fdb84 100644
--- src/utils/bedToBigBed/bedToBigBed.c
+++ src/utils/bedToBigBed/bedToBigBed.c
@@ -5,51 +5,53 @@
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "dystring.h"
 #include "obscure.h"
 #include "asParse.h"
 #include "basicBed.h"
 #include "memalloc.h"
 #include "sig.h"
 #include "rangeTree.h"
 #include "zlibFace.h"
 #include "sqlNum.h"
 #include "bPlusTree.h"
 #include "bigBed.h"
+#include "bbiAlias.h"
 #include "twoBit.h"
 
 char *version = "2.8";   // when changing, change in bedToBigBed, bedGraphToBigWig, and wigToBigWig
 /* Version history from 2.6 on at least -
  *   2.8 - Various changes where developer didn't increment version id
  *   2.7 - Added check for duplicate field names in asParse.c
  *   2.6 - Made it not crash on empty input.  
  *   */
 
 /* Things set directly or indirectly by command lne in main() routine. */
 int blockSize = 256;
 int itemsPerSlot = 512;
 char *extraIndex = NULL;
 int bedN = 0;   /* number of standard bed fields */
 int bedP = 0;   /* number of bed plus fields */
 char *asFile = NULL;
 char *asText = NULL;
 char *udcDir = NULL;
 static boolean doCompress = FALSE;
 static boolean tabSep = FALSE;
 static boolean sizesIs2Bit = FALSE;
+static boolean sizesIsBb = FALSE;
 static boolean allow1bpOverlap = FALSE;
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "bedToBigBed v. %s - Convert bed file to bigBed. (bbi version: %d)\n"
   "usage:\n"
   "   bedToBigBed in.bed chrom.sizes out.bb\n"
   "Where in.bed is in one of the ascii bed formats, but not including track lines\n"
   "and chrom.sizes is a two-column file/URL: <chromosome name> <size in bases>\n"
   "and out.bb is the output indexed big bed file.\n"
   "If the assembly <db> is hosted by UCSC, chrom.sizes can be a URL like\n"
   "  http://hgdownload.soe.ucsc.edu/goldenPath/<db>/bigZips/<db>.chrom.sizes\n"
   "or you may use the script fetchChromSizes to download the chrom.sizes file.\n"
@@ -68,45 +70,47 @@
   "                      N is between 3 and 15, \n"
   "                      optional (+) if extra \"bedPlus\" fields, \n"
   "                      optional P specifies the number of extra fields. Not required, but preferred.\n"
   "                      Examples: -type=bed6 or -type=bed6+ or -type=bed6+3 \n"
   "                      (see http://genome.ucsc.edu/FAQ/FAQformat.html#format1)\n"
   "   -as=fields.as - If you have non-standard \"bedPlus\" fields, it's great to put a definition\n"
   "                   of each field in a row in AutoSql format here.\n"
   "   -blockSize=N - Number of items to bundle in r-tree.  Default %d\n"
   "   -itemsPerSlot=N - Number of data points bundled at lowest level. Default %d\n"
   "   -unc - If set, do not use compression.\n"
   "   -tab - If set, expect fields to be tab separated, normally\n"
   "           expects white space separator.\n"
   "   -extraIndex=fieldList - If set, make an index on each field in a comma separated list\n"
   "           extraIndex=name and extraIndex=name,id are commonly used.\n"
   "   -sizesIs2Bit  -- If set, the chrom.sizes file is assumed to be a 2bit file.\n"
+  "   -sizesIsBb  -- If set, the chrom.sizes file is assumed to be a bigBed file.\n"
   "   -udcDir=/path/to/udcCacheDir  -- sets the UDC cache dir for caching of remote files.\n"
   "   -allow1bpOverlap  -- allow exons to overlap by at most one base pair\n"
   "   -maxAlloc=N -- Set the maximum memory allocation size to N bytes\n"
   , version, bbiCurrentVersion, blockSize, itemsPerSlot
   );
 }
 
 static struct optionSpec options[] = {
    {"blockSize", OPTION_INT},
    {"itemsPerSlot", OPTION_INT},
    {"type", OPTION_STRING},
    {"as", OPTION_STRING},
    {"unc", OPTION_BOOLEAN},
    {"tab", OPTION_BOOLEAN},
    {"sizesIs2Bit", OPTION_BOOLEAN},
+   {"sizesIsBb", OPTION_BOOLEAN},
    {"extraIndex", OPTION_STRING},
    {"udcDir", OPTION_STRING},
    {"allow1bpOverlap", OPTION_BOOLEAN},
    {"maxAlloc", OPTION_LONG_LONG},
    {NULL, 0},
 };
 
 int bbNamedFileChunkCmpByName(const void *va, const void *vb)
 /* Compare two named offset object to facilitate qsorting by name. */
 {
 const struct bbNamedFileChunk *a = va, *b = vb;
 return strcmp(a->name, b->name);
 }
 
 static int maxBedNameSize;
@@ -553,70 +557,96 @@
     {
     if (eim->chunkArrayArray != NULL)
 	{
 	int i;
 	for (i=0; i < eim->indexCount; ++i)
 	    freeMem(eim->chunkArrayArray[i]);
 	}
     freeMem(eim->indexFields);
     freeMem(eim->maxFieldSize);
     freeMem(eim->chunkArrayArray);
     freeMem(eim->fileOffsets);
     freez(pEim);
     }
 }
 
+struct chromSizeClosure  // a structure that contains the data we need to get a chromosome size from a bigBed
+{ 
+    struct bbiFile *bbi;
+    struct bptIndex *bptIndex;
+    struct lm *lm;
+};
+
+static int ourChromSizeFunc(void *closure, char *chrom)
+/* A function to return the size of a given sequence. */
+{
+struct chromSizeClosure *ourClosure = (struct chromSizeClosure *)closure;
+
+return bbiAliasChromSize(ourClosure->bbi, ourClosure->bptIndex, ourClosure->lm, chrom);
+}
 
 void bbFileCreate(
 	char *inName, 	  /* Input file in a tabular bed format <chrom><start><end> + whatever. */
 	char *chromSizes, /* Two column tab-separated file: <chromosome> <size>. */
 	int blockSize,	  /* Number of items to bundle in r-tree.  1024 is good. */
 	int itemsPerSlot, /* Number of items in lowest level of tree.  64 is good. */
 	char *asText,	  /* Field definitions in a string */
 	struct asObject *as,  /* Field definitions parsed out */
 	boolean doCompress, /* If TRUE then compress data. */
 	struct slName *extraIndexList,	/* List of extra indexes to add */
 	char *outName)    /* BigBed output file name. */
 /* Convert tab-separated bed file to binary indexed, zoomed bigBed version. */
 {
 /* Set up timing measures. */
 verboseTimeInit();
 struct lineFile *lf = lineFileOpen(inName, TRUE);
 
 bits16 fieldCount = slCount(as->columnList);
 bits16 extraIndexCount = slCount(extraIndexList);
 
 struct bbExIndexMaker *eim = NULL;
 if (extraIndexList != NULL)
     eim = bbExIndexMakerNew(extraIndexList, as);
 
-/* Load in chromosome sizes. */
-struct hash *chromSizesHash = NULL;
+/* Do first pass, mostly just scanning file and counting hits per chromosome. */
+int minDiff = 0;
+double aveSize = 0;
+bits64 bedCount = 0;
+bits32 uncompressBufSize = 0;
+struct bbiChromUsage *usageList = NULL;
+if (sizesIsBb)
+    {
+    struct chromSizeClosure *ourClosure = NULL;
 
+    AllocVar(ourClosure);
+    ourClosure->bbi = bigBedFileOpen(chromSizes);
+    ourClosure->bptIndex = bbiAliasOpenExtra(ourClosure->bbi);
+    ourClosure->lm = lmInit(0);
+    usageList = bbiChromUsageFromBedFileAlias(lf, ourChromSizeFunc, ourClosure, eim, &minDiff, &aveSize, &bedCount, tabSep);
+    // should close and free the closure contents
+    }
+else
+    {
+    struct hash *chromSizesHash = NULL;
     if (sizesIs2Bit)
         chromSizesHash = twoBitChromHash(chromSizes);
     else
         chromSizesHash = bbiChromSizesFromFile(chromSizes);
     verbose(2, "Read %d chromosomes and sizes from %s\n",  chromSizesHash->elCount, chromSizes);
-
-/* Do first pass, mostly just scanning file and counting hits per chromosome. */
-int minDiff = 0;
-double aveSize = 0;
-bits64 bedCount = 0;
-bits32 uncompressBufSize = 0;
-struct bbiChromUsage *usageList = bbiChromUsageFromBedFile(lf, chromSizesHash, eim, 
-    &minDiff, &aveSize, &bedCount, tabSep);
+    usageList = bbiChromUsageFromBedFile(lf, chromSizesHash, eim, &minDiff, &aveSize, &bedCount, tabSep);
+    freeHash(&chromSizesHash);
+    }
 verboseTime(1, "pass1 - making usageList (%d chroms)", slCount(usageList));
 verbose(2, "%d chroms in %s. Average span of beds %f\n", slCount(usageList), inName, aveSize);
 
 /* Open output file and write dummy header. */
 FILE *f = mustOpen(outName, "wb");
 bbiWriteDummyHeader(f);
 bbiWriteDummyZooms(f);
 
 /* Write out autoSql string */
 bits64 asOffset = ftell(f);
 mustWrite(f, asText, strlen(asText) + 1);
 verbose(2, "as definition has %d columns\n", fieldCount);
 
 /* Write out dummy total summary. */
 struct bbiSummaryElement totalSum;
@@ -792,56 +822,56 @@
 	bits16 fieldId = eim->indexFields[i];
 	writeOne(f, fieldId);
 	repeatCharOut(f, 0, 2); // reserved
 	}
     assert(ftell(f) == extraIndexListEndOffset);
     }
 
 /* Write end signature. */
 fseek(f, 0L, SEEK_END);
 writeOne(f, sig);
 
 
 /* Clean up. */
 lineFileClose(&lf);
 carefulClose(&f);
-freeHash(&chromSizesHash);
 bbiChromUsageFreeList(&usageList);
 asObjectFreeList(&as);
 }
 
 void bedToBigBed(char *inName, char *chromSizes, char *outName)
 /* bedToBigBed - Convert bed file to bigBed.. */
 {
 struct slName *extraIndexList = slNameListFromString(extraIndex, ',');
 struct asObject *as = asParseText(asText);
 if (as == NULL)
     errAbort("AutoSql file (%s) not in legal format.", asFile);
 asCompareObjAgainstStandardBed(as, bedN, TRUE); // abort if bedN columns are not standard
 bbFileCreate(inName, chromSizes, blockSize, itemsPerSlot, asText, as, 
 	doCompress, extraIndexList, outName);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 blockSize = optionInt("blockSize", blockSize);
 itemsPerSlot = optionInt("itemsPerSlot", itemsPerSlot);
 asFile = optionVal("as", asFile);
 doCompress = !optionExists("unc");
 sizesIs2Bit = optionExists("sizesIs2Bit");
+sizesIsBb = optionExists("sizesIsBb");
 extraIndex = optionVal("extraIndex", NULL);
 tabSep = optionExists("tab");
 allow1bpOverlap = optionExists("allow1bpOverlap");
 udcDir = optionVal("udcDir", udcDefaultDir());
 size_t maxAlloc = optionLongLong("maxAlloc", 0);
 if (argc != 4)
     usage();
 udcSetDefaultDir(udcDir);
 if (maxAlloc > 0)
     setMaxAlloc(maxAlloc);
 
 if (optionExists("type"))
     {
     // parse type
     char *btype = cloneString(optionVal("type", ""));