d248ea47e2ffed0b22206642ad3477c1213ec5cc
braney
  Wed Mar 3 14:15:32 2021 -0800
add -countBlocks option.  Support bigBeds.

diff --git src/hg/featureBits/featureBits.c src/hg/featureBits/featureBits.c
index ca3a900..abd1e84 100644
--- src/hg/featureBits/featureBits.c
+++ src/hg/featureBits/featureBits.c
@@ -31,74 +31,77 @@
     {"not", OPTION_BOOLEAN},
     {"countGaps", OPTION_BOOLEAN},
     {"noRandom", OPTION_BOOLEAN},
     {"noHap", OPTION_BOOLEAN},
     {"primaryChroms", OPTION_BOOLEAN},
     {"dots", OPTION_INT},
     {"minFeatureSize", OPTION_INT},
     {"enrichment", OPTION_BOOLEAN},
     {"where", OPTION_STRING},
     {"bin", OPTION_STRING},
     {"chromSize", OPTION_STRING},
     {"binSize", OPTION_INT},
     {"binOverlap", OPTION_INT},
     {"bedRegionIn", OPTION_STRING},
     {"bedRegionOut", OPTION_STRING},
+    {"countBlocks", OPTION_BOOLEAN},
     {NULL, 0}
 };
 
 int minSize = 1;	/* Minimum size of feature. */
 char *clChrom = "all";	/* Which chromosome. */
 boolean orLogic = FALSE;  /* Do ors instead of ands? */
 boolean notResults = FALSE;   /* negate results? */
 char *where = NULL;		/* Extra selection info. */
 char *chromSizes = NULL;		/* read chrom sizes from file instead of database . */
 boolean countGaps = FALSE;	/* Count gaps in denominator? */
 boolean noRandom = FALSE;	/* Exclude _random chromosomes? */
 boolean noHap = FALSE;	/* Exclude _hap|_alt chromosomes? */
 boolean primaryChroms = FALSE; /* only include primary chroms */
 int dots = 0;	/* print dots every N chroms (scaffolds) processed */
 boolean calcEnrichment = FALSE;	/* Calculate coverage/enrichment? */
 int binSize = 500000;	/* Default bin size. */
 int binOverlap = 250000;	/* Default bin size. */
+boolean countBlocks = FALSE;	/* Count blocks in bed12 rather than extent. */
 
 /* to process chroms without constantly looking up in chromInfo, create
  * this list of them from the chromInfo once.
  */
 static struct chromInfo *chromInfoList = NULL;
 static struct hash *gapHash = NULL;
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "featureBits - Correlate tables via bitmap projections. \n"
   "usage:\n"
   "   featureBits database table(s)\n"
   "This will return the number of bits in all the tables anded together\n"
   "Pipe warning:  output goes to stderr.\n"
   "Options:\n"
   "   -bed=output.bed   Put intersection into bed format. Can use stdout.\n"
   "   -fa=output.fa     Put sequence in intersection into .fa file\n"
   "   -faMerge          For fa output merge overlapping features.\n"
   "   -minSize=N        Minimum size to output (default 1)\n"
   "   -chrom=chrN       Restrict to one chromosome\n"
   "   -chromSize=sizefile       Read chrom sizes from file instead of database. \n"
   "                             (chromInfo three column format)\n"
   "   -or               Or tables together instead of anding them\n"
   "   -not              Output negation of resulting bit set.\n"
   "   -countGaps        Count gaps in denominator\n"
+  "   -countBlocks      Count blocks in bed12 files rather than entire extent.\n"
   "   -noRandom         Don't include _random (or Un) chromosomes\n"
   "   -noHap            Don't include _hap|_alt chromosomes\n"
   "   -primaryChroms    Primary assembly (chroms without '_' in name)\n"
   "   -dots=N           Output dot every N chroms (scaffolds) processed\n"
   "   -minFeatureSize=n Don't include bits of the track that are smaller than\n"
   "                     minFeatureSize, useful for differentiating between\n"
   "                     alignment gaps and introns.\n"
   "   -bin=output.bin   Put bin counts in output file\n"
   "   -binSize=N        Bin size for generating counts in bin file (default 500000)\n"
   "   -binOverlap=N     Bin overlap for generating counts in bin file (default 250000)\n"
   "   -bedRegionIn=input.bed    Read in a bed file for bin counts in specific regions \n"
   "                     and write to bedRegionsOut\n"
   "   -bedRegionOut=output.bed  Write a bed file of bin counts in specific regions \n"
   "                     from bedRegionIn\n"
   "   -enrichment       Calculates coverage and enrichment assuming first table\n"
@@ -347,56 +350,132 @@
 struct psl *psl;
 
 chromFileName(track, chrom, fileName);
 if (!fileExists(fileName))
     return;
 lf = pslFileOpen(fileName);
 while ((psl = pslNext(lf)) != NULL)
     {
     if (sameString(psl->tName, chrom))
 	setPslBits(lf, acc, psl, 0, chromSize);
     pslFree(&psl);
     }
 lineFileClose(&lf);
 }
 
+static int howManyFields(char *fileName)
+/* Figure out how many fields the first bed has in this file. */
+{
+struct lineFile *lf = lineFileOpen(fileName, TRUE);
+char *line, *row[256];
+unsigned numFields = 0;
+if (lineFileNextReal(lf, &line))
+    numFields = chopByWhite(line, row, ArraySize(row));
+
+lineFileClose(&lf);
+return numFields;
+}
+
+static struct bed *cacheBeds(char *fileName)
+/* Read in the beds from this file and keep them in a cache. */
+{
+static char *cachedFile;
+static struct bed *cachedBeds;
+
+if ((cachedFile == NULL) || differentString(cachedFile, fileName))
+    {
+    if (cachedBeds)
+        {
+        bedFreeList(&cachedBeds);
+        freez(&cachedFile);
+        }
+
+    cachedFile = cloneString(fileName);
+    unsigned numFields = howManyFields(cachedFile);
+    cachedBeds = bedLoadNAllChrom(fileName, numFields > 12 ? 12 : numFields, NULL);
+    }
+
+return cachedBeds;
+}
+
 void fbOrBed(Bits *acc, char *track, char *chrom, int chromSize)
-/* Or in bits of psl file that correspond to chrom. */
+/* Or in bits of bed file that correspond to chrom. */
 {
-struct lineFile *lf;
 char fileName[512];
-char *row[3];
-int s, e, w;
 
 chromFileName(track, chrom, fileName);
 if (!fileExists(fileName))
     return;
-lf = lineFileOpen(fileName, TRUE);
-while (lineFileRow(lf, row))
+struct bed *bed, *beds = cacheBeds(fileName);
+
+for(bed = beds; bed; bed = bed->next)
     {
-    if (sameString(row[0], chrom))
+    if (sameString(bed->chrom, chrom))
         {
-	s = lineFileNeedNum(lf, row, 1);
-	if (s < 0) outOfRange(lf, chrom, chromSize);
-	e = lineFileNeedNum(lf, row, 2);
-	if (e > chromSize) outOfRange(lf, chrom, chromSize);
-	w = e - s;
+        if (countBlocks && (bed->blockCount > 0))
+            {
+            int ii;
+            for(ii=0; ii < bed->blockCount; ii++)
+                bitSetRange(acc, bed->chromStart + bed->chromStarts[ii], bed->blockSizes[ii]);
+            }
+        else
+            {
+            int w;
+            w = bed->chromEnd - bed->chromStart;
             if (w > 0)
-	    bitSetRange(acc, s, w);
+                bitSetRange(acc, bed->chromStart, w);
             }
 	}
-lineFileClose(&lf);
+    }
+}
+
+void fbOrBigBed(Bits *acc, char *track, char *chrom, int chromSize)
+/* Or in a bigBed file. */
+{
+char fileName[512];
+chromFileName(track, chrom, fileName);
+if (!fileExists(fileName))
+    return;
+
+struct lm *lm = lmInit(0);
+struct bbiFile *bbi = bigBedFileOpen(fileName);
+unsigned fieldCount = bbi->definedFieldCount;
+char *bedRow[fieldCount];
+struct bigBedInterval *interval, *intervalList = bigBedIntervalQuery(bbi, chrom, 0, chromSize, 0, lm);
+
+for (interval = intervalList; interval != NULL; interval = interval->next)
+    {
+    if (countBlocks && (fieldCount >= 12))
+        {
+        char startBuf[16], endBuf[16];
+        bigBedIntervalToRow(interval, chrom, startBuf, endBuf, bedRow, ArraySize(bedRow));
+        struct bed *bed = bedLoadN(bedRow, fieldCount);
+        int ii;
+        for(ii=0; ii < bed->blockCount; ii++)
+            bitSetRange(acc, bed->chromStart + bed->chromStarts[ii], bed->blockSizes[ii]);
+        bedFree(&bed);
+        }
+    else
+        {
+        int w = interval->end - interval->start;
+        if (w > 0)
+            bitSetRange(acc, interval->start, w);
+        }
+    }
+    
+lmCleanup(&lm);
+bbiFileClose(&bbi);
 }
 
 void fbOrChain(Bits *acc, char *track, char *chrom, int chromSize)
 /* Or in a chain file. */
 {
 struct lineFile *lf;
 char fileName[512];
 struct chain *chain;
 struct cBlock *b;
 
 chromFileName(track, chrom, fileName);
 if (!fileExists(fileName))
     return;
 lf = lineFileOpen(fileName, TRUE);
 while ((chain = chainRead(lf)) != NULL)
@@ -429,30 +508,34 @@
 void orFile(Bits *acc, char *track, char *chrom, int chromSize)
 /* Or some sort of file into bits. */
 {
 if (isFileType(track, "psl"))
     {
     fbOrPsl(acc, track, chrom, chromSize);
     }
 else if (isFileType(track, "bed"))
     {
     fbOrBed(acc, track, chrom, chromSize);
     }
 else if (isFileType(track, "chain"))
     {
     fbOrChain(acc, track, chrom, chromSize);
     }
+else if (isFileType(track, "bb"))
+    {
+    fbOrBigBed(acc, track, chrom, chromSize);
+    }
 else  
     errAbort("can't determine file type of: %s", track);
 }
 
 void orTable(char *database, Bits *acc, char *track, char *chrom, 
 	int chromSize, struct sqlConnection *conn)
 /* Or in table if it exists.  Else do nothing. */
 {
 char t[512], *s;
 char table[HDB_MAX_TABLE_STRING];
 
 isolateTrackPartOfSpec(track, t);
 s = strrchr(t, '.');
 if (s != NULL)
     {
@@ -916,30 +999,31 @@
 hFreeConn(&conn);
 }
 
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, optionSpecs);
 if (argc < 3)
     usage();
 clChrom = optionVal("chrom", clChrom);
 chromSizes = optionVal("chromSize", NULL);
 orLogic = optionExists("or");
 notResults = optionExists("not");
 countGaps = optionExists("countGaps");
+countBlocks = optionExists("countBlocks");
 noRandom = optionExists("noRandom");
 noHap = optionExists("noHap");
 primaryChroms = optionExists("primaryChroms");
 dots = optionInt("dots", dots);
 where = optionVal("where", NULL);
 calcEnrichment = optionExists("enrichment");
 if (calcEnrichment && argc != 4)
     errAbort("You must specify two tables with -enrichment");
 if (calcEnrichment && orLogic)
     errAbort("You can't use -or with -enrichment");
 if (calcEnrichment && notResults)
     errAbort("You can't use -not with -enrichment");
 if (notResults && optionExists("fa") && !optionExists("faMerge"))
     errAbort("Must specify -faMerge if using -not with -fa");