fc87e789d56a5ba55444baa6f20a222aebaa5cd5
markd
  Mon May 13 02:42:33 2024 -0700
added option to bigWigAverageOverBed to output TSV headers

diff --git src/utils/bigWigAverageOverBed/bigWigAverageOverBed.c src/utils/bigWigAverageOverBed/bigWigAverageOverBed.c
index de08a37..59038d8 100644
--- src/utils/bigWigAverageOverBed/bigWigAverageOverBed.c
+++ src/utils/bigWigAverageOverBed/bigWigAverageOverBed.c
@@ -5,59 +5,62 @@
 
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "localmem.h"
 #include "options.h"
 #include "verbose.h"
 #include "basicBed.h"
 #include "bigWig.h"
 #include "bits.h"
 
 
 char *bedOut = NULL;
 char *statsRa = NULL;
 int sampleAroundCenter = 0;
+boolean tsv = FALSE;
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "bigWigAverageOverBed v2 - Compute average score of big wig over each bed, which may have introns.\n"
   "usage:\n"
   "   bigWigAverageOverBed in.bw in.bed out.tab\n"
   "The output columns are:\n"
   "   name - name field from bed, which should be unique\n"
   "   size - size of bed (sum of exon sizes\n"
   "   covered - # bases within exons covered by bigWig\n"
   "   sum - sum of values over all bases covered\n"
   "   mean0 - average over bases with non-covered bases counting as zeroes\n"
   "   mean - average over just covered bases\n"
   "Options:\n"
   "   -stats=stats.ra - Output a collection of overall statistics to stat.ra file\n"
   "   -bedOut=out.bed - Make output bed that is echo of input bed but with mean column appended\n"
   "   -sampleAroundCenter=N - Take sample at region N bases wide centered around bed item, rather\n"
   "                     than the usual sample in the bed item.\n"
   "   -minMax - include two additional columns containing the min and max observed in the area.\n"
+  "   -tsv - include a TSV header for input to other tools.\n"
   );
 }
 
 static struct optionSpec options[] = {
    {"bedOut", OPTION_STRING},
    {"stats", OPTION_STRING},
    {"sampleAroundCenter", OPTION_INT},
    {"minMax", OPTION_BOOLEAN},
+   {"tsv", OPTION_BOOLEAN},
    {NULL, 0},
 };
 
 void checkUniqueNames(struct bed *bedList)
 /* Make sure all names in bedList are unique */
 {
 struct hash *hash = hashNew(16);
 struct bed *bed;
 for (bed = bedList; bed != NULL; bed = bed->next)
     {
     char *name = bed->name;
     if (hashLookup(hash, name) != NULL)
         errAbort("%s duplicated in input bed", name);
     else
         hashAdd(hash, name, NULL);
@@ -344,30 +347,37 @@
     }
 verbose(1, "\n");
 }
 
 void bigWigAverageOverBed(char *inBw, char *inBed, char *outTab)
 /* bigWigAverageOverBed - Compute average score of big wig over each bed, which may have introns. */
 {
 struct bed *bedList;
 int fieldCount;
 boolean minMax = optionExists("minMax");
 bedLoadAllReturnFieldCount(inBed, &bedList, &fieldCount);
 checkUniqueNames(bedList);
 
 struct bbiFile *bbi = bigWigFileOpen(inBw);
 FILE *f = mustOpen(outTab, "w");
+if (tsv)
+    {
+    fprintf(f, "name\tsize\tcovered\tsum\tmean0\tmean");
+    if (minMax)
+        fprintf(f, "\tmin\tmax");
+    fputc('\n', f);
+    }
 FILE *bedF = NULL;
 if (bedOut != NULL)
     bedF = mustOpen(bedOut, "w");
 
 /* Count up number of blocks in file.  It takes about 1/100th of of second to
  * look up a single block in a bigWig.  On the other hand to stream through
  * the whole file setting a array of doubles takes about 30 seconds, so we change
  * strategy at 3,000 blocks. 
  *   I (Jim) usually avoid having two paths through the code like this, and am tempted
  * to always go the ~30 second chromosome-at-a-time  way.  On the other hand the block-way
  * was developed first, and it was useful to have both ways to test against each other.
  * (This found a bug where the chromosome way wasn't handling beds in chromosomes not
  * covered by the bigWig for instance).  Since this code is not likely to change too
  * much, keeping both implementations in seems reasonable. */
 int blockCount = countBlocks(bedList, fieldCount);
@@ -381,18 +391,19 @@
     outputSums(statsRa, bbi);
 
 carefulClose(&bedF);
 carefulClose(&f);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 4)
     usage();
 bedOut = optionVal("bedOut", bedOut);
 statsRa = optionVal("stats", statsRa);
 sampleAroundCenter = optionInt("sampleAroundCenter", sampleAroundCenter);
+tsv = optionExists("tsv");
 bigWigAverageOverBed(argv[1], argv[2], argv[3]);
 return 0;
 }