fc87e789d56a5ba55444baa6f20a222aebaa5cd5 markd Mon May 13 02:42:33 2024 -0700 added option to bigWigAverageOverBed to output TSV headers diff --git src/utils/bigWigAverageOverBed/bigWigAverageOverBed.c src/utils/bigWigAverageOverBed/bigWigAverageOverBed.c index de08a37..59038d8 100644 --- src/utils/bigWigAverageOverBed/bigWigAverageOverBed.c +++ src/utils/bigWigAverageOverBed/bigWigAverageOverBed.c @@ -5,59 +5,62 @@ #include "common.h" #include "linefile.h" #include "hash.h" #include "localmem.h" #include "options.h" #include "verbose.h" #include "basicBed.h" #include "bigWig.h" #include "bits.h" char *bedOut = NULL; char *statsRa = NULL; int sampleAroundCenter = 0; +boolean tsv = FALSE; void usage() /* Explain usage and exit. */ { errAbort( "bigWigAverageOverBed v2 - Compute average score of big wig over each bed, which may have introns.\n" "usage:\n" " bigWigAverageOverBed in.bw in.bed out.tab\n" "The output columns are:\n" " name - name field from bed, which should be unique\n" " size - size of bed (sum of exon sizes\n" " covered - # bases within exons covered by bigWig\n" " sum - sum of values over all bases covered\n" " mean0 - average over bases with non-covered bases counting as zeroes\n" " mean - average over just covered bases\n" "Options:\n" " -stats=stats.ra - Output a collection of overall statistics to stat.ra file\n" " -bedOut=out.bed - Make output bed that is echo of input bed but with mean column appended\n" " -sampleAroundCenter=N - Take sample at region N bases wide centered around bed item, rather\n" " than the usual sample in the bed item.\n" " -minMax - include two additional columns containing the min and max observed in the area.\n" + " -tsv - include a TSV header for input to other tools.\n" ); } static struct optionSpec options[] = { {"bedOut", OPTION_STRING}, {"stats", OPTION_STRING}, {"sampleAroundCenter", OPTION_INT}, {"minMax", OPTION_BOOLEAN}, + {"tsv", OPTION_BOOLEAN}, {NULL, 0}, }; void checkUniqueNames(struct bed *bedList) /* Make sure all names in bedList are unique */ { struct hash *hash = hashNew(16); struct bed *bed; for (bed = bedList; bed != NULL; bed = bed->next) { char *name = bed->name; if (hashLookup(hash, name) != NULL) errAbort("%s duplicated in input bed", name); else hashAdd(hash, name, NULL); @@ -344,30 +347,37 @@ } verbose(1, "\n"); } void bigWigAverageOverBed(char *inBw, char *inBed, char *outTab) /* bigWigAverageOverBed - Compute average score of big wig over each bed, which may have introns. */ { struct bed *bedList; int fieldCount; boolean minMax = optionExists("minMax"); bedLoadAllReturnFieldCount(inBed, &bedList, &fieldCount); checkUniqueNames(bedList); struct bbiFile *bbi = bigWigFileOpen(inBw); FILE *f = mustOpen(outTab, "w"); +if (tsv) + { + fprintf(f, "name\tsize\tcovered\tsum\tmean0\tmean"); + if (minMax) + fprintf(f, "\tmin\tmax"); + fputc('\n', f); + } FILE *bedF = NULL; if (bedOut != NULL) bedF = mustOpen(bedOut, "w"); /* Count up number of blocks in file. It takes about 1/100th of of second to * look up a single block in a bigWig. On the other hand to stream through * the whole file setting a array of doubles takes about 30 seconds, so we change * strategy at 3,000 blocks. * I (Jim) usually avoid having two paths through the code like this, and am tempted * to always go the ~30 second chromosome-at-a-time way. On the other hand the block-way * was developed first, and it was useful to have both ways to test against each other. * (This found a bug where the chromosome way wasn't handling beds in chromosomes not * covered by the bigWig for instance). Since this code is not likely to change too * much, keeping both implementations in seems reasonable. */ int blockCount = countBlocks(bedList, fieldCount); @@ -381,18 +391,19 @@ outputSums(statsRa, bbi); carefulClose(&bedF); carefulClose(&f); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 4) usage(); bedOut = optionVal("bedOut", bedOut); statsRa = optionVal("stats", statsRa); sampleAroundCenter = optionInt("sampleAroundCenter", sampleAroundCenter); +tsv = optionExists("tsv"); bigWigAverageOverBed(argv[1], argv[2], argv[3]); return 0; }