fc87e789d56a5ba55444baa6f20a222aebaa5cd5 markd Mon May 13 02:42:33 2024 -0700 added option to bigWigAverageOverBed to output TSV headers diff --git src/utils/bigWigAverageOverBed/bigWigAverageOverBed.c src/utils/bigWigAverageOverBed/bigWigAverageOverBed.c index de08a37..59038d8 100644 --- src/utils/bigWigAverageOverBed/bigWigAverageOverBed.c +++ src/utils/bigWigAverageOverBed/bigWigAverageOverBed.c @@ -1,398 +1,409 @@ /* bigWigAverageOverBed - Compute average score of big wig over each bed, which may have introns. */ /* Copyright (C) 2014 The Regents of the University of California * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "localmem.h" #include "options.h" #include "verbose.h" #include "basicBed.h" #include "bigWig.h" #include "bits.h" char *bedOut = NULL; char *statsRa = NULL; int sampleAroundCenter = 0; +boolean tsv = FALSE; void usage() /* Explain usage and exit. */ { errAbort( "bigWigAverageOverBed v2 - Compute average score of big wig over each bed, which may have introns.\n" "usage:\n" " bigWigAverageOverBed in.bw in.bed out.tab\n" "The output columns are:\n" " name - name field from bed, which should be unique\n" " size - size of bed (sum of exon sizes\n" " covered - # bases within exons covered by bigWig\n" " sum - sum of values over all bases covered\n" " mean0 - average over bases with non-covered bases counting as zeroes\n" " mean - average over just covered bases\n" "Options:\n" " -stats=stats.ra - Output a collection of overall statistics to stat.ra file\n" " -bedOut=out.bed - Make output bed that is echo of input bed but with mean column appended\n" " -sampleAroundCenter=N - Take sample at region N bases wide centered around bed item, rather\n" " than the usual sample in the bed item.\n" " -minMax - include two additional columns containing the min and max observed in the area.\n" + " -tsv - include a TSV header for input to other tools.\n" ); } static struct optionSpec options[] = { {"bedOut", OPTION_STRING}, {"stats", OPTION_STRING}, {"sampleAroundCenter", OPTION_INT}, {"minMax", OPTION_BOOLEAN}, + {"tsv", OPTION_BOOLEAN}, {NULL, 0}, }; void checkUniqueNames(struct bed *bedList) /* Make sure all names in bedList are unique */ { struct hash *hash = hashNew(16); struct bed *bed; for (bed = bedList; bed != NULL; bed = bed->next) { char *name = bed->name; if (hashLookup(hash, name) != NULL) errAbort("%s duplicated in input bed", name); else hashAdd(hash, name, NULL); } hashFree(&hash); } void addBigWigIntervalInfo(struct bbiFile *bbi, struct lm *lm, char *chrom, int start, int end, int *pSumSize, int *pSumCoverage, double *pSumVal, double *pMin, double *pMax) /* Read in interval from bigBed and add it sums. */ { struct bbiInterval *iv, *ivList = bigWigIntervalQuery(bbi, chrom, start, end, lm); *pSumSize += (end - start); double maxVal = *pMax, minVal = *pMin; for (iv = ivList; iv != NULL; iv = iv->next) { int cov1 = rangeIntersection(iv->start, iv->end, start, end); if (cov1 > 0) { *pSumCoverage += cov1; double val = iv->val; if (maxVal < val) maxVal = val; if (minVal > val) minVal = val; *pSumVal += cov1 * val; } } *pMin = minVal; *pMax = maxVal; } int countBlocks(struct bed *bedList, int fieldCount) /* Return the number of blocks in list, or if non-blocked beds, just number of beds. */ { if (fieldCount < 12) return slCount(bedList); int blockCount = 0; struct bed *bed; for (bed = bedList; bed != NULL; bed = bed->next) blockCount += bed->blockCount; return blockCount; } void optionallyPrintBedPlus(FILE *f, struct bed *bed, int fieldCount, double extra, boolean outputMinMax, double minVal, double maxVal) /* Print BED to tab separated file plus an extra double-format column. */ { if (f != NULL) { bedOutputN(bed, fieldCount, f, '\t', '\t'); fprintf(f, "%g", extra); if (outputMinMax) fprintf(f, "\t%g\t%g", minVal, maxVal); fputc('\n', f); } } double sumSum; long long sumCoverage; long long sumSize; void updateSums(double sum, int coverage, int size) /* Just add to the above three numbers. */ { sumSum += sum; sumCoverage += coverage; sumSize += size; } long long bbiTotalChromSize(struct bbiFile *bbi) /* Return sum of sizes of all chromosomes */ { struct bbiChromInfo *chrom, *chromList = bbiChromList(bbi); long long total = 0; for (chrom = chromList; chrom != NULL; chrom = chrom->next) total += chrom->size; bbiChromInfoFreeList(&chromList); return total; } /* Return all chromosomes in file. Dispose of this with bbiChromInfoFreeList. */ void outputSums(char *fileName, struct bbiFile *bbi) /* Write a little .ra file with results of sums. */ { FILE *f = mustOpen(fileName, "w"); struct bbiSummaryElement sumEl = bbiTotalSummary(bbi); double totalSignal = sumEl.sumData; long long basesInGenome = bbiTotalChromSize(bbi); fprintf(f, "spotRatio %g\n", sumSum/totalSignal); fprintf(f, "enrichment %g\n", (sumSum/sumSize) / (totalSignal/basesInGenome)); fprintf(f, "maxEnrichment %g\n", (double)basesInGenome/sumSize); fprintf(f, "basesInGenome %lld\n", basesInGenome); fprintf(f, "basesInSpots %lld\n", sumSize); fprintf(f, "basesInSpotsWithSignal %lld\n", sumCoverage); fprintf(f, "sumSignal %g\n", totalSignal); fprintf(f, "spotSumSignal %g\n", sumSum); carefulClose(&f); } void averageFetchingEachBlock(struct bbiFile *bbi, struct bed *bedList, int fieldCount, int outputMinMax, FILE *f, FILE *bedF) /* Do the averaging fetching each block from bedList from bigWig. Fastest for short bedList. */ { struct lm *lm = lmInit(0); struct bed *bed; for (bed = bedList; bed != NULL; bed = bed->next) { int coverage = 0; double sum = 0.0; int size = 0; double minVal = BIGDOUBLE, maxVal = -BIGDOUBLE; if (sampleAroundCenter > 0) { int center = (bed->chromStart + bed->chromEnd)/2; int left = center - (sampleAroundCenter/2); addBigWigIntervalInfo(bbi, lm, bed->chrom, left, left+sampleAroundCenter, &size, &coverage, &sum, &minVal, &maxVal); } else { if (fieldCount < 12) addBigWigIntervalInfo(bbi, lm, bed->chrom, bed->chromStart, bed->chromEnd, &size, &coverage, &sum, &minVal, &maxVal); else { int i; for (i=0; iblockCount; ++i) { int start = bed->chromStart + bed->chromStarts[i]; int end = start + bed->blockSizes[i]; addBigWigIntervalInfo(bbi, lm, bed->chrom, start, end, &size, &coverage, &sum, &minVal, &maxVal); } } } /* Print out result, fudging mean to 0 if no coverage at all. */ double mean = 0; if (coverage > 0) mean = sum/coverage; fprintf(f, "%s\t%d\t%d\t%g\t%g\t%g", bed->name, size, coverage, sum, sum/size, mean); if (outputMinMax) { if (coverage > 0) fprintf(f, "\t%g\t%g", minVal, maxVal); else fprintf(f, "\t0\t0"); // put out zeros for min/max if no coverage } fputc('\n', f); optionallyPrintBedPlus(bedF, bed, fieldCount, mean, outputMinMax, minVal, maxVal); updateSums(sum, coverage, size); } } int bedCmpChrom(const void *va, const void *vb) /* Compare strings such as chromosome names that may have embedded numbers, * so that chr4 comes before chr14 */ { const struct bed *a = *((struct bed **)va); const struct bed *b = *((struct bed **)vb); return cmpStringsWithEmbeddedNumbers(a->chrom, b->chrom); } struct bed *nextChromInList(struct bed *bedList) /* Return first bed in list that starts with another chromosome, or NULL if none. */ { char *chrom = bedList->chrom; struct bed *bed; for (bed = bedList->next; bed != NULL; bed = bed->next) if (!sameString(bed->chrom, chrom)) break; return bed; } void addBufIntervalInfo(double *valBuf, Bits *covBuf, int start, int end, int *pSumSize, int *pSumCoverage, double *pSumVal, double *pMin, double *pMax) /* Look at interval in buffers and add result to sums. */ { int size1 = end - start; *pSumSize += size1; int cov1 = bitCountRange(covBuf, start, size1); *pSumCoverage += cov1; int i; double sum1 = 0; double maxVal = *pMax, minVal = *pMin; for (i=start; i val) minVal = val; sum1 += val; } *pSumVal += sum1; *pMin = minVal; *pMax = maxVal; } void averageFetchingEachChrom(struct bbiFile *bbi, struct bed **pBedList, int fieldCount, int outputMinMax, FILE *f, FILE *bedF) /* Do the averaging by sorting bedList by chromosome, and then processing each chromosome * at once. Faster for long bedLists. */ { /* Sort by chromosome. */ slSort(pBedList, bedCmpChrom); struct bigWigValsOnChrom *chromVals = bigWigValsOnChromNew(); struct bed *bed, *bedList, *nextChrom; verbose(1, "processing chromosomes"); for (bedList = *pBedList; bedList != NULL; bedList = nextChrom) { /* Figure out which chromosome we're working on, and the last bed using it. */ char *chrom = bedList->chrom; nextChrom = nextChromInList(bedList); verbose(2, "Processing %s\n", chrom); if (bigWigValsOnChromFetchData(chromVals, chrom, bbi)) { double *valBuf = chromVals->valBuf; Bits *covBuf = chromVals->covBuf; /* Loop through beds doing sums and outputting. */ for (bed = bedList; bed != nextChrom; bed = bed->next) { int size = 0, coverage = 0; double sum = 0.0; double minVal = BIGDOUBLE, maxVal = -BIGDOUBLE; if (sampleAroundCenter > 0) { int center = (bed->chromStart + bed->chromEnd)/2; int left = center - (sampleAroundCenter/2); addBufIntervalInfo(valBuf, covBuf, left, left+sampleAroundCenter, &size, &coverage, &sum, &minVal, &maxVal); } else { if (fieldCount < 12) { addBufIntervalInfo(valBuf, covBuf, bed->chromStart, bed->chromEnd, &size, &coverage, &sum, &minVal, &maxVal); } else { int i; for (i=0; iblockCount; ++i) { int start = bed->chromStart + bed->chromStarts[i]; int end = start + bed->blockSizes[i]; addBufIntervalInfo(valBuf, covBuf, start, end, &size, &coverage, &sum, &minVal, &maxVal); } } } /* Print out result, fudging mean to 0 if no coverage at all. */ double mean = 0; if (coverage > 0) mean = sum/coverage; fprintf(f, "%s\t%d\t%d\t%g\t%g\t%g", bed->name, size, coverage, sum, sum/size, mean); if (outputMinMax) fprintf(f, "\t%g\t%g", minVal, maxVal); fputc('\n', f); optionallyPrintBedPlus(bedF, bed, fieldCount, mean, outputMinMax, minVal, maxVal); updateSums(sum, coverage, size); } verboseDot(); } else { /* If no bigWig data on this chromosome, just output as if coverage is 0 */ for (bed = bedList; bed != nextChrom; bed = bed->next) { fprintf(f, "%s\t%d\t0\t0\t0\t0", bed->name, bedTotalBlockSize(bed)); if (outputMinMax) fprintf(f, "\t0\t0"); fputc('\n', f); optionallyPrintBedPlus(bedF, bed, fieldCount, 0, outputMinMax, BIGDOUBLE, -BIGDOUBLE); } } } verbose(1, "\n"); } void bigWigAverageOverBed(char *inBw, char *inBed, char *outTab) /* bigWigAverageOverBed - Compute average score of big wig over each bed, which may have introns. */ { struct bed *bedList; int fieldCount; boolean minMax = optionExists("minMax"); bedLoadAllReturnFieldCount(inBed, &bedList, &fieldCount); checkUniqueNames(bedList); struct bbiFile *bbi = bigWigFileOpen(inBw); FILE *f = mustOpen(outTab, "w"); +if (tsv) + { + fprintf(f, "name\tsize\tcovered\tsum\tmean0\tmean"); + if (minMax) + fprintf(f, "\tmin\tmax"); + fputc('\n', f); + } FILE *bedF = NULL; if (bedOut != NULL) bedF = mustOpen(bedOut, "w"); /* Count up number of blocks in file. It takes about 1/100th of of second to * look up a single block in a bigWig. On the other hand to stream through * the whole file setting a array of doubles takes about 30 seconds, so we change * strategy at 3,000 blocks. * I (Jim) usually avoid having two paths through the code like this, and am tempted * to always go the ~30 second chromosome-at-a-time way. On the other hand the block-way * was developed first, and it was useful to have both ways to test against each other. * (This found a bug where the chromosome way wasn't handling beds in chromosomes not * covered by the bigWig for instance). Since this code is not likely to change too * much, keeping both implementations in seems reasonable. */ int blockCount = countBlocks(bedList, fieldCount); verbose(2, "Got %d blocks, if >= 3000 will use chromosome-at-a-time method\n", blockCount); if (blockCount < 3000) averageFetchingEachBlock(bbi, bedList, fieldCount, minMax, f, bedF); else averageFetchingEachChrom(bbi, &bedList, fieldCount, minMax, f, bedF); if (statsRa != NULL) outputSums(statsRa, bbi); carefulClose(&bedF); carefulClose(&f); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 4) usage(); bedOut = optionVal("bedOut", bedOut); statsRa = optionVal("stats", statsRa); sampleAroundCenter = optionInt("sampleAroundCenter", sampleAroundCenter); +tsv = optionExists("tsv"); bigWigAverageOverBed(argv[1], argv[2], argv[3]); return 0; }