7f98bf75bd712a44b8ed9ededdd10e0f387cd35a angie Thu Aug 13 10:31:58 2015 -0700 Moved averaging of wiggle values back from annoFormatTab to annoGrateWig, so that it can be done properly across multiple regions with data. A user was trying to get average GC over 1MB regions, but averages of smaller subregions were returned instead. refs #15834 diff --git src/lib/annoGrateWig.c src/lib/annoGrateWig.c index b0814f1..dd816b5 100644 --- src/lib/annoGrateWig.c +++ src/lib/annoGrateWig.c @@ -1,114 +1,200 @@ /* annoGrateWig -- subclass of annoGrator for bigWig or wiggle data */ /* Copyright (C) 2013 The Regents of the University of California * See README in this or parent directory for licensing information. */ #include "annoGrateWig.h" #include "annoStreamBigWig.h" struct annoGrateWig { struct annoGrator grator; // external annoGrator interface struct annoGrator *mySource; // internal annoGrator of wigDb rows struct lm *lm; // localmem for rows from mySource int lmRowCount; // counter for periodic cleanup + enum annoGrateWigMode mode; // output: either one annoRow with average value in range + // or one or more annoRows excluding bases with NANs }; -static void tidyUp(const struct annoRow *rowIn, struct annoRow **pOutList, +static struct annoRow *normalizeRows(const struct annoRow *rowsIn, uint primaryStart, uint primaryEnd, struct lm *callerLm) -/* This takes a wiggle chunk coming from a .wig/database row and makes it into +/* This takes a series of wiggle chunks coming from .wig/database rows and makes it into * zero or more tidy little NAN-less annoRows. Trim rowIn to the bounds of * primary, trim NANs from beginning and break into multiple rows where there * are NANs in the middle. If the rowIn is contiguous with the row at the * head of outList, expand that row to include rowIn's data. */ { +struct annoRow *rowOutList = NULL; +const struct annoRow *rowIn; +for (rowIn = rowsIn; rowIn != NULL; rowIn = rowIn->next) + { uint start = max(rowIn->start, primaryStart); uint end = min(rowIn->end, primaryEnd); float *vector = rowIn->data; while (end > start) { uint offset = start - rowIn->start; if (isnan(vector[offset])) start++; else { // If there is a NAN before end, that's the end of this row: uint thisEnd = start; while (thisEnd < end && ! isnan(vector[thisEnd - rowIn->start])) thisEnd++; - struct annoRow *headRow = *pOutList; + struct annoRow *headRow = rowOutList; if (headRow == NULL || rowIn->start > headRow->end) { // allocate a new row - struct annoRow *rowOut = annoRowWigNew(rowIn->chrom, start, thisEnd, FALSE, + struct annoRow *rowOut = annoRowWigVecNew(rowIn->chrom, start, thisEnd, FALSE, vector + offset, callerLm); - slAddHead(pOutList, rowOut); + slAddHead(&rowOutList, rowOut); } else { // glom new data onto headRow if (thisEnd < headRow->end) - errAbort("Expected thisEnd (%u) to be greater than or equal to headRow->end (%u)", + errAbort("Expected thisEnd (%u) to be greater than or equal to " + "headRow->end (%u)", thisEnd, headRow->end); uint oldEnd = headRow->end; uint oldLen = oldEnd - headRow->start; uint newLen = thisEnd - headRow->start; + //#*** TODO: precompute how many bytes we will need, so we don't have to realloc!! headRow->data = lmAllocMoreMem(callerLm, headRow->data, oldLen*sizeof(vector[0]), newLen*sizeof(vector[0])); headRow->end = thisEnd; float *newData = (float *)rowIn->data + (oldEnd - rowIn->start); float *newSpace = (float *)headRow->data + oldLen; CopyArray(newData, newSpace, (thisEnd - oldEnd)); } start = thisEnd; } } } +slReverse(&rowOutList); +return rowOutList; +} + +static struct annoRow *averageRows(struct annoRow *rowList, uint primaryStart, uint primaryEnd, + struct lm *callerLm) +/* Return an annoRow with the average value of floats across all row->data (within primary item's + * region), ignoring missing data and NANs. Return NULL if rowList is NULL or if all bases + * overlapping the primary item are NAN (unlikely, but could happen). */ +{ +if (rowList == NULL) + return NULL; +int count = 0; +double sum = 0.0; +uint start = max(rowList->start, primaryStart); +uint end = start; +struct annoRow *row; +for (row = rowList; row != NULL; row = row->next) + { + uint rowStart = max(row->start, primaryStart); + uint rowEnd = min(row->end, primaryEnd); + if (rowEnd > rowStart) + { + float *vector = row->data; + int iStart = rowStart - row->start; + int iEnd = rowEnd - row->start; + int i; + for (i = iStart; i < iEnd; i++) + { + // skip NAN values so they don't convert sum to a NAN: + if (! isnan(vector[i])) + { + sum += vector[i]; + count++; + } + } + end = rowEnd; + } + } +if (count > 0) + { + double average = (sum / (double)count); + return annoRowWigSingleNew(rowList->chrom, start, end, FALSE, average, callerLm); + } +return NULL; +} static struct annoRow *agwIntegrate(struct annoGrator *gSelf, struct annoStreamRows *primaryData, boolean *retRJFilterFailed, struct lm *callerLm) /* Return wig annoRows that overlap primaryRow position, with NANs weeded out. */ { struct annoGrateWig *self = (struct annoGrateWig *)gSelf; // Cleanup internal lm every so often: if (self->lmRowCount >= 4096) { lmCleanup(&(self->lm)); self->lmRowCount = 0; } if (self->lm == NULL) self->lm = lmInit(0); struct annoRow *rowsIn = annoGratorIntegrate(self->mySource, primaryData, retRJFilterFailed, self->lm); self->lmRowCount += slCount(rowsIn); if (retRJFilterFailed && *retRJFilterFailed) return NULL; struct annoRow *primaryRow = primaryData->rowList; -struct annoRow *rowIn, *rowOutList = NULL;; -for (rowIn = rowsIn; rowIn != NULL; rowIn = rowIn->next) - tidyUp(rowIn, &rowOutList, primaryRow->start, primaryRow->end, callerLm); -slReverse(&rowOutList); -return rowOutList; +if (self->mode == agwmAverage) + return averageRows(rowsIn, primaryRow->start, primaryRow->end, callerLm); +else if (self->mode == agwmPerBase) + return normalizeRows(rowsIn, primaryRow->start, primaryRow->end, callerLm); +else + errAbort("agwIntegrate: invalid self->mode %d", self->mode); +return NULL; } -struct annoGrator *annoGrateWigNew(struct annoStreamer *wigSource) -/* Create an annoGrator subclass for source with rowType == arWig. */ +static void appendAverageToValueLabel(struct asObject *asObj) +/* When we are averaging wiggle values, tweak the "value" column label to remind the user. */ +{ +struct asColumn *col; +for (col = asObj->columnList; col != NULL; col = col->next) + { + if (sameString(col->name, "value")) { -if (wigSource->rowType != arWig) - errAbort("annoGrateWigNew: expected source->rowType arWig (%d), got %d from source %s", - arWig, wigSource->rowType, wigSource->name); + freez(&col->name); + col->name = cloneString("valueAverage"); + } + } +} + +struct annoGrator *annoGrateWigNew(struct annoStreamer *wigSource, enum annoGrateWigMode mode) +/* Create an annoGrator subclass for source with wiggle or bigWig data. + * If mode is agwmAverage, then this produces at most one annoRow per primary item + * with type arWigSingle, containing the average of all overlapping non-NAN values. + * If mode is agwmPerBase, then this produces a list of annoRows split by NANs, with type + * arWigVec (vector of per-base values). */ +{ +if (wigSource->rowType != arWigVec) + errAbort("annoGrateWigNew: expected source->rowType arWigVec (%d), got %d from source %s", + arWigVec, wigSource->rowType, wigSource->name); struct annoGrateWig *self; AllocVar(self); struct annoGrator *gSelf = (struct annoGrator *)self; annoGratorInit(gSelf, wigSource); gSelf->integrate = agwIntegrate; self->mySource = annoGratorNew(wigSource); +self->mode = mode; +struct annoStreamer *sSelf = (struct annoStreamer *)gSelf; +if (mode == agwmAverage) + { + sSelf->rowType = arWigSingle; + appendAverageToValueLabel(sSelf->asObj); + } +else if (mode == agwmPerBase) + sSelf->rowType = arWigVec; +else + errAbort("annoGrateWigNew: unrecognized enum annoGrateWigMode %d", mode); return gSelf; } -struct annoGrator *annoGrateBigWigNew(char *fileOrUrl, struct annoAssembly *aa) -/* Create an annoGrator subclass for bigWig file or URL. */ +struct annoGrator *annoGrateBigWigNew(char *fileOrUrl, struct annoAssembly *aa, + enum annoGrateWigMode mode) +/* Create an annoGrator subclass for bigWig file or URL. See above description of mode. */ { struct annoStreamer *bigWigSource = annoStreamBigWigNew(fileOrUrl, aa); -return annoGrateWigNew(bigWigSource); +return annoGrateWigNew(bigWigSource, mode); }