7f98bf75bd712a44b8ed9ededdd10e0f387cd35a angie Thu Aug 13 10:31:58 2015 -0700 Moved averaging of wiggle values back from annoFormatTab to annoGrateWig, so that it can be done properly across multiple regions with data. A user was trying to get average GC over 1MB regions, but averages of smaller subregions were returned instead. refs #15834 diff --git src/lib/annoFormatTab.c src/lib/annoFormatTab.c index 8971be0..06259e7 100644 --- src/lib/annoFormatTab.c +++ src/lib/annoFormatTab.c @@ -5,33 +5,30 @@ #include "annoFormatTab.h" #include "annoGratorQuery.h" #include "dystring.h" struct annoFormatTab { struct annoFormatter formatter; // External interface char *fileName; // Output file name, can be "stdout" FILE *f; // Output file handle struct hash *columnVis; // Hash of columns that have been explicitly selected // or deselected by user. boolean needHeader; // TRUE if we should print out the header }; -//#*** config options: avg? more stats? list of values? -static boolean doAvg = TRUE; - static void makeFullColumnName(char *fullName, size_t size, char *sourceName, char *colName) /* If sourceName is non-empty, make fullName sourceName.colName, otherwise just colName. */ { if (isNotEmpty(sourceName)) safef(fullName, size, "%s.%s", sourceName, colName); else safecpy(fullName, size, colName); } void annoFormatTabSetColumnVis(struct annoFormatter *vSelf, char *sourceName, char *colName, boolean enabled) /* Explicitly include or exclude column in output. sourceName must be the same * as the corresponding annoStreamer source's name. */ { struct annoFormatTab *self = (struct annoFormatTab *)vSelf; @@ -44,166 +41,132 @@ static boolean columnIsIncluded(struct annoFormatTab *self, char *sourceName, char *colName) // Return TRUE if column has not been explicitly deselected. { if (self->columnVis) { char fullName[PATH_LEN]; makeFullColumnName(fullName, sizeof(fullName), sourceName, colName); int vis = hashIntValDefault(self->columnVis, fullName, 1); if (vis == 0) return FALSE; } return TRUE; } -static boolean isWiggle(struct asColumn *colList) -/* Recognize wiggle/bigWig data by its column signature. */ -{ -return (slCount(colList) == 4 && - sameString(colList->name, "chrom") && - sameString(colList->next->name, "chromStart") && - sameString(colList->next->next->name, "chromEnd") && - sameString(colList->next->next->next->name, "value")); -} - static void printHeaderColumns(struct annoFormatTab *self, struct annoStreamer *source, boolean isFirst) /* Print names of included columns from this source. */ { FILE *f = self->f; char *sourceName = source->name; -boolean isAvg = isWiggle(source->asObj->columnList) && doAvg; struct asColumn *col; -int i; -for (col = source->asObj->columnList, i = 0; col != NULL; col = col->next, i++) +for (col = source->asObj->columnList; col != NULL; col = col->next) { if (columnIsIncluded(self, sourceName, col->name)) { if (isFirst) { fputc('#', f); isFirst = FALSE; } else fputc('\t', f); char fullName[PATH_LEN]; makeFullColumnName(fullName, sizeof(fullName), sourceName, col->name); - if (isAvg && sameString(col->name, "value")) - safecat(fullName, sizeof(fullName), "Average"); fputs(fullName, f); } } } static void aftInitialize(struct annoFormatter *vSelf, struct annoStreamer *primary, struct annoStreamer *integrators) /* Print header, regardless of whether we get any data after this. */ { struct annoFormatTab *self = (struct annoFormatTab *)vSelf; if (self->needHeader) { char *primaryHeader = primary->getHeader(primary); if (isNotEmpty(primaryHeader)) fprintf(self->f, "# Header from primary input:\n%s", primaryHeader); printHeaderColumns(self, primary, TRUE); struct annoStreamer *grator; for (grator = integrators; grator != NULL; grator = grator->next) printHeaderColumns(self, grator, FALSE); fputc('\n', self->f); self->needHeader = FALSE; } } -static double wigRowAvg(struct annoRow *row) -/* Return the average value of floats in row->data. */ -{ -float *vector = row->data; -int len = row->end - row->start; -int count = 0; -double sum = 0.0; -int i; -for (i = 0; i < len; i++) - { - // skip NAN values so they don't convert sum to a NAN: - if (! isnan(vector[i])) - { - sum += vector[i]; - count++; - } - } -// I expected "double avg = sum / (double)count" to yield NAN if count is 0 -- -// but avg was negative NAN! Avoid the fp exception and weird value by testing count: -return (count > 0) ? sum / (double)count : NAN; -} - static char **bed4WordsFromAnnoRow(struct annoRow *row, char *fourth) /* Return an array of 4 words with row's chrom, chromStart, and chromEnd, and cloned fourth. */ { char **words; AllocArray(words, 4); words[0] = cloneString(row->chrom); char buf[PATH_LEN]; safef(buf, sizeof(buf), "%u", row->start); words[1] = cloneString(buf); safef(buf, sizeof(buf), "%u", row->end); words[2] = cloneString(buf); words[3] = cloneString(fourth); return words; } -static char **wordsFromWigRowAvg(struct annoRow *row) -/* Return chrom, chromStart, chromEnd and a string containing the average of values in row. */ +static char **wordsFromWigRow(struct annoRow *row, enum annoRowType rowType) +/* Return chrom, chromStart, chromEnd and a string containing comma-sep per-base wiggle values + * if rowType is arWigVec, or one value if rowType is arWigSingle. */ { -double avg = wigRowAvg(row); -if (isnan(avg)) - return NULL; -char avgStr[32]; -safef(avgStr, sizeof(avgStr), "%lf", avg); -return bed4WordsFromAnnoRow(row, avgStr); -} - -static char **wordsFromWigRowVals(struct annoRow *row) -/* Return chrom, chromStart, chromEnd and a string containing comma-sep per-base wiggle values. */ +struct dyString *dy = NULL; +if (rowType == arWigVec) { float *vector = row->data; int len = row->end - row->start; -struct dyString *dy = dyStringNew(10*len); + dy = dyStringNew(10*len); int i; for (i = 0; i < len; i++) - dyStringPrintf(dy, "%g,", vector[i]); + { + if (i > 0) + dyStringAppendC(dy, ','); + dyStringPrintf(dy, "%g", vector[i]); + } + } +else if (rowType == arWigSingle) + { + double *pValue = row->data; + dy = dyStringCreate("%lf", *pValue); + } +else + errAbort("wordsFromWigRow: invalid rowType %d", rowType); char **words = bed4WordsFromAnnoRow(row, dy->string); dyStringFree(&dy); return words; } static char **wordsFromRow(struct annoRow *row, struct annoStreamer *source, boolean *retFreeWhenDone) /* If source->rowType is arWords, return its words. Otherwise translate row->data into words. */ { if (row == NULL) return NULL; boolean freeWhenDone = FALSE; char **words = NULL; if (source->rowType == arWords) words = row->data; -else if (source->rowType == arWig) +else if (source->rowType == arWigVec || source->rowType == arWigSingle) { - if (doAvg) - words = wordsFromWigRowAvg(row); - else - words = wordsFromWigRowVals(row); + words = wordsFromWigRow(row, source->rowType); if (words != NULL) freeWhenDone = TRUE; } else errAbort("annoFormatTab: unrecognized row type %d from source %s", source->rowType, source->name); if (retFreeWhenDone != NULL) *retFreeWhenDone = freeWhenDone; return words; } static void printColumns(struct annoFormatTab *self, struct annoStreamer *streamer, struct annoRow *row, boolean isFirst) /* Print columns in streamer's row (if NULL, print the right number of empty fields). */ {