7f98bf75bd712a44b8ed9ededdd10e0f387cd35a
angie
  Thu Aug 13 10:31:58 2015 -0700
Moved averaging of wiggle values back from annoFormatTab to annoGrateWig,
so that it can be done properly across multiple regions with data.
A user was trying to get average GC over 1MB regions, but averages of
smaller subregions were returned instead.
refs #15834

diff --git src/lib/annoGrateWig.c src/lib/annoGrateWig.c
index b0814f1..dd816b5 100644
--- src/lib/annoGrateWig.c
+++ src/lib/annoGrateWig.c
@@ -1,114 +1,200 @@
 /* annoGrateWig -- subclass of annoGrator for bigWig or wiggle data */
 
 /* Copyright (C) 2013 The Regents of the University of California 
  * See README in this or parent directory for licensing information. */
 
 #include "annoGrateWig.h"
 #include "annoStreamBigWig.h"
 
 struct annoGrateWig
 {
     struct annoGrator grator;		// external annoGrator interface
     struct annoGrator *mySource;	// internal annoGrator of wigDb rows
     struct lm *lm;			// localmem for rows from mySource
     int lmRowCount;			// counter for periodic cleanup
+    enum annoGrateWigMode mode;         // output: either one annoRow with average value in range
+                                        // or one or more annoRows excluding bases with NANs
 };
 
-static void tidyUp(const struct annoRow *rowIn, struct annoRow **pOutList,
+static struct annoRow *normalizeRows(const struct annoRow *rowsIn,
                                      uint primaryStart, uint primaryEnd, struct lm *callerLm)
-/* This takes a wiggle chunk coming from a .wig/database row and makes it into
+/* This takes a series of wiggle chunks coming from .wig/database rows and makes it into
  * zero or more tidy little NAN-less annoRows.  Trim rowIn to the bounds of
  * primary, trim NANs from beginning and break into multiple rows where there
  * are NANs in the middle.  If the rowIn is contiguous with the row at the
  * head of outList, expand that row to include rowIn's data. */
 {
+struct annoRow *rowOutList = NULL;
+const struct annoRow *rowIn;
+for (rowIn = rowsIn;  rowIn != NULL;  rowIn = rowIn->next)
+    {
     uint start = max(rowIn->start, primaryStart);
     uint end = min(rowIn->end, primaryEnd);
     float *vector = rowIn->data;
     while (end > start)
         {
         uint offset = start - rowIn->start;
         if (isnan(vector[offset]))
             start++;
         else
             {
             // If there is a NAN before end, that's the end of this row:
             uint thisEnd = start;
             while (thisEnd < end && ! isnan(vector[thisEnd - rowIn->start]))
                 thisEnd++;
-	struct annoRow *headRow = *pOutList;
+            struct annoRow *headRow = rowOutList;
             if (headRow == NULL || rowIn->start > headRow->end)
                 {
                 // allocate a new row
-	    struct annoRow *rowOut = annoRowWigNew(rowIn->chrom, start, thisEnd, FALSE,
+                struct annoRow *rowOut = annoRowWigVecNew(rowIn->chrom, start, thisEnd, FALSE,
                                                           vector + offset, callerLm);
-	    slAddHead(pOutList, rowOut);
+                slAddHead(&rowOutList, rowOut);
                 }
             else
                 {
                 // glom new data onto headRow
                 if (thisEnd < headRow->end)
-                errAbort("Expected thisEnd (%u) to be greater than or equal to headRow->end (%u)",
+                    errAbort("Expected thisEnd (%u) to be greater than or equal to "
+                             "headRow->end (%u)",
                              thisEnd, headRow->end);
                 uint oldEnd = headRow->end;
                 uint oldLen = oldEnd - headRow->start;
                 uint newLen = thisEnd - headRow->start;
+                //#*** TODO: precompute how many bytes we will need, so we don't have to realloc!!
                 headRow->data = lmAllocMoreMem(callerLm, headRow->data, oldLen*sizeof(vector[0]),
                                                newLen*sizeof(vector[0]));
                 headRow->end = thisEnd;
                 float *newData = (float *)rowIn->data + (oldEnd - rowIn->start);
                 float *newSpace = (float *)headRow->data + oldLen;
                 CopyArray(newData, newSpace, (thisEnd - oldEnd));
                 }
             start = thisEnd;
             }
         }
     }
+slReverse(&rowOutList);
+return rowOutList;
+}
+
+static struct annoRow *averageRows(struct annoRow *rowList, uint primaryStart, uint primaryEnd,
+                                   struct lm *callerLm)
+/* Return an annoRow with the average value of floats across all row->data (within primary item's
+ * region), ignoring missing data and NANs.  Return NULL if rowList is NULL or if all bases
+ * overlapping the primary item are NAN (unlikely, but could happen). */
+{
+if (rowList == NULL)
+    return NULL;
+int count = 0;
+double sum = 0.0;
+uint start = max(rowList->start, primaryStart);
+uint end = start;
+struct annoRow *row;
+for (row = rowList;  row != NULL;  row = row->next)
+    {
+    uint rowStart = max(row->start, primaryStart);
+    uint rowEnd = min(row->end, primaryEnd);
+    if (rowEnd > rowStart)
+        {
+        float *vector = row->data;
+        int iStart = rowStart - row->start;
+        int iEnd = rowEnd - row->start;
+        int i;
+        for (i = iStart;  i < iEnd;  i++)
+            {
+            // skip NAN values so they don't convert sum to a NAN:
+            if (! isnan(vector[i]))
+                {
+                sum += vector[i];
+                count++;
+                }
+            }
+        end = rowEnd;
+        }
+    }
+if (count > 0)
+    {
+    double average = (sum / (double)count);
+    return annoRowWigSingleNew(rowList->chrom, start, end, FALSE, average, callerLm);
+    }
+return NULL;
+}
 
 static struct annoRow *agwIntegrate(struct annoGrator *gSelf, struct annoStreamRows *primaryData,
 				    boolean *retRJFilterFailed, struct lm *callerLm)
 /* Return wig annoRows that overlap primaryRow position, with NANs weeded out. */
 {
 struct annoGrateWig *self = (struct annoGrateWig *)gSelf;
 // Cleanup internal lm every so often:
 if (self->lmRowCount >= 4096)
     {
     lmCleanup(&(self->lm));
     self->lmRowCount = 0;
     }
 if (self->lm == NULL)
     self->lm = lmInit(0);
 struct annoRow *rowsIn = annoGratorIntegrate(self->mySource, primaryData, retRJFilterFailed,
 					     self->lm);
 self->lmRowCount += slCount(rowsIn);
 if (retRJFilterFailed && *retRJFilterFailed)
     return NULL;
 struct annoRow *primaryRow = primaryData->rowList;
-struct annoRow *rowIn, *rowOutList = NULL;;
-for (rowIn = rowsIn;  rowIn != NULL;  rowIn = rowIn->next)
-    tidyUp(rowIn, &rowOutList, primaryRow->start, primaryRow->end, callerLm);
-slReverse(&rowOutList);
-return rowOutList;
+if (self->mode == agwmAverage)
+    return averageRows(rowsIn, primaryRow->start, primaryRow->end, callerLm);
+else if (self->mode == agwmPerBase)
+    return normalizeRows(rowsIn, primaryRow->start, primaryRow->end, callerLm);
+else
+    errAbort("agwIntegrate: invalid self->mode %d", self->mode);
+return NULL;
 }
 
-struct annoGrator *annoGrateWigNew(struct annoStreamer *wigSource)
-/* Create an annoGrator subclass for source with rowType == arWig. */
+static void appendAverageToValueLabel(struct asObject *asObj)
+/* When we are averaging wiggle values, tweak the "value" column label to remind the user. */
+{
+struct asColumn *col;
+for (col = asObj->columnList;  col != NULL;  col = col->next)
+    {
+    if (sameString(col->name, "value"))
         {
-if (wigSource->rowType != arWig)
-    errAbort("annoGrateWigNew: expected source->rowType arWig (%d), got %d from source %s",
-	     arWig, wigSource->rowType, wigSource->name);
+        freez(&col->name);
+        col->name = cloneString("valueAverage");
+        }
+    }
+}
+
+struct annoGrator *annoGrateWigNew(struct annoStreamer *wigSource, enum annoGrateWigMode mode)
+/* Create an annoGrator subclass for source with wiggle or bigWig data.
+ * If mode is agwmAverage, then this produces at most one annoRow per primary item
+ * with type arWigSingle, containing the average of all overlapping non-NAN values.
+ * If mode is agwmPerBase, then this produces a list of annoRows split by NANs, with type
+ * arWigVec (vector of per-base values). */
+{
+if (wigSource->rowType != arWigVec)
+    errAbort("annoGrateWigNew: expected source->rowType arWigVec (%d), got %d from source %s",
+	     arWigVec, wigSource->rowType, wigSource->name);
 struct annoGrateWig *self;
 AllocVar(self);
 struct annoGrator *gSelf = (struct annoGrator *)self;
 annoGratorInit(gSelf, wigSource);
 gSelf->integrate = agwIntegrate;
 self->mySource = annoGratorNew(wigSource);
+self->mode = mode;
+struct annoStreamer *sSelf = (struct annoStreamer *)gSelf;
+if (mode == agwmAverage)
+    {
+    sSelf->rowType = arWigSingle;
+    appendAverageToValueLabel(sSelf->asObj);
+    }
+else if (mode == agwmPerBase)
+    sSelf->rowType = arWigVec;
+else
+    errAbort("annoGrateWigNew: unrecognized enum annoGrateWigMode %d", mode);
 return gSelf;
 }
 
-struct annoGrator *annoGrateBigWigNew(char *fileOrUrl, struct annoAssembly *aa)
-/* Create an annoGrator subclass for bigWig file or URL. */
+struct annoGrator *annoGrateBigWigNew(char *fileOrUrl, struct annoAssembly *aa,
+                                      enum annoGrateWigMode mode)
+/* Create an annoGrator subclass for bigWig file or URL.  See above description of mode. */
 {
 struct annoStreamer *bigWigSource = annoStreamBigWigNew(fileOrUrl, aa);
-return annoGrateWigNew(bigWigSource);
+return annoGrateWigNew(bigWigSource, mode);
 }