aaf7309dce63a05d18d0e639246fd3fa1f030d47
kate
  Wed Apr 12 14:11:02 2017 -0700
Implement boxplot on details version for hub version of track. refs #18736

diff --git src/hg/hgc/barChartClick.c src/hg/hgc/barChartClick.c
index 62c46fa..0465037 100644
--- src/hg/hgc/barChartClick.c
+++ src/hg/hgc/barChartClick.c
@@ -5,156 +5,288 @@
 
 #include "common.h"
 #include "hash.h"
 #include "hdb.h"
 #include "hvGfx.h"
 #include "trashDir.h"
 #include "hgc.h"
 #include "hCommon.h"
 
 #include "barChartBed.h"
 #include "barChartCategory.h"
 #include "barChartData.h"
 #include "barChartSample.h"
 #include "barChartUi.h"
 
-// TODO: Consider moving these to lib/{barChartBed,bigBarChart}.c
+struct barChartItemData
+/* Measured value for a sample and the sample category at a locus.
+ * Used for barChart track details (boxplot) */
+    {
+    struct barChartItemData *next;  /* Next in singly linked list. */
+    char *sample;	/* Sample identifier */
+    char *category;     /* Sample category (from barChartSample table  or barChartSampleUrl file) */
+    double value;	/* Measured value (e.g. expression level) */
+    };
+
+static struct hash *getTrackCategories(struct trackDb *tdb)
+/* Get list of categories from trackDb.  This may be a subset of those in matrix. 
+ * (though maybe better to prune matrix for performance) */
+{
+char *categs = trackDbSetting(tdb, BAR_CHART_CATEGORY_LABELS);
+char *words[BAR_CHART_MAX_CATEGORIES];
+int wordCt;
+wordCt = chopLine(cloneString(categs), words);
+int i;
+struct hash *categoryHash = hashNew(0);
+for (i=0; i<wordCt; i++)
+    {
+    hashStore(categoryHash, words[i]);
+    }
+return categoryHash;
+}
 
-static struct bed *getBarChartFromFile(char *item, char *chrom, int start, int end, 
+static struct barChartBed *getBarChartFromFile(char *item, char *chrom, int start, int end, 
                                                 char *file)
 /* Retrieve barChart BED item from big file */
 {
 struct bbiFile *bbi = bigBedFileOpen(file);
 struct lm *lm = lmInit(0);
 struct bigBedInterval *bb, *bbList =  bigBedIntervalQuery(bbi, chrom, start, end, 0, lm);
 for (bb = bbList; bb != NULL; bb = bb->next)
     {
     char startBuf[16], endBuf[16];
-    char *bedRow[32];
-    bigBedIntervalToRow(bb, chrom, startBuf, endBuf, bedRow, ArraySize(bedRow));
-    struct bed *barChart = barChartSimpleBedLoad(bedRow);
+    char *row[32];
+    bigBedIntervalToRow(bb, chrom, startBuf, endBuf, row, ArraySize(row));
+    struct barChartBed *barChart = barChartBedLoadOptionalOffsets(row, TRUE);
     if (sameString(barChart->name, item))
         return barChart;
     }
 return NULL;
 }
 
-static struct bed *getBarChartFromTable(char *item, char *chrom, int start, int end, 
+static struct barChartBed *getBarChartFromTable(char *item, char *chrom, int start, int end, 
                                                 char *table)
 /* Retrieve barChart BED item from track table */
 {
-struct bed *barChart = NULL;
+struct barChartBed *barChart = NULL;
 struct sqlConnection *conn = hAllocConn(database);
 char **row;
 char query[512];
 struct sqlResult *sr;
 if (sqlTableExists(conn, table))
     {
     sqlSafef(query, sizeof query, 
                 "SELECT * FROM %s WHERE name='%s'"
                     "AND chrom='%s' AND chromStart=%d AND chromEnd=%d", 
                                 table, item, chrom, start, end);
     sr = sqlGetResult(conn, query);
     row = sqlNextRow(sr);
+    // TODO: Fix or retire
+    /*
+    boolean hasOffsets = sqlColumnExists(conn, table, BARCHART_OFFSET_COLUMN);
+    */
+
     if (row != NULL)
         {
-        barChart = barChartSimpleBedLoad(row);
+        // TODO: Fix or retire
+        barChart = barChartBedLoadOptionalOffsets(row, FALSE);
         }
     sqlFreeResult(&sr);
     }
 hFreeConn(&conn);
 return barChart;
 }
 
-static struct bed *getBarChart(char *item, char *chrom, int start, int end, 
+static struct barChartBed *getBarChart(char *item, char *chrom, int start, int end, 
                                         struct trackDb *tdb)
 /* Retrieve barChart BED item from track */
 {
-struct bed *barChart = NULL;
+struct barChartBed *barChart = NULL;
 char *file = trackDbSetting(tdb, "bigDataUrl");
 if (file != NULL)
     barChart = getBarChartFromFile(item, chrom, start, end, file);
 else
     barChart = getBarChartFromTable(item, chrom, start, end, tdb->table);
 return barChart;
 }
 
-static struct barChartData *getSampleVals(char *track, char *item)
-/* Get data values for this item (locus) from all samples */
+static struct barChartItemData *getSampleValsFromFile(struct trackDb *tdb, 
+                                                        struct hash *categoryHash,
+                                                        struct barChartBed *bed)
+/* Get all data values in a file for this item (locus) */
 {
-char table[256];
-safef(table, sizeof(table), "%s%s", track, "Data");
-struct sqlConnection *conn = hAllocConn(database);
-if (!sqlTableExists(conn, table))
+// Get sample categories from sample file
+// Format: id, category, extras
+char *url = trackDbSetting(tdb, "barChartSampleUrl");
+struct lineFile *lf = udcWrapShortLineFile(url, NULL, 0);
+struct hash *sampleHash = hashNew(0);
+char *words[2];
+int sampleCt = 0;
+while (lineFileChopNext(lf, words, sizeof words))
     {
-    hFreeConn(&conn);
-    conn = hAllocConn("hgFixed");
-    if (!sqlTableExists(conn, table))
+    hashAdd(sampleHash, words[0], words[1]);
+    sampleCt++;
+    }
+lineFileClose(&lf);
+
+// Open matrix file
+url = trackDbSetting(tdb, "barChartDataUrl");
+struct udcFile *f = udcFileOpen(url, NULL);
+
+// Get header line with sample ids
+char *header = udcReadLine(f);
+int wordCt = sampleCt+1;        // initial field is label or empty 
+char **samples;
+AllocArray(samples, wordCt);
+chopByWhite(header, samples, wordCt);
+
+// Get data values
+// Format: id, category, extras
+// TODO: check data types for offset & len
+bits64 offset = (bits64)bed->_dataOffset;
+bits64 size = (bits64)bed->_dataLen;
+udcSeek(f, offset);
+bits64 seek = udcTell(f);
+if (udcTell(f) != offset)
+    warn("UDC seek mismatch: expecting %Lx, got %Lx. ", offset, seek);
+char *buf = needMem(size);
+bits64 count = udcRead(f, buf, size);
+if (count != size)
+    warn("UDC read mismatch: expecting %Ld bytes, got %Ld. ", size, count);
+char **vals;
+AllocArray(vals, wordCt);
+chopByWhite(buf, vals, wordCt);
+udcFileClose(&f);
+
+// Construct list of sample data with category
+struct barChartItemData *sampleVals = NULL, *data = NULL;
+int i;
+for (i=1; i<wordCt; i++)
     {
-        hFreeConn(&conn);
-        return NULL;
+    char *sample = samples[i];
+    char *categ = (char *)hashFindVal(sampleHash, sample);
+    if (categ == NULL)
+        warn("barChart track %s: unknown category for sample %s", tdb->track, sample);
+    else if (hashLookup(categoryHash, categ))
+        {
+        AllocVar(data);
+        // TODO: try w/o clone
+        data->sample = cloneString(sample);
+        data->category = cloneString(categ);
+        data->value = sqlDouble(vals[i]);
+        slAddHead(&sampleVals, data);
         }
     }
-struct barChartData *vals = barChartDataLoadForLocus(conn, table, item);
-hFreeConn(&conn);
-return vals;
+return sampleVals;
 }
 
-static char *makeDataFrame(char *track, struct barChartData *vals)
-/* Create R data frame from sample data.  This is a tab-sep file, one row per sample.
-   Return filename. */
+static struct sqlConnection *getConnectionAndTable(struct trackDb *tdb, char *suffix,
+                                                         char **retTable)
+/* Look for <table><suffix> in database or hgFixed and set up connection */
 {
-// Get category for samples 
 char table[256];
-safef(table, sizeof(table), "%s%s", track, "Sample");
+assert(retTable);
+safef(table, sizeof(table), "%s%s", tdb->table, suffix);
+*retTable = cloneString(table);
 struct sqlConnection *conn = hAllocConn(database);
 if (!sqlTableExists(conn, table))
     {
     hFreeConn(&conn);
     conn = hAllocConn("hgFixed");
     if (!sqlTableExists(conn, table))
         {
         hFreeConn(&conn);
         return NULL;
         }
     }
+return conn;
+}
+
+static struct barChartItemData *getSampleValsFromTable(struct trackDb *tdb, 
+                                                        struct hash *categoryHash,
+                                                        struct barChartBed *bed)
+/* Get data values for this item (locus) from all samples */
+// TODO: Consider retiring table-based version.  Use files instead, like hub version
+{
+char *table = NULL;
+struct sqlConnection *conn = getConnectionAndTable(tdb, "Data", &table);
+struct barChartData *val, *vals = barChartDataLoadForLocus(conn, table, bed->name);
+hFreeConn(&conn);
+
+// Get category for samples 
+conn = getConnectionAndTable(tdb, "Sample", &table);
 char query[512];
 sqlSafef(query, sizeof(query), "SELECT * FROM %s", table);
 struct barChartSample *sample, *samples = barChartSampleLoadByQuery(conn, query);
 hFreeConn(&conn);
+
 struct hash *sampleHash = hashNew(0);
 for (sample = samples; sample != NULL; sample = sample->next)
     {
     hashAdd(sampleHash, sample->sample, sample);
     }
 
+// Construct list of sample data with category
+struct barChartItemData *sampleVals = NULL, *data = NULL;
+for (val = vals; val != NULL; val = val->next)
+    {
+    struct barChartSample *sample = hashFindVal(sampleHash, val->sample);
+    if (sample == NULL)
+        warn("barChart track %s: unknown category for sample %s", tdb->track, val->sample);
+    else if (hashLookup(categoryHash, sample->category))
+        {
+        AllocVar(data);
+        data->sample = cloneString(val->sample);
+        data->category = cloneString(sample->category);
+        data->value = val->value;
+        slAddHead(&sampleVals, data);
+        }
+    }
+return sampleVals;
+}
+
+static struct barChartItemData *getSampleVals(struct trackDb *tdb, struct barChartBed *chartItem)
+/* Get data values for this item (locus) from all samples */
+{
+struct barChartItemData *vals = NULL;
+char *file = trackDbSetting(tdb, "barChartDataUrl");
+struct hash *categoryHash = getTrackCategories(tdb);
+if (file != NULL)
+    vals = getSampleValsFromFile(tdb, categoryHash, chartItem);
+else
+    vals = getSampleValsFromTable(tdb, categoryHash, chartItem);
+return vals;
+}
+
+static char *makeDataFrame(char *track, struct barChartItemData *vals)
+/* Create R data frame from sample data.  This is a tab-sep file, one row per sample.
+   Return filename. */
+{
+
 // Create data frame with columns for sample, category, value */
 struct tempName dfTn;
 trashDirFile(&dfTn, "hgc", "barChart", ".df.txt");
 FILE *f = fopen(dfTn.forCgi, "w");
 if (f == NULL)
     errAbort("can't create temp file %s", dfTn.forCgi);
 fprintf(f, "sample\tcategory\tvalue\n");
 int i = 0;
-struct barChartData *val;
+struct barChartItemData *val;
 for (val = vals; val != NULL; val = val->next)
     {
-    struct barChartSample *sample = hashFindVal(sampleHash, val->sample);
-    if (sample == NULL)
-        warn("barChart track %s: unknown category for sample %s", track, val->sample);
-    else
-        fprintf(f, "%d\t%s\t%0.3f\n", i++, sample->category, val->value);
+    // NOTE: don't actually need sample ID here -- just use a unique int
+    fprintf(f, "%d\t%s\t%0.3f\n", i++, val->category, val->value);
     }
 fclose(f);
 return cloneString(dfTn.forCgi);
 }
 
 char *makeColorFile(struct trackDb *tdb)
 /* Make a file with category + color */
 {
 struct tempName colorTn;
 trashDirFile(&colorTn, "hgc", "barChartColors", ".txt");
 FILE *f = fopen(colorTn.forCgi, "w");
 if (f == NULL)
     errAbort("can't create temp file %s", colorTn.forCgi);
 struct barChartCategory *categs = barChartUiGetCategories(database, tdb);
 struct barChartCategory *categ;
@@ -176,45 +308,45 @@
 
 /* Exec R in quiet mode, without reading/saving environment or workspace */
 char cmd[256];
 safef(cmd, sizeof(cmd), "Rscript --vanilla --slave hgcData/barChartBoxplot.R %s %s %s %s %s",
                                 item, units, colorFile, df, pngTn.forHtml);
 int ret = system(cmd);
 if (ret == 0)
     printf("<img src = \"%s\" border=1><br>\n", pngTn.forHtml);
 }
 
 void doBarChartDetails(struct trackDb *tdb, char *item)
 /* Details of barChart item */
 {
 int start = cartInt(cart, "o");
 int end = cartInt(cart, "t");
-struct bed *chartItem = getBarChart(item, seqName, start, end, tdb);
+struct barChartBed *chartItem = getBarChart(item, seqName, start, end, tdb);
 if (chartItem == NULL)
-    errAbort("Can't find item %s in barChart table %s\n", item, tdb->table);
+    errAbort("Can't find item %s in barChart table/file %s\n", item, tdb->table);
 
 genericHeader(tdb, item);
 int categId;
 float highLevel = barChartMaxValue(chartItem, &categId);
 char *units = trackDbSettingClosestToHomeOrDefault(tdb, BAR_CHART_UNIT, "");
 printf("<b>Maximum value: </b> %0.2f %s in %s<br>\n", 
                 highLevel, units, barChartUiGetCategoryLabelById(categId, database, tdb));
 printf("<b>Total all values: </b> %0.2f<br>\n", barChartTotalValue(chartItem));
 printf("<b>Score: </b> %d<br>\n", chartItem->score); 
 printf("<b>Genomic position: "
                 "</b>%s <a href='%s&db=%s&position=%s%%3A%d-%d'>%s:%d-%d</a><br>\n", 
                     database, hgTracksPathAndSettings(), database, 
                     chartItem->chrom, chartItem->chromStart+1, chartItem->chromEnd,
                     chartItem->chrom, chartItem->chromStart+1, chartItem->chromEnd);
-struct barChartData *vals = getSampleVals(tdb->table, item);
+struct barChartItemData *vals = getSampleVals(tdb, chartItem);
 if (vals != NULL)
     {
     // Print boxplot
     puts("<p>");
     char *df = makeDataFrame(tdb->table, vals);
     char *colorFile = makeColorFile(tdb);
     printBoxplot(df, item, units, colorFile);
     }
 puts("<br>");
 printTrackHtml(tdb);
 }