ba9c4b8c144df33072d58154549b7cf531d15bde
kate
  Tue May 3 09:14:22 2016 -0700
Libifying GTEx boxplot for use by hgGene (as well as hgc). refs #17244

diff --git src/hg/lib/gtexBoxplot.c src/hg/lib/gtexBoxplot.c
new file mode 100644
index 0000000..2f6909f
--- /dev/null
+++ src/hg/lib/gtexBoxplot.c
@@ -0,0 +1,160 @@
+/* Create a PNG file with boxplot of gene expression 
+ *      for GTEx (Genotype Tissue Expression) data. */
+
+/* Copyright (C) 2016 The Regents of the University of California 
+ * See README in this or parent directory for licensing information. */
+
+#include "common.h"
+#include "hash.h"
+#include "trashDir.h"
+#include "gtexInfo.h"
+#include "gtexTissue.h"
+#include "gtexSampleData.h"
+
+struct tissueSampleVals
+/* RPKM expression values for multiple samples */
+/* (Data structure shared by different implementations of boxplot (e.g. C, JS),
+ * later abandoned) */
+    {
+    struct tissueSampleVals *next;
+    char *name;         /* GTEx tissue name */
+    char *description;  /* GTEx tissue description */
+    int color;          /* GTEx tissue color */
+    int count;          /* number of samples */
+    double *vals;       /* RPKM values */
+    double min, max;    /* minimum, maximum value */
+    double q1, median, q3;      /* quartiles */
+    struct slDouble *valList;   /* used to create val array */
+    };
+
+static struct tissueSampleVals *getTissueSampleVals(char *geneId, boolean doLogTransform,
+                                                char *version, double *maxValRet)
+/* Get sample data for the gene.  Optionally log10 it. Return maximum value seen */
+{
+struct hash *tsHash = hashNew(0);
+struct tissueSampleVals *tsv;
+struct hashEl *hel;
+struct slDouble *val;
+double maxVal = 0;
+struct gtexSampleData *sd = NULL;
+char query[256];
+char **row;
+char buf[256];
+char *sampleDataTable = "gtexSampleData";
+safef(buf, sizeof(buf), "%s%s", sampleDataTable, gtexVersionSuffixFromVersion(version));
+struct sqlConnection *conn = hAllocConn("hgFixed");
+assert(sqlTableExists(conn, buf));
+sqlSafef(query, sizeof(query), "select * from %s where geneId='%s'", buf, geneId);
+struct sqlResult *sr = sqlGetResult(conn, query);
+while ((row = sqlNextRow(sr)) != NULL)
+    {
+    sd = gtexSampleDataLoad(row);
+    if ((hel = hashLookup(tsHash, sd->tissue)) == NULL)
+        {
+        AllocVar(tsv);
+        hashAdd(tsHash, sd->tissue, tsv);
+        }
+    else
+        tsv = (struct tissueSampleVals *)hel->val;
+    maxVal = max(maxVal, sd->score);
+    val = slDoubleNew(sd->score);
+    slAddHead(&tsv->valList, val);
+    }
+/*  Fill in tissue descriptions, fill values array and calculate stats for plotting
+        Then make a list, suitable for sorting by tissue or score
+    NOTE: Most of this not needed for R implementation */
+struct gtexTissue *tis = NULL, *tissues = gtexGetTissues(version);
+struct tissueSampleVals *tsList = NULL;
+int i;
+if (doLogTransform)
+    maxVal = log10(maxVal+1.0);
+for (tis = tissues; tis != NULL; tis = tis->next)
+    {
+    tsv = hashFindVal(tsHash, tis->name);
+    if (tsv == NULL)
+        {
+        /* no non-zero values for this tissue/gene */
+        AllocVar(tsv);
+        val = slDoubleNew(0.0);
+        slAddHead(&tsv->valList, val);
+        }
+    tsv->name = tis->name;
+    tsv->description = tis->description;
+    tsv->color = tis->color;
+    int count = tsv->count = slCount(tsv->valList);
+    double *vals = AllocArray(tsv->vals, count);
+    for (i=0; i<count; i++)
+        {
+        val = slPopHead(&tsv->valList);
+        if (doLogTransform)
+            vals[i] = log10(val->val+1.0);
+        else
+            vals[i] = val->val;
+        }
+    doubleBoxWhiskerCalc(tsv->count, tsv->vals, 
+                                &tsv->min, &tsv->q1, &tsv->median, &tsv->q3, &tsv->max);
+    slAddHead(&tsList, tsv);
+    }
+if (maxValRet != NULL)
+    *maxValRet = maxVal;
+return tsList;
+}
+
+/********************************************************/
+/* R implementation of GTEx boxplot.  This invokes R interpreter on an R script */
+
+static boolean drawGtexRBoxplot(char *geneName, struct tissueSampleVals *tsvList,
+                        boolean doLogTransform, char *version, struct tempName *pngTn)
+/* Draw a box-and-whiskers plot from GTEx sample data, using R boxplot */
+{
+/* Create R data frame.  This is a tab-sep file, one row per sample, 
+ * with columns for sample, tissue, rpkm */
+struct tempName dfTn;
+trashDirFile(&dfTn, "hgc", "gtexGene", ".df.txt");
+FILE *f = fopen(dfTn.forCgi, "w");
+if (f == NULL)
+    errAbort("can't create temp file %s", dfTn.forCgi);
+fprintf(f, "sample\ttissue\trpkm\n");
+struct tissueSampleVals *tsv;
+int sampleId=1;
+int i;
+for (tsv = tsvList; tsv != NULL; tsv = tsv->next)
+    {
+    int count = tsv->count;
+    // remove trailing parenthesized phrases as not worth label length
+    chopSuffixAt(tsv->description, '(');
+    for (i=0; i<count; i++)
+        fprintf(f, "%d\t%s\t%0.3f\n", sampleId++, tsv->description, tsv->vals[i]);
+    }
+fclose(f);
+
+// Plot to PNG file
+if (!pngTn)
+    return FALSE;
+trashDirFile(pngTn, "hgc", "gtexGene", ".png");
+char cmd[256];
+
+/* Exec R in quiet mode, without reading/saving environment or workspace */
+safef(cmd, sizeof(cmd), "Rscript --vanilla --slave hgcData/gtexBoxplot.R %s %s %s %s %s %s",  
+                                geneName, dfTn.forCgi, pngTn->forHtml, 
+                                doLogTransform ? "log=TRUE" : "log=FALSE", "order=alpha", version);
+//NOTE: use "order=score" to order bargraph by median RPKM, descending
+
+int ret = system(cmd);
+if (ret == 0)
+    return TRUE;
+return FALSE;
+}
+
+boolean gtexGeneBoxplot(char *geneId, char *geneName, char *version, 
+                                boolean doLogTransform, struct tempName *pngTn)
+/* Create a png temp file with boxplot of GTEx expression values for this gene. 
+ * GeneId is the Ensembl gene ID.  GeneName is the HUGO name, used for graph title;
+ * If NULL, label with the Ensembl gene ID */
+{
+struct tissueSampleVals *tsvs;
+tsvs  = getTissueSampleVals(geneId, doLogTransform, version, NULL);
+char *label = geneName ? geneName : geneId;
+return drawGtexRBoxplot(label, tsvs, doLogTransform, version, pngTn);
+}
+