5f443dee868d091ae2523a1b6f9b4e6ddc90b3d0
kent
  Tue Jan 5 15:37:01 2021 -0800
Freen plays with colors for new single cell RNA tracks, and generates many shades of mostly brown.

diff --git src/hg/oneShot/freen/freen.c src/hg/oneShot/freen/freen.c
index 745564a..8905370 100644
--- src/hg/oneShot/freen/freen.c
+++ src/hg/oneShot/freen/freen.c
@@ -1,128 +1,205 @@
 /* freen - My Pet Freen.  A pet freen is actually more dangerous than a wild one. */
 
 /* Copyright (C) 2014 The Regents of the University of California 
  * See README in this or parent directory for licensing information. */
 
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "dystring.h"
 #include "cheapcgi.h"
 #include "jksql.h"
 #include "portable.h"
 #include "obscure.h"
 #include "localmem.h"
 #include "csv.h"
-#include "tokenizer.h"
-#include "strex.h"
-#include "hmac.h"
-#include "hdf5.h"
+#include "memgfx.h"
+#include "correlate.h"
+#include "vMatrix.h"
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
    {NULL, 0},
 };
 
 void usage()
 {
 errAbort("freen - test some hairbrained thing.\n"
-         "usage:  freen input\n");
+         "usage:  freen refMatrix.tsv sampleMatrix.tsv outColors.tsv\n");
 }
 
-#define FULLDATASET         "obs/__categories/cell_type"
-#define DIM0            4
+double columnVectorLength(struct memMatrix *m, int column)
+{
+double **rows = m->rows;
+int ySize = m->ySize;
+int y;
+double sumSquares = 0.0;
+for (y=0; y<ySize; ++y)
+    {
+    double val = rows[y][column];
+    sumSquares += val*val;
+    }
+return sqrt(sumSquares);
+}
 
-hid_t  h5dOpen(hid_t parent, char *name)
-/* Open up a subpart or die trying */
+void normalizeColumn(struct memMatrix *m, int column)
+/* Normalize a column in matrix so that as a vector it has length 1 */
+{
+double **rows = m->rows;
+double length = columnVectorLength(m, column);
+if (length > 0.0)
     {
-hid_t ret = H5Dopen(parent, name, H5P_DEFAULT);
-if (ret < 0)
-   errAbort("Couldn't find %s in parent", name);
-return ret;
+    int ySize = m->ySize;
+    int y;
+    double invLength = 1/length;
+    for (y=0; y<ySize; ++y)
+	rows[y][column] *= invLength;
+    }
+verbose(2, "oldLength %g, newLength %g\n", length, columnVectorLength(m, column));
 }
 
-hid_t h5fOpen(char *fileName,  int accessType)
+void normalizeColumns(struct memMatrix *m)
+/* Normalize columns so that they have Euclidian length one as a vector */
 {
-hid_t ret = H5Fopen(fileName, accessType, H5P_DEFAULT);
-if (ret < 0)
-   errAbort("Couldn't open %s", fileName);
-return ret;
+int x;
+for (x=0; x<m->xSize; ++x)
+    normalizeColumn(m, x);
 }
 
-void freen(char *fileName)
-/* Test something */
+double refDistance(struct memMatrix *refMatrix, struct hash *refHash, int refCol,
+    struct memMatrix *qMatrix, int qCol)
+/* Return a dotProduct between a column in a reference matrix and a query matrix.
+ * The reference matrix has a row hash so that we can look up the corresponding row
+ * based on the label in the qMatrix row.  If somethign is missing from refMatrix
+ * that's ok, it just won't contribute to the dot product. */
+{
+int i;
+double sumSquares = 0.0;
+for (i=0; i<qMatrix->ySize; ++i)
     {
+    char *label = qMatrix->yLabels[i];
+    double *row = hashFindVal(refHash, label);
+    if (row != NULL)
+	{
+	double d = row[refCol] - qMatrix->rows[i][qCol];
+	sumSquares += d*d;
+	}
+    }
+return sqrt(sumSquares);
+}
 
-/*
- * Open file, dataset, and attribute.
- */
-hid_t file = h5fOpen (fileName, H5F_ACC_RDONLY);
-hid_t dset = h5dOpen (file, "obs/__categories");
-
-/*
- * Get the datatype.
- */
-hid_t filetype = H5Dget_type (dset);
-uglyf("filetype = %d\n", filetype);
-
-/*
- * Get dataspace and allocate memory for read buffer.
- */
-hid_t space = H5Dget_space (dset);
-hsize_t     dims[1] = {DIM0};
-int ndims = H5Sget_simple_extent_dims (space, dims, NULL);
-uglyf("Got %d ndims\n", ndims);
-char **rdata = (char **) malloc (dims[0] * sizeof (char *));
-
-/*
- * Create the memory datatype.
- */
-hid_t memtype = H5Tcopy (H5T_C_S1);
-herr_t status = H5Tset_size (memtype, H5T_VARIABLE);
-status = H5Tset_cset(memtype, H5T_CSET_UTF8) ;
-
-/*
- * Read the data.
- */
-status = H5Dread (dset, memtype, H5S_ALL, H5S_ALL, H5P_DEFAULT, rdata);
-if (status < 0)
-    errAbort("status %d, rats", status);
-
-/*
- * Output the data to the screen.
- */
+double refCorrelate(struct memMatrix *refMatrix, struct hash *refHash, int refCol,
+    struct memMatrix *qMatrix, int qCol)
+/* Return a dotProduct between a column in a reference matrix and a query matrix.
+ * The reference matrix has a row hash so that we can look up the corresponding row
+ * based on the label in the qMatrix row.  If somethign is missing from refMatrix
+ * that's ok, it just won't contribute to the dot product. */
+{
+struct correlate *c = correlateNew();
 int i;
-for (i=0; i<dims[0]; i++)
-    printf ("%s[%d]: %s\n", "cell_type", i, rdata[i]);
+for (i=0; i<qMatrix->ySize; ++i)
+    {
+    char *label = qMatrix->yLabels[i];
+    double *row = hashFindVal(refHash, label);
+    if (row != NULL)
+	correlateNext(c, row[refCol], qMatrix->rows[i][qCol]);
+    }
+double result = correlateResult(c);
+correlateFree(&c);
+return result;
+}
 
+struct rgbColor colors[5] = {{0x33,0x33,0xFF},{ 0xCC,0x00,0xCC},{ 0xFF,0x11,0x11},{ 0x00,0xB0,0x00},{ 0xF0,0x99,0x00}};
 
-/* Close the dataset. */
-/*
- * Close and release resources.  Note that H5Dvlen_reclaim works
- * for variable-length strings as well as variable-length arrays.
- * Also note that we must still free the array of pointers stored
- * in rdata, as H5Tvlen_reclaim only frees the data these point to.
- */
-status = H5Dvlen_reclaim (memtype, space, H5P_DEFAULT, rdata);
-free (rdata);
-status = H5Dclose (dset);
-status = H5Sclose (space);
-status = H5Tclose (filetype);
-status = H5Tclose (memtype);
-status = H5Fclose (file);
-if (status < 0)
-   warn("status %d", status);
+void freen(char *refMatrixTsv, char *sampleMatrixTsv, char *colorsOut)
+/* Test something */
+{
+struct memMatrix *refMatrix = memMatrixFromTsv(refMatrixTsv);
+verbose(1, "%s is %d x %d\n", refMatrixTsv, refMatrix->xSize, refMatrix->ySize);
+normalizeColumns(refMatrix);
+struct memMatrix *sampleMatrix = memMatrixFromTsv(sampleMatrixTsv);
+verbose(1, "%s is %d x %d\n", sampleMatrixTsv, sampleMatrix->xSize, sampleMatrix->ySize);
+normalizeColumns(sampleMatrix);
+
+/* Strip dots off of gene names in sample matrix */
+int i;
+for (i=0; i<sampleMatrix->ySize; ++i)
+    chopSuffix(sampleMatrix->yLabels[i]);
+
+struct hash *refRowHash = hashNew(0);
+for (i=0; i<refMatrix->ySize; ++i)
+    hashAdd(refRowHash, refMatrix->yLabels[i], refMatrix->rows[i]);
+
+FILE *f = mustOpen(colorsOut, "w");
+fprintf(f, "#sample");
+for (i=0; i<refMatrix->xSize; ++i)
+     fprintf(f, "\t%s", refMatrix->xLabels[i]);
+fprintf(f, "\tcolor\n");
+int sampleIx;
+for (sampleIx=0; sampleIx<sampleMatrix->xSize; ++sampleIx)
+     {
+     /* Print label */
+     char *sampleLabel = sampleMatrix->xLabels[sampleIx];
+     fprintf(f, "%s", sampleLabel);
+
+     /* Figure out distances from sample to each reference and save min/max distance */
+     int refIx;
+     double minDistance = 3.0;	// 2 is as big as it gets
+     double maxDistance = 0;
+     double distances [refMatrix->xSize];
+     for (refIx=0; refIx < refMatrix->xSize; ++refIx)
+         {
+	 double distance = refDistance(refMatrix, refRowHash, refIx,  sampleMatrix, sampleIx);
+	 if (distance < minDistance)
+	     minDistance = distance;
+	 if (distance > maxDistance)
+	     maxDistance = distance;
+	 distances[refIx] = distance;
+	 }
+
+     minDistance *= 0.75;
+     for (refIx = 0; refIx < refMatrix->xSize; ++refIx)
+	 distances[refIx] -= minDistance - 0.01;
+
+     double totalForce = 0;
+     double forces[refMatrix->xSize];
+     for (refIx=0; refIx < refMatrix->xSize; ++refIx)
+         {
+	 double distance = distances[refIx];
+	 double force = 1.0/(distance*distance);
+	 fprintf(f, "\t%g", force);
+	 totalForce += force;
+	 forces[refIx] = force;
+	 }
+
+     /* Figure out RGB values */
+     double r=0, g=0, b=0;
+     if (totalForce > 0)
+	 {
+	 for (refIx=0; refIx < refMatrix->xSize; ++refIx)
+	     {
+	     double normForce = forces[refIx]/totalForce;
+	     r += colors[refIx].r * normForce;
+	     g += colors[refIx].g * normForce;
+	     b += colors[refIx].b * normForce;
+	     }
+	 }
+     else
+          r = g = b = 0x70;
+     fprintf(f, "\t#%02x%02x%02x\n", round(r), round(g), round(b));
+     }
+carefulClose(&f);
 }
 
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
-if (argc != 2)
+if (argc != 4)
     usage();
-freen(argv[1]);
+freen(argv[1], argv[2], argv[3]);
 return 0;
 }