src/hg/instinct/hgGeneset/hgStats.c 1.4
1.4 2010/01/29 00:34:07 jsanborn
added 2D clustering
Index: src/hg/instinct/hgGeneset/hgStats.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/instinct/hgGeneset/hgStats.c,v
retrieving revision 1.3
retrieving revision 1.4
diff -b -B -U 1000000 -r1.3 -r1.4
--- src/hg/instinct/hgGeneset/hgStats.c 28 Jan 2010 23:45:26 -0000 1.3
+++ src/hg/instinct/hgGeneset/hgStats.c 29 Jan 2010 00:34:07 -0000 1.4
@@ -1,1352 +1,1392 @@
/*
* hgStats -- wrappers that set up hgHeatmap data for cluster statistical routines
*/
#include "common.h"
#include "bed.h"
#include "memgfx.h"
#include "vGfx.h"
#include "cart.h"
#include "errCatch.h"
#include "hash.h"
#include "hdb.h"
#include "web.h"
#include "htmshell.h"
#include "featuresLib.h"
#include "hPrint.h"
#include "trashDir.h"
#include "obscure.h"
#include "hgStats.h"
#define MAX_ROWS 10000 // Maximum number of rows to cluster
#define MAX_COLUMNS 10000 // Maximum number of columns to cluster
#define CLIP(p,limit) if (p < 0) p = 0; if (p >= (limit)) p = (limit)-1;
void getNodeOrderFromTree(Node* tree, int nNodes, const double* order,
double *nodeorder, int *nodecounts, char metric)
{
if (tree == NULL || nodeorder == NULL || nodecounts == NULL)
return;
int i;
if (metric=='e' || metric=='b')
/* Scale all distances such that they are between 0 and 1 */
{
double scale = 0.0;
for (i = 0; i < nNodes; i++)
if (tree[i].distance > scale)
scale = tree[i].distance;
if (scale)
for (i = 0; i < nNodes; i++)
tree[i].distance /= scale;
}
/* Now we join nodes */
for (i = 0; i < nNodes; i++)
{
int min1 = tree[i].left;
int min2 = tree[i].right;
/* min1 and min2 are the elements that are to be joined */
double order1;
double order2;
int counts1;
int counts2;
if (min1 < 0)
{
int index1 = -min1-1;
order1 = nodeorder[index1];
counts1 = nodecounts[index1];
tree[i].distance = max(tree[i].distance, tree[index1].distance);
}
else
{
order1 = order[min1];
counts1 = 1;
}
if (min2 < 0)
{
int index2 = -min2-1;
order2 = nodeorder[index2];
counts2 = nodecounts[index2];
tree[i].distance = max(tree[i].distance, tree[index2].distance);
}
else
{
order2 = order[min2];
counts2 = 1;
}
nodecounts[i] = counts1 + counts2;
nodeorder[i] = (counts1*order1 + counts2*order2) / (counts1 + counts2);
}
}
static void TreeSort(const int nNodes, const int nData, int *dataindex, const double* order,
const double* nodeorder, const int* nodecounts, Node* tree)
{
const int nElements = nNodes + 1;
int i;
double* neworder = needMem(nElements*sizeof(double)); /* initialized to 0.0 */
int* clusterids = needMem(nElements*sizeof(int));
for (i = 0; i < nElements; i++)
clusterids[i] = i;
for (i = 0; i < nNodes; i++)
{
const int i1 = tree[i].left;
const int i2 = tree[i].right;
const double order1 = (i1<0) ? nodeorder[-i1-1] : order[i1];
const double order2 = (i2<0) ? nodeorder[-i2-1] : order[i2];
const int count1 = (i1<0) ? nodecounts[-i1-1] : 1;
const int count2 = (i2<0) ? nodecounts[-i2-1] : 1;
/* If order1 and order2 are equal, their order is determined by
* the order in which they were clustered */
if (i1<i2)
{
const double increase = (order1<order2) ? count1 : count2;
int j;
for (j = 0; j < nElements; j++)
{
const int clusterid = clusterids[j];
if (clusterid==i1 && order1>=order2)
neworder[j] += increase;
if (clusterid==i2 && order1<order2)
neworder[j] += increase;
if (clusterid==i1 || clusterid==i2)
clusterids[j] = -i-1;
}
}
else
{
const double increase = (order1<=order2) ? count1 : count2;
int j;
for (j = 0; j < nElements; j++)
{
const int clusterid = clusterids[j];
if (clusterid==i1 && order1>order2)
neworder[j] += increase;
if (clusterid==i2 && order1<=order2)
neworder[j] += increase;
if (clusterid==i1 || clusterid==i2)
clusterids[j] = -i-1;
}
}
}
freez(&clusterids);
for (i = 0; i < nData; i++)
dataindex[i] = i;
sort(nData, neworder, dataindex);
freez(&neworder);
return;
}
void initializeData(int rows, int columns, struct hash *geneHash, struct slName *genes,
double **data, int **mask, double *arrayweight,
double *geneorder, char **genename)
/*
* Code was adapted from LOAD function found in ../cluster/data.c to prepare
* data for input to treecluster program.
* Copyright (C) 2002 Michiel Jan Laurens de Hoon.
*/
{
if (geneHash == NULL || data == NULL || mask == NULL || arrayweight == NULL ||
geneorder == NULL || genename == NULL)
return;
int column, row;
struct slName *sl;
/* Set up array weights and array order (not used since we're ordering by gene) */
for (column = 0; column < columns; column++)
arrayweight[column] = 1.0;
/* Fill data, make sure mask is set to one for all data so that
* data is used in clustering */
for (row = 0; row < rows; row++)
{
data[row] = needMem(columns*sizeof(double));
mask[row] = needMem(columns*sizeof(int));
}
/* Fill the genename with gene names */
row = 0;
for (sl = genes; sl; sl = sl->next)
{
genename[row] = cloneString(sl->name);
row++;
}
/* Fill the gene weights with the default value of 1.0 */
for (row = 0; row < rows; row++)
geneorder[row] = row;
row = 0;
for (sl = genes; sl; sl = sl->next)
{
struct hashEl *el = hashLookup(geneHash, sl->name);
if (!el)
{
// uglyf("<BR>%s not found<BR>", sl->name);
for (column = 0; column < columns; column++)
{
data[row][column] = 0.0;
mask[row][column] = 0;
}
}
else
{
// uglyf("<BR>%s FOUND<BR>", sl->name);
struct bed *nb = el->val;
for(column = 0; column < nb->expCount; column++)
{
if (column < columns)
{
data[row][column] = nb->expScores[column];
mask[row][column] = 1;
}
}
}
row++;
}
}
void initializeRawData(int rows, int columns,
struct rawData *rdList, struct mapSettings *settings,
double **data, int **mask, double *arrayweight,
- double *geneorder, char **genename)
+ double *geneorder, char **genename,
+ double *sampleorder, char **samplename)
/*
* Code was adapted from LOAD function found in ../cluster/data.c to prepare
* data for input to treecluster program.
* Copyright (C) 2002 Michiel Jan Laurens de Hoon.
*/
{
-if (!rdList || !settings || !mask || !arrayweight || !geneorder || !genename)
+if (!rdList || !settings || !mask || !arrayweight || !geneorder || !genename
+ || !sampleorder || !samplename)
return;
int column, row;
/* Set up array weights and array order (not used since we're ordering by gene) */
for (column = 0; column < columns; column++)
arrayweight[column] = 1.0;
/* Fill data, make sure mask is set to one for all data so that
* data is used in clustering */
for (row = 0; row < rows; row++)
{
data[row] = needMem(columns*sizeof(double));
mask[row] = needMem(columns*sizeof(int));
for (column = 0; column < columns; column++)
{
data[row][column] = 0.0;
mask[row][column] = 0;
}
}
/* Fill the genename with gene names */
struct hashEl *el;
struct hashCookie cookie = hashFirst(settings->y_index);
while ((el = hashNext(&cookie)) != NULL)
{
char *name = el->name;
int val = ptToInt(el->val);
if (val >= rows)
continue;
genename[val] = cloneString(name);
}
+cookie = hashFirst(settings->x_index);
+while ((el = hashNext(&cookie)) != NULL)
+ {
+ char *name = el->name;
+ int val = ptToInt(el->val);
+
+ if (val >= columns)
+ continue;
+ samplename[val] = cloneString(name);
+ }
+
/* Fill the gene weights with the default value of 1.0 */
for (row = 0; row < rows; row++)
geneorder[row] = row;
+for (column = 0; column < columns; column++)
+ sampleorder[column] = column;
+
int prevSampleId = -1;
int prevFeatureId = -1;
char id[128];
struct rawData *rd;
for (rd = rdList; rd; rd = rd->next)
{
// Attempt to reduce number of hash calls by checking if we *just* searched
// for sample or feature id.
if (rd->feature_id != prevFeatureId)
{
safef(id, sizeof (id), "%d", rd->feature_id);
row = hashIntValDefault(settings->y_index, id, -1);
prevFeatureId = rd->feature_id;
}
if (rd->sample_id != prevSampleId)
{
safef(id, sizeof (id), "%d", rd->sample_id);
column = hashIntValDefault(settings->x_index, id, -1);
prevSampleId = rd->sample_id;
}
if (row < 0 || column < 0)
continue;
if (row >= rows || column >= columns)
continue;
data[row][column] = rd->val;
mask[row][column] = 1;
}
}
Node *clusterSamplesByGeneSet(struct hash *geneHash, struct slName *allGenes,
char method, char metric, int transpose, int *nRows, int *nCols)
/* Set up data structures for cluster software, perform hierarchical clustering, and
* return a new sorted list of gene names according to clustering */
{
if (geneHash == NULL || allGenes == NULL)
return NULL;
if (method == 'x') // should never happen ('x' = don't cluster)
return NULL;
struct bed *nb = NULL;
struct slName *sl, *genes = NULL;
struct hashEl *el;
for (sl = allGenes; sl ; sl = sl->next)
{
el = hashLookup(geneHash, sl->name);
if (el != NULL)
{
slNameAddHead(&genes, sl->name);
nb = el->val;
}
}
slReverse(&genes);
if (!nb)
return NULL;
int columns = nb->expCount;
int rows = slCount(genes);
/* Allocate space for array weights */
double *arrayweight = needMem(columns*sizeof(double));
/* Allocate space for data */
double **data = needMem(rows*sizeof(double*));
int **mask = needMem(rows*sizeof(int*));
/* Allocate space for gene quantities */
double *geneorder = needMem(rows*sizeof(double));
int *geneindex = needMem(rows*sizeof(int));
char **genename = needMem(rows*sizeof(char*));
initializeData(rows, columns, geneHash, genes, data, mask, arrayweight, geneorder, genename);
Node* tree = treecluster(rows, columns, data, mask, arrayweight, transpose, metric, method, NULL);
freez(&geneorder);
freez(&geneindex);
freez(&genename);
freez(&arrayweight);
freez(&data);
freez(&mask);
*nRows = rows;
*nCols = columns;
return tree;
}
struct slName *clusterGeneSet(struct hash *geneHash, struct slName *allGenes,
char method, char metric)
/* Set up data structures for cluster software, perform hierarchical clustering, and
* return a new sorted list of gene names according to clustering */
{
if (geneHash == NULL || allGenes == NULL)
return NULL;
if (method == 'x') // should never happen ('x' = don't cluster)
return NULL;
struct bed *nb = NULL;
struct slName *sl, *genes = NULL;
struct hashEl *el;
for (sl = allGenes; sl ; sl = sl->next)
{
el = hashLookup(geneHash, sl->name);
if (el != NULL)
{
slNameAddHead(&genes, sl->name);
nb = el->val;
}
}
slReverse(&genes);
if (nb == NULL)
return NULL;
int columns = nb->expCount;
int rows = slCount(genes);
if (rows > MAX_ROWS || columns > MAX_COLUMNS)
return NULL;
/* Allocate space for array weights */
double *arrayweight = needMem(columns*sizeof(double));
/* Allocate space for data */
double **data = needMem(rows*sizeof(double*));
int **mask = needMem(rows*sizeof(int*));
/* Allocate space for gene quantities */
double *geneorder = needMem(rows*sizeof(double));
int *geneindex = needMem(rows*sizeof(int));
char **genename = needMem(rows*sizeof(char*));
initializeData(rows, columns, geneHash, genes, data, mask, arrayweight, geneorder, genename);
/* Perform hierarchical clustering. */
Node* tree = treecluster(rows, columns, data, mask, arrayweight, 0, metric, method, NULL);
if (tree == NULL)
return NULL;
-
const int nNodes = rows - 1;
double *nodeorder = needMem(nNodes*sizeof(double));
int *nodecounts = needMem(nNodes*sizeof(int));
getNodeOrderFromTree(tree, nNodes, geneorder, nodeorder, nodecounts, metric);
TreeSort(nNodes, rows, geneindex, geneorder, nodeorder, nodecounts, tree);
int i;
struct slName *newGeneOrder = NULL;
for (i = 0; i < nNodes; i++)
slNameAddHead(&newGeneOrder, genename[geneindex[i]]);
slReverse(&newGeneOrder);
freez(&nodecounts);
freez(&nodeorder);
freez(&tree);
freez(&geneorder);
freez(&geneindex);
freez(&genename);
freez(&arrayweight);
freez(&data);
freez(&mask);
+
return newGeneOrder;
}
-struct slName *clusterDataByGene(struct rawData *rdList, struct mapSettings *settings,
- char method, char metric)
+void clusterData(struct rawData *rdList, struct mapSettings *settings,
+ char method, char metric,
+ struct slName **geneOrder, struct slName **sampleOrder)
/* Set up data structures for cluster software, perform hierarchical clustering, and
* return a new sorted list of gene names according to clustering */
{
if (rdList == NULL || settings == NULL)
- return NULL;
+ return;
int rows = hashNumEntries(settings->y_index);
int columns = hashNumEntries(settings->x_index);
if (rows > MAX_ROWS || columns > MAX_COLUMNS)
- return NULL;
+ return;
/* Allocate space for array weights */
double *arrayweight = needMem(columns*sizeof(double));
/* Allocate space for data */
double **data = needMem(rows*sizeof(double*));
int **mask = needMem(rows*sizeof(int*));
/* Allocate space for gene quantities */
double *geneorder = needMem(rows*sizeof(double));
int *geneindex = needMem(rows*sizeof(int));
char **genename = needMem(rows*sizeof(char*));
-initializeRawData(rows, columns, rdList, settings, data, mask, arrayweight, geneorder, genename);
+double *sampleorder = needMem(columns*sizeof(double));
+int *sampleindex = needMem(columns*sizeof(int));
+char **samplename = needMem(columns*sizeof(char*));
-/* Perform hierarchical clustering. */
-Node* tree = treecluster(rows, columns, data, mask, arrayweight, 0, metric, method, NULL);
+initializeRawData(rows, columns, rdList, settings, data, mask, arrayweight, geneorder, genename,
+ sampleorder, samplename);
-if (tree == NULL)
- return NULL;
+/* Perform hierarchical clustering in gene direction*/
+Node* tree = treecluster(rows, columns, data, mask, arrayweight, 0, metric, method, NULL);
+if (!tree)
+ return;
const int nNodes = rows - 1;
double *nodeorder = needMem(nNodes*sizeof(double));
int *nodecounts = needMem(nNodes*sizeof(int));
getNodeOrderFromTree(tree, nNodes, geneorder, nodeorder, nodecounts, metric);
TreeSort(nNodes, rows, geneindex, geneorder, nodeorder, nodecounts, tree);
int i;
struct slName *newGeneOrder = NULL;
for (i = 0; i < rows; i++)
slNameAddHead(&newGeneOrder, genename[geneindex[i]]);
-
slReverse(&newGeneOrder);
+*geneOrder = newGeneOrder;
+
+freez(&nodecounts);
+freez(&nodeorder);
+
+/* Perform hierarchical clustering in sample direction. */
+tree = treecluster(rows, columns, data, mask, arrayweight, 1, metric, method, NULL);
+if (!tree)
+ return;
+
+const int ncNodes = columns - 1;
+nodeorder = needMem(ncNodes*sizeof(double));
+nodecounts = needMem(ncNodes*sizeof(int));
+getNodeOrderFromTree(tree, ncNodes, geneorder, nodeorder, nodecounts, metric);
+
+TreeSort(ncNodes, columns, sampleindex, sampleorder, nodeorder, nodecounts, tree);
+
+struct slName *newSampleOrder = NULL;
+for (i = 0; i < columns; i++)
+ slNameAddHead(&newSampleOrder, samplename[sampleindex[i]]);
+slReverse(&newSampleOrder);
+*sampleOrder = newSampleOrder;
freez(&nodecounts);
freez(&nodeorder);
freez(&tree);
freez(&geneorder);
freez(&geneindex);
freez(&genename);
freez(&arrayweight);
freez(&data);
freez(&mask);
-
-return newGeneOrder;
}
void outputGenePCA(int rows, int columns, struct slName *genes,
double eigenvalues[], double** eigenvectors, int order[])
{
int row, column;
/* Fill the genename with gene names */
row = 0;
struct slName *sl;
char **genename = needMem(rows*sizeof(char*));
for (sl = genes; sl; sl = sl->next)
{
genename[row] = cloneString(sl->name);
row++;
}
uglyf("\n");
for (column = 0; column < min(rows, columns); column++)
uglyf("%f,", eigenvalues[order[column]]);
uglyf("\n");
if (rows > columns)
{
for (row = 0; row < rows; row++)
{
uglyf("%s ", genename[row]);
for (column = 0; column < columns; column++)
uglyf("%f,", eigenvectors[row][order[column]]);
uglyf("\n");
}
}
else
{
for (row = 0; row < rows; row++)
{
uglyf("%s ", genename[order[row]]);
for (column = 0; column < rows; column++)
uglyf("%f,", eigenvectors[column][order[row]]);
uglyf("\n");
}
}
freez(&genename);
}
void pcaGeneSet(struct hash *geneHash, struct slName *genes,
int rows, int columns, double **data, char **genename, double **u, double **v,
double *w, int *svdorder)
/* Set up data structures for cluster software and perform PCA on gene set
* adapted from PerformPCA routine in "data.c" in Cluster 3.0 package */
{
if (geneHash == NULL || genes == NULL)
return;
/* Allocate space for array weights */
double *arrayweight = needMem(columns*sizeof(double));
/* Allocate space for data */
//double **data = needMem(rows*sizeof(double*));
int **mask = needMem(rows*sizeof(int*));
/* Allocate space for gene quantities */
double *geneorder = needMem(rows*sizeof(double));
initializeData(rows, columns, geneHash, genes, data, mask, arrayweight, geneorder, genename);
/* free data that is not needed for PCA (should i have a specialized initializeData
* routines for PCA and hierarchical clustering? */
freez(&arrayweight);
freez(&geneorder);
int row, column;
//int* svdorder= NULL;
double* svdsortval= NULL;
int ierr = 0;
const int nvals = min(rows,columns);
if (!u || !v || !w)
{
if (u)
freez(&u);
if (v)
freez(&v);
if (w)
freez(&w);
errAbort("Memory allocation error in pcaGeneSet");
}
if (rows > columns)
{
/* [u] = [row, columns]
* [v] = [columns, columns]
* [w] = [columns, columns] diagonal
*/
for (row = 0; row < rows; row++)
{
double mag = 0.0;
u[row] = needMem(columns*sizeof(double));
if (!u[row])
{
while(--row >= 0)
freez(&u[row]);
freez(&u);
freez(&v);
freez(&w);
errAbort("Memory allocation error in pcaGeneSet");
}
for (column = 0; column < columns; column++)
{
if (mask[row][column])
{
const double val = data[row][column];
mag += val*val;
}
}
mag = sqrt(mag);
if (mag == 0.0)
mag = 1.0;
for (column = 0; column < columns; column++)
{
if (mask[row][column])
u[row][column] = data[row][column] / mag;
else
u[row][column] = 0.0;
}
}
for (row = 0; row < columns; row++)
{
v[row] = needMem(columns*sizeof(double));
if (!v[row])
{
while (--row >= 0)
freez(&v[row]);
for (row = 0; row < rows; row++)
freez(&u[row]);
freez(&u);
freez(&v);
freez(&w);
errAbort("Memory allocation error in pcaGeneSet");
}
}
svd(rows, columns, u, w, v, &ierr);
}
else /* rows < columns */
{
/* [u] = [rows, rows]
* [v] = [column, rows]
* [w] = [rows, rows] diagonal
*/
for (column = 0; column < columns; column++)
{
v[column] = needMem(rows*sizeof(double));
if (!v[column])
{
while (--column >= 0)
freez(&v[column]);
freez(&u);
freez(&v);
freez(&w);
errAbort("Memory allocation error in pcaGeneSet");
}
}
for (row = 0; row < rows; row++)
{
double mag = 0.0;
for (column = 0; column < columns; column++)
{
if (mask[row][column])
{
const double val = data[row][column];
mag += val*val;
}
}
mag = sqrt(mag);
if (mag == 0.0)
mag = 1.0;
for (column = 0; column < columns; column++)
{
if (mask[row][column])
v[column][row] = data[row][column] / mag;
else
v[column][row] = 0.0;
}
}
for (row = 0; row < rows; row++)
{
u[row] = needMem(rows*sizeof(double));
if(!u[row])
{
while(--row >= 0)
freez(&u[row]);
for (column = 0; column < columns; column++)
freez(&v[column]);
freez(&u);
freez(&v);
freez(&w);
errAbort("Memory allocation error in pcaGeneSet");
}
}
svd(columns, rows, v, w, u, &ierr);
}
if(!ierr)
svdsortval= needMem(nvals*sizeof(double));
if (!svdorder || !svdsortval)
{
if(svdorder)
freez(&svdorder);
if(svdsortval)
freez(&svdsortval);
for (row = 0; row < rows; row++)
freez(&u[row]);
for (column = 0; column < columns; column++)
freez(&v[column]);
freez(&u);
freez(&v);
freez(&w);
errAbort("Memory allocation error in pcaGeneSet");
}
for (column = 0; column < nvals; column++)
{
svdorder[column] = column;
svdsortval[column] = -w[column];
}
sort(nvals,svdsortval,svdorder);
/* Free all used space */
//for (row = 0; row < rows; row++)
// freez(&u[row]);
//for (column = 0; column < columns; column++)
// freez(&v[column]);
//freez(&u);
//freez(&v);
//freez(&w);
//freez(&svdorder);
freez(&svdsortval);
}
static void verticalTextCentered(struct vGfx *vg, int x, int y,
int width, int height, int colorIx, MgFont *font, char *string)
/* Draw a vertical line of text in middle of the box.
* The string will read from bottom to top
*/
{
/* don't let this run wild */
CLIP(width, vg->width);
CLIP(height, vg->height);
if ((width > 0) && (height > 0))
{
struct vGfx *vgHoriz;
int i, j;
/* reversed meanings of width and height here since this is going
* to rotate 90 degrees
*/
vgHoriz = vgOpenGif(height, width, "/dev/null", FALSE);
vgTextCentered(vgHoriz, 0, 0, height, width, colorIx, font, string);
/* now, blit from the horizontal to the vertical, rotate -90 (CCW) */
for (i = 0; i < height; ++i) /* xSrc -> yDest */
{
int yDest = height - i;
for (j = 0; j < width; ++j) /* ySrc -> xDest */
{
vgDot(vg,j+x,yDest+y, vgGetDot(vgHoriz, i, j));
}
}
vgClose(&vgHoriz);
}
}
static void addLabels(struct vGfx *vg, MgFont *font, char *minXStr,
char *maxXStr, char *minYStr, char *maxYStr, char *shortLabel,
char *longLabel, int totalWidth, int totalHeight,
int graphWidth, int graphHeight, int startX, int startY,
int leftMargin, int fontHeight, char *vertLabel)
{
int x1 = 0;
int y1 = 0;
int x2 = 0;
int y2 = 0;
int shortLabelSize = 0;
int longLabelSize = 0;
int textWidth = 0;
/* border around the whole thing */
vgLine(vg, startX, startY, totalWidth-1+startX, startY, MG_MAGENTA); /* top */
vgLine(vg, startX, startY, startX, totalHeight-1+startY, MG_MAGENTA); /* left */
vgLine(vg, totalWidth-1+startX, startY,
totalWidth-1+startX, totalHeight-1+startY, MG_MAGENTA); /* right */
vgLine(vg, startX, totalHeight-1+startY,
totalWidth-1+startX, totalHeight-1+startY, MG_MAGENTA);/* bottom */
/* border just around the graph */
vgLine(vg, leftMargin-PLOT_MARGIN+startX, PLOT_MARGIN+startY,
leftMargin-PLOT_MARGIN+startX,PLOT_MARGIN+graphHeight+startY, MG_CYAN); /* left */
vgLine(vg, leftMargin+startX, PLOT_MARGIN+graphHeight+1+startY,
totalWidth-PLOT_MARGIN+startX, PLOT_MARGIN+graphHeight+1+startY, MG_CYAN);
/* bottom */
x1 = leftMargin+startX;
y1 = totalHeight-PLOT_MARGIN-fontHeight+startY;
x2 = totalWidth-PLOT_MARGIN+startX;
y2 = totalHeight-PLOT_MARGIN+startY;
if (shortLabel)
shortLabelSize = mgFontStringWidth(font, shortLabel);
if (longLabel)
longLabelSize = mgFontStringWidth(font, longLabel);
if ((longLabel != NULL) && (longLabelSize < (x2-x1)))
vgTextCentered(vg, x1, y1, x2-x1, y2-y1, MG_BLACK, font, longLabel);
else if (shortLabel != NULL)
vgTextCentered(vg, x1, y1, x2-x1, y2-y1, MG_BLACK, font, shortLabel);
x1 = leftMargin+startX;
y1 = totalHeight-PLOT_MARGIN-fontHeight-fontHeight+2+startY;
vgText(vg, x1, y1, MG_BLACK, font, minXStr);
textWidth = mgFontStringWidth(font, maxXStr);
x1 = totalWidth - PLOT_MARGIN - textWidth;
vgText(vg, x1, y1, MG_BLACK, font, maxXStr);
x1 = startX;
x2 = leftMargin - PLOT_MARGIN+startX;
y1 = PLOT_MARGIN+startY;
y2 = PLOT_MARGIN + fontHeight+startY;
if (maxYStr)
vgTextRight(vg, x1, y1, x2-x1, y2-y1, MG_BLACK, font, maxYStr);
y1 = PLOT_MARGIN+graphHeight-fontHeight+startY;
y2 = PLOT_MARGIN+graphHeight+startY;
if (minYStr)
vgTextRight(vg, x1, y1, x2-x1, y2-y1, MG_BLACK, font, minYStr);
x1 = PLOT_MARGIN+startX;
y1 = PLOT_MARGIN + fontHeight+startY;
x2 = leftMargin - PLOT_MARGIN - PLOT_MARGIN+startX;
y2 = graphHeight - fontHeight+startY;
if (vertLabel)
verticalTextCentered(vg, x1, y1, x2-x1, y2-y1, MG_BLACK, font, vertLabel);
}
static MgFont *fontSetup(int *height)
{
MgFont *font = mgSmallFont();
if (height)
{
*height = mgFontLineHeight(font);
}
return (font);
}
static void ordinaryPlot(int **densityCounts, Color **dataColors, struct vGfx *vg,
int totalWidth, int totalHeight,
int graphWidth, int graphHeight,
int startX, int startY, int leftMargin)
/* a simple point plot, not density, the inversion of the Y axis takes
* place here. leftMargin is what has been passed in, the top margin is
* fixed at PLOT_MARGIN
* Function borrowed from hgTables/correlatePlot.c and adjusted. */
{
int i, j;
/* the dots are going to overlap because of DOT_SIZE, but that is
* OK for this situation, they are all MG_BLACK
*/
for (j = 0; j < graphHeight; ++j)
for (i = 0; i < graphWidth; ++i)
if (densityCounts[j][i])
{
vgBox(vg, i+leftMargin+startX, PLOT_MARGIN+(graphHeight-j)+startY,
DOT_SIZE+2, DOT_SIZE+2, dataColors[j][i]);
}
/* the (totalHeight-j) is the inversion of the Y axis */
}
struct tempName *scatterPlot(struct vGfx *vg, struct dataVector *xList,
struct dataVector *yList, int totalWidth, int totalHeight,
int startX, int startY)
/* create scatter plot gif file in trash, return path name
* Function borrowed from hgTables/correlatePlot.c and adjusted. */
{
//static struct tempName gifFileName;
MgFont *font = NULL;
struct dataVector *x;
struct dataVector *y;
double yMin = INFINITY;
double xMin = INFINITY;
double yMax = -INFINITY;
double xMax = -INFINITY;
double yRange = 0.0;
double xRange = 0.0;
int **densityCounts; /* densityCounts[GRAPH_HEIGHT] [GRAPH_WIDTH] */
Color **dataColors;
int i, j;
int pointsPlotted = 0;
int fontHeight = 0;
char minXStr[32];
char maxXStr[32];
char minYStr[32];
char maxYStr[32];
char xLabel[256];
char yLabel[256];
int leftLabelSize = 0;
int bottomLabelSize = 0;
int leftMargin = 0;
int bottomMargin = 0;
font = fontSetup(&fontHeight);
/* find overall min and max */
for (x = xList, y = yList; (y != NULL) && (x !=NULL); y = y->next, x=x->next)
{
yMin = min(yMin,y->min);
xMin = min(xMin,x->min);
yMax = max(yMax,y->max);
xMax = max(xMax,x->max);
}
yRange = yMax - yMin;
xRange = xMax - xMin;
if (xRange == 0.0)
xRange = 1.0;
if (yRange == 0.0)
yRange = 1.0;
safef(minXStr,ArraySize(minXStr), "%.4g", xMin);
safef(maxXStr,ArraySize(maxXStr), "%.4g", xMax);
safef(minYStr,ArraySize(minYStr), "%.4g", yMin);
safef(maxYStr,ArraySize(maxYStr), "%.4g", yMax);
/* the axes labels will be horizontal, thus the widest establishes
* the left margin.
*/
leftLabelSize = max(mgFontStringWidth(font, minYStr),
mgFontStringWidth(font, maxYStr));
/* the vertical text may be even wider than the numbers */
leftLabelSize = max(leftLabelSize,fontHeight);
/* The bottom label is one line for the axes labels, and a second
* line for the X axis table longLabel
*/
bottomLabelSize = fontHeight * 2;
/* y data is the Y axis, x data is the X axis */
/* Do not worry about the fact that the Y axis is 0 at the top
* here, that will get translated during the actual plot.
*/
leftMargin = PLOT_MARGIN + leftLabelSize + PLOT_MARGIN + PLOT_MARGIN;
bottomMargin = PLOT_MARGIN + bottomLabelSize + PLOT_MARGIN;
int graphWidth = totalWidth - leftMargin - PLOT_MARGIN;
int graphHeight = totalHeight - PLOT_MARGIN - bottomMargin;
if (graphWidth <= 0 || graphHeight <= 0)
return NULL;
struct lm *lm = lmInit(graphWidth);
/* Initialize density plot count array */
/* space for the row pointers, first */
lmAllocArray(lm, densityCounts, graphHeight);
lmAllocArray(lm, dataColors, graphHeight);
/* then space for each row */
for (j = 0; j < graphHeight; ++j)
{
lmAllocArray(lm, densityCounts[j], graphWidth);
lmAllocArray(lm, dataColors[j], graphWidth);
}
for (j = 0; j < graphHeight; ++j)
for (i = 0; i < graphWidth; ++i)
{
densityCounts[j][i] = 0;
dataColors[j][i] = MG_GRAY;
}
for (x = xList, y = yList; (y != NULL) && (x !=NULL); y = y->next, x=x->next)
{
pointsPlotted++;
double yValue = y->value;
double xValue = x->value;
int x1 = ((xValue - xMin)/xRange) * graphWidth;
int y1 = ((yValue - yMin)/yRange) * graphHeight;
CLIP(x1,graphWidth);
CLIP(y1,graphHeight);
densityCounts[y1][x1]++;
dataColors[y1][x1] = x->color;
}
struct tempName *gifFileName = NULL;
if (vg == NULL)
{
gifFileName = AllocA(struct tempName);
trashDirFile(gifFileName, "hgtData", "hgtaScatter", ".gif");
vg = vgOpenGif(totalWidth, totalHeight, gifFileName->forCgi, FALSE);
}
vgSetClip(vg, leftMargin+startX, PLOT_MARGIN+startY, graphWidth+startX, graphHeight+startY);
ordinaryPlot(densityCounts, dataColors, vg, totalWidth, totalHeight,
graphWidth, graphHeight, startX, startY, leftMargin);
safef(xLabel, ArraySize(xLabel), "PCA Component 1");
safef(yLabel, ArraySize(yLabel), "PCA Component 2");
/* allow the entire area to be drawn into for the labels */
vgSetClip(vg, startX, startY, totalWidth+startX, totalHeight+startY);
addLabels(vg, font, minXStr, maxXStr, minYStr, maxYStr,
NULL, xLabel, totalWidth, totalHeight, graphWidth, graphHeight,
startX, startY, leftMargin, fontHeight, yLabel);
vgUnclip(vg);
if (gifFileName)
vgClose(&vg);
lmCleanup(&lm);
return gifFileName;
}
struct hash *getPCAComponentsXY(int rows, int columns,
struct dataVector **x, struct dataVector **y,
struct slName **list1, struct slName **list2,
double **data, char **genename, double **u, double **v, double *w,
int *svdorder, Color *colors)
{
if (rows < 2 || columns < 2)
return NULL;
int row, column;
struct dataVector *newX, *newY;
int c1 = 0;
int c2 = 1;
double mult = 10000.0;
int* geneorder1 = needMem(rows*sizeof(double));
int* geneorder2 = needMem(rows*sizeof(double));
double** weight = needMem(rows*sizeof(double*));
double** weightAbs = needMem(rows*sizeof(double*));
int nvals = columns;
if (rows < columns)
nvals = rows;
for (row = 0; row < rows; row++)
{
weight[row] = needMem(nvals*sizeof(double));
weightAbs[row] = needMem(nvals*sizeof(double));
}
if (rows > columns)
{
for (row = 0; row < rows; row++)
{
geneorder1[row] = row;
geneorder2[row] = row;
for (column = 0; column < nvals; column++)
{
weight[row][column] = u[row][svdorder[column]];
weightAbs[row][column] = -fabs(weight[row][column]);
}
}
for (column = 0; column < columns; column++)
{
newX = AllocA(struct dataVector);
newY = AllocA(struct dataVector);
newX->value = 0;
newY->value = 0;
for (row = 0; row < rows; row++)
{
newX->value += data[row][column] * u[row][svdorder[c1]];
newY->value += data[row][column] * u[row][svdorder[c2]];
}
newX->min = newX->value;
newX->max = newX->value;
newY->min = newY->value;
newY->max = newY->value;
newX->count = 1;
newY->count = 1;
newX->color = MG_GRAY;
newY->color = MG_GRAY;
if (colors != NULL)
{
newX->color = colors[column];
newY->color = colors[column];
}
slAddHead(x, newX);
slAddHead(y, newY);
}
}
else
{
for (row = 0; row < rows; row++)
{
geneorder1[row] = row;
geneorder2[row] = row;
for (column = 0; column < nvals; column++)
{
weight[row][column] = u[column][row];
weightAbs[row][column] = -fabs(weight[row][column]);
}
}
for (column = 0; column < columns; column++)
{
newX = AllocA(struct dataVector);
newY = AllocA(struct dataVector);
newX->value = 0;
newY->value = 0;
for (row = 0; row < rows; row++)
{
newX->value += data[svdorder[row]][column] * u[c1][svdorder[row]];
newY->value += data[svdorder[row]][column] * u[c2][svdorder[row]];
}
newX->min = newX->value;
newX->max = newX->value;
newY->min = newY->value;
newY->max = newY->value;
newX->count = 1;
newY->count = 1;
newX->color = MG_GRAY;
newY->color = MG_GRAY;
if (colors != NULL)
{
newX->color = colors[column];
newY->color = colors[column];
}
slAddHead(x, newX);
slAddHead(y, newY);
}
}
double* w1Abs = needMem(rows*sizeof(double));
double* w2Abs = needMem(rows*sizeof(double));
for (row = 0; row < rows; row++)
{
w1Abs[row] = weightAbs[row][c1]; // get first component, for scatter plot
w2Abs[row] = weightAbs[row][c2]; // get second component, for scatter plot
}
sort(rows,w1Abs,geneorder1);
sort(rows,w2Abs,geneorder2);
struct hash *statHash = NULL;
if (rows > 0)
statHash = hashNew(0);
for (row = 0; row < rows; row++)
{
int index1 = geneorder1[row];
int index2 = geneorder2[row];
if (ceil(w1Abs[index1]*mult) != 0)
slNameAddHead(list1, genename[index1]);
if (ceil(w2Abs[index2]*mult) != 0)
slNameAddHead(list2, genename[index2]);
struct pcaData *pd = AllocA(struct pcaData);
pd->numComponents = nvals;
AllocArray(pd->components, pd->numComponents);
for (column = 0; column < nvals; column++)
pd->components[column] = weight[row][column];
hashAdd(statHash, genename[row], pd);
}
slReverse(list1);
slReverse(list2);
return statHash;
}
void printComponents(char *name, struct slName *list1, struct slName *list2)
{
if (list1 == NULL || list2 == NULL)
return;
struct slName *sl;
int count = 0;
int numGenesPerLine = 20;
webNewSection("PCA Components: %s", name);
hPrintf("<TABLE border = \"0\">");
hPrintf("<TR><TH>First:<TD>");
for (sl = list1; sl; sl = sl->next)
{
hPrintf("%s,", sl->name);
count++;
if (count == numGenesPerLine)
{
hPrintf("...");
break;
}
}
count = 0;
hPrintf("<TR><TH>Second:<TD>");
for (sl = list2; sl; sl = sl->next)
{
hPrintf("%s,", sl->name);
count++;
if (count == numGenesPerLine)
{
hPrintf("...");
break;
}
}
hPrintf("</TABLE>");
hPrintf("\n</TD><TD WIDTH=15></TD></TR></TABLE>\n");
hPrintf("</TD></TR></TABLE>\n");
hPrintf("</TD></TR></TABLE>\n");
}
struct hash *performPCAandPlot(struct vGfx *vg, int totalWidth, int totalHeight,
int startX, int startY, struct hash *geneHash,
struct slName *allGenes, struct featureColor *fcList)
{
if (geneHash == NULL || allGenes == NULL)
return NULL;
struct bed *nb = NULL;
struct slName *sl, *genes = NULL;
struct hashEl *el;
for (sl = allGenes; sl ; sl = sl->next)
{
el = hashLookup(geneHash, sl->name);
if (el != NULL)
{
slNameAddHead(&genes, sl->name);
nb = el->val;
}
}
slReverse(&genes);
if (nb == NULL)
return NULL;
int row, column;
int columns = nb->expCount;
int rows = slCount(genes);
int nvals = min(rows, columns);
if (rows > MAX_ROWS || columns > MAX_COLUMNS)
return NULL;
double **data = needMem(rows*sizeof(double*));
char **genename = needMem(rows*sizeof(char*));
double** u = needMem(rows*sizeof(double*));
double** v = needMem(columns*sizeof(double*));
double* w = needMem(nvals*sizeof(double));
int *svdorder = needMem(nvals*sizeof(int));
pcaGeneSet(geneHash, genes, rows, columns,
data, genename, u, v, w, svdorder);
Color colors[columns];
for (column = 0; column < columns; column++)
colors[column] = MG_GRAY;
struct featureColor *fc;
for (fc = fcList; fc; fc = fc->next)
{
if (fc->index < columns)
colors[fc->index] = fc->color;
}
struct dataVector *x = NULL;
struct dataVector *y = NULL;
struct slName *list1 = NULL;
struct slName *list2 = NULL;
struct hash *statHash = getPCAComponentsXY(rows, columns, &x, &y, &list1, &list2,
data, genename, u, v, w, svdorder, colors);
struct tempName *scatterPlotGif = NULL;
if (vg)
scatterPlotGif = scatterPlot(vg, x, y, totalWidth, totalHeight, startX, startY);
//#if 0
outputGenePCA(rows, columns, genes, w, u, svdorder);
//#endif
/* Free all used space */
for (row = 0; row < rows; row++)
freez(&u[row]);
for (column = 0; column < columns; column++)
freez(&v[column]);
freez(&u);
freez(&v);
freez(&w);
return statHash;
}
struct hash *performPCA(struct hash *geneHash, struct slName *allGenes)
/* use plotting function, but pass NULL vGfx ptr to avoid drawing plot, just returning statHash */
{
return performPCAandPlot(NULL, 0, 0, 0, 0, geneHash, allGenes, NULL);
}