src/hg/instinct/hgGeneset/hgStats.c 1.1
1.1 2010/01/28 22:59:07 jsanborn
added clustering
Index: src/hg/instinct/hgGeneset/hgStats.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/instinct/hgGeneset/hgStats.c,v
retrieving revision 1.4
retrieving revision 1.1
diff -b -B -U 4 -r1.4 -r1.1
--- src/hg/instinct/hgGeneset/hgStats.c 29 Jan 2010 00:34:07 -0000 1.4
+++ src/hg/instinct/hgGeneset/hgStats.c 28 Jan 2010 22:59:07 -0000 1.1
@@ -17,10 +17,10 @@
#include "trashDir.h"
#include "obscure.h"
#include "hgStats.h"
-#define MAX_ROWS 10000 // Maximum number of rows to cluster
-#define MAX_COLUMNS 10000 // Maximum number of columns to cluster
+#define MAX_ROWS 2000 // Maximum number of rows to cluster
+#define MAX_COLUMNS 2000 // Maximum number of columns to cluster
#define CLIP(p,limit) if (p < 0) p = 0; if (p >= (limit)) p = (limit)-1;
@@ -228,18 +228,16 @@
void initializeRawData(int rows, int columns,
struct rawData *rdList, struct mapSettings *settings,
double **data, int **mask, double *arrayweight,
- double *geneorder, char **genename,
- double *sampleorder, char **samplename)
+ double *geneorder, char **genename)
/*
* Code was adapted from LOAD function found in ../cluster/data.c to prepare
* data for input to treecluster program.
* Copyright (C) 2002 Michiel Jan Laurens de Hoon.
*/
{
-if (!rdList || !settings || !mask || !arrayweight || !geneorder || !genename
- || !sampleorder || !samplename)
+if (!rdList || !settings || !mask || !arrayweight || !geneorder || !genename)
return;
int column, row;
@@ -262,9 +260,9 @@
}
/* Fill the genename with gene names */
struct hashEl *el;
-struct hashCookie cookie = hashFirst(settings->y_index);
+struct hashCookie cookie = hashFirst(settings->x_index);
while ((el = hashNext(&cookie)) != NULL)
{
char *name = el->name;
int val = ptToInt(el->val);
@@ -273,26 +271,12 @@
continue;
genename[val] = cloneString(name);
}
-cookie = hashFirst(settings->x_index);
-while ((el = hashNext(&cookie)) != NULL)
- {
- char *name = el->name;
- int val = ptToInt(el->val);
-
- if (val >= columns)
- continue;
- samplename[val] = cloneString(name);
- }
-
/* Fill the gene weights with the default value of 1.0 */
for (row = 0; row < rows; row++)
geneorder[row] = row;
-for (column = 0; column < columns; column++)
- sampleorder[column] = column;
-
int prevSampleId = -1;
int prevFeatureId = -1;
char id[128];
struct rawData *rd;
@@ -302,16 +286,16 @@
// for sample or feature id.
if (rd->feature_id != prevFeatureId)
{
safef(id, sizeof (id), "%d", rd->feature_id);
- row = hashIntValDefault(settings->y_index, id, -1);
+ row = hashIntValDefault(settings->x_index, id, -1);
prevFeatureId = rd->feature_id;
}
if (rd->sample_id != prevSampleId)
{
safef(id, sizeof (id), "%d", rd->sample_id);
- column = hashIntValDefault(settings->x_index, id, -1);
+ column = hashIntValDefault(settings->y_index, id, -1);
prevSampleId = rd->sample_id;
}
if (row < 0 || column < 0)
@@ -467,22 +451,21 @@
return newGeneOrder;
}
-void clusterData(struct rawData *rdList, struct mapSettings *settings,
- char method, char metric,
- struct slName **geneOrder, struct slName **sampleOrder)
+struct slName *clusterDataByGene(struct rawData *rdList, struct mapSettings *settings,
+ char method, char metric)
/* Set up data structures for cluster software, perform hierarchical clustering, and
* return a new sorted list of gene names according to clustering */
{
if (rdList == NULL || settings == NULL)
- return;
+ return NULL;
-int rows = hashNumEntries(settings->y_index);
-int columns = hashNumEntries(settings->x_index);
+int rows = hashNumEntries(settings->x_index);
+int columns = hashNumEntries(settings->y_index);
if (rows > MAX_ROWS || columns > MAX_COLUMNS)
- return;
+ return NULL;
/* Allocate space for array weights */
double *arrayweight = needMem(columns*sizeof(double));
@@ -494,19 +477,15 @@
double *geneorder = needMem(rows*sizeof(double));
int *geneindex = needMem(rows*sizeof(int));
char **genename = needMem(rows*sizeof(char*));
-double *sampleorder = needMem(columns*sizeof(double));
-int *sampleindex = needMem(columns*sizeof(int));
-char **samplename = needMem(columns*sizeof(char*));
-
-initializeRawData(rows, columns, rdList, settings, data, mask, arrayweight, geneorder, genename,
- sampleorder, samplename);
+initializeRawData(rows, columns, rdList, settings, data, mask, arrayweight, geneorder, genename);
-/* Perform hierarchical clustering in gene direction*/
+/* Perform hierarchical clustering. */
Node* tree = treecluster(rows, columns, data, mask, arrayweight, 0, metric, method, NULL);
-if (!tree)
- return;
+
+if (tree == NULL)
+ return NULL;
const int nNodes = rows - 1;
double *nodeorder = needMem(nNodes*sizeof(double));
int *nodecounts = needMem(nNodes*sizeof(int));
@@ -515,33 +494,12 @@
TreeSort(nNodes, rows, geneindex, geneorder, nodeorder, nodecounts, tree);
int i;
struct slName *newGeneOrder = NULL;
-for (i = 0; i < rows; i++)
+for (i = 0; i < nNodes; i++)
slNameAddHead(&newGeneOrder, genename[geneindex[i]]);
-slReverse(&newGeneOrder);
-*geneOrder = newGeneOrder;
-freez(&nodecounts);
-freez(&nodeorder);
-
-/* Perform hierarchical clustering in sample direction. */
-tree = treecluster(rows, columns, data, mask, arrayweight, 1, metric, method, NULL);
-if (!tree)
- return;
-
-const int ncNodes = columns - 1;
-nodeorder = needMem(ncNodes*sizeof(double));
-nodecounts = needMem(ncNodes*sizeof(int));
-getNodeOrderFromTree(tree, ncNodes, geneorder, nodeorder, nodecounts, metric);
-
-TreeSort(ncNodes, columns, sampleindex, sampleorder, nodeorder, nodecounts, tree);
-
-struct slName *newSampleOrder = NULL;
-for (i = 0; i < columns; i++)
- slNameAddHead(&newSampleOrder, samplename[sampleindex[i]]);
-slReverse(&newSampleOrder);
-*sampleOrder = newSampleOrder;
+slReverse(&newGeneOrder);
freez(&nodecounts);
freez(&nodeorder);
freez(&tree);
@@ -550,8 +508,10 @@
freez(&genename);
freez(&arrayweight);
freez(&data);
freez(&mask);
+
+return newGeneOrder;
}
void outputGenePCA(int rows, int columns, struct slName *genes,
double eigenvalues[], double** eigenvectors, int order[])