src/hg/instinct/hgGeneset/hgStats.c 1.1

1.1 2010/01/28 22:59:07 jsanborn
added clustering
Index: src/hg/instinct/hgGeneset/hgStats.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/instinct/hgGeneset/hgStats.c,v
retrieving revision 1.4
retrieving revision 1.1
diff -b -B -U 1000000 -r1.4 -r1.1
--- src/hg/instinct/hgGeneset/hgStats.c	29 Jan 2010 00:34:07 -0000	1.4
+++ src/hg/instinct/hgGeneset/hgStats.c	28 Jan 2010 22:59:07 -0000	1.1
@@ -1,1392 +1,1352 @@
 /* 
  * hgStats -- wrappers that set up hgHeatmap data for cluster statistical routines
  */
 
 #include "common.h"
 #include "bed.h"
 #include "memgfx.h"
 #include "vGfx.h" 
 #include "cart.h"
 #include "errCatch.h"
 #include "hash.h"
 #include "hdb.h"
 #include "web.h"
 #include "htmshell.h"
 #include "featuresLib.h"
 #include "hPrint.h"
 #include "trashDir.h"
 #include "obscure.h"
 #include "hgStats.h"
 
-#define MAX_ROWS 10000    // Maximum number of rows to cluster
-#define MAX_COLUMNS 10000 // Maximum number of columns to cluster
+#define MAX_ROWS 2000    // Maximum number of rows to cluster
+#define MAX_COLUMNS 2000 // Maximum number of columns to cluster
 
 #define CLIP(p,limit) if (p < 0) p = 0; if (p >= (limit)) p = (limit)-1; 
 
 
 void getNodeOrderFromTree(Node* tree, int nNodes, const double* order, 
 			  double *nodeorder, int *nodecounts, char metric)
 {
 if (tree == NULL || nodeorder == NULL || nodecounts == NULL) 
     return;
 
 int i;
 
 if (metric=='e' || metric=='b')
     /* Scale all distances such that they are between 0 and 1 */
     { 
     double scale = 0.0;
     for (i = 0; i < nNodes; i++)
 	if (tree[i].distance > scale) 
 	    scale = tree[i].distance;
     
     if (scale) 
 	for (i = 0; i < nNodes; i++) 
 	    tree[i].distance /= scale;
     }
 
 /* Now we join nodes */
 for (i = 0; i < nNodes; i++)
     { 
     int min1 = tree[i].left;
     int min2 = tree[i].right;
 
     /* min1 and min2 are the elements that are to be joined */
     double order1;
     double order2;
     int counts1;
     int counts2;
 
     if (min1 < 0)
 	{ 
 	int index1 = -min1-1;
 	order1 = nodeorder[index1];
 	counts1 = nodecounts[index1];
 	
 	tree[i].distance = max(tree[i].distance, tree[index1].distance);
 	}
     else
 	{ 
 	order1 = order[min1];
 	counts1 = 1;
 	}
 
     if (min2 < 0)
 	{ 
 	int index2 = -min2-1;
 	order2 = nodeorder[index2];
 	counts2 = nodecounts[index2];
 	
 	tree[i].distance = max(tree[i].distance, tree[index2].distance);
 	}
     else
 	{ 
 	order2 = order[min2];
 	counts2 = 1;
 	}
  
     nodecounts[i] = counts1 + counts2;
     nodeorder[i] = (counts1*order1 + counts2*order2) / (counts1 + counts2);
     }
 }
 
 
 static void TreeSort(const int nNodes, const int nData, int *dataindex, const double* order,
 		     const double* nodeorder, const int* nodecounts, Node* tree)
 { 
 const int nElements = nNodes + 1;
 int i;
 double* neworder = needMem(nElements*sizeof(double)); /* initialized to 0.0 */
 int* clusterids = needMem(nElements*sizeof(int));
 
 for (i = 0; i < nElements; i++) 
     clusterids[i] = i;
 
 for (i = 0; i < nNodes; i++)
     { 
     const int i1 = tree[i].left;
     const int i2 = tree[i].right;
     const double order1 = (i1<0) ? nodeorder[-i1-1] : order[i1];
     const double order2 = (i2<0) ? nodeorder[-i2-1] : order[i2];
     const int count1 = (i1<0) ? nodecounts[-i1-1] : 1;
     const int count2 = (i2<0) ? nodecounts[-i2-1] : 1;
     /* If order1 and order2 are equal, their order is determined by 
      * the order in which they were clustered */
     if (i1<i2)
 	{ 
 	const double increase = (order1<order2) ? count1 : count2;
 	int j;
 	for (j = 0; j < nElements; j++)
 	    { 
 	    const int clusterid = clusterids[j];
 	    if (clusterid==i1 && order1>=order2) 
 		neworder[j] += increase;
 	    if (clusterid==i2 && order1<order2) 
 		neworder[j] += increase;
 	    if (clusterid==i1 || clusterid==i2) 
 		clusterids[j] = -i-1;
 	    }
 	}
     else
 	{ 
 	const double increase = (order1<=order2) ? count1 : count2;
 	int j;
 	for (j = 0; j < nElements; j++)
 	    { 
 	    const int clusterid = clusterids[j];
 	    if (clusterid==i1 && order1>order2) 
 		neworder[j] += increase;
 	    if (clusterid==i2 && order1<=order2) 
 		neworder[j] += increase;
 	    if (clusterid==i1 || clusterid==i2) 
 		clusterids[j] = -i-1;
 	    }
 	}
     }
 freez(&clusterids);
 
 for (i = 0; i < nData; i++)
     dataindex[i] = i;
 
 sort(nData, neworder, dataindex);
 
 freez(&neworder);
 return;
 }
 
 
 void initializeData(int rows, int columns, struct hash *geneHash, struct slName *genes, 
 		    double **data, int **mask, double *arrayweight, 
 		    double *geneorder, char **genename)
 /* 
  * Code was adapted from LOAD function found in ../cluster/data.c to prepare 
  * data for input to treecluster program. 
  * Copyright (C) 2002 Michiel Jan Laurens de Hoon.
  */
 {
 if (geneHash == NULL || data == NULL || mask == NULL || arrayweight == NULL ||
     geneorder == NULL || genename == NULL)
     return;
 
 int column, row;
 struct slName *sl;
 
 /* Set up array weights and array order (not used since we're ordering by gene) */
 for (column = 0; column < columns; column++)
     arrayweight[column] = 1.0;
 
 /* Fill data, make sure mask is set to one for all data so that
  * data is used in clustering */
 for (row = 0; row < rows; row++)
     { 
     data[row] = needMem(columns*sizeof(double));
     mask[row] = needMem(columns*sizeof(int));
     }
 
 /* Fill the genename with gene names */
 row = 0;
 for (sl = genes; sl; sl = sl->next)
     {
     genename[row] = cloneString(sl->name); 
     row++;
     }
 
 /* Fill the gene weights with the default value of 1.0 */
 for (row = 0; row < rows; row++) 
     geneorder[row] = row;
 
 row = 0;
 for (sl = genes; sl; sl = sl->next)
     {
     struct hashEl *el = hashLookup(geneHash, sl->name);
     if (!el)
 	{
 //	uglyf("<BR>%s not found<BR>", sl->name);
 	for (column = 0; column < columns; column++)
 	    {
 	    data[row][column] = 0.0;
 	    mask[row][column] = 0;
 	    }
 	}
     else
 	{
 //	uglyf("<BR>%s FOUND<BR>", sl->name);
 	struct bed *nb = el->val;
 	for(column = 0; column < nb->expCount; column++)
 	    {
 	    if (column < columns)
 		{
 		data[row][column] = nb->expScores[column];
 		mask[row][column] = 1;
 		}
 	    }
 	}
     row++;
     }
 }
 
 
 void initializeRawData(int rows, int columns, 
 		       struct rawData *rdList, struct mapSettings *settings, 
 		       double **data, int **mask, double *arrayweight, 
-		       double *geneorder, char **genename, 
-		       double *sampleorder, char **samplename)
+		       double *geneorder, char **genename)
 /* 
  * Code was adapted from LOAD function found in ../cluster/data.c to prepare 
  * data for input to treecluster program. 
  * Copyright (C) 2002 Michiel Jan Laurens de Hoon.
  */
 {
-if (!rdList || !settings || !mask || !arrayweight || !geneorder || !genename
-    || !sampleorder || !samplename)
+if (!rdList || !settings || !mask || !arrayweight || !geneorder || !genename)
     return;
 
 int column, row;
 
 /* Set up array weights and array order (not used since we're ordering by gene) */
 for (column = 0; column < columns; column++)
     arrayweight[column] = 1.0;
 
 /* Fill data, make sure mask is set to one for all data so that
  * data is used in clustering */
 for (row = 0; row < rows; row++)
     { 
     data[row] = needMem(columns*sizeof(double));
     mask[row] = needMem(columns*sizeof(int));
 
     for (column = 0; column < columns; column++)
 	{
 	data[row][column] = 0.0;
 	mask[row][column] = 0;
 	}
     }
 
 /* Fill the genename with gene names */
 struct hashEl *el;
-struct hashCookie cookie = hashFirst(settings->y_index);
+struct hashCookie cookie = hashFirst(settings->x_index);
 while ((el = hashNext(&cookie)) != NULL)
     {
     char *name = el->name;
     int val    = ptToInt(el->val);
 
     if (val >= rows)
 	continue;
     genename[val] = cloneString(name);
     }
 
-cookie = hashFirst(settings->x_index);
-while ((el = hashNext(&cookie)) != NULL)
-    {
-    char *name = el->name;
-    int val    = ptToInt(el->val);
-
-    if (val >= columns)
-	continue;
-    samplename[val] = cloneString(name);
-    }
-
 /* Fill the gene weights with the default value of 1.0 */
 for (row = 0; row < rows; row++) 
     geneorder[row] = row;
 
-for (column = 0; column < columns; column++)
-    sampleorder[column] = column;
-
 int prevSampleId  = -1;
 int prevFeatureId = -1;
 char id[128];
 struct rawData *rd;
 for (rd = rdList; rd; rd = rd->next)
     {
     // Attempt to reduce number of hash calls by checking if we *just* searched
     // for sample or feature id.
     if (rd->feature_id != prevFeatureId)
 	{
 	safef(id, sizeof (id), "%d", rd->feature_id);
-	row = hashIntValDefault(settings->y_index, id, -1);
+	row = hashIntValDefault(settings->x_index, id, -1);
 	prevFeatureId = rd->feature_id;
 	}
 
     if (rd->sample_id != prevSampleId)
 	{
 	safef(id, sizeof (id), "%d", rd->sample_id);
-	column = hashIntValDefault(settings->x_index, id, -1);
+	column = hashIntValDefault(settings->y_index, id, -1);
 	prevSampleId = rd->sample_id;
 	}
 
     if (row < 0 || column < 0)
 	continue;
     if (row >= rows || column >= columns)
 	continue;
 
     data[row][column] = rd->val;
     mask[row][column] = 1;
     }
 }
 
 Node *clusterSamplesByGeneSet(struct hash *geneHash, struct slName *allGenes, 
 			      char method, char metric, int transpose, int *nRows, int *nCols)
 /* Set up data structures for cluster software, perform hierarchical clustering, and 
  * return a new sorted list of gene names according to clustering */
 {
 if (geneHash == NULL || allGenes == NULL)
     return NULL;
 
 if (method == 'x')  // should never happen ('x' = don't cluster)
     return NULL;
 
 struct bed *nb = NULL;
 struct slName *sl, *genes = NULL;
 struct hashEl *el;
 
 for (sl = allGenes; sl ; sl = sl->next)
     {
     el = hashLookup(geneHash, sl->name);
     if (el != NULL)
         {
         slNameAddHead(&genes, sl->name);
         nb = el->val;
         }
     }
 slReverse(&genes);
   
 if (!nb)
     return NULL;
 
 int columns = nb->expCount;
 int rows = slCount(genes);
 
 /* Allocate space for array weights */
 double *arrayweight = needMem(columns*sizeof(double));
 
 /* Allocate space for data */
 double **data = needMem(rows*sizeof(double*));
 int **mask = needMem(rows*sizeof(int*));
  
 /* Allocate space for gene quantities */
 double *geneorder = needMem(rows*sizeof(double));
 int *geneindex = needMem(rows*sizeof(int));
 char **genename = needMem(rows*sizeof(char*));
 
 initializeData(rows, columns, geneHash, genes, data, mask, arrayweight, geneorder, genename); 
 
 Node* tree = treecluster(rows, columns, data, mask, arrayweight, transpose, metric, method, NULL);
 
 freez(&geneorder);
 freez(&geneindex);
 freez(&genename);
 freez(&arrayweight);
 freez(&data);
 freez(&mask);
 
 *nRows = rows;
 *nCols = columns;
 return tree;
 }
 
 struct slName *clusterGeneSet(struct hash *geneHash, struct slName *allGenes, 
 			      char method, char metric)
 /* Set up data structures for cluster software, perform hierarchical clustering, and 
  * return a new sorted list of gene names according to clustering */
 {
 if (geneHash == NULL || allGenes == NULL)
     return NULL;
 
 if (method == 'x')  // should never happen ('x' = don't cluster)
     return NULL;
 
 struct bed *nb = NULL;
 struct slName *sl, *genes = NULL;
 struct hashEl *el;
 
 for (sl = allGenes; sl ; sl = sl->next)
     {
     el = hashLookup(geneHash, sl->name);
     if (el != NULL)
         {
         slNameAddHead(&genes, sl->name);
         nb = el->val;
         }
     }
 slReverse(&genes);
   
 if (nb == NULL)
     return NULL;
 
 int columns = nb->expCount;
 int rows = slCount(genes);
 
 if (rows > MAX_ROWS || columns > MAX_COLUMNS)
     return NULL;
 
 /* Allocate space for array weights */
 double *arrayweight = needMem(columns*sizeof(double));
 
 /* Allocate space for data */
 double **data = needMem(rows*sizeof(double*));
 int **mask = needMem(rows*sizeof(int*));
  
 /* Allocate space for gene quantities */
 double *geneorder = needMem(rows*sizeof(double));
 int *geneindex = needMem(rows*sizeof(int));
 char **genename = needMem(rows*sizeof(char*));
 
 initializeData(rows, columns, geneHash, genes, data, mask, arrayweight, geneorder, genename); 
 
 /* Perform hierarchical clustering. */
 Node* tree = treecluster(rows, columns, data, mask, arrayweight, 0, metric, method, NULL);
 
 if (tree == NULL)
     return NULL;
 
+
 const int nNodes = rows - 1;
 double *nodeorder = needMem(nNodes*sizeof(double));
 int *nodecounts = needMem(nNodes*sizeof(int));
 getNodeOrderFromTree(tree, nNodes, geneorder, nodeorder, nodecounts, metric); 
 
 TreeSort(nNodes, rows, geneindex, geneorder, nodeorder, nodecounts, tree);
 
 int i;
 struct slName *newGeneOrder = NULL;
 for (i = 0; i < nNodes; i++)
     slNameAddHead(&newGeneOrder, genename[geneindex[i]]);
 
 slReverse(&newGeneOrder);
 
 freez(&nodecounts);
 freez(&nodeorder);
 freez(&tree);
 freez(&geneorder);
 freez(&geneindex);
 freez(&genename);
 freez(&arrayweight);
 freez(&data);
 freez(&mask);
 
-
 return newGeneOrder;
 }
 
 
-void clusterData(struct rawData *rdList, struct mapSettings *settings,
-		 char method, char metric, 
-		 struct slName **geneOrder, struct slName **sampleOrder)
+struct slName *clusterDataByGene(struct rawData *rdList, struct mapSettings *settings,
+				 char method, char metric)
 /* Set up data structures for cluster software, perform hierarchical clustering, and 
  * return a new sorted list of gene names according to clustering */
 {
 if (rdList == NULL || settings == NULL)
-    return;
+    return NULL;
 
-int rows    = hashNumEntries(settings->y_index);
-int columns = hashNumEntries(settings->x_index);
+int rows    = hashNumEntries(settings->x_index);
+int columns = hashNumEntries(settings->y_index);
 
 if (rows > MAX_ROWS || columns > MAX_COLUMNS)
-    return;
+    return NULL;
 
 /* Allocate space for array weights */
 double *arrayweight = needMem(columns*sizeof(double));
 
 /* Allocate space for data */
 double **data = needMem(rows*sizeof(double*));
 int **mask    = needMem(rows*sizeof(int*));
  
 /* Allocate space for gene quantities */
 double *geneorder = needMem(rows*sizeof(double));
 int *geneindex    = needMem(rows*sizeof(int));
 char **genename   = needMem(rows*sizeof(char*));
 
-double *sampleorder = needMem(columns*sizeof(double));
-int *sampleindex    = needMem(columns*sizeof(int));
-char **samplename   = needMem(columns*sizeof(char*));
-
-initializeRawData(rows, columns, rdList, settings, data, mask, arrayweight, geneorder, genename, 
-		  sampleorder, samplename);
+initializeRawData(rows, columns, rdList, settings, data, mask, arrayweight, geneorder, genename); 
 
-/* Perform hierarchical clustering in gene direction*/
+/* Perform hierarchical clustering. */
 Node* tree = treecluster(rows, columns, data, mask, arrayweight, 0, metric, method, NULL);
-if (!tree)
-    return;
+
+if (tree == NULL)
+    return NULL;
 
 const int nNodes  = rows - 1;
 double *nodeorder = needMem(nNodes*sizeof(double));
 int *nodecounts   = needMem(nNodes*sizeof(int));
 getNodeOrderFromTree(tree, nNodes, geneorder, nodeorder, nodecounts, metric); 
 
 TreeSort(nNodes, rows, geneindex, geneorder, nodeorder, nodecounts, tree);
 
 int i;
 struct slName *newGeneOrder = NULL;
-for (i = 0; i < rows; i++)
+for (i = 0; i < nNodes; i++)
     slNameAddHead(&newGeneOrder, genename[geneindex[i]]);
-slReverse(&newGeneOrder);
-*geneOrder = newGeneOrder;
 
-freez(&nodecounts);
-freez(&nodeorder);
-
-/* Perform hierarchical clustering in sample direction. */
-tree = treecluster(rows, columns, data, mask, arrayweight, 1, metric, method, NULL);
-if (!tree)
-    return;
-
-const int ncNodes  = columns - 1;
-nodeorder  = needMem(ncNodes*sizeof(double));
-nodecounts = needMem(ncNodes*sizeof(int));
-getNodeOrderFromTree(tree, ncNodes, geneorder, nodeorder, nodecounts, metric); 
-
-TreeSort(ncNodes, columns, sampleindex, sampleorder, nodeorder, nodecounts, tree);
-
-struct slName *newSampleOrder = NULL;
-for (i = 0; i < columns; i++)
-    slNameAddHead(&newSampleOrder, samplename[sampleindex[i]]);
-slReverse(&newSampleOrder);
-*sampleOrder = newSampleOrder;
+slReverse(&newGeneOrder);
 
 freez(&nodecounts);
 freez(&nodeorder);
 freez(&tree);
 freez(&geneorder);
 freez(&geneindex);
 freez(&genename);
 freez(&arrayweight);
 freez(&data);
 freez(&mask);
+
+return newGeneOrder;
 }
 
 void outputGenePCA(int rows, int columns, struct slName *genes, 
 		   double eigenvalues[], double** eigenvectors, int order[])
 { 
 int row, column;
 
 /* Fill the genename with gene names */
 row = 0;
 struct slName *sl;
 char **genename = needMem(rows*sizeof(char*)); 
 for (sl = genes; sl; sl = sl->next)
     {
     genename[row] = cloneString(sl->name);
     row++;
     }
    
 uglyf("\n"); 
 for (column = 0; column < min(rows, columns); column++)
     uglyf("%f,", eigenvalues[order[column]]);
 
 uglyf("\n");
 if (rows > columns)
     { 
     for (row = 0; row < rows; row++)
 	{ 
 	uglyf("%s   ", genename[row]); 
 	for (column = 0; column < columns; column++)
 	    uglyf("%f,", eigenvectors[row][order[column]]);
 	uglyf("\n");
 	}
     }
 else
     { 
     for (row = 0; row < rows; row++)
 	{ 
 	uglyf("%s   ", genename[order[row]]);
 	for (column = 0; column < rows; column++)
 	    uglyf("%f,", eigenvectors[column][order[row]]);
 	uglyf("\n");
 	}
     }
 
 freez(&genename); 
 }
 
 
 void pcaGeneSet(struct hash *geneHash, struct slName *genes, 
 		int rows, int columns, double **data, char **genename, double **u, double **v, 
 		double *w, int *svdorder)      
 /* Set up data structures for cluster software and perform PCA on gene set
  * adapted from PerformPCA routine in "data.c" in Cluster 3.0 package */
 {
 if (geneHash == NULL || genes == NULL)
     return;
 
 /* Allocate space for array weights */
 double *arrayweight = needMem(columns*sizeof(double));
 
 /* Allocate space for data */
 //double **data = needMem(rows*sizeof(double*));
 int **mask = needMem(rows*sizeof(int*));
 
 /* Allocate space for gene quantities */
 double *geneorder = needMem(rows*sizeof(double));
 
 initializeData(rows, columns, geneHash, genes, data, mask, arrayweight, geneorder, genename);    
 /* free data that is not needed for PCA (should i have a specialized initializeData 
  * routines for PCA and hierarchical clustering? */ 
 freez(&arrayweight);
 freez(&geneorder);
        
 int row, column;
 //int* svdorder= NULL;
 double* svdsortval= NULL;
 int ierr = 0;
 const int nvals = min(rows,columns);
 
 if (!u || !v || !w)
     { 
     if (u) 
 	freez(&u);
     if (v) 
 	freez(&v);
     if (w) 
 	freez(&w);
     errAbort("Memory allocation error in pcaGeneSet");
     }
 
 if (rows > columns)
     { 
     /* [u] = [row, columns]
      * [v] = [columns, columns]
      * [w] = [columns, columns] diagonal
      */
     for (row = 0; row < rows; row++)
 	{ 
 	double mag = 0.0;
 	u[row] = needMem(columns*sizeof(double));
 
 	if (!u[row])
 	    { 
 	    while(--row >= 0) 
 		freez(&u[row]);
 	    freez(&u);
 	    freez(&v);
 	    freez(&w);
 	    errAbort("Memory allocation error in pcaGeneSet");
 	    }
 
 	for (column = 0; column < columns; column++)
 	    { 
 	    if (mask[row][column])
 		{ 
 		const double val = data[row][column];
 		mag += val*val;
 		}
 	    }
 	mag = sqrt(mag);
 
 	if (mag == 0.0) 
 	    mag = 1.0;
 
 	for (column = 0; column < columns; column++)
 	    { 
 	    if (mask[row][column]) 
 		u[row][column] = data[row][column] / mag;
 	    else 
 		u[row][column] = 0.0;
 	    }
 	}
 
     for (row = 0; row < columns; row++)
 	{ 
 	v[row] = needMem(columns*sizeof(double));
 
 	if (!v[row])
 	    { 
 	    while (--row >= 0) 
 		freez(&v[row]);
 	    for (row = 0; row < rows; row++) 
 		freez(&u[row]);
 	    freez(&u);
 	    freez(&v);
 	    freez(&w);
 	    errAbort("Memory allocation error in pcaGeneSet");
 	    }
 	}
 
     svd(rows, columns, u, w, v, &ierr);
     }
 else /* rows < columns */
     { 
     /* [u] = [rows, rows]
      * [v] = [column, rows]
      * [w] = [rows, rows] diagonal
      */ 
     for (column = 0; column < columns; column++)
 	{ 
 	v[column] = needMem(rows*sizeof(double));
 	if (!v[column])
 	    { 
 	    while (--column >= 0) 
 		freez(&v[column]);
 	    freez(&u);
 	    freez(&v);
 	    freez(&w);
 	    errAbort("Memory allocation error in pcaGeneSet");
 	    }
 	}
     for (row = 0; row < rows; row++)
 	{ 
 	double mag = 0.0;
 	for (column = 0; column < columns; column++)
 	    { 
 	    if (mask[row][column])
 		{ 
 		const double val = data[row][column];
 		mag += val*val;
 		}
 	    }
 
 	mag = sqrt(mag);
 	if (mag == 0.0) 
 	    mag = 1.0;
 
 	for (column = 0; column < columns; column++)
 	    { 
 	    if (mask[row][column]) 
 		v[column][row] = data[row][column] / mag;
 	    else 
 		v[column][row] = 0.0;
 	    }
 	}
     for (row = 0; row < rows; row++)
 	{ 
 	u[row] = needMem(rows*sizeof(double));
 	if(!u[row])
 	    { 
 	    while(--row >= 0) 
 		freez(&u[row]);
 	    for (column = 0; column < columns; column++) 
 		freez(&v[column]);
 	    freez(&u);
 	    freez(&v);
 	    freez(&w);
 	    errAbort("Memory allocation error in pcaGeneSet");
 	    }
 	}
 
     svd(columns, rows, v, w, u, &ierr);
     }
 
 if(!ierr)
     svdsortval= needMem(nvals*sizeof(double));
 
 if (!svdorder || !svdsortval)
     { 
     if(svdorder) 
 	freez(&svdorder);
     if(svdsortval) 
 	freez(&svdsortval);
     
     for (row = 0; row < rows; row++) 
 	freez(&u[row]);
     for (column = 0; column < columns; column++) 
 	freez(&v[column]);
     freez(&u);
     freez(&v);
     freez(&w); 
     errAbort("Memory allocation error in pcaGeneSet");
     }
 
 for (column = 0; column < nvals; column++)
     { 
     svdorder[column] = column;
     svdsortval[column] = -w[column];
     }
 
 sort(nvals,svdsortval,svdorder);
 
 /* Free all used space */
 //for (row = 0; row < rows; row++) 
 //    freez(&u[row]);
 //for (column = 0; column < columns; column++) 
 //    freez(&v[column]);
 //freez(&u);
 //freez(&v);
 //freez(&w); 
 //freez(&svdorder);
 freez(&svdsortval);
 }
 
 static void verticalTextCentered(struct vGfx *vg, int x, int y,
 				 int width, int height, int colorIx, MgFont *font, char *string)
 /* Draw a vertical line of text in middle of the box.
  *      The string will read from bottom to top
  */
 {
 /*      don't let this run wild */
 CLIP(width, vg->width);
 CLIP(height, vg->height);
 if ((width > 0) && (height > 0))
     {
     struct vGfx *vgHoriz;
     int i, j;
     /*  reversed meanings of width and height here since this is going
      *  to rotate 90 degrees
      */
     vgHoriz = vgOpenGif(height, width, "/dev/null", FALSE);
     vgTextCentered(vgHoriz, 0, 0, height, width, colorIx, font, string);
     /*  now, blit from the horizontal to the vertical, rotate -90 (CCW) */
     for (i = 0; i < height; ++i)        /* xSrc -> yDest */
         {
         int yDest = height - i;
         for (j = 0; j < width; ++j)     /* ySrc -> xDest */
             {
             vgDot(vg,j+x,yDest+y, vgGetDot(vgHoriz, i, j));
             }
         }
     vgClose(&vgHoriz);
     }
 }     
 
 static void addLabels(struct vGfx *vg, MgFont *font, char *minXStr,
 		      char *maxXStr, char *minYStr, char *maxYStr, char *shortLabel,
 		      char *longLabel, int totalWidth, int totalHeight, 
 		      int graphWidth, int graphHeight, int startX, int startY, 
 		      int leftMargin, int fontHeight, char *vertLabel)
 {
 int x1 = 0;
 int y1 = 0;
 int x2 = 0;
 int y2 = 0;
 int shortLabelSize = 0;
 int longLabelSize = 0;
 int textWidth = 0;
 
 /*      border around the whole thing   */
 vgLine(vg, startX, startY, totalWidth-1+startX, startY, MG_MAGENTA);  /*      top     */
 vgLine(vg, startX, startY, startX, totalHeight-1+startY, MG_MAGENTA); /*      left    */
 vgLine(vg, totalWidth-1+startX, startY, 
        totalWidth-1+startX, totalHeight-1+startY, MG_MAGENTA); /* right */
 vgLine(vg, startX, totalHeight-1+startY, 
        totalWidth-1+startX, totalHeight-1+startY, MG_MAGENTA);/* bottom */
 
 /*      border just around the graph    */
 vgLine(vg, leftMargin-PLOT_MARGIN+startX, PLOT_MARGIN+startY, 
        leftMargin-PLOT_MARGIN+startX,PLOT_MARGIN+graphHeight+startY, MG_CYAN); /* left */
 
 vgLine(vg, leftMargin+startX, PLOT_MARGIN+graphHeight+1+startY,
        totalWidth-PLOT_MARGIN+startX, PLOT_MARGIN+graphHeight+1+startY, MG_CYAN);
 /*      bottom  */
 
 x1 = leftMargin+startX;
 y1 = totalHeight-PLOT_MARGIN-fontHeight+startY;
 x2 = totalWidth-PLOT_MARGIN+startX;
 y2 = totalHeight-PLOT_MARGIN+startY;
 
 if (shortLabel)
     shortLabelSize = mgFontStringWidth(font, shortLabel);
 
 if (longLabel)
     longLabelSize = mgFontStringWidth(font, longLabel);
 
 if ((longLabel != NULL) && (longLabelSize < (x2-x1)))
     vgTextCentered(vg, x1, y1, x2-x1, y2-y1, MG_BLACK, font, longLabel);
 else if (shortLabel != NULL)
     vgTextCentered(vg, x1, y1, x2-x1, y2-y1, MG_BLACK, font, shortLabel);
 
 x1 = leftMargin+startX;
 y1 = totalHeight-PLOT_MARGIN-fontHeight-fontHeight+2+startY;
 
 vgText(vg, x1, y1, MG_BLACK, font, minXStr);
 textWidth = mgFontStringWidth(font, maxXStr);
 x1 = totalWidth - PLOT_MARGIN - textWidth;
 vgText(vg, x1, y1, MG_BLACK, font, maxXStr);
 
 x1 = startX;
 x2 = leftMargin - PLOT_MARGIN+startX;
 y1 = PLOT_MARGIN+startY;
 y2 = PLOT_MARGIN + fontHeight+startY;
 
 if (maxYStr)
     vgTextRight(vg, x1, y1, x2-x1, y2-y1, MG_BLACK, font, maxYStr);
 
 y1 = PLOT_MARGIN+graphHeight-fontHeight+startY;
 y2 = PLOT_MARGIN+graphHeight+startY;
 if (minYStr)
     vgTextRight(vg, x1, y1, x2-x1, y2-y1, MG_BLACK, font, minYStr);
 
 x1 = PLOT_MARGIN+startX;
 y1 = PLOT_MARGIN + fontHeight+startY;
 x2 = leftMargin - PLOT_MARGIN - PLOT_MARGIN+startX;
 y2 = graphHeight - fontHeight+startY;
 if (vertLabel)
     verticalTextCentered(vg, x1, y1, x2-x1, y2-y1, MG_BLACK, font, vertLabel);
 }       
 
 static MgFont *fontSetup(int *height)
 {
 MgFont *font = mgSmallFont();
 if (height)
     {
     *height = mgFontLineHeight(font);
     }
 return (font);
 }
 
      
 static void ordinaryPlot(int **densityCounts, Color **dataColors, struct vGfx *vg,
 			 int totalWidth, int totalHeight, 
 			 int graphWidth, int graphHeight, 
 			 int startX, int startY, int leftMargin)
 /* a simple point plot, not density, the inversion of the Y axis takes
  * place here.  leftMargin is what has been passed in, the top margin is
  * fixed at PLOT_MARGIN 
  *      Function borrowed from hgTables/correlatePlot.c and adjusted. */
 {
 int i, j;
 
 /*      the dots are going to overlap because of DOT_SIZE, but that is
  *      OK for this situation, they are all MG_BLACK
  */
 for (j = 0; j < graphHeight; ++j)
     for (i = 0; i < graphWidth; ++i)
         if (densityCounts[j][i])
             {
             vgBox(vg, i+leftMargin+startX, PLOT_MARGIN+(graphHeight-j)+startY,
 		  DOT_SIZE+2, DOT_SIZE+2, dataColors[j][i]);
             }
 
 /*  the (totalHeight-j) is the inversion of the Y axis */
 }
 
 struct tempName *scatterPlot(struct vGfx *vg, struct dataVector *xList,
 			     struct dataVector *yList, int totalWidth, int totalHeight, 
 			     int startX, int startY)
 /*      create scatter plot gif file in trash, return path name 
  *      Function borrowed from hgTables/correlatePlot.c and adjusted. */
 {
 //static struct tempName gifFileName;
 MgFont *font = NULL;
 struct dataVector *x;
 struct dataVector *y; 
 double yMin = INFINITY;
 double xMin = INFINITY;
 double yMax = -INFINITY;
 double xMax = -INFINITY;
 double yRange = 0.0;
 double xRange = 0.0;
 int **densityCounts;    /*      densityCounts[GRAPH_HEIGHT] [GRAPH_WIDTH] */
 Color **dataColors;
 int i, j;
 int pointsPlotted = 0;
 int fontHeight = 0;
 char minXStr[32];
 char maxXStr[32];
 char minYStr[32];
 char maxYStr[32];
 char xLabel[256];
 char yLabel[256];
 int leftLabelSize = 0;
 int bottomLabelSize = 0;
 int leftMargin = 0;
 int bottomMargin = 0;
 
 font = fontSetup(&fontHeight);
 
 /*      find overall min and max        */
 for (x = xList, y = yList; (y != NULL) && (x !=NULL); y = y->next, x=x->next)
     {
     yMin = min(yMin,y->min);
     xMin = min(xMin,x->min);
     yMax = max(yMax,y->max);
     xMax = max(xMax,x->max);
     }
 
 yRange = yMax - yMin;
 xRange = xMax - xMin;
 if (xRange == 0.0)
     xRange = 1.0;                                                                               
 if (yRange == 0.0)
     yRange = 1.0;
 
 safef(minXStr,ArraySize(minXStr), "%.4g", xMin);
 safef(maxXStr,ArraySize(maxXStr), "%.4g", xMax);
 safef(minYStr,ArraySize(minYStr), "%.4g", yMin);
 safef(maxYStr,ArraySize(maxYStr), "%.4g", yMax);
 
 /*      the axes labels will be horizontal, thus the widest establishes
  *      the left margin.
  */
 leftLabelSize = max(mgFontStringWidth(font, minYStr),
 		    mgFontStringWidth(font, maxYStr));
 
 /*      the vertical text may be even wider than the numbers    */
 leftLabelSize = max(leftLabelSize,fontHeight);
 
 /*      The bottom label is one line for the axes labels, and a second
  *      line for the X axis table longLabel
  */
 bottomLabelSize = fontHeight * 2;
 
 /*      y data is the Y axis, x data is the X axis      */
 /*      Do not worry about the fact that the Y axis is 0 at the top
  *      here, that will get translated during the actual plot.
  */
 
 leftMargin = PLOT_MARGIN + leftLabelSize + PLOT_MARGIN + PLOT_MARGIN;
 bottomMargin = PLOT_MARGIN + bottomLabelSize + PLOT_MARGIN;
 
 int graphWidth = totalWidth - leftMargin - PLOT_MARGIN;
 int graphHeight = totalHeight - PLOT_MARGIN - bottomMargin;
 
 if (graphWidth <= 0 || graphHeight <= 0)
     return NULL;
 
 struct lm *lm = lmInit(graphWidth);
 /*      Initialize density plot count array     */
 /*      space for the row pointers, first       */
 lmAllocArray(lm, densityCounts, graphHeight);
 lmAllocArray(lm, dataColors, graphHeight); 
 
 /*      then space for each row */
 for (j = 0; j < graphHeight; ++j)
     {
     lmAllocArray(lm, densityCounts[j], graphWidth);
     lmAllocArray(lm, dataColors[j], graphWidth);
     }
 
 for (j = 0; j < graphHeight; ++j)
     for (i = 0; i < graphWidth; ++i)
 	{
         densityCounts[j][i] = 0;
 	dataColors[j][i] = MG_GRAY;
 	}
 for (x = xList, y = yList; (y != NULL) && (x !=NULL); y = y->next, x=x->next)
     {
     pointsPlotted++;
 
     double yValue = y->value;
     double xValue = x->value;
     int x1 = ((xValue - xMin)/xRange) * graphWidth;
     int y1 = ((yValue - yMin)/yRange) * graphHeight;
     
     CLIP(x1,graphWidth);
     CLIP(y1,graphHeight);
     
     densityCounts[y1][x1]++;
     dataColors[y1][x1] = x->color;
     }
 	
 struct tempName *gifFileName = NULL;  
 if (vg == NULL)
     {
     gifFileName = AllocA(struct tempName);
     trashDirFile(gifFileName, "hgtData", "hgtaScatter", ".gif");
 
     vg = vgOpenGif(totalWidth, totalHeight, gifFileName->forCgi, FALSE);
     }
 
 vgSetClip(vg, leftMargin+startX, PLOT_MARGIN+startY, graphWidth+startX, graphHeight+startY);
 
 ordinaryPlot(densityCounts, dataColors, vg, totalWidth, totalHeight, 
 	     graphWidth, graphHeight, startX, startY, leftMargin);
 
 safef(xLabel, ArraySize(xLabel), "PCA Component 1"); 
 safef(yLabel, ArraySize(yLabel), "PCA Component 2");
 
 /*      allow the entire area to be drawn into for the labels   */
 vgSetClip(vg, startX, startY, totalWidth+startX, totalHeight+startY);
 
 addLabels(vg, font, minXStr, maxXStr, minYStr, maxYStr,
 	  NULL, xLabel, totalWidth, totalHeight, graphWidth, graphHeight,
 	  startX, startY, leftMargin, fontHeight, yLabel);
 
 vgUnclip(vg);
 
 if (gifFileName)
     vgClose(&vg);
 
 lmCleanup(&lm);
 
 return gifFileName;
 }     
 
 
 struct hash *getPCAComponentsXY(int rows, int columns, 
 				struct dataVector **x, struct dataVector **y,
 				struct slName **list1, struct slName **list2, 
 				double **data, char **genename, double **u, double **v, double *w, 
 				int *svdorder, Color *colors)
 {
 if (rows < 2 || columns < 2)
     return NULL;
 
 int row, column;
 struct dataVector *newX, *newY;
 int c1 = 0;
 int c2 = 1;
 
 double mult = 10000.0;
 
 int* geneorder1 = needMem(rows*sizeof(double));
 int* geneorder2 = needMem(rows*sizeof(double));
 double** weight = needMem(rows*sizeof(double*)); 
 double** weightAbs = needMem(rows*sizeof(double*));
 
 int nvals = columns;
 if (rows < columns)
     nvals = rows;
 
 for (row = 0; row < rows; row++)
     {
     weight[row] = needMem(nvals*sizeof(double));
     weightAbs[row] = needMem(nvals*sizeof(double));
     }    
 
 if (rows > columns)
     {
     for (row = 0; row < rows; row++)
 	{
 	geneorder1[row] = row;
 	geneorder2[row] = row;
 	for (column = 0; column < nvals; column++)
 	    {
 	    weight[row][column] = u[row][svdorder[column]];
 	    weightAbs[row][column] = -fabs(weight[row][column]);
 	    }
 	}
 
     for (column = 0; column < columns; column++)
 	{
 	newX = AllocA(struct dataVector);
 	newY = AllocA(struct dataVector);
 	newX->value = 0;
 	newY->value = 0;
 	for (row = 0; row < rows; row++)
 	    {
 	    newX->value += data[row][column] * u[row][svdorder[c1]];
 	    newY->value += data[row][column] * u[row][svdorder[c2]];
 	    }
 
 	newX->min = newX->value;
 	newX->max = newX->value;
 	newY->min = newY->value;
 	newY->max = newY->value;
 
 	newX->count = 1;
 	newY->count = 1;
 	
 	newX->color = MG_GRAY;
 	newY->color = MG_GRAY;
 	if (colors != NULL)
 	    {
 	    newX->color = colors[column];
 	    newY->color = colors[column];
 	    }
 
 	slAddHead(x, newX);
 	slAddHead(y, newY);
 	}
     }
 else
     {
     for (row = 0; row < rows; row++)                                                            
 	{
 	geneorder1[row] = row;
 	geneorder2[row] = row;
 	for (column = 0; column < nvals; column++)
 	    {
 	    weight[row][column] = u[column][row];
 	    weightAbs[row][column] = -fabs(weight[row][column]);
 	    }
 	}
 
     for (column = 0; column < columns; column++)
 	{
         newX = AllocA(struct dataVector);
         newY = AllocA(struct dataVector); 
 	newX->value = 0;
 	newY->value = 0;
 	for (row = 0; row < rows; row++)
 	    {
 	    newX->value += data[svdorder[row]][column] * u[c1][svdorder[row]];
 	    newY->value += data[svdorder[row]][column] * u[c2][svdorder[row]];
 	    }
 
 	newX->min = newX->value;
         newX->max = newX->value;
         newY->min = newY->value;
         newY->max = newY->value;
 
         newX->count = 1;
         newY->count = 1;
 
         newX->color = MG_GRAY;
         newY->color = MG_GRAY;
         if (colors != NULL)
             {
             newX->color = colors[column];
             newY->color = colors[column];
             }
  
         slAddHead(x, newX);
         slAddHead(y, newY);
 	}
     }
 
 double* w1Abs = needMem(rows*sizeof(double));
 double* w2Abs = needMem(rows*sizeof(double));
 for (row = 0; row < rows; row++)
     {
     w1Abs[row] = weightAbs[row][c1];  // get first component, for scatter plot
     w2Abs[row] = weightAbs[row][c2];  // get second component, for scatter plot
     }
 sort(rows,w1Abs,geneorder1);
 sort(rows,w2Abs,geneorder2);
 
 struct hash *statHash = NULL;
 if (rows > 0)
     statHash = hashNew(0);
 
 for (row = 0; row < rows; row++)
     {
     int index1 = geneorder1[row];
     int index2 = geneorder2[row];
 
     if (ceil(w1Abs[index1]*mult) != 0)
 	slNameAddHead(list1, genename[index1]);
 
     if (ceil(w2Abs[index2]*mult) != 0)
 	slNameAddHead(list2, genename[index2]);
 
     struct pcaData *pd = AllocA(struct pcaData);
     pd->numComponents = nvals;
     AllocArray(pd->components, pd->numComponents);
     for (column = 0; column < nvals; column++)
 	pd->components[column] = weight[row][column];
 
     hashAdd(statHash, genename[row], pd);
     }
 slReverse(list1);
 slReverse(list2);
 
 return statHash;
 }
 
 void printComponents(char *name, struct slName *list1, struct slName *list2)
 {
 if (list1 == NULL || list2 == NULL)
     return;
 
 struct slName *sl;
 int count = 0;
 int numGenesPerLine = 20;
 webNewSection("PCA Components: %s", name);
 
 hPrintf("<TABLE border = \"0\">");
 hPrintf("<TR><TH>First:<TD>");
 for (sl = list1; sl; sl = sl->next)
     {
     hPrintf("%s,", sl->name);
     count++;
     if (count == numGenesPerLine)
 	{
 	hPrintf("...");
 	break;
 	}
     }
 count = 0;
 hPrintf("<TR><TH>Second:<TD>");
 for (sl = list2; sl; sl = sl->next)
     {
     hPrintf("%s,", sl->name);
     count++;
     if (count == numGenesPerLine)
 	{
 	hPrintf("...");
 	break;
 	}
     }
 hPrintf("</TABLE>");
 
 hPrintf("\n</TD><TD WIDTH=15></TD></TR></TABLE>\n");
 hPrintf("</TD></TR></TABLE>\n");
 hPrintf("</TD></TR></TABLE>\n");
 }
 
 struct hash *performPCAandPlot(struct vGfx *vg, int totalWidth, int totalHeight, 
 		    int startX, int startY, struct hash *geneHash, 
 		    struct slName *allGenes, struct featureColor *fcList)
 {
 if (geneHash == NULL || allGenes == NULL)
     return NULL;
 
 struct bed *nb = NULL;
 struct slName *sl, *genes = NULL;
 struct hashEl *el;
 
 for (sl = allGenes; sl ; sl = sl->next)
     {
     el = hashLookup(geneHash, sl->name);
     if (el != NULL)
 	{
 	slNameAddHead(&genes, sl->name);
 	nb = el->val;
 	}
     }
 slReverse(&genes);
 
 if (nb == NULL)
     return NULL;
 
 int row, column;
 int columns = nb->expCount; 
 int rows = slCount(genes);
 int nvals = min(rows, columns); 
 
 if (rows > MAX_ROWS || columns > MAX_COLUMNS)
     return NULL;
 
 double **data = needMem(rows*sizeof(double*));
 char **genename = needMem(rows*sizeof(char*));
 double** u = needMem(rows*sizeof(double*));
 double** v = needMem(columns*sizeof(double*));
 double* w = needMem(nvals*sizeof(double));
 int *svdorder = needMem(nvals*sizeof(int));
 
 pcaGeneSet(geneHash, genes, rows, columns, 
 	   data, genename, u, v, w, svdorder);
 
 Color colors[columns];
 for (column = 0; column < columns; column++)
     colors[column] = MG_GRAY;
 
 struct featureColor *fc;
 for (fc = fcList; fc; fc = fc->next)
     {
     if (fc->index < columns)
 	colors[fc->index] = fc->color;
     }
 
 struct dataVector *x = NULL; 
 struct dataVector *y = NULL;
 struct slName *list1 = NULL;
 struct slName *list2 = NULL;
  
 struct hash *statHash = getPCAComponentsXY(rows, columns, &x, &y, &list1, &list2, 
 					   data, genename, u, v, w, svdorder, colors);
 
 struct tempName *scatterPlotGif = NULL; 
 if (vg)
     scatterPlotGif = scatterPlot(vg, x, y, totalWidth, totalHeight, startX, startY);    
 
 //#if 0
 outputGenePCA(rows, columns, genes, w, u, svdorder);
 //#endif
   
 
 /* Free all used space */
 for (row = 0; row < rows; row++)
     freez(&u[row]);
 for (column = 0; column < columns; column++)
     freez(&v[column]);                                                                            
 freez(&u);
 freez(&v);
 freez(&w);
 
 return statHash;
 }
 
 struct hash *performPCA(struct hash *geneHash, struct slName *allGenes)
 /* use plotting function, but pass NULL vGfx ptr to avoid drawing plot, just returning statHash */
 {
 return performPCAandPlot(NULL, 0, 0, 0, 0, geneHash, allGenes, NULL);
 }