b2a26f49df9172f0900b4516dbf7710fc13f681c
ceisenhart
  Wed Jul 16 15:51:50 2014 -0700
Added a new output which is a json formatted for hierarchical d3 visualizations
diff --git src/hg/expData/expData.c src/hg/expData/expData.c
index c874ef0..c25c336 100644
--- src/hg/expData/expData.c
+++ src/hg/expData/expData.c
@@ -1,279 +1,334 @@
 /* expData -  Takes in a relational database and outputs expression information. */
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
+#include "obscure.h"
 #include "jksql.h"
 #include "expData.h"
 #include "sqlList.h"
 #include "hacTree.h"
 #include "jsonWrite.h"
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "expData -  Takes in a relational database and outputs expression information\n"
   "usage:\n"
   "   expData biosamples matrix output\n"
   "options:\n"
   "   -xxx=XXX\n"
   );
 }
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
    {NULL, 0},
 };
 
 int target = 0;  // Used for the target value in rlinkJson.
 float longest = 0;  // Used to normalize link distances in rlinkJson.
 
 struct bioExpVector
 /* Contains expression information for a biosample on many genes. */
     {
     struct bioExpVector *next;
     char *name;	    // name of biosample.
     int count;	    // Number of genes we have data for.
     double *vector;   //  An array allocated dynamically.
     };
 
 struct bioExpVector *bioExpVectorListFromFile(char *matrixFile)
 // Read a tab-delimited file and return list of bioExpVector.
 
 {
 int vectorSize = 0;
 struct lineFile *lf = lineFileOpen(matrixFile, TRUE);
 char *line, **row = NULL;
 struct bioExpVector *list = NULL, *el;
 while (lineFileNextReal(lf, &line))
     {
     if (vectorSize == 0)
         {
 	// Detect first row.
 	vectorSize = chopByWhite(line, NULL, 0);  // counting up
 	AllocArray(row, vectorSize);
 	continue;
 	}
     AllocVar(el);
     AllocArray(el->vector, vectorSize);
     el->count = chopByWhite(line, row, vectorSize);
     assert(el->count == vectorSize);
     int i;
     for (i = 0; i < el->count; ++i)
 	el->vector[i] = sqlDouble(row[i]);
     slAddHead(&list, el);
     }
 lineFileClose(&lf);
 slReverse(&list);
 return list;
 }
 
 void fillInNames(struct bioExpVector *list, char *nameFile)
 /* Fill in name field from file. */
 {
 struct lineFile *lf = lineFileOpen(nameFile, TRUE);
 char *line;
 struct bioExpVector *el = list;
 while (lineFileNextReal(lf, &line))
      {
      if (el == NULL)
 	 {
          warn("More names than items in list");
 	 break;
 	 }
      el->name = cloneString(line);
      el = el->next;
      }
 lineFileClose(&lf);
 }
 
-void rPrintNodes(FILE *f, struct hacTree *tree)
+void rPrintNodes(struct jsonWrite *jw, FILE *f, struct hacTree *tree)
 {
 // Recursively prints out the nodes in a depth first order starting on the left
 char *tissue = ((struct bioExpVector *)(tree->itemOrCluster))->name;
-struct jsonWrite *jw = jsonWriteNew();
+jsonWriteObjectStart(jw);
 if (tree->childDistance != 0) 
     // If the current object is a node then we assign no name.
     {
     fprintf(f,"    %s\"%s\"%s\"%s\"%s", "{","name", ":", " ", ",");
     jsonWriteString(jw, "name", " ");
     jsonWriteNumber(jw, "y", tree->childDistance);
     fprintf(f,"\"%s\"%s%0.31f%s\n", "y", ":" , tree->childDistance, "},");
     }
 else {
     // Otherwise the current object is a leaf, and the tissue name is printed. 
     jsonWriteString(jw, "name", tissue);
     jsonWriteNumber(jw, "y", tree->childDistance);
     fprintf(f,"    %s\"%s\"%s\"%s\"%s", "{","name", ":", tissue, ",");
     fprintf(f,"\"%s\"%s%0.31f%s\n", "y", ":" , tree->childDistance, "},");
     }
+jsonWriteObjectEnd(jw);
 if (tree->left == NULL && tree->right == NULL)
     { 
     return;
     }
 else if (tree->left == NULL || tree->right == NULL)
     errAbort("\nHow did we get a node with one NULL kid??");
-rPrintNodes(f, tree->left);
-rPrintNodes(f, tree->right);
+rPrintNodes(jw, f, tree->left);
+rPrintNodes(jw, f, tree->right);
 }
 
    
+
 void rPrintLinks(FILE *f, struct hacTree *tree, int source)
 {
 // Recursively prints the links in json format. 
 if (tree->childDistance > longest)
     // the first distance will be the longest, and is used for normalization
     longest = tree->childDistance;
 /* if the current location is a leaf */
 if (tree->left == NULL && tree->right == NULL)
     { 
     return;
     }
 else if (tree->left == NULL || tree->right == NULL)
     errAbort("\nHow did we get a node with one NULL kid??");
 /* check for the end of the tree */
 
 /* Left recursion; the source and target are always ofset by 1. */
 ++target;
 fprintf(f,"    %s\"%s\"%s%d%s", "{","source", ":", target - 1, ",");
 fprintf(f,"\"%s\"%s%d%s", "target", ":" , target, ",");  
 fprintf(f,"\"%s\"%s%0.31f%s\n", "distance", ":" , 100*(tree->childDistance/longest) , "},");  
 /* Prepares the source for the right recursion. */
 source = target;
 rPrintLinks(f, tree->left, source);
 /* Right recursion. */
 ++target;
 fprintf(f,"    %s\"%s\"%s%d%s", "{","source", ":", source - 1 , ",");
 fprintf(f,"\"%s\"%s%d%s", "target", ":" , target, ",");  
 fprintf(f,"\"%s\"%s%0.31f%s\n", "distance", ":" , 100*(tree->childDistance/longest) , "},");  
 rPrintLinks(f, tree->right, ++source);
 }
 
 void printJson(FILE *f, struct hacTree *tree)
 /* Prints the hacTree into a Json file format */
 {
 int source = 0;
 // Basic json template for d3 visualizations
-//struct jsonWrite *jw = jsonWriteNew();
+struct jsonWrite *jw = jsonWriteNew();
+jsonWriteObjectStart(jw);
 fprintf(f,"%s\n", "{");
-//jsonWriteTag(struct jsonWrite *jw, char *var);
+jsonWriteListStart(jw, "nodes");
 fprintf(f,"  \"%s\"%s\n", "nodes", ":[" );
-rPrintNodes(f, tree);
+rPrintNodes(jw, f, tree);
 fprintf(f,  "%s\n", "],");
 // Basic json template for d3 visualizations
 fprintf(f,  "\"%s\"%s\n", "links", ":[" );
 rPrintLinks(f,tree, source);
 fprintf(f,"  %s\n", "]");
 
 fprintf(f,"%s\n", "}");
+jsonWriteObjectEnd(jw);
+writeGulp("jsonWrite.out", jw->dy->string, jw->dy->stringSize);
 }
 
 
 static void rPrintSlBioExpVectorTree(FILE *f, struct hacTree *tree, int level,double distance)
 /* Recursively print out cluster as nested-parens with {}'s around leaf nodes. */
 {
 char *tissue = ((struct bioExpVector *)(tree->itemOrCluster))->name;
 int i;
 for (i = 0;  i < level;  i++)
     fputc(' ', f);
 if (tree->left == NULL && tree->right == NULL)
     {
     fprintf(f, "{%s}%0.31f", tissue, distance);
     return;
     }
 else if (tree->left == NULL || tree->right == NULL)
     errAbort("\nHow did we get a node with one NULL kid??");
 fprintf(f, "(%s%f\n", "node", tree->childDistance);
 distance += tree->childDistance;
 rPrintSlBioExpVectorTree(f, tree->left, level+1, distance);
 fputs(",\n", f);
 rPrintSlBioExpVectorTree(f, tree->right, level+1, distance);
 fputc('\n', f);
 for (i=0;  i < level;  i++)
     fputc(' ', f);
 fputs(")", f);
 }
 
 void printSlBioExpVectorTree(FILE *f, struct hacTree *tree)
 /* Print out cluster as nested-parens with {}'s around leaf nodes. */
 {
 if (tree == NULL)
     {
     fputs("Empty tree.\n", f);
     return;
     }
 double distance = 0;
 rPrintSlBioExpVectorTree(f, tree, 0, distance);
 fputc('\n', f);
 }
 
+static void rPrintNestedJson(FILE *f, struct hacTree *tree, int level, double distance)
+/* Recursively print out cluster as nested-parens with {}'s around leaf nodes. */
+{
+char *tissue = ((struct bioExpVector *)(tree->itemOrCluster))->name;
+int i;
+if (tree->childDistance > longest)
+    // the first distance will be the longest, and is used for normalization
+    longest = tree->childDistance;
+for (i = 0;  i < level;  i++)
+    fputc(' ', f);
+if (tree->left == NULL && tree->right == NULL)
+    {
+    fprintf(f, "{\"%s\"%s \"%s\"%s\"%s\"%s %f}", "name", ":", tissue, ", ", "size", ":", distance);
+    return;
+    }
+else if (tree->left == NULL || tree->right == NULL)
+    errAbort("\nHow did we get a node with one NULL kid??");
+//fprintf(f, "{\"%s\"%s \"%f\"%s\n", "name", ":", " ", ",");
+fprintf(f, "{\"%s\"%s \"%f\"%s\n", "name", ":", 100*(tree->childDistance/longest), ",");
+for (i = 0;  i < level + 1;  i++)
+    fputc(' ', f);
+fprintf(f, "\"%s\"%s\n", "children", ": [");
+distance += tree->childDistance;
+rPrintNestedJson(f, tree->left, level+1, distance);
+fputs(",\n", f);
+rPrintNestedJson(f, tree->right, level+1, distance);
+fputc('\n', f);
+for (i=0;  i < level + 1;  i++)
+    fputc(' ', f);
+fputs("]\n", f);
+for (i=0;  i < level;  i++)
+    fputc(' ', f);
+fputs("}", f);
+}
+
+void printNestedJson(FILE *f, struct hacTree *tree)
+/* Print out cluster as nested-parens with {}'s around leaf nodes. */
+{
+if (tree == NULL)
+    {
+    fputs("Empty tree.\n", f);
+    return;
+    }
+double distance = 0;
+rPrintNestedJson(f, tree, 0, distance);
+fputc('\n', f);
+}
+
 char* floatToString(float input)
 {
 char* result = needMem(sizeof(result));
 sprintf(result,"%f", input);
 return result;
 freez(result);
 }
 
 double slBioExpVectorDistance(const struct slList *item1, const struct slList *item2, void *extraData)
 /* Return the absolute difference between the two kids' values. 
  * Designed for HAC tree use*/
 {
 const struct bioExpVector *kid1 = (const struct bioExpVector *)item1;
 const struct bioExpVector *kid2 = (const struct bioExpVector *)item2;
 int j;
 double diff = 0, sum = 0;
 for (j = 0; j < kid1->count; ++j)
     {
     diff = kid1->vector[j] - kid2->vector[j];
     sum += (diff * diff);
     }
 return sqrt(sum);
 }
 
 
 struct slList *slBioExpVectorMerge(const struct slList *item1, const struct slList *item2,
 				void *unusedExtraData)
 /* Make a new slPair where the name is the children names concattenated and the 
  * value is the average of kids' values.
  * Designed for HAC tree use*/
 {
 const struct bioExpVector *kid1 = (const struct bioExpVector *)item1;
 const struct bioExpVector *kid2 = (const struct bioExpVector *)item2;
 struct bioExpVector *el;
 AllocVar(el);
 AllocArray(el->vector, kid1->count);
 el->count = kid1->count; 
 el->name = catTwoStrings(kid1->name, kid2->name);
 int i;
 for (i = 0; i < el->count; ++i)
     {
     el->vector[i] = (kid1->vector[i] + kid2->vector[i])/2;
     }
 return (struct slList *)(el);
 }
 
 void expData(char *matrixFile, char *nameFile, char *outFile)
 /* Read matrix and names into a list of bioExpVectors, run hacTree to
  * associate them, and write output. */
 {
 struct bioExpVector *list = bioExpVectorListFromFile(matrixFile);
 FILE *f = mustOpen(outFile,"w");
 struct lm *localMem = lmInit(0);
 fillInNames(list, nameFile);
 struct hacTree *clusters = hacTreeFromItems((struct slList *)list, localMem,
 					    slBioExpVectorDistance, slBioExpVectorMerge, NULL, NULL);
-printJson(f,clusters);
+//printJson(f,clusters);
 //printSlBioExpVectorTree(f,clusters);
+printNestedJson(f,clusters);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 4)
     usage();
 expData(argv[1], argv[2], argv[3]);
 return 0;
 }