dc6522a598cca732573eb62d9202d2d2f5917821
Merge parents 0ae26a1 751c92f
ceisenhart
  Mon Sep 15 12:42:17 2014 -0700
Working through a merge conflict
diff --cc src/utils/bigWigCluster/bigWigCluster.c
index 7cad967,0fea77b..0572a9c
--- src/utils/bigWigCluster/bigWigCluster.c
+++ src/utils/bigWigCluster/bigWigCluster.c
@@@ -1,286 -1,406 +1,416 @@@
  /* bigWigCluster - Cluster bigWigs using a hactree. */
  #include "sqlNum.h"
  #include "common.h"
  #include "linefile.h"
  #include "hash.h"
  #include "options.h"
  #include "hacTree.h"
  #include "rainbow.h"
+ #include "portable.h"
+ 
+ int gThreadCount = 10;
+ char *gTmpDir = ".";
  
  void usage()
  /* Explain usage and exit. */
  {
  errAbort(
-   "bigWigCluster - Cluster bigWigs using a hactree\n"
+   "bigWigCluster - Cluster bigWigs using a hacTree\n"
    "usage:\n"
-   "   bigWigCluster input.list chrom.sizes output.json\n"
+   "   bigWigCluster input.list chrom.sizes output.json output.tab\n"
+   "where: input.list is a list of bigWig file names\n"
+   "       chrom.sizes is tab separated <chrom><size> for assembly for bigWigs\n"
+   "       output.json is json formatted output suitable for graphing with D3\n"
+   "       output.tab is tab-separated file of  of items ordered by tree with the fields\n"
+   "           label - label from -labels option or from file name with no dir or extention\n"
+   "           pos - number from 0-1 representing position according to tree and distance\n"
+   "           red - number from 0-255 representing recommended red component of color\n"
+   "           green - number from 0-255 representing recommended green component of color\n"
+   "           blue - number from 0-255 representing recommended blue component of color\n"
+   "           path - file name from input.list including directory and extension\n" 
    "options:\n"
-   "   -threads = int Sets the thread count for the multiThreads option, default is 10 \n"
-   "   -hacTree = Dictates how the tree is generated;  multiThreads or costlyMerges or fromItems. fromItems is default \n"
+   "   -labels=fileName - label files from tabSeparated file with fields\n"
+   "           path - path to bigWig file\n"
+   "           label - a string with no tabs\n"
+   "   -precalc=precalc.tab - tab separated file with <file1> <file2> <distance>\n"
+   "            columns.\n"
+   "   -threads=N - number of threads to use, default %d\n"
+   "   -tmpDir=/tmp/path - place to put temp files, default current dir\n"
+   , gThreadCount
    );
  }
  
+ char tmpPrefix[PATH_LEN] = "bwc_tmp_";  /* Prefix for temp file names */
+ 
  /* Command line validation table. */
  static struct optionSpec options[] = {
+    {"precalc", OPTION_STRING},
+    {"threads", OPTION_INT},
+    {"labels", OPTION_STRING},
+    {"tmpDir", OPTION_STRING},
     {NULL, 0},
 +   {"threads", OPTION_INT},
 +   {"hacTree", OPTION_STRING},
  };
  
 +
 +double longest = 0;
 +char* clHacTree = "fromItems";
 +int clThreads = 10; // The number of threads to run with the multiThreads option
 +int nameCount = 0;
 +
+ int bigWigNextId;
  
  struct bigWig
- {
- struct bigWig *next;  //next item in series
- char *name;	//name of the bigWig filei
- struct rgbColor color; //for coloring
- };
+ /* Information on a bigWig */
+     {
+     struct bigWig *next;  //next item in series
+     int id;	// Unique numerical identifier
+     char *fileName; // Associated file name
+     char *label;    // Associated labels if any
+     struct rgbColor color; //for coloring
+     double pos;	    // Position between 0.0 and 1.0 when ordered by tree and distance
+     };
  
- struct bigWig *getBigWigs(char* input)
+ struct bigWig *readBigWigList(char* input)
  // get the bigWig files 
  {
- struct bigWig **list;
- AllocVar(list);
- char* line = NULL;
- int i = 0;
+ struct bigWig *list = NULL;
  struct lineFile *lf = lineFileOpen(input,TRUE);
- while(lineFileNext(lf, &line, NULL))
+ char *line = NULL;
+ while(lineFileNextReal(lf, &line))
      {
-     ++i;
+     /* Allocate new item and initialize id and fileName fields */
      struct bigWig *bw;
      AllocVar(bw);
-     bw->name = cloneString(line);
+     bw->id = ++bigWigNextId;
+     bw->fileName = cloneString(trimSpaces(line));
+ 
+     // Make up default label - same as file name without directory and extension
+     char root[FILENAME_LEN];
+     splitPath(line, NULL, root, NULL);
+     bw->label = cloneString(root);
+ 
+     /* Add to list */
      slAddHead(&list,bw);
      }
+ 
+ /* Clean up and go home */
+ lineFileClose(&lf);
  slReverse(&list);
- return *list;
+ return list;
  }
  
  
- 
  static void rAddLeaf(struct hacTree *tree, struct slRef **pList)
- /* Recursively add leaf to list */
+ /* Recursively add itemOrCluster from leaf nodes of hacTree to list */
  {
  if (tree->left == NULL && tree->right == NULL)
      refAdd(pList, tree->itemOrCluster);
  else
      {
      rAddLeaf(tree->left, pList);
      rAddLeaf(tree->right, pList);
      }
  }
  
  struct slRef *getOrderedLeafList(struct hacTree *tree)
- /* Return list of references to bioExpVectors from leaf nodes
+ /* Return list of references to bigWigs from leaf nodes
   * ordered by position in hacTree */
  {
  struct slRef *leafList = NULL;
  rAddLeaf(tree, &leafList);
  slReverse(&leafList);
  return leafList;
  }
  
- 
  static void rPrintHierarchicalJson(FILE *f, struct hacTree *tree, int level, double distance,
  		int normConstant, int cgConstant)
  /* Recursively prints out the elements of the hierarchical .json file. */
  {
- struct bigWig *bio = (struct bigWig *)tree->itemOrCluster;
- char *tissue = bio->name;
- struct rgbColor colors = bio->color;
+ struct bigWig *bw = (struct bigWig *)tree->itemOrCluster;
+ char *label = bw->label;
+ struct rgbColor colors = bw->color;
  int i;
  for (i = 0;  i < level;  i++)
      fputc(' ', f); // correct spacing for .json format
  if (tree->left == NULL && tree->right == NULL)
      {
      // Prints out the leaf objects
-     fprintf(f, "{\"%s\"%s \"%s\"%s\"%s\"%s %f %s\"%s\"%s \"rgb(%i,%i,%i)\"}", "name", ":", tissue, ", ",
+     fprintf(f, "{\"%s\"%s \"%s\"%s\"%s\"%s %f %s\"%s\"%s \"rgb(%i,%i,%i)\"}", "name", ":", label, ", ",
      		"similarity", ":", distance, "," , "colorGroup", ":", colors.r, colors.g, colors.b);
      return;
      }
  else if (tree->left == NULL || tree->right == NULL)
      errAbort("\nHow did we get a node with one NULL kid??");
  
  // Prints out the node object and opens a new children block
  fprintf(f, "{\"%s\"%s \"%s\"%s", "name", ":", " ", ",");
  fprintf(f, "\"colorGroup\": \"rgb(%i,%i,%i)\",", colors.r, colors.g, colors.b );
- fprintf(f, "\"%s\"%s \"%f\"%s\n", "distance", ":", 20 *(tree->childDistance/longest), ",");
+ fprintf(f, "\"%s\"%s \"%f\"%s\n", "distance", ":", 100*tree->childDistance, ",");
  for (i = 0;  i < level + 1;  i++)
      fputc(' ', f);
  fprintf(f, "\"%s\"%s\n", "children", ": [");
- distance = 20*(tree->childDistance/longest); 
+ distance = tree->childDistance; 
  rPrintHierarchicalJson(f, tree->left, level+1, distance, normConstant, cgConstant);
  fputs(",\n", f);
  rPrintHierarchicalJson(f, tree->right, level+1, distance, normConstant, cgConstant);
  fputc('\n', f);
  // Closes the children block for node objects
  for (i=0;  i < level + 1;  i++)
      fputc(' ', f);
  fputs("]\n", f);
  for (i = 0;  i < level;  i++)
      fputc(' ', f);
  fputs("}", f);
  }
  
- void printHierarchicalJson(FILE *f, struct hacTree *tree, int normConstant, int cgConstant)
+ void printHierarchicalJson(char *fileName, struct hacTree *tree, int normConstant, int cgConstant)
  /* Prints out the binary tree into .json format intended for d3
   * hierarchical layouts */
  {
+ FILE *f = mustOpen(fileName, "w");
  if (tree == NULL)
      {
      fputs("Empty tree.\n", f);
      return;
      }
- double distance = 0;
- rPrintHierarchicalJson(f, tree, 0, distance, normConstant, cgConstant);
+ rPrintHierarchicalJson(f, tree, 0, 0, normConstant, cgConstant);
  fputc('\n', f);
+ carefulClose(&f);
  }
  
- 
- 
+ void printOrderedTab(char *fileName, struct slRef *refList)
+ /* Print list of references to bigWigs */
+ {
+ FILE *f = mustOpen(fileName, "w");
+ struct slRef *ref;
+ for (ref = refList; ref != NULL; ref = ref->next)
+     {
+     struct bigWig *bw = ref->val;
+     fprintf(f, "%s\t%g\t%d\t%d\t%d\t%s\n", bw->label, bw->pos,
+ 	bw->color.r, bw->color.g, bw->color.b, bw->fileName);
+     }
+ carefulClose(&f);
+ }
  
  double slBigWigDistance(const struct slList *item1, const struct slList *item2, void *extraData)
  /* Return the absolute difference between the two kids' values. 
   * Designed for HAC tree use*/
  {
- static int count = 0;
- ++count;
  verbose(1,"Calculating Distance...\n");
  const struct bigWig *kid1 = (const struct bigWig *)item1;
  const struct bigWig *kid2 = (const struct bigWig *)item2;
- char cmd[1024];
  char tmpName[PATH_LEN];
- safef(tmpName, sizeof(tmpName), "tmp_%p_%p", kid1, kid2);
- safef(cmd, 1024, "bigWigCorrelate %s %s > %s", kid1->name, kid2->name, tmpName);
+ safef(tmpName, sizeof(tmpName), "%s%d_%d.txt", tmpPrefix, kid1->id, kid2->id);
+ char cmd[1024];
+ safef(cmd, 1024, "bigWigCorrelate %s %s > %s", kid1->fileName, kid2->fileName, tmpName);
  double diff = 0;
  mustSystem(cmd);
  struct lineFile *lf = lineFileOpen(tmpName,TRUE);
- char* line = NULL;
+ char *line = NULL;
  if (!lineFileNext(lf, &line, NULL))
      errAbort("no difference output, check bigWigCorrelate");
- diff = 100 * sqlDouble(cloneString(line));
+ lineFileClose(&lf);
+ diff = sqlDouble(line);
  remove(tmpName);
- return  100 - diff;
+ return  1 - diff;
  }
  
+ 
  struct slList *slBigWigMerge(const struct slList *item1, const struct slList *item2,
- 				void *unusedExtraData)
+ 				void *extraData)
  /* Make a new slPair where the name is the children names concattenated and the 
   * value is the average of kids' values.
   * Designed for HAC tree use*/
  {
+ char *chromSizesFile = extraData;
  verbose(1,"Merging...\n");
- ++ nameCount;
  struct bigWig *result;
  AllocVar(result);
  const struct bigWig *kid1 = (const struct bigWig *)item1;
  const struct bigWig *kid2 = (const struct bigWig *)item2;
+ char tmpName[PATH_LEN];
+ safef(tmpName, sizeof(tmpName), "%s%d_%d.bedGraph", tmpPrefix, kid1->id, kid2->id);
  char cmd1[1024];
  char cmd2[1024];
- safef(cmd1, 1024, "bigWigMerge %s %s output -verbose=0", kid1->name, kid2->name);
- char name[1024];
- safef(name,1024, "%i", nameCount);
- safef(cmd2, 1024, "bedGraphToBigWig output chrom.sizes %s", name);
+ safef(cmd1, 1024, "bigWigMerge %s %s %s -verbose=0", kid1->fileName, kid2->fileName, tmpName);
+ char fileName[PATH_LEN];
+ safef(fileName, sizeof(fileName), "%s%d_%d.bw", tmpPrefix, kid1->id, kid2->id);
+ safef(cmd2, 1024, "bedGraphToBigWig %s %s %s", tmpName, chromSizesFile, fileName);
  mustSystem(cmd1);
  mustSystem(cmd2);
- result->name = cloneString(name); 
+ remove(tmpName);
+ result->id = ++bigWigNextId;
+ result->fileName = cloneString(fileName);
+ result->label = " ";
  return (struct slList *)result;
  }
  
+ double findCachedDistance(struct hash *distanceHash, struct bigWig *bw1, struct bigWig *bw2)
+ /* Return cached distance - from hash if possible */
+ {
+ double distance;
+ double *cached = hacTreeDistanceHashLookup(distanceHash, bw1, bw2);
+ if (cached != NULL)
+    distance = *cached;
+ else
+    {
+    distance = slBigWigDistance((struct slList *)bw1, (struct slList *)bw2, NULL);
+    hacTreeDistanceHashAdd(distanceHash, bw1, bw2, distance);
+    }
+ return distance;
+ }
+ 
  
- void colorLeaves(struct slRef *leafList)
+ void colorLeaves(struct slRef *leafList, struct hash *distanceHash)
  /* Assign colors of rainbow to leaves. */
  {
  /* Loop through list once to figure out total, since we need to
   * normalize */
  double total = 0;
  double purplePos = 0.80;
  struct slRef *el, *nextEl;
  for (el = leafList; el != NULL; el = nextEl)
     {
     nextEl = el->next;
     if (nextEl == NULL)
         break;
-    struct bigWig *bio1 = el->val;
-    struct bigWig *bio2 = nextEl->val;
-    double distance = slBigWigDistance((struct slList *)bio1, (struct slList *)bio2, NULL);
-    if (distance > longest)
-        longest = distance;
+    double distance = findCachedDistance(distanceHash, el->val, nextEl->val);
     total += distance;
     }
  
  /* Loop through list a second time to generate actual colors. */
  double soFar = 0;
  for (el = leafList; el != NULL; el = nextEl)
     {
     nextEl = el->next;
     if (nextEl == NULL)
         break;
-    struct bigWig *bio1 = el->val;
-    struct bigWig *bio2 = nextEl->val;
-    double distance = slBigWigDistance((struct slList *)bio1, (struct slList *)bio2, NULL);
+    struct bigWig *bw = nextEl->val;
+    double distance = findCachedDistance(distanceHash, el->val, bw);
     soFar += distance;
     double normalized = soFar/total;
-    bio2->color = saturatedRainbowAtPos(normalized * purplePos);
+    bw->pos = normalized;
+    bw->color = saturatedRainbowAtPos(normalized * purplePos);
     }
  
  /* Set first color to correspond to 0, since not set in above loop */
- struct bigWig *bio = leafList->val;
- bio->color = saturatedRainbowAtPos(0);
+ struct bigWig *bw = leafList->val;
+ bw->color = saturatedRainbowAtPos(0);
  }
  
- void bigWigCluster(char *inputList, char* chromSizes, char* output)
- /* bigWigCluster - Cluster bigWigs using a hactree. */
+ void preloadDistances(char *distanceFile, struct bigWig *itemList, struct hash *distanceHash)
+ /* Load tab-separated distanceFile into a hash, making sure all items
+  * in distanceFile correspond to file in list */
  {
- struct bigWig *list = getBigWigs(inputList);
- FILE *f = mustOpen(output,"w");
- struct lm *localMem = lmInit(0);
- struct hacTree *clusters = NULL;
- if (sameString(clHacTree, "multiThreads"))
+ /* Create hash of items in list keyed by file name */
+ struct hash *itemHash = hashNew(0);
+ struct bigWig *item;
+ for (item = itemList; item != NULL; item = item->next)
+     hashAdd(itemHash, item->fileName, item);
+ 
+ /* Loop through distance file's three columns building up distance hash. */
+ struct lineFile *lf = lineFileOpen(distanceFile, TRUE);
+ char *row[3];
+ while (lineFileRow(lf, row))
      {
-     clusters = hacTreeMultiThread(clThreads, (struct slList *)list, localMem,
-   					    slBigWigDistance, slBigWigMerge, NULL, NULL);
+     char *a = row[0], *b = row[1];
+     double correlation = sqlDouble(row[2]);
+     double distance = 1.0 - correlation;
+     struct bigWig *aItem = hashMustFindVal(itemHash, a);
+     struct bigWig *bItem = hashMustFindVal(itemHash, b);
+     hacTreeDistanceHashAdd(distanceHash, aItem, bItem, distance);
+     hacTreeDistanceHashAdd(distanceHash, bItem, aItem, distance);
      }
- else if (sameString(clHacTree, "costlyMerges"))
+ 
+ /* Report, clean up and go home. */
+ verbose(1, "Preloaded %d distances on %d items from %s\n", distanceHash->elCount, itemHash->elCount,
+     distanceFile);
+ lineFileClose(&lf);
+ hashFree(&itemHash);
+ }
+ 
+ void addLabels(char *labelFile, struct bigWig *list)
+ /* Update label field of bigWig list according to labels in tab-separated file of format
+  *    <fileName> <labels> */
+ {
+ /* Build up hash of labels keyed by file name */
+ struct hash *hash = hashNew(0);
+ struct lineFile *lf = lineFileOpen(labelFile, TRUE);
+ char *row[2];
+ while (lineFileRowTab(lf, row))
+      hashAdd(hash, row[0], cloneString(row[1]));
+ lineFileClose(&lf);
+ 
+ /* Loop through list looking up labels in hash */
+ struct bigWig *bw;
+ for (bw = list; bw != NULL; bw = bw->next)
+      {
+      bw->label = hashFindVal(hash, bw->fileName);
+      if (bw->label == NULL)
+          errAbort("No label for %s\n", bw->fileName);
+      }
+ 
+ /* Clean up and go home */
+ freeHash(&hash);
+ }
+ 
+ void bigWigCluster(char *inputList, char *chromSizes, char *outputJson, char *outputTab)
+ /* bigWigCluster - Cluster bigWigs using a hactree. */
+ {
+ /* Read in input hash */
+ struct bigWig *list = readBigWigList(inputList);
+ 
+ /* Set up distance cache, preloading it if possible from file */
+ struct hash *distanceHash = hashNew(0);
+ char *precalcFile = optionVal("precalc", NULL);
+ if (precalcFile)
+     preloadDistances(precalcFile, list, distanceHash);
+ 
+ /* Supply better labels if they exist */
+ char *labelFile = optionVal("labels", NULL);
+ if (labelFile)
+     addLabels(labelFile, list);
+ 
+ struct hacTree *clusters;
+ struct lm *localMem = lmInit(0);
+ if (gThreadCount > 1)
      {
-     clusters = hacTreeForCostlyMerges((struct slList *)list, localMem,
- 						slBigWigDistance, slBigWigMerge, NULL);
+     clusters = hacTreeMultiThread(gThreadCount, (struct slList *)list, localMem,
+ 					    slBigWigDistance, slBigWigMerge, chromSizes, distanceHash);
      }
- else if (sameString(clHacTree, "fromItems"))
+ else
      {
+     /* Use older code for single threaded case, just to be able to compare results to
+      * parallelized version */
      clusters = hacTreeFromItems((struct slList *)list, localMem,
- 						slBigWigDistance, slBigWigMerge, NULL, NULL);
-     }
- else 
-     {
-     uglyAbort("Unrecognized input option: %s", clHacTree);
+ 					    slBigWigDistance, slBigWigMerge, NULL, chromSizes);
      }
+ 
+ /* Convert tree to ordered list, do coloring, and make outputs */
  struct slRef *orderedList = getOrderedLeafList(clusters);
- colorLeaves(orderedList);
- printHierarchicalJson(f, clusters, 20, 20);
- // some cleanup
- int i;
- for (i = 0 ; i <= nameCount; ++i)
-    {
-    char name[1024];
-    safef(name,1024, "%i", i);
-    remove(name);
-    }
+ colorLeaves(orderedList, distanceHash);
+ printHierarchicalJson(outputJson, clusters, 20, 20);
+ printOrderedTab(outputTab, orderedList);
+ 
+ // Clean up remaining temp files 
+ char cmd[1024];
+ safef(cmd, sizeof(cmd), "rm %s*", tmpPrefix);
+ mustSystem(cmd);
  }
  
  int main(int argc, char *argv[])
  /* Process command line. */
  {
 +clThreads = optionInt("threads", clThreads);
 +clHacTree = optionVal("hacTree", clHacTree);
  optionInit(&argc, argv, options);
- if (argc != 4)
+ if (argc != 5)
      usage();
- bigWigCluster(argv[1], argv[2], argv[3]);
+ gThreadCount = optionInt("threads", gThreadCount);
+ gTmpDir = optionVal("tmpDir", gTmpDir);
+ safef(tmpPrefix, sizeof(tmpPrefix), "%s/bwc_tmp_", gTmpDir);
+ bigWigCluster(argv[1], argv[2], argv[3], argv[4]);
  return 0;
  }