1e7f2a7a806fccac0d5633d8191b8475821b0480 ceisenhart Sat Aug 23 11:55:44 2014 -0700 ExpData.c takes in a matrix of data and a corresponding matrix of names.The output is a .json file which can be used for visualizations. forceLayout.html is a d3 visualization that is generated with a .json file. radialDend.html is a d3 visualization that is generated with a .json file. hacTree.c, refactored the code slightly to remove uneccesary merge calls. bigWigCluster.c runs on a list of bigWig files, uses the hacTree library to cluster the bigWigs into a binary tree. The output is a .json file which can be used for visualizations diff --git src/hg/expData/expData.c src/hg/expData/expData.c index c25c336..91d54dd 100644 --- src/hg/expData/expData.c +++ src/hg/expData/expData.c @@ -1,334 +1,463 @@ -/* expData - Takes in a relational database and outputs expression information. */ +/* expData - Takes in GNF expression data, organizes it using a hierarchical agglomerative clustering algorithm. The output defaults to a hierarchichal .json format, with two additional options. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "obscure.h" #include "jksql.h" #include "expData.h" #include "sqlList.h" #include "hacTree.h" -#include "jsonWrite.h" +#include "rainbow.h" + +/* Visually the nodes will be assigned a color corresponding to a range of sizes. + * This constant dictates how many colors will be displayed (1-20). */ +int clCgConstant = 20; +/* A normalizing constant for the distances, this corresponds to + * the size of the nodes in the standard output and the link distance in the forceLayout output*/ +int clNormConstant = 20; +boolean clForceLayout = FALSE; // Prints the data in .json format for d3 force layout visualizations +int target = 0; // Used for the target value in rlinkJson. +float longest = 0; // Used to normalize link distances in rlinkJson. void usage() /* Explain usage and exit. */ { errAbort( - "expData - Takes in a relational database and outputs expression information\n" + "expData - Takes in a relational database and outputs expression information.\n" + "Standard output is .json format intended for d3 hierarchical displays. \n" "usage:\n" - " expData biosamples matrix output\n" + " expData matrix biosamples output\n" "options:\n" - " -xxx=XXX\n" + " -forceLayout = bool Prints the output in .json format for d3 forceLayouts\n" + " -normConstant = int Normalizing constant for d3, default is 20. For forceLayout 100 is reccomended \n" + " -cgConstant = int Defines the number of possible colors for nodes, 1 - 20 \n" ); } /* Command line validation table. */ static struct optionSpec options[] = { + {"normConstant", OPTION_INT}, + {"cgConstant", OPTION_INT}, + {"structuredOutput", OPTION_BOOLEAN}, + {"forceLayout", OPTION_BOOLEAN}, {NULL, 0}, }; -int target = 0; // Used for the target value in rlinkJson. -float longest = 0; // Used to normalize link distances in rlinkJson. +struct jsonNodeLine +/* Stores the information for a single json node line */ + { + struct jsonNodeLine *next; + char* name; // the source for a given link + double distance; // the distance for a given link + }; + +struct jsonLinkLine +/* Stores the information for a single json link line */ + { + struct jsonLinkLine *next; + int source; // the source for a given link + int target; // the target for a given link + double distance; // the distance for a given link + }; struct bioExpVector /* Contains expression information for a biosample on many genes. */ { struct bioExpVector *next; char *name; // name of biosample. int count; // Number of genes we have data for. double *vector; // An array allocated dynamically. + struct rgbColor color; // Color for this one }; struct bioExpVector *bioExpVectorListFromFile(char *matrixFile) // Read a tab-delimited file and return list of bioExpVector. - { int vectorSize = 0; struct lineFile *lf = lineFileOpen(matrixFile, TRUE); char *line, **row = NULL; struct bioExpVector *list = NULL, *el; while (lineFileNextReal(lf, &line)) { if (vectorSize == 0) { // Detect first row. vectorSize = chopByWhite(line, NULL, 0); // counting up AllocArray(row, vectorSize); continue; } AllocVar(el); AllocArray(el->vector, vectorSize); el->count = chopByWhite(line, row, vectorSize); - assert(el->count == vectorSize); +// assert(el->count == vectorSize); int i; for (i = 0; i < el->count; ++i) el->vector[i] = sqlDouble(row[i]); slAddHead(&list, el); } + lineFileClose(&lf); slReverse(&list); return list; } void fillInNames(struct bioExpVector *list, char *nameFile) /* Fill in name field from file. */ { struct lineFile *lf = lineFileOpen(nameFile, TRUE); char *line; struct bioExpVector *el = list; while (lineFileNextReal(lf, &line)) { if (el == NULL) { warn("More names than items in list"); break; } el->name = cloneString(line); el = el->next; } lineFileClose(&lf); } -void rPrintNodes(struct jsonWrite *jw, FILE *f, struct hacTree *tree) +void printJsonNodeLine(FILE *f, struct jsonNodeLine *node) +{ +fprintf(f," %s\"%s\"%s\"%s\"%s", "{","name", ":", node->name, ","); +fprintf(f,"\"%s\"%s%0.31f%s\n", "y", ":" , node->distance, "},"); +} + +void printEndJsonNodeLine(FILE *f, struct jsonNodeLine *node) +{ +fprintf(f," %s\"%s\"%s\"%s\"%s", "{","name", ":", node->name, ","); +fprintf(f,"\"%s\"%s%0.31f%s\n", "y", ":" , node->distance, "}"); +} + +void printJsonLinkLine(FILE *f, struct jsonLinkLine *links) +/* Prints out a single json link line */ +{ +fprintf(f," %s\"%s\"%s%d%s", "{","source", ":", links->source, ","); +fprintf(f,"\"%s\"%s%d%s", "target", ":" , links->target, ","); +fprintf(f,"\"%s\"%s%0.31f%s\n", "distance", ":" , links->distance , "},"); +} + +void printEndJsonLinkLine(FILE *f, struct jsonLinkLine *links) +/* Prints out a single json link line */ +{ +fprintf(f," %s\"%s\"%s%d%s", "{","source", ":", links->source, ","); +fprintf(f,"\"%s\"%s%d%s", "target", ":" , links->target, ","); +fprintf(f,"\"%s\"%s%0.31f%s\n", "distance", ":" , links->distance , "}"); +} + +void rPrintNodes(FILE *f, struct hacTree *tree, struct jsonNodeLine *nodes) +// Recursively adds the node information to a linked list { -// Recursively prints out the nodes in a depth first order starting on the left char *tissue = ((struct bioExpVector *)(tree->itemOrCluster))->name; -jsonWriteObjectStart(jw); if (tree->childDistance != 0) // If the current object is a node then we assign no name. { - fprintf(f," %s\"%s\"%s\"%s\"%s", "{","name", ":", " ", ","); - jsonWriteString(jw, "name", " "); - jsonWriteNumber(jw, "y", tree->childDistance); - fprintf(f,"\"%s\"%s%0.31f%s\n", "y", ":" , tree->childDistance, "},"); + struct jsonNodeLine *nNode; + AllocVar(nNode); + nNode->name = " "; + nNode->distance = tree->childDistance; + slAddHead(nodes, nNode); // add the node } else { // Otherwise the current object is a leaf, and the tissue name is printed. - jsonWriteString(jw, "name", tissue); - jsonWriteNumber(jw, "y", tree->childDistance); - fprintf(f," %s\"%s\"%s\"%s\"%s", "{","name", ":", tissue, ","); - fprintf(f,"\"%s\"%s%0.31f%s\n", "y", ":" , tree->childDistance, "},"); + struct jsonNodeLine *lNode; + AllocVar(lNode); + lNode->name = tissue; + lNode->distance = tree->childDistance; + slAddHead(nodes, lNode); // add the node } -jsonWriteObjectEnd(jw); + if (tree->left == NULL && tree->right == NULL) + // Stop at the last element { return; } else if (tree->left == NULL || tree->right == NULL) errAbort("\nHow did we get a node with one NULL kid??"); -rPrintNodes(jw, f, tree->left); -rPrintNodes(jw, f, tree->right); +rPrintNodes(f, tree->left, nodes); +rPrintNodes(f, tree->right, nodes); } - -void rPrintLinks(FILE *f, struct hacTree *tree, int source) +void rPrintLinks(int normConstant, FILE *f, struct hacTree *tree, int source, struct jsonLinkLine *links) +// Recursively loads the link information into a linked list { -// Recursively prints the links in json format. if (tree->childDistance > longest) // the first distance will be the longest, and is used for normalization longest = tree->childDistance; -/* if the current location is a leaf */ if (tree->left == NULL && tree->right == NULL) + // Stop at the last element { return; } else if (tree->left == NULL || tree->right == NULL) errAbort("\nHow did we get a node with one NULL kid??"); -/* check for the end of the tree */ -/* Left recursion; the source and target are always ofset by 1. */ +// Left recursion. +struct jsonLinkLine *lLink; +AllocVar(lLink); ++target; -fprintf(f," %s\"%s\"%s%d%s", "{","source", ":", target - 1, ","); -fprintf(f,"\"%s\"%s%d%s", "target", ":" , target, ","); -fprintf(f,"\"%s\"%s%0.31f%s\n", "distance", ":" , 100*(tree->childDistance/longest) , "},"); -/* Prepares the source for the right recursion. */ -source = target; -rPrintLinks(f, tree->left, source); -/* Right recursion. */ +lLink->source = target - 1; // The source and target are always ofset by 1. +lLink->target = target; +lLink->distance = normConstant*(tree->childDistance/longest); //Calculates the link distance +source = target; // Prepares the source for the right recursion. +slAddHead(links, lLink); // Add the link +rPrintLinks(normConstant,f, tree->left, source, links); + +// Right recursion. +struct jsonLinkLine *rLink; +AllocVar(rLink); ++target; -fprintf(f," %s\"%s\"%s%d%s", "{","source", ":", source - 1 , ","); -fprintf(f,"\"%s\"%s%d%s", "target", ":" , target, ","); -fprintf(f,"\"%s\"%s%0.31f%s\n", "distance", ":" , 100*(tree->childDistance/longest) , "},"); -rPrintLinks(f, tree->right, ++source); +rLink->source = source - 1; // The source is dependent on the last target of the left recursion +rLink->target = target; +rLink->distance = normConstant*(tree->childDistance/longest); +slAddHead(links, rLink); // Add the link +rPrintLinks(normConstant,f, tree->right, ++source, links); } -void printJson(FILE *f, struct hacTree *tree) -/* Prints the hacTree into a Json file format */ +void printForceLayoutJson(int normConstant, FILE *f, struct hacTree *tree) +// Prints the hacTree into a .json file format for d3 forceLayout visualizations. { -int source = 0; // Basic json template for d3 visualizations -struct jsonWrite *jw = jsonWriteNew(); -jsonWriteObjectStart(jw); fprintf(f,"%s\n", "{"); -jsonWriteListStart(jw, "nodes"); fprintf(f," \"%s\"%s\n", "nodes", ":[" ); -rPrintNodes(jw, f, tree); + +// Print the nodes +struct jsonNodeLine *nodes; +AllocVar(nodes); +rPrintNodes(f, tree, nodes); +slReverse(&nodes); +int nodeCount = slCount(nodes); +int j; +for (j = 0; j < nodeCount -1; ++j) + // iterate through the linked nodelist printing nodes + { + if (j == nodeCount - 2) + printEndJsonNodeLine(f, nodes); + else + { + printJsonNodeLine(f, nodes); + nodes = nodes -> next; + } + } fprintf(f, "%s\n", "],"); -// Basic json template for d3 visualizations fprintf(f, "\"%s\"%s\n", "links", ":[" ); -rPrintLinks(f,tree, source); -fprintf(f," %s\n", "]"); +// Print the links +struct jsonLinkLine *links; +AllocVar(links); +rPrintLinks(normConstant,f,tree, 0, links); +slReverse(&links); +int linkCount = slCount(links); +int i; +for (i = 0; i< linkCount -1; ++i) + // iterate through the linked linklist printing links + { + if (i == linkCount - 2) + printEndJsonLinkLine(f, links); + else + { + printJsonLinkLine(f, links); + links = links -> next; + } + } +fprintf(f," %s\n", "]"); fprintf(f,"%s\n", "}"); -jsonWriteObjectEnd(jw); -writeGulp("jsonWrite.out", jw->dy->string, jw->dy->stringSize); } -static void rPrintSlBioExpVectorTree(FILE *f, struct hacTree *tree, int level,double distance) -/* Recursively print out cluster as nested-parens with {}'s around leaf nodes. */ +static void rAddLeaf(struct hacTree *tree, struct slRef **pList) +/* Recursively add leaf to list */ { -char *tissue = ((struct bioExpVector *)(tree->itemOrCluster))->name; -int i; -for (i = 0; i < level; i++) - fputc(' ', f); if (tree->left == NULL && tree->right == NULL) + refAdd(pList, tree->itemOrCluster); +else { - fprintf(f, "{%s}%0.31f", tissue, distance); - return; + rAddLeaf(tree->left, pList); + rAddLeaf(tree->right, pList); } -else if (tree->left == NULL || tree->right == NULL) - errAbort("\nHow did we get a node with one NULL kid??"); -fprintf(f, "(%s%f\n", "node", tree->childDistance); -distance += tree->childDistance; -rPrintSlBioExpVectorTree(f, tree->left, level+1, distance); -fputs(",\n", f); -rPrintSlBioExpVectorTree(f, tree->right, level+1, distance); -fputc('\n', f); -for (i=0; i < level; i++) - fputc(' ', f); -fputs(")", f); } -void printSlBioExpVectorTree(FILE *f, struct hacTree *tree) -/* Print out cluster as nested-parens with {}'s around leaf nodes. */ -{ -if (tree == NULL) +struct slRef *getOrderedLeafList(struct hacTree *tree) +/* Return list of references to bioExpVectors from leaf nodes + * ordered by position in hacTree */ { - fputs("Empty tree.\n", f); - return; - } -double distance = 0; -rPrintSlBioExpVectorTree(f, tree, 0, distance); -fputc('\n', f); +struct slRef *leafList = NULL; +rAddLeaf(tree, &leafList); +slReverse(&leafList); +return leafList; } -static void rPrintNestedJson(FILE *f, struct hacTree *tree, int level, double distance) -/* Recursively print out cluster as nested-parens with {}'s around leaf nodes. */ +static void rPrintHierarchicalJson(FILE *f, struct hacTree *tree, int level, double distance, + int normConstant, int cgConstant) +/* Recursively prints out the elements of the hierarchical .json file. */ { -char *tissue = ((struct bioExpVector *)(tree->itemOrCluster))->name; -int i; +struct bioExpVector *bio = (struct bioExpVector *)tree->itemOrCluster; +char *tissue = bio->name; +struct rgbColor colors = bio->color; if (tree->childDistance > longest) // the first distance will be the longest, and is used for normalization longest = tree->childDistance; +int i; for (i = 0; i < level; i++) - fputc(' ', f); + fputc(' ', f); // correct spacing for .json format if (tree->left == NULL && tree->right == NULL) { - fprintf(f, "{\"%s\"%s \"%s\"%s\"%s\"%s %f}", "name", ":", tissue, ", ", "size", ":", distance); + // Prints out the leaf objects + // fprintf(f, "{\"name\": \"%s\",\"similarity\": %f,\"linkGroup\": \" \"", tissue, distance); + fprintf(f, "{\"%s\"%s \"%s\"%s\"%s\"%s %f %s\"%s\"%s \"rgb(%i,%i,%i)\"}", "name", ":", tissue, ", ", + "similarity", ":", distance, "," , "colorGroup", ":", colors.r, colors.g, colors.b); return; } else if (tree->left == NULL || tree->right == NULL) errAbort("\nHow did we get a node with one NULL kid??"); -//fprintf(f, "{\"%s\"%s \"%f\"%s\n", "name", ":", " ", ","); -fprintf(f, "{\"%s\"%s \"%f\"%s\n", "name", ":", 100*(tree->childDistance/longest), ","); + +// Prints out the node object and opens a new children block +fprintf(f, "{\"%s\"%s \"%s\"%s", "name", ":", " ", ","); +fprintf(f, "\"colorGroup\": \"rgb(%i,%i,%i)\",", colors.r, colors.g, colors.b ); +fprintf(f, "\"%s\"%s \"%f\"%s\n", "distance", ":", normConstant * (tree->childDistance/longest), ","); for (i = 0; i < level + 1; i++) fputc(' ', f); fprintf(f, "\"%s\"%s\n", "children", ": ["); -distance += tree->childDistance; -rPrintNestedJson(f, tree->left, level+1, distance); +distance = tree->childDistance/longest; +rPrintHierarchicalJson(f, tree->left, level+1, distance, normConstant, cgConstant); fputs(",\n", f); -rPrintNestedJson(f, tree->right, level+1, distance); +rPrintHierarchicalJson(f, tree->right, level+1, distance, normConstant, cgConstant); fputc('\n', f); +// Closes the children block for node objects for (i=0; i < level + 1; i++) fputc(' ', f); fputs("]\n", f); for (i = 0; i < level; i++) fputc(' ', f); fputs("}", f); } -void printNestedJson(FILE *f, struct hacTree *tree) -/* Print out cluster as nested-parens with {}'s around leaf nodes. */ +void printHierarchicalJson(FILE *f, struct hacTree *tree, int normConstant, int cgConstant) +/* Prints out the binary tree into .json format intended for d3 + * hierarchical layouts */ { if (tree == NULL) { fputs("Empty tree.\n", f); return; } double distance = 0; -rPrintNestedJson(f, tree, 0, distance); +rPrintHierarchicalJson(f, tree, 0, distance, normConstant, cgConstant); fputc('\n', f); } -char* floatToString(float input) -{ -char* result = needMem(sizeof(result)); -sprintf(result,"%f", input); -return result; -freez(result); -} double slBioExpVectorDistance(const struct slList *item1, const struct slList *item2, void *extraData) /* Return the absolute difference between the two kids' values. * Designed for HAC tree use*/ { +verbose(1,"Calculating Distance...\n"); const struct bioExpVector *kid1 = (const struct bioExpVector *)item1; const struct bioExpVector *kid2 = (const struct bioExpVector *)item2; int j; double diff = 0, sum = 0; for (j = 0; j < kid1->count; ++j) { diff = kid1->vector[j] - kid2->vector[j]; sum += (diff * diff); } return sqrt(sum); } struct slList *slBioExpVectorMerge(const struct slList *item1, const struct slList *item2, void *unusedExtraData) /* Make a new slPair where the name is the children names concattenated and the * value is the average of kids' values. * Designed for HAC tree use*/ { +verbose(1,"Merging...\n"); const struct bioExpVector *kid1 = (const struct bioExpVector *)item1; const struct bioExpVector *kid2 = (const struct bioExpVector *)item2; struct bioExpVector *el; AllocVar(el); AllocArray(el->vector, kid1->count); el->count = kid1->count; el->name = catTwoStrings(kid1->name, kid2->name); int i; for (i = 0; i < el->count; ++i) { el->vector[i] = (kid1->vector[i] + kid2->vector[i])/2; } return (struct slList *)(el); } -void expData(char *matrixFile, char *nameFile, char *outFile) +void colorLeaves(struct slRef *leafList) +/* Assign colors of rainbow to leaves. */ +{ +/* Loop through list once to figure out total, since we need to + * normalize */ +double total = 0; +double purplePos = 0.80; +struct slRef *el, *nextEl; +for (el = leafList; el != NULL; el = nextEl) + { + nextEl = el->next; + if (nextEl == NULL) + break; + struct bioExpVector *bio1 = el->val; + struct bioExpVector *bio2 = nextEl->val; + double distance = slBioExpVectorDistance((struct slList *)bio1, (struct slList *)bio2, NULL); + total += distance; + } + +/* Loop through list a second time to generate actual colors. */ +double soFar = 0; +for (el = leafList; el != NULL; el = nextEl) + { + nextEl = el->next; + if (nextEl == NULL) + break; + struct bioExpVector *bio1 = el->val; + struct bioExpVector *bio2 = nextEl->val; + double distance = slBioExpVectorDistance((struct slList *)bio1, (struct slList *)bio2, NULL); + soFar += distance; + double normalized = soFar/total; + bio2->color = saturatedRainbowAtPos(normalized * purplePos); + } + +/* Set first color to correspond to 0, since not set in above loop */ +struct bioExpVector *bio = leafList->val; +bio->color = saturatedRainbowAtPos(0); +} + +void expData(char *matrixFile, char *nameFile, char *outFile, bool forceLayout, int normConstant, int cgConstant) /* Read matrix and names into a list of bioExpVectors, run hacTree to * associate them, and write output. */ { struct bioExpVector *list = bioExpVectorListFromFile(matrixFile); FILE *f = mustOpen(outFile,"w"); struct lm *localMem = lmInit(0); fillInNames(list, nameFile); struct hacTree *clusters = hacTreeFromItems((struct slList *)list, localMem, slBioExpVectorDistance, slBioExpVectorMerge, NULL, NULL); -//printJson(f,clusters); -//printSlBioExpVectorTree(f,clusters); -printNestedJson(f,clusters); +struct slRef *orderedList = getOrderedLeafList(clusters); +colorLeaves(orderedList); +if (forceLayout) + printForceLayoutJson(normConstant,f,clusters); +if (!clForceLayout) + printHierarchicalJson(f, clusters, normConstant, cgConstant); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); +clForceLayout = optionExists("forceLayout"); +clNormConstant = optionInt("normConstant", clNormConstant); +clCgConstant = optionInt("cgConstant", clCgConstant); if (argc != 4) usage(); -expData(argv[1], argv[2], argv[3]); +expData(argv[1], argv[2], argv[3], clForceLayout, clNormConstant, clCgConstant); return 0; }