93cdd24ca26274f6283c9b9dd4938e95aa1260e0 ceisenhart Tue Jun 16 17:06:59 2015 -0700 Fixed the program up a bit, documented and removed some unecessary options diff --git src/hg/expMatrixToJson/expMatrixToJson.c src/hg/expMatrixToJson/expMatrixToJson.c index 0f3d479..e180bd5 100644 --- src/hg/expMatrixToJson/expMatrixToJson.c +++ src/hg/expMatrixToJson/expMatrixToJson.c @@ -1,69 +1,69 @@ -/* expData - Takes in GNF expression data, organizes it using a hierarchical agglomerative clustering algorithm. The output defaults to a hierarchichal .json format, with two additional options. */ +/* expData - Takes in an expression matrix and clusters it using a hierarchical agglomerative clustering algorithm. The output defaults to a hierarchichal .json format, with two additional options. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "obscure.h" #include "memalloc.h" #include "jksql.h" #include "expData.h" #include "sqlList.h" #include "hacTree.h" #include "rainbow.h" -/* Visually the nodes will be assigned a color corresponding to a range of sizes. - * This constant dictates how many colors will be displayed (1-20). */ -int clCgConstant = 20; -/* A normalizing constant for the distances, this corresponds to - * the size of the nodes in the standard output and the link distance in the forceLayout output*/ -int clThreads = 10; // The number of threads to run with the multiThreads option -int clNormConstant = 99; boolean clForceLayout = FALSE; // Prints the data in .json format for d3 force layout visualizations +boolean clCSV = FALSE; // Converts the comma separated matrix into a tab based file. +boolean clMultiThreads = FALSE; // Allows the user to run the program with multiple threads, default is off. +int clThreads = 10; // The number of threads to run with the multiThreads option +int clMemLim = 4; // The amount of memeory the program can use, read in Gigabytes. int target = 0; // Used for the target value in rlinkJson. float longest = 0; // Used to normalize link distances in rlinkJson. -char* clHacTree = "fromItems"; -char* clDescFile = NULL; +char* clDescFile = NULL; // The user can provide a description file void usage() /* Explain usage and exit. */ { errAbort( - "expMatrixToJson - Takes in an expression matrix and outputs a binary tree clustering the data in .json format.\n" + "expMatrixToJson - Takes in an expression matrix and outputs a binary tree clustering the data.\n" + " The tree is output as a .json file, a .html file is generated to view the \n" + " tree. The files are named using the output argument (ex output.json, output.html).\n" "usage:\n" - " expMatrixToJson matrix output\n" + " expMatrixToJson [options] matrix output\n" "options:\n" - " -forceLayout = bool Prints the output in .json format for d3 forceLayouts\n" - " -normConstant = int Normalizing constant for d3, default is 20. For forceLayout 100 is reccomended \n" - " -cgConstant = int Defines the number of possible colors for nodes, 1 - 20 \n" + " -multiThread The program will run on multiple threads. \n" + " -forceLayout Prints the output in .json format for d3 forceLayouts. NOTE no .html file will be \n" + " generated using this option.\n" + " -CSV The input matrix is in .csv format. \n" " -threads=int Sets the thread count for the multiThreads option, default is 10 \n" - " -hacTree = Dictates how the tree is generated; multiThreads or costlyMerges or fromItems. fromItems is default \n" - " -descFile = The user is providing a description file. The description file must provide a \n" - " description for each cell line in the expression matrix. There should be one \n" - " description per line, starting on the left side of the expression matrix. The \n" - " description will appear over a leaf node when hovered over.\n" + " -memLim=int Sets the amount of memeory the program can use before aborting. The default is 4G. \n" + " -descFile=string The user is providing a description file. The description file must provide a \n" + " description for each cell line in the expression matrix. There should be one description per \n" + " line, starting on the left side of the expression matrix. The description will appear over a \n" + " leaf node when hovered over.\n" + " -verbose=2 Show basic run stats. \n" + " -verbose=3 Show all run stats. \n" ); } /* Command line validation table. */ static struct optionSpec options[] = { - {"normConstant", OPTION_INT}, - {"cgConstant", OPTION_INT}, - {"threads", OPTION_INT}, - {"structuredOutput", OPTION_BOOLEAN}, + {"multiThread", OPTION_BOOLEAN}, {"forceLayout", OPTION_BOOLEAN}, - {"hacTree", OPTION_STRING}, + {"CSV", OPTION_BOOLEAN}, + {"threads", OPTION_INT}, + {"memLim", OPTION_INT}, {"descFile", OPTION_STRING}, {NULL, 0}, }; struct jsonNodeLine /* Stores the information for a single json node line */ { struct jsonNodeLine *next; char* name; // the source for a given link double distance; // the distance for a given link }; struct jsonLinkLine /* Stores the information for a single json link line */ { @@ -204,98 +204,98 @@ slAddHead(nodes, lNode); // add the node } if (tree->left == NULL && tree->right == NULL) // Stop at the last element { return; } else if (tree->left == NULL || tree->right == NULL) errAbort("\nHow did we get a node with one NULL kid??"); rPrintNodes(f, tree->left, nodes); rPrintNodes(f, tree->right, nodes); } -void rPrintLinks(int normConstant, FILE *f, struct hacTree *tree, int source, struct jsonLinkLine *links) +void rPrintLinks(FILE *f, struct hacTree *tree, int source, struct jsonLinkLine *links) // Recursively loadslist->children = 0 ; the link information into a linked list { if (tree->childDistance > longest) // the first distance will be the longest, and is used for normalization longest = tree->childDistance; if (tree->left == NULL && tree->right == NULL) // Stop at the last element { return; } else if (tree->left == NULL || tree->right == NULL) errAbort("\nHow did we get a node with one NULL kid??"); // Left recursion. struct jsonLinkLine *lLink; AllocVar(lLink); ++target; lLink->source = target - 1; // The source and target are always ofset by 1. lLink->target = target; -lLink->distance = normConstant*(tree->childDistance/longest); //Calculates the link distance +lLink->distance = 100*(tree->childDistance/longest); //Calculates the link distance source = target; // Prepares the source for the right recursion. slAddHead(links, lLink); // Add the link -rPrintLinks(normConstant,f, tree->left, source, links); +rPrintLinks(f, tree->left, source, links); // Right recursion. struct jsonLinkLine *rLink; AllocVar(rLink); ++target; rLink->source = source - 1; // The source is dependent on the last target of the left recursion rLink->target = target; -rLink->distance = normConstant*(tree->childDistance/longest); +rLink->distance = 100*(tree->childDistance/longest); slAddHead(links, rLink); // Add the link -rPrintLinks(normConstant,f, tree->right, ++source, links); +rPrintLinks(f, tree->right, ++source, links); } -void printForceLayoutJson(int normConstant, FILE *f, struct hacTree *tree) +void printForceLayoutJson(FILE *f, struct hacTree *tree) // Prints the hacTree into a .json file format for d3 forceLayout visualizations. { // Basic json template for d3 visualizations fprintf(f,"%s\n", "{"); fprintf(f," \"%s\"%s\n", "nodes", ":[" ); // Print the nodes struct jsonNodeLine *nodes; AllocVar(nodes); rPrintNodes(f, tree, nodes); slReverse(&nodes); int nodeCount = slCount(nodes); int j; for (j = 0; j < nodeCount -1; ++j) // iterate through the linked nodelist printing nodes { if (j == nodeCount - 2) printEndJsonNodeLine(f, nodes); else { printJsonNodeLine(f, nodes); nodes = nodes -> next; } } fprintf(f, "%s\n", "],"); fprintf(f, "\"%s\"%s\n", "links", ":[" ); // Print the links struct jsonLinkLine *links; AllocVar(links); -rPrintLinks(normConstant,f,tree, 0, links); +rPrintLinks(f,tree, 0, links); slReverse(&links); int linkCount = slCount(links); int i; for (i = 0; i< linkCount -1; ++i) // iterate through the linked linklist printing links { if (i == linkCount - 2) printEndJsonLinkLine(f, links); else { printJsonLinkLine(f, links); links = links -> next; } } fprintf(f," %s\n", "]"); @@ -313,127 +313,123 @@ rAddLeaf(tree->left, pList); rAddLeaf(tree->right, pList); } } struct slRef *getOrderedLeafList(struct hacTree *tree) /* Return list of references to bioExpVectors from leaf nodes * ordered by position in hacTree */ { struct slRef *leafList = NULL; rAddLeaf(tree, &leafList); slReverse(&leafList); return leafList; } -static void rPrintHierarchicalJson(FILE *f, struct hacTree *tree, int level, double distance, - int normConstant, int cgConstant) +static void rPrintHierarchicalJson(FILE *f, struct hacTree *tree, int level, double distance) /* Recursively prints out the elements of the hierarchical .json file. */ { struct bioExpVector *bio = (struct bioExpVector *)tree->itemOrCluster; char *tissue = bio->name; struct rgbColor colors = bio->color; if (tree->childDistance > longest) // the first distance will be the longest, and is used for normalization longest = tree->childDistance; int i; for (i = 0; i < level; i++) fputc(' ', f); // correct spacing for .json format + if (tree->left == NULL && tree->right == NULL) + // Print the leaf nodes { - // Prints out the leaf objects - //fprintf(f, "{\"%s\"%s \"%s\"%s\"%s\"%s %f %s\"%s\"%s \"rgb(%i,%i,%i)\"}", "name", ":", tissue, ", ", - // "similarity", ":", bio->desc , "," , "colorGroup", ":", colors.r, colors.g, colors.b); + if (bio->desc) fprintf(f, "{\"name\":\"%s\",\"distance\":\"%s\",\"colorGroup\":\"rgb(%i,%i,%i)\"}", tissue, bio->desc, colors.r, colors.g, colors.b); + else + fprintf(f, "{\"name\":\"%s\",\"distance\":\"%s\",\"colorGroup\":\"rgb(%i,%i,%i)\"}", tissue, " ", colors.r, colors.g, colors.b); return; } else if (tree->left == NULL || tree->right == NULL) errAbort("\nHow did we get a node with one NULL kid??"); // Prints out the node object and opens a new children block fprintf(f, "{\"%s\"%s \"%s\"%s", "name", ":", " ", ","); fprintf(f, "\"colorGroup\": \"rgb(%i,%i,%i)\",", colors.r, colors.g, colors.b ); distance = tree->childDistance/longest; if (distance != distance) distance = 0; -fprintf(f, "\"%s\"%s \"%f\"%s\n", "distance", ":", (1 + normConstant * (distance)), ","); +fprintf(f, "\"%s\"%s \"%f\"%s\n", "distance", ":", 100*distance, ","); for (i = 0; i < level + 1; i++) fputc(' ', f); fprintf(f, "\"%s\"%s\n", "children", ": ["); -rPrintHierarchicalJson(f, tree->left, level+1, distance, normConstant, cgConstant); +rPrintHierarchicalJson(f, tree->left, level+1, distance); fputs(",\n", f); -rPrintHierarchicalJson(f, tree->right, level+1, distance, normConstant, cgConstant); +rPrintHierarchicalJson(f, tree->right, level+1, distance); fputc('\n', f); // Closes the children block for node objects for (i=0; i < level + 1; i++) fputc(' ', f); fputs("]\n", f); for (i = 0; i < level; i++) fputc(' ', f); fputs("}", f); } -void printHierarchicalJson(FILE *f, struct hacTree *tree, int normConstant, int cgConstant) +void printHierarchicalJson(FILE *f, struct hacTree *tree) /* Prints out the binary tree into .json format intended for d3 * hierarchical layouts */ { if (tree == NULL) { fputs("Empty tree.\n", f); return; } double distance = 0; -rPrintHierarchicalJson(f, tree, 0, distance, normConstant, cgConstant); +rPrintHierarchicalJson(f, tree, 0, distance); fputc('\n', f); } double slBioExpVectorDistance(const struct slList *item1, const struct slList *item2, void *extraData) -/* Return the absolute difference between the two kids' values. - * Designed for HAC tree use*/ +/* Return the absolute difference between the two kids' values. Weight based on how many nodes have been merged + * to create the current node. Designed for HAC tree use*/ { -verbose(1,"Calculating Distance...\n"); +verbose(3,"Calculating Distance...\n"); const struct bioExpVector *kid1 = (const struct bioExpVector *)item1; const struct bioExpVector *kid2 = (const struct bioExpVector *)item2; int j; double diff = 0, sum = 0; -//float kid1Weight = 0.0, kid2Weight = 0.0; float kid1Weight = kid1->children / (float)(kid1->children + kid2->children); float kid2Weight = kid2->children / (float)(kid1->children + kid2->children); -//printf("Kid1 weight is %f kid2 weight is %f \n", kid1Weight, kid2Weight); -//uglyAbort("%f %f\n", kid1Weight, kid2Weight); for (j = 0; j < kid1->count; ++j) { diff = (kid1Weight*kid1->vector[j]) - (kid2Weight*kid2->vector[j]); sum += (diff * diff); } -//printf("%f\n",sqrt(sum)); return sqrt(sum); } struct slList *slBioExpVectorMerge(const struct slList *item1, const struct slList *item2, void *unusedExtraData) /* Make a new slPair where the name is the children names concattenated and the * value is the average of kids' values. * Designed for HAC tree use*/ { -verbose(1,"Merging...\n"); +verbose(3,"Merging...\n"); const struct bioExpVector *kid1 = (const struct bioExpVector *)item1; const struct bioExpVector *kid2 = (const struct bioExpVector *)item2; float kid1Weight = kid1->children / (float)(kid1->children + kid2->children); float kid2Weight = kid2->children / (float)(kid1->children + kid2->children); -printf("Kid1weight %g kid2weight %g the sum is %g \n", kid1Weight, kid2Weight, kid1Weight + kid2Weight); struct bioExpVector *el; AllocVar(el); AllocArray(el->vector, kid1->count); assert(kid1->count == kid2->count); el->count = kid1->count; el->name = catTwoStrings(kid1->name, kid2->name); int i; for (i = 0; i < el->count; ++i) { el->vector[i] = (kid1Weight*kid1->vector[i] + kid2Weight*kid2->vector[i]); } el->children = kid1->children + kid2->children; return (struct slList *)(el); } @@ -465,113 +461,120 @@ break; struct bioExpVector *bio1 = el->val; struct bioExpVector *bio2 = nextEl->val; double distance = slBioExpVectorDistance((struct slList *)bio1, (struct slList *)bio2, NULL); if (distance != distance ) distance = 0 ; soFar += distance; double normalized = soFar/total; bio2->color = saturatedRainbowAtPos(normalized * purplePos); } /* Set first color to correspond to 0, since not set in above loop */ struct bioExpVector *bio = leafList->val; bio->color = saturatedRainbowAtPos(0); } -void convertInput(char *expMatrix, char *descFile) +void convertInput(char *expMatrix, char *descFile, bool csv) /* Takes in a expression matrix and makes the inputs that this program will use. * Namely a transposed table with the first column removed. Makes use of system calls * to use cut, sed, kent utility rowsToCols, and paste (for descFile option). */ { char cmd1[1024], cmd2[1024]; +if (csv) + /* A sed one liner will convert comma separated values into a tab separated values*/ + { + char cmd3[1024]; + safef(cmd3, 1024, "sed -i 's/,/\\t/g' %s ",expMatrix); + verbose(2,"%s\n", cmd3); + mustSystem(cmd3); + } safef(cmd1, 1024, "cat %s | sed '1d' | rowsToCols stdin %s.transposedMatrix", expMatrix, expMatrix); -printf("%s\n", cmd1); +/* Exp matrices are X axis of cell lines and Y axis of transcripts. This causes long Y axis and short + * X axis, which are not handled well in C. The matrix is transposed to get around this issue. */ +verbose(2,"%s\n", cmd1); mustSystem(cmd1); +/* Pull out the cell names, and store them in a separate file. This allows the actual data matrix to + * have the first row identify the transcript, then all following rows contain only expression values. + * By removing the name before hand the computation was made faster and easier. */ if (descFile) { char cmd3[1024]; safef(cmd2, 1024, "rowsToCols %s stdout | cut -f1 | sed \'1d\' > %s.cellNamesTemp", expMatrix, expMatrix); safef(cmd3, 1024, "paste %s.cellNamesTemp %s > %s.cellNames", expMatrix, descFile, expMatrix); + verbose(2,"%s\n", cmd2); mustSystem(cmd2); + verbose(2,"%s\n", cmd3); mustSystem(cmd3); } else { safef(cmd2, 1024, "rowsToCols %s stdout | cut -f1 | sed \'1d\' > %s.cellNames", expMatrix, expMatrix); - printf("%s\n", cmd2); + verbose(2,"%s\n", cmd2); mustSystem(cmd2); } } void generateHtml(FILE *outputFile, int nameSize, char* jsonFile) // Generates a new .html file for the dendrogram. { -fprintf(outputFile, "