dfaf18b4d00a9e2a795a231cb3b161da3b98a6e0 ceisenhart Wed Jun 10 11:41:17 2015 -0700 Changed the input/output. The program now takes an expression matrix as input and generates a .json and a .html diff --git src/hg/expMatrixToJson/expMatrixToJson.c src/hg/expMatrixToJson/expMatrixToJson.c index c977304..0f3d479 100644 --- src/hg/expMatrixToJson/expMatrixToJson.c +++ src/hg/expMatrixToJson/expMatrixToJson.c @@ -1,89 +1,96 @@ /* expData - Takes in GNF expression data, organizes it using a hierarchical agglomerative clustering algorithm. The output defaults to a hierarchichal .json format, with two additional options. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "obscure.h" +#include "memalloc.h" #include "jksql.h" #include "expData.h" #include "sqlList.h" #include "hacTree.h" #include "rainbow.h" /* Visually the nodes will be assigned a color corresponding to a range of sizes. * This constant dictates how many colors will be displayed (1-20). */ int clCgConstant = 20; /* A normalizing constant for the distances, this corresponds to * the size of the nodes in the standard output and the link distance in the forceLayout output*/ int clThreads = 10; // The number of threads to run with the multiThreads option -int clNormConstant = 20; +int clNormConstant = 99; boolean clForceLayout = FALSE; // Prints the data in .json format for d3 force layout visualizations int target = 0; // Used for the target value in rlinkJson. float longest = 0; // Used to normalize link distances in rlinkJson. char* clHacTree = "fromItems"; +char* clDescFile = NULL; void usage() /* Explain usage and exit. */ { errAbort( - "expData - Takes in a relational database and outputs expression information.\n" - "Standard output is .json format intended for d3 hierarchical displays. \n" + "expMatrixToJson - Takes in an expression matrix and outputs a binary tree clustering the data in .json format.\n" "usage:\n" - " expData matrix biosamples output\n" + " expMatrixToJson matrix output\n" "options:\n" " -forceLayout = bool Prints the output in .json format for d3 forceLayouts\n" " -normConstant = int Normalizing constant for d3, default is 20. For forceLayout 100 is reccomended \n" " -cgConstant = int Defines the number of possible colors for nodes, 1 - 20 \n" " -threads = int Sets the thread count for the multiThreads option, default is 10 \n" " -hacTree = Dictates how the tree is generated; multiThreads or costlyMerges or fromItems. fromItems is default \n" + " -descFile = The user is providing a description file. The description file must provide a \n" + " description for each cell line in the expression matrix. There should be one \n" + " description per line, starting on the left side of the expression matrix. The \n" + " description will appear over a leaf node when hovered over.\n" ); } /* Command line validation table. */ static struct optionSpec options[] = { {"normConstant", OPTION_INT}, {"cgConstant", OPTION_INT}, {"threads", OPTION_INT}, {"structuredOutput", OPTION_BOOLEAN}, {"forceLayout", OPTION_BOOLEAN}, {"hacTree", OPTION_STRING}, + {"descFile", OPTION_STRING}, {NULL, 0}, }; struct jsonNodeLine /* Stores the information for a single json node line */ { struct jsonNodeLine *next; char* name; // the source for a given link double distance; // the distance for a given link }; struct jsonLinkLine /* Stores the information for a single json link line */ { struct jsonLinkLine *next; int source; // the source for a given link int target; // the target for a given link double distance; // the distance for a given link }; struct bioExpVector /* Contains expression information for a biosample on many genes. */ { struct bioExpVector *next; char *name; // name of biosample. + char *desc; // description of biosample. int count; // Number of genes we have data for. double *vector; // An array allocated dynamically. struct rgbColor color; // Color for this one int children; // Number of bioExpVectors used to build the current }; struct bioExpVector *bioExpVectorListFromFile(char *matrixFile) // Read a tab-delimited file and return list of bioExpVector. { int vectorSize = 0; struct lineFile *lf = lineFileOpen(matrixFile, TRUE); char *line, **row = NULL; struct bioExpVector *list = NULL, *el; while (lineFileNextReal(lf, &line)) { @@ -97,47 +104,66 @@ AllocVar(el); AllocArray(el->vector, vectorSize); el->count = chopByWhite(line, row, vectorSize); // assert(el->count == vectorSize); int i; for (i = 0; i < el->count; ++i) el->vector[i] = sqlDouble(row[i]); el->children = 1; slAddHead(&list, el); } lineFileClose(&lf); slReverse(&list); return list; } -void fillInNames(struct bioExpVector *list, char *nameFile) + +int fillInNames(struct bioExpVector *list, char *nameFile) /* Fill in name field from file. */ { struct lineFile *lf = lineFileOpen(nameFile, TRUE); char *line; struct bioExpVector *el = list; +int maxSize = 0 ; + while (lineFileNextReal(lf, &line)) { if (el == NULL) { warn("More names than items in list"); break; } - el->name = cloneString(line); + char *fields[2]; + if (strlen(line) > maxSize) maxSize = strlen(line); + int fieldCount = chopTabs(line, fields); + if (fieldCount >= 1) + { + el->name = cloneString(fields[0]); + if (fieldCount >= 2) + el->desc = cloneString(fields[1]); + else + el->desc = cloneString("n/a"); + } + el = el->next; } + +if (el != NULL) + errAbort("More items in matrix file than %s", nameFile); + lineFileClose(&lf); +return maxSize; } void printJsonNodeLine(FILE *f, struct jsonNodeLine *node) { fprintf(f," %s\"%s\"%s\"%s\"%s", "{","name", ":", node->name, ","); fprintf(f,"\"%s\"%s%0.31f%s\n", "y", ":" , node->distance, "},"); } void printEndJsonNodeLine(FILE *f, struct jsonNodeLine *node) { fprintf(f," %s\"%s\"%s\"%s\"%s", "{","name", ":", node->name, ","); fprintf(f,"\"%s\"%s%0.31f%s\n", "y", ":" , node->distance, "}"); } void printJsonLinkLine(FILE *f, struct jsonLinkLine *links) @@ -303,46 +329,47 @@ int normConstant, int cgConstant) /* Recursively prints out the elements of the hierarchical .json file. */ { struct bioExpVector *bio = (struct bioExpVector *)tree->itemOrCluster; char *tissue = bio->name; struct rgbColor colors = bio->color; if (tree->childDistance > longest) // the first distance will be the longest, and is used for normalization longest = tree->childDistance; int i; for (i = 0; i < level; i++) fputc(' ', f); // correct spacing for .json format if (tree->left == NULL && tree->right == NULL) { // Prints out the leaf objects - // fprintf(f, "{\"name\": \"%s\",\"similarity\": %f,\"linkGroup\": \" \"", tissue, distance); - fprintf(f, "{\"%s\"%s \"%s\"%s\"%s\"%s %f %s\"%s\"%s \"rgb(%i,%i,%i)\"}", "name", ":", tissue, ", ", - "similarity", ":", distance, "," , "colorGroup", ":", colors.r, colors.g, colors.b); + //fprintf(f, "{\"%s\"%s \"%s\"%s\"%s\"%s %f %s\"%s\"%s \"rgb(%i,%i,%i)\"}", "name", ":", tissue, ", ", + // "similarity", ":", bio->desc , "," , "colorGroup", ":", colors.r, colors.g, colors.b); + fprintf(f, "{\"name\":\"%s\",\"distance\":\"%s\",\"colorGroup\":\"rgb(%i,%i,%i)\"}", tissue, bio->desc, colors.r, colors.g, colors.b); return; } else if (tree->left == NULL || tree->right == NULL) errAbort("\nHow did we get a node with one NULL kid??"); // Prints out the node object and opens a new children block fprintf(f, "{\"%s\"%s \"%s\"%s", "name", ":", " ", ","); fprintf(f, "\"colorGroup\": \"rgb(%i,%i,%i)\",", colors.r, colors.g, colors.b ); -fprintf(f, "\"%s\"%s \"%f\"%s\n", "distance", ":", normConstant * (tree->childDistance/longest), ","); +distance = tree->childDistance/longest; +if (distance != distance) distance = 0; +fprintf(f, "\"%s\"%s \"%f\"%s\n", "distance", ":", (1 + normConstant * (distance)), ","); for (i = 0; i < level + 1; i++) fputc(' ', f); fprintf(f, "\"%s\"%s\n", "children", ": ["); -distance = tree->childDistance/longest; rPrintHierarchicalJson(f, tree->left, level+1, distance, normConstant, cgConstant); fputs(",\n", f); rPrintHierarchicalJson(f, tree->right, level+1, distance, normConstant, cgConstant); fputc('\n', f); // Closes the children block for node objects for (i=0; i < level + 1; i++) fputc(' ', f); fputs("]\n", f); for (i = 0; i < level; i++) fputc(' ', f); fputs("}", f); } void printHierarchicalJson(FILE *f, struct hacTree *tree, int normConstant, int cgConstant) /* Prints out the binary tree into .json format intended for d3 @@ -380,118 +407,171 @@ } //printf("%f\n",sqrt(sum)); return sqrt(sum); } struct slList *slBioExpVectorMerge(const struct slList *item1, const struct slList *item2, void *unusedExtraData) /* Make a new slPair where the name is the children names concattenated and the * value is the average of kids' values. * Designed for HAC tree use*/ { verbose(1,"Merging...\n"); const struct bioExpVector *kid1 = (const struct bioExpVector *)item1; const struct bioExpVector *kid2 = (const struct bioExpVector *)item2; +float kid1Weight = kid1->children / (float)(kid1->children + kid2->children); +float kid2Weight = kid2->children / (float)(kid1->children + kid2->children); +printf("Kid1weight %g kid2weight %g the sum is %g \n", kid1Weight, kid2Weight, kid1Weight + kid2Weight); struct bioExpVector *el; AllocVar(el); AllocArray(el->vector, kid1->count); +assert(kid1->count == kid2->count); el->count = kid1->count; el->name = catTwoStrings(kid1->name, kid2->name); int i; for (i = 0; i < el->count; ++i) { - el->vector[i] = (kid1->vector[i] + kid2->vector[i])/2; + el->vector[i] = (kid1Weight*kid1->vector[i] + kid2Weight*kid2->vector[i]); } el->children = kid1->children + kid2->children; return (struct slList *)(el); } void colorLeaves(struct slRef *leafList) /* Assign colors of rainbow to leaves. */ { /* Loop through list once to figure out total, since we need to * normalize */ -double total = 0; +float total = 0.0; double purplePos = 0.80; struct slRef *el, *nextEl; for (el = leafList; el != NULL; el = nextEl) { nextEl = el->next; if (nextEl == NULL) break; struct bioExpVector *bio1 = el->val; struct bioExpVector *bio2 = nextEl->val; double distance = slBioExpVectorDistance((struct slList *)bio1, (struct slList *)bio2, NULL); + if (distance != distance ) distance = 0; total += distance; } /* Loop through list a second time to generate actual colors. */ double soFar = 0; for (el = leafList; el != NULL; el = nextEl) { nextEl = el->next; if (nextEl == NULL) break; struct bioExpVector *bio1 = el->val; struct bioExpVector *bio2 = nextEl->val; double distance = slBioExpVectorDistance((struct slList *)bio1, (struct slList *)bio2, NULL); + if (distance != distance ) distance = 0 ; soFar += distance; double normalized = soFar/total; bio2->color = saturatedRainbowAtPos(normalized * purplePos); } /* Set first color to correspond to 0, since not set in above loop */ struct bioExpVector *bio = leafList->val; bio->color = saturatedRainbowAtPos(0); } -void expData(char *matrixFile, char *nameFile, char *outFile, bool forceLayout, int normConstant, int cgConstant) +void convertInput(char *expMatrix, char *descFile) +/* Takes in a expression matrix and makes the inputs that this program will use. + * Namely a transposed table with the first column removed. Makes use of system calls + * to use cut, sed, kent utility rowsToCols, and paste (for descFile option). */ +{ +char cmd1[1024], cmd2[1024]; +safef(cmd1, 1024, "cat %s | sed '1d' | rowsToCols stdin %s.transposedMatrix", expMatrix, expMatrix); +printf("%s\n", cmd1); +mustSystem(cmd1); +if (descFile) + { + char cmd3[1024]; + safef(cmd2, 1024, "rowsToCols %s stdout | cut -f1 | sed \'1d\' > %s.cellNamesTemp", expMatrix, expMatrix); + safef(cmd3, 1024, "paste %s.cellNamesTemp %s > %s.cellNames", expMatrix, descFile, expMatrix); + mustSystem(cmd2); + mustSystem(cmd3); + } +else + { + safef(cmd2, 1024, "rowsToCols %s stdout | cut -f1 | sed \'1d\' > %s.cellNames", expMatrix, expMatrix); + printf("%s\n", cmd2); + mustSystem(cmd2); + } +} + +void generateHtml(FILE *outputFile, int nameSize, char* jsonFile) +// Generates a new .html file for the dendrogram. +{ +fprintf(outputFile, " Radial Dendrogram "); + +} + + + +void expData(char *matrixFile, char *outDir, char *descFile, bool forceLayout, int normConstant, int cgConstant) +//void expData(char *matrixFile, char *nameFile, char *outFile, bool forceLayout, int normConstant, int cgConstant) /* Read matrix and names into a list of bioExpVectors, run hacTree to * associate them, and write output. */ { -struct bioExpVector *list = bioExpVectorListFromFile(matrixFile); -FILE *f = mustOpen(outFile,"w"); +convertInput(matrixFile, descFile); +struct bioExpVector *list = bioExpVectorListFromFile(catTwoStrings(matrixFile,".transposedMatrix")); +uglyf("%lld allocated after bioExpVectorListFromFile\n", (long long)carefulTotalAllocated()); +FILE *f = mustOpen(catTwoStrings(outDir,".json"),"w"); struct lm *localMem = lmInit(0); -fillInNames(list, nameFile); +int size = fillInNames(list, catTwoStrings(matrixFile,".cellNames")); +uglyf("The size here is %i\n", size); +//char *catTwoStrings(char *a, char *b) +/* Allocate new string that is a concatenation of two strings. */ +uglyf("%lld allocated after fillInNames\n", (long long)carefulTotalAllocated()); struct hacTree *clusters = NULL; if (sameString(clHacTree, "multiThreads")) { clusters = hacTreeMultiThread(clThreads, (struct slList *)list, localMem, slBioExpVectorDistance, slBioExpVectorMerge, NULL, NULL); } /*else if (sameString(clHacTree, "costlyMerges")) { clusters = hacTreeForCostlyMerges((struct slList *)list, localMem, slBioExpVectorDistance, slBioExpVectorMerge, NULL); }*/ else if (sameString(clHacTree, "fromItems")) { clusters = hacTreeFromItems((struct slList *)list, localMem, slBioExpVectorDistance, slBioExpVectorMerge, NULL, NULL); } else { uglyAbort("Unrecognized input option: %s", clHacTree); } //uglyAbort("Made it through the binary tree generation"); struct slRef *orderedList = getOrderedLeafList(clusters); colorLeaves(orderedList); if (forceLayout) printForceLayoutJson(normConstant,f,clusters); if (!clForceLayout) printHierarchicalJson(f, clusters, normConstant, cgConstant); + FILE *htmlF = mustOpen(catTwoStrings(outDir,".html"),"w"); + generateHtml(htmlF,size,catTwoStrings(outDir,".json")); +uglyf("%lld allocated at end\n", (long long)carefulTotalAllocated()); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); clForceLayout = optionExists("forceLayout"); clNormConstant = optionInt("normConstant", clNormConstant); clCgConstant = optionInt("cgConstant", clCgConstant); clThreads = optionInt("threads", clThreads); clHacTree = optionVal("hacTree", clHacTree); -if (argc != 4) +clDescFile = optionVal("descFile", clDescFile); +if (argc != 3) usage(); -expData(argv[1], argv[2], argv[3], clForceLayout, clNormConstant, clCgConstant); +pushCarefulMemHandler(4L*1024*1024*1024); +expData(argv[1], argv[2], clDescFile, clForceLayout, clNormConstant, clCgConstant); return 0; }