3172ebe9afedb10851f907cb2bf6f2a81bc80588 ceisenhart Wed Oct 14 11:15:49 2015 -0700 Changed the .html output file, removed some of the unecessary code and split it apart into more fprintf statements diff --git src/hg/expMatrixToJson/expMatrixToJson.c src/hg/expMatrixToJson/expMatrixToJson.c index b9e3989..51e72fa 100644 --- src/hg/expMatrixToJson/expMatrixToJson.c +++ src/hg/expMatrixToJson/expMatrixToJson.c @@ -7,64 +7,67 @@ #include "memalloc.h" #include "jksql.h" #include "expData.h" #include "sqlList.h" #include "hacTree.h" #include "rainbow.h" boolean clForceLayout = FALSE; // Prints the data in .json format for d3 force layout visualizations boolean clCSV = FALSE; // Converts the comma separated matrix into a tab based file. boolean clMultiThreads = FALSE; // Allows the user to run the program with multiple threads, default is off. int clThreads = 10; // The number of threads to run with the multiThreads option int clMemLim = 4; // The amount of memeory the program can use, read in Gigabytes. int target = 0; // Used for the target value in rlinkJson. float longest = 0; // Used to normalize link distances in rlinkJson. char* clDescFile = NULL; // The user can provide a description file +char* clAttributeTable = NULL; // The user can provide an attributes table... this may get removed soon. +int nodeCount; //The number of nodes. void usage() /* Explain usage and exit. */ { errAbort( "expMatrixToJson - Takes in an expression matrix and outputs a binary tree clustering the data.\n" " The tree is output as a .json file, a .html file is generated to view the \n" " tree. The files are named using the output argument (ex output.json, output.html).\n" "usage:\n" " expMatrixToJson [options] matrix output\n" "options:\n" - " -multiThread The program will run on multiple threads. \n" + " -multiThreads The program will run on multiple threads. \n" " -forceLayout Prints the output in .json format for d3 forceLayouts. NOTE no .html file will be \n" " generated using this option.\n" " -CSV The input matrix is in .csv format. \n" " -threads=int Sets the thread count for the multiThreads option, default is 10 \n" " -memLim=int Sets the amount of memeory the program can use before aborting. The default is 4G. \n" " -descFile=string The user is providing a description file. The description file must provide a \n" " description for each cell line in the expression matrix. There should be one description per \n" " line, starting on the left side of the expression matrix. The description will appear over a \n" " leaf node when hovered over.\n" " -verbose=2 Show basic run stats. \n" " -verbose=3 Show all run stats. \n" ); } /* Command line validation table. */ static struct optionSpec options[] = { - {"multiThread", OPTION_BOOLEAN}, + {"multiThreads", OPTION_BOOLEAN}, {"forceLayout", OPTION_BOOLEAN}, {"CSV", OPTION_BOOLEAN}, {"threads", OPTION_INT}, {"memLim", OPTION_INT}, {"descFile", OPTION_STRING}, + {"attributeTable", OPTION_STRING}, {NULL, 0}, }; struct jsonNodeLine /* Stores the information for a single json node line */ { struct jsonNodeLine *next; char* name; // the source for a given link double distance; // the distance for a given link }; struct jsonLinkLine /* Stores the information for a single json link line */ { struct jsonLinkLine *next; @@ -73,53 +76,67 @@ double distance; // the distance for a given link }; struct bioExpVector /* Contains expression information for a biosample on many genes. */ { struct bioExpVector *next; char *name; // name of biosample. char *desc; // description of biosample. int count; // Number of genes we have data for. double *vector; // An array allocated dynamically. struct rgbColor color; // Color for this one int children; // Number of bioExpVectors used to build the current }; +double stringToDouble(char *s) +/* Convert string to a double. Assumes all of string is number + * and aborts on an error. */ +{ +char* end; +double val = strtod(s, &end); + +if (val != val) errAbort("A value of %f was encountered. Please change this value then re run the program.", val); +if ((end == s) || (*end != '\0')) + errAbort("invalid double: %s", s); +return val; +} + struct bioExpVector *bioExpVectorListFromFile(char *matrixFile) // Read a tab-delimited file and return list of bioExpVector. { int vectorSize = 0; struct lineFile *lf = lineFileOpen(matrixFile, TRUE); char *line, **row = NULL; struct bioExpVector *list = NULL, *el; while (lineFileNextReal(lf, &line)) { + ++nodeCount; if (vectorSize == 0) { // Detect first row. vectorSize = chopByWhite(line, NULL, 0); // counting up AllocArray(row, vectorSize); continue; } AllocVar(el); AllocArray(el->vector, vectorSize); el->count = chopByWhite(line, row, vectorSize); // assert(el->count == vectorSize); int i; for (i = 0; i < el->count; ++i) - el->vector[i] = sqlDouble(row[i]); + el->vector[i] = stringToDouble(row[i]); el->children = 1; slAddHead(&list, el); } lineFileClose(&lf); slReverse(&list); return list; } int fillInNames(struct bioExpVector *list, char *nameFile) /* Fill in name field from file. */ { struct lineFile *lf = lineFileOpen(nameFile, TRUE); char *line; struct bioExpVector *el = list; @@ -330,47 +347,52 @@ { struct bioExpVector *bio = (struct bioExpVector *)tree->itemOrCluster; char *tissue = bio->name; struct rgbColor colors = bio->color; if (tree->childDistance > longest) // the first distance will be the longest, and is used for normalization longest = tree->childDistance; int i; for (i = 0; i < level; i++) fputc(' ', f); // correct spacing for .json format if (tree->left == NULL && tree->right == NULL) // Print the leaf nodes { if (bio->desc) - fprintf(f, "{\"name\":\"%s\",\"distance\":\"%s\",\"colorGroup\":\"rgb(%i,%i,%i)\"}", tissue, bio->desc, colors.r, colors.g, colors.b); + fprintf(f, "{\"name\":\"%s\",\"kids\":\"%i\",\"distance\":\"%s\",\"colorGroup\":\"rgb(%i,%i,%i)\"}", tissue, 0, bio->desc, colors.r, colors.g, colors.b); else - fprintf(f, "{\"name\":\"%s\",\"distance\":\"%s\",\"colorGroup\":\"rgb(%i,%i,%i)\"}", tissue, " ", colors.r, colors.g, colors.b); + fprintf(f, "{\"name\":\"%s\",\"kids\":\"%i\",\"distance\":\"%s\",\"colorGroup\":\"rgb(%i,%i,%i)\"}", tissue, 0, " ", colors.r, colors.g, colors.b); return; } else if (tree->left == NULL || tree->right == NULL) errAbort("\nHow did we get a node with one NULL kid??"); -// Prints out the node object and opens a new children block -fprintf(f, "{\"%s\"%s \"%s\"%s", "name", ":", " ", ","); -fprintf(f, "\"colorGroup\": \"rgb(%i,%i,%i)\",", colors.r, colors.g, colors.b ); +// Prints out the node object and opens a new children block. +//fprintf(f, "{\"%s\"%s \"%s\"%s", "name", ":", " ", ","); +fprintf(f, "{\"name\": \" \", \"kids\": \"%f\", \"colorGroup\": \"rgb(%i,%i,%i)\",",sqrt(bio->children), colors.r,colors.g,colors.b); +//fprintf(f, "\"colorGroup\": \"rgb(%i,%i,%i)\",", colors.r, colors.g, colors.b ); distance = tree->childDistance/longest; if (distance != distance) distance = 0; -fprintf(f, "\"%s\"%s \"%f\"%s\n", "distance", ":", 100*distance, ","); +//fprintf(f, "\"%s\"%s \"%f\"%s\n", "distance", ":", 100*distance, ","); +struct rgbColor wTB; +if (distance == 0) {wTB = whiteToBlackRainbowAtPos(.95-(distance*.95));} +else {wTB = whiteToBlackRainbowAtPos(.95-(sqrt(distance)*.95));} +fprintf(f, "\"distance\": \"%f\", \"whiteToBlack\":\"rgb(%i,%i,%i)\",\n", 100*distance, wTB.r, wTB.g, wTB.b); for (i = 0; i < level + 1; i++) fputc(' ', f); -fprintf(f, "\"%s\"%s\n", "children", ": ["); +fprintf(f, "\"children\":[\n"); rPrintHierarchicalJson(f, tree->left, level+1, distance); fputs(",\n", f); rPrintHierarchicalJson(f, tree->right, level+1, distance); fputc('\n', f); // Closes the children block for node objects for (i=0; i < level + 1; i++) fputc(' ', f); fputs("]\n", f); for (i = 0; i < level; i++) fputc(' ', f); fputs("}", f); } void printHierarchicalJson(FILE *f, struct hacTree *tree) /* Prints out the binary tree into .json format intended for d3 @@ -384,35 +406,33 @@ double distance = 0; rPrintHierarchicalJson(f, tree, 0, distance); fputc('\n', f); } double slBioExpVectorDistance(const struct slList *item1, const struct slList *item2, void *extraData) /* Return the absolute difference between the two kids' values. Weight based on how many nodes have been merged * to create the current node. Designed for HAC tree use*/ { verbose(3,"Calculating Distance...\n"); const struct bioExpVector *kid1 = (const struct bioExpVector *)item1; const struct bioExpVector *kid2 = (const struct bioExpVector *)item2; int j; double diff = 0, sum = 0; -float kid1Weight = kid1->children / (float)(kid1->children + kid2->children); -float kid2Weight = kid2->children / (float)(kid1->children + kid2->children); for (j = 0; j < kid1->count; ++j) { - diff = (kid1Weight*kid1->vector[j]) - (kid2Weight*kid2->vector[j]); + diff = kid1->vector[j] - kid2->vector[j]; sum += (diff * diff); } return sqrt(sum); } struct slList *slBioExpVectorMerge(const struct slList *item1, const struct slList *item2, void *unusedExtraData) /* Make a new slPair where the name is the children names concattenated and the * value is the average of kids' values. * Designed for HAC tree use*/ { verbose(3,"Merging...\n"); const struct bioExpVector *kid1 = (const struct bioExpVector *)item1; const struct bioExpVector *kid2 = (const struct bioExpVector *)item2; @@ -440,51 +460,82 @@ * normalize */ float total = 0.0; double purplePos = 0.80; struct slRef *el, *nextEl; for (el = leafList; el != NULL; el = nextEl) { nextEl = el->next; if (nextEl == NULL) break; struct bioExpVector *bio1 = el->val; struct bioExpVector *bio2 = nextEl->val; double distance = slBioExpVectorDistance((struct slList *)bio1, (struct slList *)bio2, NULL); if (distance != distance ) distance = 0; total += distance; } +if (total == 0) errAbort("There doesn't seem to be any difference between these matrix columns"); /* Loop through list a second time to generate actual colors. */ double soFar = 0; for (el = leafList; el != NULL; el = nextEl) { nextEl = el->next; if (nextEl == NULL) break; struct bioExpVector *bio1 = el->val; struct bioExpVector *bio2 = nextEl->val; double distance = slBioExpVectorDistance((struct slList *)bio1, (struct slList *)bio2, NULL); if (distance != distance ) distance = 0 ; soFar += distance; double normalized = soFar/total; - bio2->color = saturatedRainbowAtPos(normalized * purplePos); + bio2->color = saturatedRainbowAtPos(normalized*purplePos);// *purplePos } /* Set first color to correspond to 0, since not set in above loop */ struct bioExpVector *bio = leafList->val; bio->color = saturatedRainbowAtPos(0); } + +void colorLeavesFromAttbFile(struct slRef *leafList, char *attributef, char *colorHash) +/* Assign colors of rainbow to leaves using a weird attribute file. + * And a hashing file that corresponds the group to a color. */ +{ +struct hash *hT = hashTwoColumnFile(attributef); +struct slRef *el, *nextEl; +for (el = leafList; el != NULL; el = nextEl) + { + nextEl = el->next; + if (nextEl == NULL) + break; + struct bioExpVector *bio = el->val; + struct hashEl *hEl = hashLookup(hT, bio->name); + float *something = hEl->val; + double normalized = *something / 10.0 ; + bio->color = saturatedRainbowAtPos(normalized); + } + +} + +//void verifyInput(char *expMatrix) +/* This program keeps on breaking on weird matrix column values, hopefully this + * function will weed these issues out. It will be set on by default and can be + * turned off for speed with a flag */ +//{ +//char cmd1[1024], cmd2[1024]; +//} + + void convertInput(char *expMatrix, char *descFile, bool csv) /* Takes in a expression matrix and makes the inputs that this program will use. * Namely a transposed table with the first column removed. Makes use of system calls * to use cut, sed, kent utility rowsToCols, and paste (for descFile option). */ { char cmd1[1024], cmd2[1024]; if (csv) /* A sed one liner will convert comma separated values into a tab separated values*/ { char cmd3[1024]; safef(cmd3, 1024, "sed -i 's/,/\\t/g' %s ",expMatrix); verbose(2,"%s\n", cmd3); mustSystem(cmd3); } safef(cmd1, 1024, "cat %s | sed '1d' | rowsToCols stdin %s.transposedMatrix", expMatrix, expMatrix); @@ -502,75 +553,89 @@ safef(cmd3, 1024, "paste %s.cellNamesTemp %s > %s.cellNames", expMatrix, descFile, expMatrix); verbose(2,"%s\n", cmd2); mustSystem(cmd2); verbose(2,"%s\n", cmd3); mustSystem(cmd3); } else { safef(cmd2, 1024, "rowsToCols %s stdout | cut -f1 | sed \'1d\' > %s.cellNames", expMatrix, expMatrix); verbose(2,"%s\n", cmd2); mustSystem(cmd2); } } void generateHtml(FILE *outputFile, int nameSize, char* jsonFile) -// Generates a new .html file for the dendrogram. +// Generates a new .html file for the dendrogram. Will do some size calculations as well. { char *pageName = cloneString(jsonFile); chopSuffix(pageName); -fprintf(outputFile, " %s Radial Dendrogram "); +int textSize = 12 - log(nodeCount); +int radius = 540 + 270*log10(nodeCount); +int labelLength = 10+nameSize*(15-textSize); +if (labelLength > 100) labelLength = 100; +fprintf(outputFile," %s "); +carefulClose(&outputFile); } void expData(char *matrixFile, char *outDir, char *descFile) /* Read matrix and names into a list of bioExpVectors, run hacTree to * associate them, and write output. */ { convertInput(matrixFile, descFile, clCSV); struct bioExpVector *list = bioExpVectorListFromFile(catTwoStrings(matrixFile,".transposedMatrix")); verbose(2,"%lld allocated after bioExpVectorListFromFile\n", (long long)carefulTotalAllocated()); FILE *f = mustOpen(catTwoStrings(outDir,".json"),"w"); struct lm *localMem = lmInit(0); int size = fillInNames(list, catTwoStrings(matrixFile,".cellNames")); /* Allocate new string that is a concatenation of two strings. */ struct hacTree *clusters = NULL; if (clMultiThreads) { + uglyf("Using %i threads. \n", clThreads); clusters = hacTreeMultiThread(clThreads, (struct slList *)list, localMem, slBioExpVectorDistance, slBioExpVectorMerge, NULL, NULL); } else { + uglyf("Using 1 threads. \n"); clusters = hacTreeFromItems((struct slList *)list, localMem, slBioExpVectorDistance, slBioExpVectorMerge, NULL, NULL); } struct slRef *orderedList = getOrderedLeafList(clusters); colorLeaves(orderedList); if (clForceLayout) printForceLayoutJson(f,clusters); else{ printHierarchicalJson(f, clusters); FILE *htmlF = mustOpen(catTwoStrings(outDir,".html"),"w"); generateHtml(htmlF,size,catTwoStrings(outDir,".json")); } // Remove temporary files -char cleanup[1024]; -safef(cleanup, 1024, "rm %s", catTwoStrings(matrixFile, ".*")); +char cleanup[1024], cleanup2[1024]; +safef(cleanup, 1024, "rm %s.cellNames", matrixFile); +safef(cleanup2, 1024, "rm %s.transposedMatrix", matrixFile); mustSystem(cleanup); +mustSystem(cleanup2); verbose(2,"%lld allocated at end\n", (long long)carefulTotalAllocated()); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); clForceLayout = optionExists("forceLayout"); clCSV = optionExists("CSV"); clMultiThreads = optionExists("multiThreads"); clThreads = optionInt("threads", clThreads); clMemLim = optionInt("memLim", clMemLim); clDescFile = optionVal("descFile", clDescFile); if (argc != 3) usage();