56faab879b052fc2cdf3265cd34b36064faf6fe9 ceisenhart Tue Mar 8 12:33:32 2016 -0800 Changing the program name as suggested by Jim in #16789, refs #16182 diff --git src/hg/expMatrixToJson/expMatrixToJson.c src/hg/expMatrixToJson/expMatrixToJson.c deleted file mode 100644 index 8bea130..0000000 --- src/hg/expMatrixToJson/expMatrixToJson.c +++ /dev/null @@ -1,636 +0,0 @@ -/* expData - Takes in an expression matrix and clusters it using a hierarchical agglomerative clustering algorithm. The output defaults to a hierarchichal .json format, with two additional options. */ -#include "common.h" -#include "linefile.h" -#include "hash.h" -#include "options.h" -#include "obscure.h" -#include "memalloc.h" -#include "jksql.h" -#include "expData.h" -#include "jsonWrite.h" -#include "sqlList.h" -#include "hacTree.h" -#include "rainbow.h" - -boolean clCSV = FALSE; // Converts the comma separated matrix into a tab based file. -int clThreads = 10; // The number of threads to run with the multiThreads option -int clMemLim = 4; // The amount of memeory the program can use, read in Gigabytes. -float clLongest = 0; // Used to normalize link distances in rlinkJson. -char* clDescFile = NULL; // The user can provide a description file -char* clAttributeTable = NULL; // The user can provide an attributes table... this may get removed soon. -int nodeCount; //The number of nodes. -int internalNodes = 0; -void usage() -/* Explain usage and exit. */ -{ -errAbort( - "expMatrixToJson - Takes in an expression matrix and outputs a binary tree clustering the data.\n" - " The tree is output as a .json file, a .html file is generated to view the \n" - " tree. The files are named using the output argument (ex output.json, output.html).\n" - "usage:\n" - " expMatrixToJson [options] matrix output\n" - "options:\n" - " -CSV The input matrix is in .csv format. \n" - " -threads=int Sets the thread count for the multiThreads option, default is 10 \n" - " -memLim=int Sets the amount of memeory the program can use before aborting. The default is 4G. \n" - " -verbose=2 Show basic run stats. \n" - " -verbose=3 Show all run stats. Very ugly, avoid at all costs. \n" - " -descFile=string The user is providing a description file. The description file must provide a \n" - " description for each cell line in the expression matrix. There should be one description per \n" - " line, starting on the left side of the expression matrix. The description will appear over a \n" - " leaf node when hovered over.\n" - ); -} - -/* Command line validation table. */ -static struct optionSpec options[] = { - {"CSV", OPTION_BOOLEAN}, - {"threads", OPTION_INT}, - {"memLim", OPTION_INT}, - {"descFile", OPTION_STRING}, - {"attributeTable", OPTION_STRING}, - {NULL, 0}, - }; - -struct slDoubleInt -/* Used to keep track of the top ten genes contributed */ - { - struct slDoubleInt *next; - double val; - int index; - }; - -struct bioExpVector -/* Contains expression information for a biosample on many genes. */ - { - struct bioExpVector *next; - char *name; // name of biosample. - char *desc; // description of biosample. - int count; // Number of genes we have data for. - double *vector; // An array allocated dynamically. - struct rgbColor color; // Color for this one - int children; // Number of bioExpVectors used to build the current - struct slDoubleInt *topGeneIndeces; // The indeces for the top 10 genes that drove the clustering up to this point - int contGenes; //The number of contributing genes - int mergeCount; //Number in the merge chain. - }; - - -struct slDoubleInt *slDoubleIntNew(double x, int y) -/* Return a new doubleInt */ - { - struct slDoubleInt *a; - AllocVar(a); - a->val = x; - a->index = y; - return a; - } - -int slDoubleIntCmp(const void *va, const void *vb) -/* Compare two doubleInts */ -{ - const struct slDoubleInt *a = *((struct slDoubleInt **)va); - const struct slDoubleInt *b = *((struct slDoubleInt **)vb); - double diff = a->val - b->val; - if (diff < 0) - return -1; - else if (diff > 0) - return 1; - else - return 0; - } - -double stringToDouble(char *s) -/* Convert string to a double. Assumes all of string is number - * and aborts on an error. Errors on 'nan'*/ - { - char* end; - double val = strtod(s, &end); - - if (val != val) errAbort("A value of %f was encountered. Please change this value then re run the program.", val); - if ((end == s) || (*end != '\0')) - errAbort("invalid double: %s", s); - return val; - } - -struct bioExpVector *bioExpVectorListFromFile(char *matrixFile) -/* Read a tab-delimited file and return list of bioExpVectors */ - { - int vectorSize = 0; - struct lineFile *lf = lineFileOpen(matrixFile, TRUE); - char *line, **row = NULL; - struct bioExpVector *list = NULL, *el; - while (lineFileNextReal(lf, &line)) - { - ++nodeCount; - if (vectorSize == 0) - { - // Detect first row. - vectorSize = chopByWhite(line, NULL, 0); // counting up - AllocArray(row, vectorSize); - continue; - } - AllocVar(el); - AllocArray(el->vector, vectorSize); - el->count = chopByWhite(line, row, vectorSize); - assert(el->count == vectorSize); - int i; - for (i = 0; i < el->count; ++i) - el->vector[i] = stringToDouble(row[i]); - el->children = 1; - el->contGenes = 0; - slAddHead(&list, el); - } - lineFileClose(&lf); - slReverse(&list); - return list; - } - -int fillInNames(struct bioExpVector *list, char *nameFile) -/* Fill in name field from file. */ - { - struct lineFile *lf = lineFileOpen(nameFile, TRUE); - char *line; - struct bioExpVector *el = list; - int maxSize = 0 ; - - while (lineFileNextReal(lf, &line)) - { - if (el == NULL) - { - warn("More names than items in list"); - break; - } - char *fields[2]; - if (strlen(line) > maxSize) maxSize = strlen(line); - int fieldCount = chopTabs(line, fields); - if (fieldCount >= 1) - { - el->name = cloneString(fields[0]); - if (fieldCount >= 2) - el->desc = cloneString(fields[1]); - else - el->desc = cloneString("0"); - } - el = el->next; - } - if (el != NULL) - errAbort("More items in matrix file than %s", nameFile); - lineFileClose(&lf); - return maxSize; - } - -static void rAddLeaf(struct hacTree *tree, struct slRef **pList) -/* Recursively add leaf to list */ - { - if (tree->left == NULL && tree->right == NULL) - refAdd(pList, tree->itemOrCluster); - else - { - rAddLeaf(tree->left, pList); - rAddLeaf(tree->right, pList); - } - } - -struct slRef *getOrderedLeafList(struct hacTree *tree) -/* Return list of references to bioExpVectors from leaf nodes - * ordered by position in hacTree */ - { - struct slRef *leafList = NULL; - rAddLeaf(tree, &leafList); - slReverse(&leafList); - return leafList; - } - - -static void printJsonNode(FILE *f, struct hacTree *tree, int level, double distance, struct slName *geneNames) - { - int i; - struct bioExpVector *bio = (struct bioExpVector *)tree->itemOrCluster; - struct rgbColor colors = bio->color; - for (i = 0; i < level + 1; i++) - fputc(' ', f); - distance = tree->childDistance/clLongest; - double length = 0; - if (tree->parent != NULL) - { - length = tree->parent->childDistance; - } - ++internalNodes; - fprintf(f, "{\"name\":\" \", \"number\":\"%i\", \"kids\":\"%i\", \"tpmDistance\":" - "\"%f\", \"length\": \"%f\", \"colorGroup\": \"rgb(%i,""%i,%i)\"," - "\"contGenes\":\"%i\",\"geneList\": {",internalNodes, bio->children, - tree->childDistance, length, colors.r,colors.g,colors.b, bio->contGenes); - - struct slDoubleInt *j; - for (j = bio->topGeneIndeces; j != NULL; j = j->next) - { - struct slName *geneName = slElementFromIx(geneNames, j->index); - fprintf(f,"\"%s\":\"%f\"", geneName->name, j->val); - if (j->next != NULL) fprintf(f, ", "); - - } - fprintf(f , "},"); - if (distance != distance) distance = 0; - struct rgbColor wTB; - struct rgbColor wTBsqrt; - struct rgbColor wTBquad; - if (distance == 0) { - wTB = whiteToBlackRainbowAtPos(0); - wTBsqrt = whiteToBlackRainbowAtPos(0); - wTBquad = whiteToBlackRainbowAtPos(0); - } - else { - wTB = whiteToBlackRainbowAtPos(distance*.95); - wTBsqrt = whiteToBlackRainbowAtPos(sqrt(distance*.95)); - wTBquad = whiteToBlackRainbowAtPos(sqrt(sqrt(distance*.95))); - } - fprintf(f, "\"normalizedDistance\": \"%f\", \"whiteToBlack\":\"rgb(%i,%i,%i)\", \"whiteTo", - distance, wTB.r, wTB.g, wTB.b); - fprintf(f, "BlackSqrt\":\"rgb(%i,%i,%i)\", \"whiteToBlackQuad\":\"rgb(%i,%i,%i)\",\n", - wTBsqrt.r, wTBsqrt.g, wTBsqrt.b, wTBquad.r, wTBquad.g, wTBquad.b); - for (i = 0; i < level + 1; i++) - fputc(' ', f); - fprintf(f, "\"children\":[\n"); - } - -static void rPrintHierarchicalJson(FILE *f, struct hacTree *tree, int level, double distance, struct slName *geneNames) -/* Recursively prints out the elements of the hierarchical .json file. */ - { - struct bioExpVector *bio = (struct bioExpVector *)tree->itemOrCluster; - char *tissue = bio->name; - struct rgbColor colors = bio->color; - if (tree->childDistance > clLongest) - /* In practice the first distance will be the longest, and is used for normalization. */ - clLongest = tree->childDistance; - - int i; - for (i = 0; i < level; i++) - fputc(' ', f); // correct spacing for .json format - - if (tree->left == NULL && tree->right == NULL) // Check if the current node is a leaf - { - fprintf(f, "{\"name\":\"%s\",\"kids\":\"0\",\"length\":\"%f\",\"colorGroup\":\"rgb(%i,%i,%i)\"}", - tissue, tree->parent->childDistance, colors.r, colors.g, colors.b); - return; - } - // Sanity check leaf nodes, they should not have any kids, definitely should not have a single kid. - else if (tree->left == NULL || tree->right == NULL) - errAbort("\nHow did we get a node with one NULL kid??"); - - printJsonNode(f, tree, level, distance, geneNames); - - rPrintHierarchicalJson(f, tree->left, level+1, distance, geneNames); - fputs(",\n", f); - rPrintHierarchicalJson(f, tree->right, level+1, distance, geneNames); - fputc('\n', f); - // Closes the children block for node objects - for (i=0; i < level + 1; i++) - fputc(' ', f); - fputs("]\n", f); - for (i = 0; i < level; i++) - fputc(' ', f); - fputs("}", f); - } - -void printHierarchicalJson(FILE *f, struct hacTree *tree, char *geneNamesFile) -/* Prints out the binary tree into .json format intended for d3 - * hierarchical layouts */ - { - if (tree == NULL) - { - fputs("Empty tree.\n", f); - return; - } - double distance = 0; - struct lineFile *lf = lineFileOpen(geneNamesFile, TRUE); - char *line; - struct slName *geneNames; - AllocVar(geneNames); - while (lineFileNextReal(lf, &line)) - { - struct slName *geneName = newSlName(cloneString(line)); - slAddTail(&geneNames, geneName); - } - lineFileClose(&lf); - rPrintHierarchicalJson(f, tree, 0, distance, geneNames); - } - -double slBioExpVectorDistance(const struct slList *item1, const struct slList *item2, void *extraData) -/* Return the absolute difference between the two kids' values. Weight based on how many nodes have been merged - * to create the current node. Designed for HAC tree use*/ - { - verbose(3,"Calculating Distance...\n"); - const struct bioExpVector *kid1 = (const struct bioExpVector *)item1; - const struct bioExpVector *kid2 = (const struct bioExpVector *)item2; - int j; - double diff = 0, sum = 0; - for (j = 0; j < kid1->count; ++j) - { - diff = kid1->vector[j] - kid2->vector[j]; - sum += (diff * diff); - } - return sqrt(sum); - } - -int mergeCount = 0; - -struct slList *slBioExpVectorMerge(const struct slList *item1, const struct slList *item2, - void *unusedExtraData) -/* Make a new slPair where the name is the children names concattenated and the - * value is the average of kids' values. - * Designed for HAC tree use*/ - { - verbose(3,"Merging...\n"); - const struct bioExpVector *kid1 = (const struct bioExpVector *)item1; - const struct bioExpVector *kid2 = (const struct bioExpVector *)item2; - float kid1Weight = kid1->children / (float)(kid1->children + kid2->children); //Weight based on number of children. - float kid2Weight = kid2->children / (float)(kid1->children + kid2->children); - struct bioExpVector *el; - AllocVar(el); - AllocArray(el->vector, kid1->count); - assert(kid1->count == kid2->count); - el->count = kid1->count; - el->name = catTwoStrings(kid1->name, kid2->name); - int i; - mergeCount++; - int gCount = 0; - for (i = 0; i < el->count; ++i) - { - if (kid1->vector[i] == kid2->vector[i]) // We are doing thousands of merges, lets cut out useless compute where we can. - { - el->vector[i] = kid1->vector[i]; - continue; - } - el->vector[i] = (kid1Weight*kid1->vector[i] + kid2Weight*kid2->vector[i]); - float diff; - if (((float)(kid1->vector[i])) > ((float)(kid2->vector[i]))) diff = ((float)(kid1->vector[i])) - ((float)(kid2->vector[i])); - else diff = ((float)(kid2->vector[i])) - ((float)(kid1->vector[i])); - if (diff != 0.0) // Only consider diffs that are non zero - { - ++el->contGenes; // This gene is non zero so it is contributing - ++gCount; - int index = i + 1; - if (gCount <= 10){ - struct slDoubleInt *newGene = slDoubleIntNew(diff, index); - slAddHead(&el->topGeneIndeces, newGene); - slSort(&el->topGeneIndeces, slDoubleIntCmp); - } - else{ - if (el->vector[i] > el->topGeneIndeces->val){ - slPopHead(el->topGeneIndeces); - struct slDoubleInt *newGene = slDoubleIntNew(diff, index); - slAddHead(&el->topGeneIndeces, newGene); - slSort(&el->topGeneIndeces, slDoubleIntCmp); - } - } - } - } - slReverse(&el->topGeneIndeces); - el->mergeCount = mergeCount; - el->children = kid1->children + kid2->children; - return (struct slList *)(el); - } - -const struct slList *lastLeaf = NULL; - -bool slBioExpVectorCmp(const struct slList *item1, const struct slList *item2, - void *unusedExtraData) -/* Compare two bioExpVectors to decide which one becomes the left child and which becomes the right. - * Return TRUE if item 1 is left child and item 2 is right child. */ - { - const struct bioExpVector *kid1 = (const struct bioExpVector *)item1; - const struct bioExpVector *kid2 = (const struct bioExpVector *)item2; - if (kid1->children !=1 && kid2->children ==1) lastLeaf = item2; - if (kid1->children ==1 && kid2->children !=1) lastLeaf = item1; - - if (kid1->children > kid2->children) - { - return TRUE; // Whoever has more kids gets to be first born - } - else if (kid1->children < kid2->children)return FALSE; - else{ - if (kid1->mergeCount < kid2->mergeCount) return TRUE; - else if (kid1->mergeCount > kid2->mergeCount) return FALSE; - else{ - // These are two leaf siblings, things get a bit tricky here... - // First we find the closest neighbor (who should be a firstborn) - if (lastLeaf == NULL) - { - // This is the very first merge, not certain how to handle it so lets just return True for now - lastLeaf = item1; - return TRUE; - } - else{ - double score1 = slBioExpVectorDistance(item1, lastLeaf, NULL); - double score2 = slBioExpVectorDistance(item2, lastLeaf, NULL); - if (score1 < score2) - { - lastLeaf = item2; - return TRUE; - } - else{ - lastLeaf = item1; - return FALSE; - } - } - } - } - } - -void colorLeaves(struct slRef *leafList) -/* Assign colors to leaves. I am very unhappy with the inconsistencies here. Namely the when you have two leaf siblings - * they are essentially assigned at random, which makes this metric fairly useless? Im less than stoked*/ - { - float total = 0.0; - struct slRef *el, *nextEl; - - /* Loop through list once to figure out the longest distance since we need to normalize */ - for (el = leafList; el != NULL; el = nextEl) - { - nextEl = el->next; - if (nextEl == NULL) - break; - struct bioExpVector *bio1 = el->val; - struct bioExpVector *bio2 = nextEl->val; - double distance = slBioExpVectorDistance((struct slList *)bio1, (struct slList *)bio2, NULL); - if (distance > total) total = distance; - } - - /* Loop through list a second time to generate actual colors. */ - bool firstLine = TRUE; - for (el = leafList; el != NULL; el = nextEl) - { - nextEl = el->next; - if (nextEl == NULL) - break; - struct bioExpVector *bio1 = el->val; - struct bioExpVector *bio2 = nextEl->val; - double distance = slBioExpVectorDistance((struct slList *)bio1, (struct slList *)bio2, NULL); - if (firstLine) // Handle the first two nodes, color them both the same - { - double normalized = distance/total; - struct rgbColor col = whiteToBlackRainbowAtPos(normalized); - bio1->color = col; - bio2->color = col; - firstLine = FALSE; - continue; - } - double normalized = distance/total; - // Color the next node based on distance from previous leaf node - if (normalized >= .95) bio2->color = whiteToBlackRainbowAtPos(.95); - else bio2->color = whiteToBlackRainbowAtPos(normalized); - } - } - -void convertInput(char *expMatrix, char *descFile, bool csv) -/* Takes in a expression matrix and makes the inputs that this program will use. - * Namely a transposed table with the first column removed. Makes use of system calls - * to use cut, sed, kent utility rowsToCols, and paste (for descFile option). */ - { - char cmd[1024],cmd1[1024], cmd2[1024]; - if (csv) - /* A sed one liner will convert comma separated values into a tab separated values*/ - { - char cmd3[1024]; - safef(cmd3, 1024, "sed -i 's/,/\\t/g' %s ",expMatrix); - verbose(2,"%s\n", cmd3); - mustSystem(cmd3); - } - safef(cmd, 1024, "cut -f 1 %s | sed \'1d\' > %s.genes", expMatrix, expMatrix); - mustSystem(cmd); - safef(cmd1, 1024, "cat %s | sed '1d' | rowsToCols stdin %s.transposedMatrix", expMatrix, expMatrix); - /* Exp matrices are X axis of cell lines and Y axis of transcripts. This causes long Y axis and short - * X axis, which are not handled well in C. The matrix is transposed to get around this issue. */ - verbose(2,"%s\n", cmd1); - mustSystem(cmd1); - /* Pull out the cell names, and store them in a separate file. This allows the actual data matrix to - * have the first row identify the transcript, then all following rows contain only expression values. - * By removing the name before hand the computation was made faster and easier. */ - if (descFile) - { - char cmd3[1024]; - safef(cmd2, 1024, "rowsToCols %s stdout | cut -f1 | sed \'1d\' > %s.cellNamesTemp", expMatrix, expMatrix); - safef(cmd3, 1024, "paste %s.cellNamesTemp %s > %s.cellNames", expMatrix, descFile, expMatrix); - verbose(2,"%s\n", cmd2); - mustSystem(cmd2); - verbose(2,"%s\n", cmd3); - mustSystem(cmd3); - } - else - { - safef(cmd2, 1024, "rowsToCols %s stdout | cut -f1 | sed \'1d\' > %s.cellNames", expMatrix, expMatrix); - verbose(2,"%s\n", cmd2); - mustSystem(cmd2); - } - } - -void generateHtml(FILE *outputFile, int nameSize, char* jsonFile) -// Generates a new .html file for the dendrogram. Will do some size calculations as well. - { - char *pageName = cloneString(jsonFile); - chopSuffix(pageName); - int textSize = 12 - log(nodeCount); - int labelLength = 10+nameSize*(15-textSize); - if (labelLength > 100) labelLength = 100; - - fprintf(outputFile,"\n"); - fprintf(outputFile,"
\n"); - fprintf(outputFile,"\n");
- fprintf(outputFile," Dendrogram\n"); - fprintf(outputFile," \n"); - fprintf(outputFile," | \n");
- fprintf(outputFile,"