6d929adf2631fc69b2bdb3f813f12b8590cbdd17 ceisenhart Fri Jul 11 16:23:44 2014 -0700 Added new comments, json output is still not working diff --git src/hg/expData/expData.c src/hg/expData/expData.c index 65fbbac..c874ef0 100644 --- src/hg/expData/expData.c +++ src/hg/expData/expData.c @@ -1,275 +1,279 @@ /* expData - Takes in a relational database and outputs expression information. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "jksql.h" #include "expData.h" #include "sqlList.h" #include "hacTree.h" #include "jsonWrite.h" void usage() /* Explain usage and exit. */ { errAbort( "expData - Takes in a relational database and outputs expression information\n" "usage:\n" " expData biosamples matrix output\n" "options:\n" " -xxx=XXX\n" ); } /* Command line validation table. */ static struct optionSpec options[] = { {NULL, 0}, }; -int gc = 0; -int gc2 = 0; -struct link -/* Contains the info for a Json link */ - { - struct link *next; - int source; // the source node - int target; // the target node - }; + +int target = 0; // Used for the target value in rlinkJson. +float longest = 0; // Used to normalize link distances in rlinkJson. + struct bioExpVector -/* Contains expression information for a biosample on many genes */ +/* Contains expression information for a biosample on many genes. */ { struct bioExpVector *next; - char *name; // name of biosample - int count; // Number of genes we have data for - double *vector; // An array allocated dynamically + char *name; // name of biosample. + int count; // Number of genes we have data for. + double *vector; // An array allocated dynamically. }; struct bioExpVector *bioExpVectorListFromFile(char *matrixFile) // Read a tab-delimited file and return list of bioExpVector. { int vectorSize = 0; struct lineFile *lf = lineFileOpen(matrixFile, TRUE); char *line, **row = NULL; struct bioExpVector *list = NULL, *el; while (lineFileNextReal(lf, &line)) { if (vectorSize == 0) { - // Detect first row + // Detect first row. vectorSize = chopByWhite(line, NULL, 0); // counting up AllocArray(row, vectorSize); continue; } AllocVar(el); AllocArray(el->vector, vectorSize); el->count = chopByWhite(line, row, vectorSize); assert(el->count == vectorSize); int i; for (i = 0; i < el->count; ++i) el->vector[i] = sqlDouble(row[i]); slAddHead(&list, el); } lineFileClose(&lf); slReverse(&list); return list; } void fillInNames(struct bioExpVector *list, char *nameFile) -/* Fill in name field from file */ +/* Fill in name field from file. */ { struct lineFile *lf = lineFileOpen(nameFile, TRUE); char *line; struct bioExpVector *el = list; while (lineFileNextReal(lf, &line)) { if (el == NULL) { warn("More names than items in list"); break; } el->name = cloneString(line); el = el->next; } lineFileClose(&lf); } void rPrintNodes(FILE *f, struct hacTree *tree) { // Recursively prints out the nodes in a depth first order starting on the left char *tissue = ((struct bioExpVector *)(tree->itemOrCluster))->name; +struct jsonWrite *jw = jsonWriteNew(); if (tree->childDistance != 0) + // If the current object is a node then we assign no name. { fprintf(f," %s\"%s\"%s\"%s\"%s", "{","name", ":", " ", ","); + jsonWriteString(jw, "name", " "); + jsonWriteNumber(jw, "y", tree->childDistance); fprintf(f,"\"%s\"%s%0.31f%s\n", "y", ":" , tree->childDistance, "},"); } else { + // Otherwise the current object is a leaf, and the tissue name is printed. + jsonWriteString(jw, "name", tissue); + jsonWriteNumber(jw, "y", tree->childDistance); fprintf(f," %s\"%s\"%s\"%s\"%s", "{","name", ":", tissue, ","); fprintf(f,"\"%s\"%s%0.31f%s\n", "y", ":" , tree->childDistance, "},"); } if (tree->left == NULL && tree->right == NULL) { return; } else if (tree->left == NULL || tree->right == NULL) errAbort("\nHow did we get a node with one NULL kid??"); rPrintNodes(f, tree->left); rPrintNodes(f, tree->right); } -int testSource = 0, testTarget = 0; void rPrintLinks(FILE *f, struct hacTree *tree, int source) { -// recursively prints the links - -if (gc == gc2 - 1) - { - return; - } +// Recursively prints the links in json format. +if (tree->childDistance > longest) + // the first distance will be the longest, and is used for normalization + longest = tree->childDistance; /* if the current location is a leaf */ if (tree->left == NULL && tree->right == NULL) { return; } else if (tree->left == NULL || tree->right == NULL) errAbort("\nHow did we get a node with one NULL kid??"); /* check for the end of the tree */ -/* left recursgion, the source and target are always ofset by 1 */ -++testTarget; -fprintf(f," %s\"%s\"%s%d%s", "{","source", ":", testTarget - 1, ","); -fprintf(f,"\"%s\"%s%d%s\n", "target", ":" , testTarget, "},"); -/* preps the source for the right links */ -source = testTarget; +/* Left recursion; the source and target are always ofset by 1. */ +++target; +fprintf(f," %s\"%s\"%s%d%s", "{","source", ":", target - 1, ","); +fprintf(f,"\"%s\"%s%d%s", "target", ":" , target, ","); +fprintf(f,"\"%s\"%s%0.31f%s\n", "distance", ":" , 100*(tree->childDistance/longest) , "},"); +/* Prepares the source for the right recursion. */ +source = target; rPrintLinks(f, tree->left, source); -/* print the right link */ -++testTarget; +/* Right recursion. */ +++target; fprintf(f," %s\"%s\"%s%d%s", "{","source", ":", source - 1 , ","); -fprintf(f,"\"%s\"%s%d%s\n", "target", ":" , testTarget, "},"); +fprintf(f,"\"%s\"%s%d%s", "target", ":" , target, ","); +fprintf(f,"\"%s\"%s%0.31f%s\n", "distance", ":" , 100*(tree->childDistance/longest) , "},"); rPrintLinks(f, tree->right, ++source); } void printJson(FILE *f, struct hacTree *tree) /* Prints the hacTree into a Json file format */ { int source = 0; // Basic json template for d3 visualizations +//struct jsonWrite *jw = jsonWriteNew(); fprintf(f,"%s\n", "{"); +//jsonWriteTag(struct jsonWrite *jw, char *var); fprintf(f," \"%s\"%s\n", "nodes", ":[" ); rPrintNodes(f, tree); fprintf(f, "%s\n", "],"); // Basic json template for d3 visualizations fprintf(f, "\"%s\"%s\n", "links", ":[" ); rPrintLinks(f,tree, source); fprintf(f," %s\n", "]"); fprintf(f,"%s\n", "}"); } static void rPrintSlBioExpVectorTree(FILE *f, struct hacTree *tree, int level,double distance) /* Recursively print out cluster as nested-parens with {}'s around leaf nodes. */ { char *tissue = ((struct bioExpVector *)(tree->itemOrCluster))->name; int i; for (i = 0; i < level; i++) fputc(' ', f); if (tree->left == NULL && tree->right == NULL) { fprintf(f, "{%s}%0.31f", tissue, distance); return; } else if (tree->left == NULL || tree->right == NULL) errAbort("\nHow did we get a node with one NULL kid??"); fprintf(f, "(%s%f\n", "node", tree->childDistance); distance += tree->childDistance; rPrintSlBioExpVectorTree(f, tree->left, level+1, distance); fputs(",\n", f); rPrintSlBioExpVectorTree(f, tree->right, level+1, distance); fputc('\n', f); for (i=0; i < level; i++) fputc(' ', f); fputs(")", f); } void printSlBioExpVectorTree(FILE *f, struct hacTree *tree) /* Print out cluster as nested-parens with {}'s around leaf nodes. */ { if (tree == NULL) { fputs("Empty tree.\n", f); return; } double distance = 0; rPrintSlBioExpVectorTree(f, tree, 0, distance); fputc('\n', f); } char* floatToString(float input) { char* result = needMem(sizeof(result)); sprintf(result,"%f", input); return result; freez(result); } double slBioExpVectorDistance(const struct slList *item1, const struct slList *item2, void *extraData) -/* Return the absolute difference between the two kids' values. */ +/* Return the absolute difference between the two kids' values. + * Designed for HAC tree use*/ { const struct bioExpVector *kid1 = (const struct bioExpVector *)item1; const struct bioExpVector *kid2 = (const struct bioExpVector *)item2; int j; double diff = 0, sum = 0; for (j = 0; j < kid1->count; ++j) { diff = kid1->vector[j] - kid2->vector[j]; sum += (diff * diff); } return sqrt(sum); } struct slList *slBioExpVectorMerge(const struct slList *item1, const struct slList *item2, void *unusedExtraData) -/* Make a new slPair where the name is the both kids names and the +/* Make a new slPair where the name is the children names concattenated and the * value is the average of kids' values. * Designed for HAC tree use*/ { const struct bioExpVector *kid1 = (const struct bioExpVector *)item1; const struct bioExpVector *kid2 = (const struct bioExpVector *)item2; struct bioExpVector *el; AllocVar(el); AllocArray(el->vector, kid1->count); el->count = kid1->count; el->name = catTwoStrings(kid1->name, kid2->name); int i; for (i = 0; i < el->count; ++i) { el->vector[i] = (kid1->vector[i] + kid2->vector[i])/2; } return (struct slList *)(el); } void expData(char *matrixFile, char *nameFile, char *outFile) /* Read matrix and names into a list of bioExpVectors, run hacTree to * associate them, and write output. */ { struct bioExpVector *list = bioExpVectorListFromFile(matrixFile); FILE *f = mustOpen(outFile,"w"); struct lm *localMem = lmInit(0); fillInNames(list, nameFile); struct hacTree *clusters = hacTreeFromItems((struct slList *)list, localMem, slBioExpVectorDistance, slBioExpVectorMerge, NULL, NULL); printJson(f,clusters); //printSlBioExpVectorTree(f,clusters); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 4) usage(); expData(argv[1], argv[2], argv[3]); return 0; }