8e4929ab14748403859c5a400a348f882a0cbcfb ceisenhart Sat Jun 28 12:53:24 2014 -0700 Takes in input files and performs an expression analysis using a Hac Tree. The output file is a binary tree, each leaf node is a specific tissue. diff --git src/hg/expData/expData.c src/hg/expData/expData.c index 3d0e285..4c4af77 100644 --- src/hg/expData/expData.c +++ src/hg/expData/expData.c @@ -1,204 +1,191 @@ /* expData - Takes in a relational database and outputs expression information. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "jksql.h" #include "expData.h" #include "sqlList.h" - +#include "hacTree.h" void usage() /* Explain usage and exit. */ { errAbort( "expData - Takes in a relational database and outputs expression information\n" "usage:\n" - " expData inputDataBase output\n" + " expData biosamples matrix output\n" "options:\n" " -xxx=XXX\n" ); } /* Command line validation table. */ static struct optionSpec options[] = { {NULL, 0}, }; -typedef struct -{ -char *name; -int count; -} expCell; -typedef struct +struct bioExpVector +/* Contains expression information for a biosample on many genes */ { -char *name; -float value; -}nameValue; + struct bioExpVector *next; + char *name; // name of biosample + int count; // Number of genes we have data for + double *vector; // An array allocated dynamically + }; +struct bioExpVector *bioExpVectorListFromFile(char *matrixFile) +// Read a tab-delimited file and return list of bioExpVector. -char* floatToString(float input) { -char* result = needMem(sizeof(result)); -sprintf(result,"%f", input); -return result; -freez(result); -} - -int compareExpCells(expCell exp1, expCell exp2) -/* a compare function for the HAC tree */ +int vectorSize = 0; +struct lineFile *lf = lineFileOpen(matrixFile, TRUE); +char *line, **row = NULL; +struct bioExpVector *list = NULL, *el; +while (lineFileNextReal(lf, &line)) { -int result = exp1.count - exp2.count; -return sqrt(result*result); -} - -expCell mergeExpCells(expCell exp1, expCell exp2) -/* a merge function for the HAC tree */ + if (vectorSize == 0) { -expCell result; -result.count = (exp1.count + exp2.count)/2; -result.name = catTwoStrings(exp1.name,exp2.name); -return result; + // Detect first row + vectorSize = chopByWhite(line, NULL, 0); // counting up + AllocArray(row, vectorSize); + continue; } - -int compareTwoRows(char *line1, char* line2) -/* seems to be working - * Takes in two rows from the table, as strings. - * Parses the rows and finds the summation of distance between - * the expression values of each row. */ -{ -char *output1[10000]; -char *output2[10000]; -char *temp1 = cloneString(line1); -char *temp2 = cloneString(line2); -int size1 = 0; -size1 = chopTabs(temp1,output1); -int size2 = 0; -size2 = chopTabs(temp2,output2); -//double result = 1000000; -//if (size1 != size2) -// { -// return result; -// } + AllocVar(el); + AllocArray(el->vector, vectorSize); + el->count = chopByWhite(line, row, vectorSize); + assert(el->count == vectorSize); int i; -double sum = 0; -for (i = 0; i<size1; ++i) - { - double diff = atoi(output1[i]) - atoi(output2[i]); - sum += sqrt((diff * diff)); + for (i = 0; i < el->count; ++i) + el->vector[i] = sqlDouble(row[i]); + slAddHead(&list, el); } -return sum; +lineFileClose(&lf); +slReverse(&list); +return list; } - - -void fileIO (char *input, char *output) +void fillInNames(struct bioExpVector *list, char *nameFile) +/* Fill in name field from file */ { -FILE *f = mustOpen(output,"w"); -//struct lm *localMem = lmInit(0); -struct lineFile *lf = lineFileOpen(input, TRUE); +struct lineFile *lf = lineFileOpen(nameFile, TRUE); char *line; -long count = 1; -int i, j; -struct hash *hashRows = hashNew(0); -if (!lineFileNext(lf, &line, NULL)) - { - errAbort("%s This should be the column names", lf->fileName); - } -while (lineFileNext(lf,&line,NULL)) - { - char *temp = cloneString(line); - hashAdd(hashRows,floatToString(count),temp); - ++ count; - } -for (i = 1; i < count; ++i) +struct bioExpVector *el = list; +while (lineFileNextReal(lf, &line)) { - for (j = i + 1 ; j < count ; ++j) + if (el == NULL) { - expCell exp; - exp.name = catTwoStrings(floatToString(i), floatToString(j)); - char *temp1 = hashFindVal(hashRows, floatToString(i)); - char *temp2 = hashFindVal(hashRows, floatToString(j)); - exp.count = compareTwoRows(temp1,temp2); - fprintf(f,"%d",compareTwoRows(temp1, temp2)); - fprintf(f,"%s\n"," "); + warn("More names than items in list"); + break; } + el->name = cloneString(line); + el = el->next; } -//struct hacTree *clusters = hacTreeFromItems((struct slList *)sldList, localMem, compareExpCell, mergeExpCell, NULL, NULL); +lineFileClose(&lf); } -void printOutput(FILE *f, struct hash *hash, int count) -/* Prints the data to the output file. - * Each row corresponds to a tissue sample; the first element. - * ALl subsequent elements are in sequence name : expression value pairs. */ +static void rPrintSlBioExpVectorTree(FILE *f, struct hacTree *tree, int level) +/* Recursively print out cluster as nested-parens with {}'s around leaf nodes. */ { -nameValue *output; +char *tissue = ((struct bioExpVector *)(tree->itemOrCluster))->name; int i; -for (i = 0 ; i < count; ++i) +for (i = 0; i < level; i++) + fputc(' ', f); +if (tree->left == NULL && tree->right == NULL) { - output = hashFindVal(hash,floatToString(i)); - struct hashEl *hel; - fprintf(f,"%s", floatToString(i)); - fprintf(f,"%s", " "); - for (hel = hashLookup(hash,floatToString(i)); hel != NULL; hel = hashLookupNext(hel)) - { - nameValue *temp = hel->val; - char *name = temp->name; - long score = temp->value; - fprintf(f, "%s", name); - fprintf(f, "%s", ":"); - fprintf(f, "%ld", score); - fprintf(f, "%s", " + "); + fprintf(f, "{%s}", tissue); + return; + } +else if (tree->left == NULL || tree->right == NULL) + errAbort("\nHow did we get a node with one NULL kid??"); +fprintf(f, "(%s\n", tissue); +rPrintSlBioExpVectorTree(f, tree->left, level+1); +fputs(",\n", f); +rPrintSlBioExpVectorTree(f, tree->right, level+1); +fputc('\n', f); +for (i=0; i < level; i++) + fputc(' ', f); +fputs(")", f); } - fprintf(f, "\n%s", " " ); + +void printSlBioExpVectorTree(FILE *f, struct hacTree *tree) +/* Print out cluster as nested-parens with {}'s around leaf nodes. */ +{ +if (tree == NULL) + { + fputs("Empty tree.\n", f); + return; } +rPrintSlBioExpVectorTree(f, tree, 0); +fputc('\n', f); } +char* floatToString(float input) +{ +char* result = needMem(sizeof(result)); +sprintf(result,"%f", input); +return result; +freez(result); +} +double slBioExpVectorDistance(const struct slList *item1, const struct slList *item2, void *extraData) +/* Return the absolute difference between the two kids' values. */ +{ +const struct bioExpVector *kid1 = (const struct bioExpVector *)item1; +const struct bioExpVector *kid2 = (const struct bioExpVector *)item2; +int j; +double diff = 0, sum = 0; +for (j = 0; j < kid1->count; ++j) + { + diff = kid1->vector[j] - kid2->vector[j]; + sum += (diff * diff); + } +return sqrt(sum); +} -void expData(char *output) -/* Grabs expression data and formats it nicely */ +struct slList *slBioExpVectorMerge(const struct slList *item1, const struct slList *item2, + void *unusedExtraData) +/* Make a new slPair where the name is the both kids names and the + * value is the average of kids' values. + * Designed for HAC tree use*/ { -FILE *f = mustOpen(output, "w"); -struct sqlConnection *conn = sqlConnect("hgFixed"); -char query[512]; -sqlSafef(query, sizeof(query), "select * from gnfHumanAtlas2Median limit 2"); -struct sqlResult *sr = sqlGetResult(conn, query); -char **row; -char *line = needMem(sizeof(line)); -struct expData *list = NULL; -struct hash *hash = hashNew(0); -int count = 0; -while ((row = sqlNextRow(sr)) != NULL) - { - struct expData *exp = expDataLoad(row); +const struct bioExpVector *kid1 = (const struct bioExpVector *)item1; +const struct bioExpVector *kid2 = (const struct bioExpVector *)item2; +struct bioExpVector *el; +AllocVar(el); +AllocArray(el->vector, kid1->count); +el->count = kid1->count; +el->name = catTwoStrings(kid1->name, kid2->name); int i; - count = exp->expCount; - for (i = 0; i<exp->expCount; ++i) +for (i = 0; i < el->count; ++i) { - nameValue *pair = needMem(sizeof(pair)); - pair->name = exp->name; - pair->value = exp->expScores[i]; - hashAdd(hash,floatToString(i),pair); - } - slAddHead(&list, exp); + el->vector[i] = (kid1->vector[i] + kid2->vector[i])/2; } -printOutput(f,hash,count); -slReverse(&list); -expDataFreeList(&list); -carefulClose(&f); +return (struct slList *)(el); } +void expData(char *matrixFile, char *nameFile, char *outFile) +/* Read matrix and names into a list of bioExpVectors, run hacTree to + * associate them, and write output. */ +{ +struct bioExpVector *list = bioExpVectorListFromFile(matrixFile); +FILE *f = mustOpen(outFile,"w"); +struct lm *localMem = lmInit(0); +fillInNames(list, nameFile); +struct hacTree *clusters = hacTreeFromItems((struct slList *)list, localMem, + slBioExpVectorDistance, slBioExpVectorMerge, NULL, NULL); +printSlBioExpVectorTree(f,clusters); +} int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); -if (argc != 3) +if (argc != 4) usage(); -fileIO(argv[1],argv[2]); +expData(argv[1], argv[2], argv[3]); return 0; }