4898794edd81be5285ea6e544acbedeaeb31bf78 max Tue Nov 23 08:10:57 2021 -0800 Fixing pointers to README file for license in all source code files. refs #27614 diff --git src/hg/regulate/regClusterTreeCells/regClusterTreeCells.c src/hg/regulate/regClusterTreeCells/regClusterTreeCells.c index 1e9efcb..b34df66 100644 --- src/hg/regulate/regClusterTreeCells/regClusterTreeCells.c +++ src/hg/regulate/regClusterTreeCells/regClusterTreeCells.c @@ -1,493 +1,493 @@ /* regClusterTreeCells - Create a binary tree of cell types based on hierarchical clustering (Eisen style) of expression data.. */ /* Copyright (C) 2011 The Regents of the University of California - * See README in this or parent directory for licensing information. */ + * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "localmem.h" #include "bedGraph.h" #include "hmmstats.h" #include "obscure.h" void usage() /* Explain usage and exit. */ { errAbort( "regClusterTreeCells - Create a binary tree of cell types based on hierarchical clustering\n" "(Eisen style) of expression data.\n" "usage:\n" " regClusterTreeCells inFiles.lst output.tree output.distances\n" "Where inFiles.lst is a list of bedGraph format files: <chrom><start><end><score>\n" "options:\n" " -short - Abbreviate output by removing stuff shared by all.\n" " -rename=twoCol.tab - Rename files according to file/name columns\n" ); } struct hash *renameHash = NULL; static struct optionSpec options[] = { {"short", OPTION_BOOLEAN}, {"rename", OPTION_STRING}, {NULL, 0}, }; struct treeNode /* An element in a hierarchical binary tree. */ { struct treeNode *left, *right; /* Two children */ int vectorSize; /* # of items in vector below. */ float *vector; /* Array of doubles, in this case gene expression values. */ char *fileName; /* File data came from. */ boolean merged; /* If true, has been merged. */ }; struct bedGraph *readTabFileAsBedGraph( char *fileName, int chromIx, int startIx, int endIx, int dataIx, struct lm *lm) /* Read in indicated columns from a tab-separated file and create a list of bedGraphs from them. */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); char *row[128]; int maxRowSize = ArraySize(row); int rowSize; struct bedGraph *list = NULL, *el; /* Figure out minimum number of columns for file. */ int maxIx = chromIx; if (maxIx < startIx) maxIx = startIx; if (maxIx < endIx) maxIx = endIx; if (maxIx < dataIx) maxIx = dataIx; while ((rowSize = lineFileChopNext(lf, row, maxRowSize)) != 0) { /* Check row is not too big, but not too small either. */ if (rowSize == maxRowSize) errAbort("Too many words line %d of %s, can only handle %d\n", lf->lineIx, lf->fileName, maxRowSize-1); if (rowSize <= maxIx) errAbort("Need at least %d columns, got %d, line %d of %s\n", maxIx+1, rowSize, lf->lineIx, lf->fileName); /* Allocate and fill in bedGraph from local memory pool */ lmAllocVar(lm, el); el->chrom = lmCloneString(lm, row[chromIx]); el->chromStart = lineFileNeedNum(lf, row, startIx); el->chromEnd = lineFileNeedNum(lf, row, endIx); el->dataValue = lineFileNeedDouble(lf, row, dataIx); slAddHead(&list, el); } lineFileClose(&lf); slReverse(&list); return list; } static void notIdenticalRegions(struct lineFile *lf) /* Complain about regions not being identical. */ { errAbort("Line %d of %s, region not the same as corresponding line of first file", lf->lineIx, lf->fileName); } struct treeNode *readTabFileAsNode( char *fileName, int chromIx, int startIx, int endIx, int dataIx, struct bedGraph *bedList, int vectorSize) /* Read file into a treeNode. Check while reading it that the chrom/start/end * are the same as bedList. Effectively this makes sure that all of the files * are ordered the same way. */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); char *row[128]; int maxRowSize = ArraySize(row); int rowSize; /* Figure out minimum number of columns for file. */ int maxIx = chromIx; if (maxIx < startIx) maxIx = startIx; if (maxIx < endIx) maxIx = endIx; if (maxIx < dataIx) maxIx = dataIx; /* Allocate tree node and it's vector. */ struct treeNode *treeNode; AllocVar(treeNode); treeNode->fileName = cloneString(fileName); treeNode->vectorSize = vectorSize; float *vector = AllocArray(treeNode->vector, vectorSize); /* Loop through file. */ int count = 0; struct bedGraph *el = bedList; while ((rowSize = lineFileChopNext(lf, row, maxRowSize)) != 0) { /* Check row is not too big, but not too small either, and that there are not too * many of them. */ if (rowSize == maxRowSize) errAbort("Too many words line %d of %s, can only handle %d\n", lf->lineIx, lf->fileName, maxRowSize-1); if (rowSize <= maxIx) errAbort("Need at least %d columns, got %d, line %d of %s\n", maxIx+1, rowSize, lf->lineIx, lf->fileName); if (count >= vectorSize) errAbort("Too many items %s. First file only has %d\n", lf->fileName, vectorSize); /* Check we are covering same regions. */ if (!sameString(row[chromIx], el->chrom)) notIdenticalRegions(lf); if (el->chromStart != lineFileNeedNum(lf, row, startIx)) notIdenticalRegions(lf); if (el->chromEnd != lineFileNeedNum(lf, row, endIx)) notIdenticalRegions(lf); /* Store away the data. */ vector[count] = lineFileNeedDouble(lf, row, dataIx); /* And advance to next one. */ ++count; el = el->next; } if (count != vectorSize) errAbort("Expecting %d items in %s, just got %d", vectorSize, lf->fileName, count); lineFileClose(&lf); return treeNode; } double treeNodeDistance(struct treeNode *aNode, struct treeNode *bNode) /* Return euclidean distance between vectors of two nodes. */ { float *av = aNode->vector, *bv = bNode->vector; int size = aNode->vectorSize; double sumSquares = 0; int i; for (i=0; i<size; ++i) { double d = av[i] - bv[i]; sumSquares += d*d; } return sqrt(sumSquares); } void normalizeTreeNode(struct treeNode *treeNode) /* Convert vector to z-scores - subtract mean, divide by standard deviation. */ { /* Get local vars for stuff we use a bunch. */ float *vector = treeNode->vector; int size = treeNode->vectorSize; /* Calculate standard deviation and mean. */ double sum = 0, sumSquares = 0; int i; for (i=0; i<size; ++i) { double x = vector[i]; sum += x; sumSquares += x*x; } float mean = sum/size; float std = calcStdFromSums(sum, sumSquares, size); float invStd = 1.0/std; /* Rescale vector. */ for (i=0; i<size; ++i) vector[i] = (vector[i] - mean)*invStd; } void findClosestPair(struct treeNode *array[], int nodesUsed, double **distanceMatrix, struct treeNode **retA, struct treeNode **retB) /* Find closest pair of nodes in array using distance matrix for distance. */ { int i,j; double closestDistance = BIGDOUBLE; struct treeNode *a, *b, *closestA = NULL, *closestB = NULL; for (i=0; i<nodesUsed; ++i) { a = array[i]; if (!a->merged) { for (j=0; j<i; ++j) { b = array[j]; if (!b->merged) { double distance = distanceMatrix[i][j]; if (distance < closestDistance) { closestDistance = distance; closestA = a; closestB = b; } } } } } *retA = closestA; *retB = closestB; } float *averageVectors(float *a, float *b, int size) /* Allocate and return a vector that is the average of a & b */ { float *x = needLargeMem(size * sizeof(float)); int i; for (i=0; i<size; ++i) x[i] = 0.5 * (a[i] + b[i]); return x; } void rOutputTree(FILE *f, struct treeNode *parent, struct treeNode *node, int level, int prefixSize, int suffixSize) /* Recursively output tree. */ { if (node->fileName) { if (renameHash) { char *s = hashMustFindVal(renameHash, node->fileName); mustWrite(f, s, strlen(s)); } else { char *s = node->fileName + prefixSize; int len = strlen(s) - suffixSize; mustWrite(f, s, len); } } else { fprintf(f, "("); rOutputTree(f, node, node->left, level+1, prefixSize, suffixSize); fprintf(f, ","); rOutputTree(f, node, node->right, level+1, prefixSize, suffixSize); fprintf(f, ")"); } if (parent != NULL) fprintf(f, ":%g", treeNodeDistance(parent, node)); fprintf(f, " "); } char *findCommonPrefix(struct slName *list) /* Find common prefix to list of names. */ { /* Deal with short special cases. */ if (list == NULL) errAbort("Can't findCommonPrefix on empty list"); if (list->next == NULL) return cloneString(""); int prefixSize = BIGNUM; struct slName *el, *next; for (el = list; el != NULL; el = next) { next = el->next; if (next == NULL) break; int same = countSame(el->name, next->name); if (same < prefixSize) prefixSize = same; } return cloneStringZ(list->name, prefixSize); } int countSameAtEnd(char *a, char *b) /* Count number of chars at end of string that are the same between a and b. */ { int aLen = strlen(a), bLen = strlen(b); int minLen = min(aLen, bLen); int sameCount = 0; a += aLen-1; b += bLen-1; int i; for (i=0; i<minLen; ++i) { if (a[-i] == b[-i]) ++sameCount; else break; } return sameCount; } char *findCommonSuffix(struct slName *list) /* Find common suffix to list of names. */ { /* Deal with short special cases. */ if (list == NULL) errAbort("Can't findCommonPrefix on empty list"); if (list->next == NULL) return cloneString(""); int suffixSize = BIGNUM; struct slName *el, *next; for (el = list; el != NULL; el = next) { next = el->next; if (next == NULL) break; int same = countSameAtEnd(el->name, next->name); if (same < suffixSize) suffixSize = same; } return cloneStringZ(list->name + strlen(list->name) - suffixSize, suffixSize); } void outputTree(char *fileName, struct slName *inFileList, struct treeNode *root) /* Output tree to file. */ { FILE *f = mustOpen(fileName, "w"); char *commonPrefix = "", *commonSuffix = ""; if (optionExists("short")) { commonPrefix = findCommonPrefix(inFileList); commonSuffix = findCommonSuffix(inFileList); } rOutputTree(f, NULL, root, 0, strlen(commonPrefix), strlen(commonSuffix)); fputc('\n', f); carefulClose(&f); } struct slRef *rRefList = NULL; void rMakeRefList(struct treeNode *node) /* Recursively add references to leaf nodes to rRefList */ { if (node->fileName) { refAdd(&rRefList, node); } else { rMakeRefList(node->left); rMakeRefList(node->right); } } struct slRef *makeRefsToLeaves(struct treeNode *root) /* Make a refList of treeNodes that are leaves ordered according to tree. */ { rRefList = NULL; rMakeRefList(root); slReverse(&rRefList); return rRefList; } void outputDistancesOfNeighbors(char *fileName, struct treeNode *root) /* Output three column file of format <cellA> <cellB> <distanceA-B> * where lines in file or ordered by tree. */ { struct slRef *ref, *refList = makeRefsToLeaves(root); FILE *f = mustOpen(fileName, "w"); for (ref = refList; ref != NULL; ref = ref->next) { struct slRef *next = ref->next; if (next == NULL) next = refList; // wrap at end struct treeNode *a = ref->val; struct treeNode *b = next->val; char *aString = a->fileName; char *bString = b->fileName; if (renameHash) { aString = hashMustFindVal(renameHash, aString); bString = hashMustFindVal(renameHash, bString); } fprintf(f, "%s\t%s\t%g\n", aString, bString, treeNodeDistance(a, b)); } carefulClose(&f); } void regClusterTreeCells(char *inFiles, char *outTree, char *outDistances) /* regClusterTreeCells - Create a binary tree of cell types based on hierarchical clustering * (Eisen style) of expression data.. */ { struct slName *inFileList = readAllLines(inFiles); int inFileCount = slCount(inFileList); verbose(1, "Got %d files in %s\n", inFileCount, inFiles); if (inFileCount < 2) errAbort("Can't do this with less than 2 input files"); /* Read in first file as a bedGraph */ char *firstFileName = inFileList->name; struct lm *lm = lmInit(0); struct bedGraph *bedList = readTabFileAsBedGraph(firstFileName, 0, 1, 2, 6, lm); int itemCount = slCount(bedList); verbose(1, "Got %d items in %s\n", itemCount, firstFileName); /* Create some arrays big enough for all nodes. We'll be merging nodes a pair at a time, * but creating a new node while doing this. */ int treeNodeCount = inFileCount + inFileCount - 1; struct treeNode **array; AllocArray(array, treeNodeCount); /* Read in initial treeNodes from file. */ /* Create a doubly-linked list of treeNodes, one for each file. */ int nodesUsed = 0; struct slName *inFile; for (inFile = inFileList; inFile != NULL; inFile = inFile->next) { struct treeNode *treeNode = readTabFileAsNode(inFile->name, 0, 1, 2, 6, bedList, itemCount); normalizeTreeNode(treeNode); array[nodesUsed++] = treeNode; verboseDot(); } verbose(1, "\nRead %d x %d = %ld data values total\n", inFileCount, itemCount, (long)itemCount * (long)inFileCount); /* Create look up array for pairwise distances. This needs to be big enough to hold all nodes, * the initial nodes we just read in, and also the ones we create by joining a pair of existing * nodes. */ double **distanceMatrix; AllocArray(distanceMatrix, treeNodeCount); int i,j; for (i=0; i<treeNodeCount; ++i) { distanceMatrix[i] = needLargeZeroedMem((i+1) * sizeof(double)); } /* Populate distance array with initial values. */ for (i=0; i<nodesUsed; ++i) for (j=0; j<i; ++j) distanceMatrix[i][j] = treeNodeDistance(array[i], array[j]); verbose(1, "done initial distance matrix calcs\n"); /* Main loop - find two nearest, make new node that is an average of the two, * and replace the two with that node until only one left. */ struct treeNode *join = NULL; for ( ;nodesUsed <treeNodeCount; ++nodesUsed) { /* Find closest two and build new merged node around it. */ struct treeNode *aNode, *bNode; findClosestPair(array, nodesUsed, distanceMatrix, &aNode, &bNode); AllocVar(join); join->vectorSize = itemCount; join->vector = averageVectors(aNode->vector, bNode->vector, join->vectorSize); join->left = aNode; join->right = bNode; aNode->merged = bNode->merged = TRUE; /* Put vector in array, and in distanceMatrix */ array[nodesUsed] = join; distanceMatrix[nodesUsed] = needLargeZeroedMem((nodesUsed+1) * sizeof(double)); for (j=0; j<nodesUsed; ++j) distanceMatrix[nodesUsed][j] = treeNodeDistance(join, array[j]); verboseDot(); } verbose(1, "\ndone main loop\n"); outputTree(outTree, inFileList, join); outputDistancesOfNeighbors(outDistances, join); lmCleanup(&lm); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 4) usage(); char *renameFile = optionVal("rename", NULL); if (renameFile != NULL) renameHash = hashTwoColumnFile(renameFile); regClusterTreeCells(argv[1], argv[2], argv[3]); return 0; }