8135446fb6a4c338e543585aa411808eef36ee10 kent Sun Dec 20 16:00:10 2020 -0800 Optimizing speed and polishing comments. diff --git src/utils/clusterMatrixToBarchartBed/clusterMatrixToBarchartBed.c src/utils/clusterMatrixToBarchartBed/clusterMatrixToBarchartBed.c index ea28755..fead46b 100644 --- src/utils/clusterMatrixToBarchartBed/clusterMatrixToBarchartBed.c +++ src/utils/clusterMatrixToBarchartBed/clusterMatrixToBarchartBed.c @@ -1,284 +1,297 @@ -/* clusterMatrixToBarchartBed - Take a gene matrix and a gene bed file and a way to cluster - * samples into a barchart type bed. */ +/* clusterMatrixToBarchartBed - Compute a barchart bed file from a gene matrix + * and a gene bed file and a way to cluster samples. */ + #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "localmem.h" +#include "obscure.h" #include "sqlNum.h" boolean clDataOffset = FALSE; boolean clMean = FALSE; void usage() /* Explain usage and exit. */ { errAbort( - "clusterMatrixToBarchartBed - Take a gene matrix and a gene bed file and a way to cluster\n" - "samples into a barchart type bed\n" + "clusterMatrixToBarchartBed - Compute a barchart bed file from a gene matrix\n" + "and a gene bed file and a way to cluster samples.\n" "usage:\n" " clusterMatrixToBarchartBed sampleClusters.tsv geneMatrix.tsv geneset.bed output.bed\n" "where:\n" - " sampleClusters.tsv is a tab separated file with the first column being sample ids\n" - " geneMatrix.tsv has a row for each gene. The first row uses the same sample ids\n" + " sampleClusters.tsv is a two column tab separated file with sampleId and clusterId\n" + " geneMatrix.tsv has a row for each gene. The first row uses the same sampleId as above\n" " geneset.bed has the maps the genes in the matrix (from it's first column) to the genome\n" " output.bed is the resulting bar chart, with one column per cluster\n" "options:\n" " -dataOffset - store the position of gene in geneMatrix.tsv file in output\n" " -mean - use mean (instead of median)\n" ); } /* Command line validation table. */ static struct optionSpec options[] = { {"dataOffset", OPTION_BOOLEAN}, {"_dataOffset", OPTION_BOOLEAN}, {"mean", OPTION_BOOLEAN}, {NULL, 0}, }; struct hash *hashTsvBy(char *in, int keyColIx, int *retColCount) /* Return a hash of rows keyed by the given column */ { struct lineFile *lf = lineFileOpen(in, TRUE); struct hash *hash = hashNew(0); char *line = NULL, **row = NULL; int colCount = 0, colAlloc=0; /* Columns as counted and as allocated */ while (lineFileNextReal(lf, &line)) { if (colCount == 0) { *retColCount = colCount = chopByChar(line, '\t', NULL, 0); verbose(2, "Got %d columns in first real line\n", colCount); colAlloc = colCount + 1; // +1 so we can detect unexpected input and complain lmAllocArray(hash->lm, row, colAlloc); } int count = chopByChar(line, '\t', row, colAlloc); if (count != colCount) { errAbort("Expecting %d words, got more than that line %d of %s", colCount, lf->lineIx, lf->fileName); } hashAdd(hash, row[keyColIx], lmCloneRow(hash->lm, row, colCount) ); } lineFileClose(&lf); return hash; } void hashSamplesAndClusters(char *tsvFile, struct hash **retSampleHash, struct hash **retClusterHash) /* Read two column tsv file into a hash keyed by first column */ { struct hash *sampleHash = hashNew(0); struct hash *clusterHash = hashNew(0); char *row[2]; struct lineFile *lf = lineFileOpen(tsvFile, TRUE); while (lineFileNextRowTab(lf, row, ArraySize(row)) ) { /* Find cluster in cluster hash, if it doesn't exist make it. */ char *clusterName = row[1]; struct hashEl *hel = hashLookup(clusterHash, clusterName); if (hel == NULL) hel = hashAddInt(clusterHash, clusterName, 1); else hel->val = ((char *)hel->val)+1; // Increment hash pointer as per hashIncInt char *clusterStableName = hel->name; // This is allocated in clusterHash hashAdd(sampleHash, row[0], clusterStableName); } lineFileClose(&lf); *retSampleHash = sampleHash; *retClusterHash = clusterHash; } void clusterMatrixToBarchartBed(char *sampleClusters, char *matrixTsv, char *geneBed, char *output) -/* clusterMatrixToBarchartBed - Take a gene matrix and a gene bed file and a way to cluster samples - * into a barchart type bed. */ +/* clusterMatrixToBarchartBed - Compute a barchart bed file from a gene matrix + * and a gene bed file and a way to cluster samples. */ { +/* Figure out if we need to do medians etc */ +boolean doMedian = !clMean; + /* Load up the gene set */ verbose(1, "clusterMatrixToBarchartBed(%s,%s,%s,%s)\n", sampleClusters, matrixTsv, geneBed, output); int bedRowSize = 0; struct hash *geneHash = hashTsvBy(geneBed, 3, &bedRowSize); verbose(1, "%d genes in %s\n", geneHash->elCount, geneBed); /* Load up the sample clustering */ struct hash *sampleHash = NULL, *clusterHash = NULL; hashSamplesAndClusters(sampleClusters, &sampleHash, &clusterHash); int clusterCount = clusterHash->elCount; verbose(1, "%d samples and %d clusters in %s\n", sampleHash->elCount, clusterCount, sampleClusters); if (clusterCount <= 1 || clusterCount >= 10000) errAbort("%d is not a good number of clusters", clusterCount); double clusterTotal[clusterCount]; int clusterElements[clusterCount]; /* Alphabetize cluster names */ char *clusterNames[clusterCount]; struct hashEl *hel; struct hashCookie cookie = hashFirst(clusterHash); int clusterIx = 0; while ((hel = hashNext(&cookie)) != NULL) { clusterNames[clusterIx] = hel->name; ++clusterIx; } sortStrings(clusterNames, clusterCount); verbose(2, "%s to %s\n", clusterNames[0], clusterNames[clusterCount-1]); -/* Figure out size of each alphabetized cluster in terms of number of samples in cluster */ +/* Figure out size of each alphabetized cluster in terms of number of samples in cluster + * if we are doing median */ int clusterSize[clusterCount]; +double *clusterSamples[clusterCount]; +if (doMedian) + { for (clusterIx = 0; clusterIx < clusterCount; ++clusterIx) { clusterSize[clusterIx] = hashIntVal(clusterHash, clusterNames[clusterIx]); verbose(2, "clusterSize[%d] = %d\n", clusterIx, clusterSize[clusterIx]); } /* Make up array of doubles for each cluster to hold all samples in that clusters */ -double *clusterSamples[clusterCount]; for (clusterIx = 0; clusterIx < clusterCount; ++clusterIx) { double *samples; AllocArray(samples, clusterSize[clusterIx]); clusterSamples[clusterIx] = samples; } + } /* Make hash that goes from cluster name to cluster index */ struct hash *clusterToClusterIdHash = hashNew(0); for (clusterIx = 0; clusterIx<clusterCount; ++clusterIx) { hashAddInt(clusterToClusterIdHash, clusterNames[clusterIx], clusterIx); } /* Open output */ FILE *f = mustOpen(output, "w"); /* Open up matrix file and read first line into sample labeling */ struct lineFile *lf = lineFileOpen(matrixTsv, TRUE); char *line; lineFileNeedNext(lf, &line, NULL); if (line[0] == '#') // Opening sharp on labels is optional, skip it if there line = skipLeadingSpaces(line+1); int colCount = chopByChar(line, '\t', NULL, 0); int colAlloc = colCount + 1; char **sampleNames; AllocArray(sampleNames, colAlloc); chopByChar(line, '\t', sampleNames, colCount); /* Make array that maps row index to clusterID */ int colToCluster[colCount]; int colIx; for (colIx=1; colIx <colCount; colIx = colIx+1) { char *colName = sampleNames[colIx]; char *clusterName = hashFindVal(sampleHash, colName); colToCluster[colIx] = -1; if (clusterName != NULL) { int clusterId = hashIntValDefault(clusterToClusterIdHash, clusterName, -1); colToCluster[colIx] = clusterId; if (clusterId == -1) warn("%s is in expression matrix but not in sample cluster file", clusterName); } } /* Set up row for reading one row of matrix at a time. */ char **matrixRow; AllocArray(matrixRow, colAlloc); int hitCount = 0, missCount = 0; +dotForUserInit(100); for (;;) { /* Fetch next line and remember how long it is. Just skip over lines that are empty or * start with # character. */ int lineLength = 0; char *line; if (!lineFileNext(lf, &line, &lineLength)) break; char *s = skipLeadingSpaces(line); char c = s[0]; if (c == 0 || c == '#') continue; /* Chop it into tabs */ int rowSize = chopByChar(line, '\t', matrixRow, colAlloc); lineFileExpectWords(lf, colCount, rowSize); char *geneName = matrixRow[0]; struct hashEl *onePos = hashLookup(geneHash, geneName); if (onePos == NULL) { warn("Can't find gene %s in %s", geneName, geneBed); ++missCount; continue; } else { ++hitCount; } + /* A gene may map multiple places. This loop takes care of that */ for (; onePos != NULL; onePos = hashLookupNext(onePos)) { - char **geneBedVal = onePos->val; + char **geneBedVal = onePos->val; // Get our bed as string array out of hash /* Zero out cluster histogram */ int i; for (i=0; i<clusterCount; ++i) { clusterTotal[i] = 0.0; clusterElements[i] = 0; } - zeroBytes(&clusterTotal, sizeof(clusterTotal)); - zeroBytes(&clusterElements, sizeof(clusterElements)); - /* Loop through rest of row filling in histogram */ for (i=1; i<colCount; ++i) { int clusterIx = colToCluster[i]; - double val = sqlDouble(matrixRow[i]); - int pos = clusterElements[clusterIx]; - if (pos >= clusterSize[clusterIx]) + char *textVal = matrixRow[i]; + // special case so common we parse out "0" inline + double val = (textVal[0] == '0' && textVal[1] == 0) ? 0.0 : sqlDouble(textVal); + int valCount = clusterElements[clusterIx]; + clusterElements[clusterIx] = valCount+1; + if (doMedian) + { + if (valCount >= clusterSize[clusterIx]) internalErr(); - clusterElements[clusterIx] = pos+1; - clusterSamples[clusterIx][pos] = val; + clusterSamples[clusterIx][valCount] = val; + } + else clusterTotal[clusterIx] += val; } - /* Output info */ + /* Output info - first from the bed, then our barchart */ for (i=0; i<bedRowSize; ++i) fprintf(f, "%s\t", geneBedVal[i]); fprintf(f, "%d\t", clusterCount); for (i=0; i<clusterCount; ++i) { if (i != 0) fprintf(f, ","); - if (clMean) - fprintf(f, "%g", clusterTotal[i]/clusterElements[i]); - else + if (doMedian) fprintf(f, "%g", doubleMedian(clusterElements[i], clusterSamples[i])); + else + fprintf(f, "%g", clusterTotal[i]/clusterElements[i]); } /* Data file offset info */ if (clDataOffset) fprintf(f, "\t%lld\t%lld", (long long)lineFileTell(lf), (long long)lineLength); fprintf(f, "\n"); } - - + dotForUser(); } verbose(1, "%d genes found, %d missed\n", hitCount, missCount); carefulClose(&f); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 5) usage(); clDataOffset = (optionExists("_dataOffset") || optionExists("dataOffset")); clMean = optionExists("mean"); clusterMatrixToBarchartBed(argv[1], argv[2], argv[3], argv[4]); return 0; }