b3291f717be7ee6ce33882e0c07cf815d7ac40c5 kent Thu Dec 31 15:05:10 2020 -0800 Moved hashTsvBy() to obscure.c. Removed some debugging uglyfs. diff --git src/utils/clusterMatrixToBarChartBed/clusterMatrixToBarChartBed.c src/utils/clusterMatrixToBarChartBed/clusterMatrixToBarChartBed.c index 1b7fe7e..19d2053 100644 --- src/utils/clusterMatrixToBarChartBed/clusterMatrixToBarChartBed.c +++ src/utils/clusterMatrixToBarChartBed/clusterMatrixToBarChartBed.c @@ -31,58 +31,30 @@ "options:\n" " -simple - don't store the position of gene in geneMatrix.tsv file in output\n" " -median - use median (instead of mean)\n" " -name2=twoColFile.tsv - get name2 from file where first col is same ase geneset.bed's name\n" ); } /* Command line validation table. */ static struct optionSpec options[] = { {"simple", OPTION_BOOLEAN}, {"median", OPTION_BOOLEAN}, {"name2", OPTION_STRING}, {NULL, 0}, }; -struct hash *hashTsvBy(char *in, int keyColIx, int *retColCount) -/* Return a hash of rows keyed by the given column */ -{ -struct lineFile *lf = lineFileOpen(in, TRUE); -struct hash *hash = hashNew(0); -char *line = NULL, **row = NULL; -int colCount = 0, colAlloc=0; /* Columns as counted and as allocated */ -while (lineFileNextReal(lf, &line)) - { - if (colCount == 0) - { - *retColCount = colCount = chopByChar(line, '\t', NULL, 0); - verbose(2, "Got %d columns in first real line\n", colCount); - colAlloc = colCount + 1; // +1 so we can detect unexpected input and complain - lmAllocArray(hash->lm, row, colAlloc); - } - int count = chopByChar(line, '\t', row, colAlloc); - if (count != colCount) - { - errAbort("Expecting %d words, got more than that line %d of %s", - colCount, lf->lineIx, lf->fileName); - } - hashAdd(hash, row[keyColIx], lmCloneRow(hash->lm, row, colCount) ); - } -lineFileClose(&lf); -return hash; -} - void hashSamplesAndClusters(char *tsvFile, struct hash **retSampleHash, struct hash **retClusterHash) /* Read two column tsv file into a hash keyed by first column */ { struct hash *sampleHash = hashNew(0); struct hash *clusterHash = hashNew(0); char *row[2]; struct lineFile *lf = lineFileOpen(tsvFile, TRUE); while (lineFileNextRowTab(lf, row, ArraySize(row)) ) { /* Find cluster in cluster hash, if it doesn't exist make it. */ char *clusterName = row[1]; struct hashEl *hel = hashLookup(clusterHash, clusterName); if (hel == NULL) hel = hashAddInt(clusterHash, clusterName, 1); @@ -211,30 +183,31 @@ colToCluster[colIx] = -1; if (clusterName != NULL) { int clusterId = hashIntValDefault(clusterToClusterIdHash, clusterName, -1); colToCluster[colIx] = clusterId; if (clusterId == -1) warn("%s is in expression matrix but not in sample cluster file", clusterName); } } /* Set up row for reading one row of matrix at a time. */ char **matrixRow; AllocArray(matrixRow, colAlloc); int hitCount = 0, missCount = 0; +double sumTotal = 0; dotForUserInit(100); for (;;) { /* Fetch next line and remember how long it is. Just skip over lines that are empty or * start with # character. */ int lineLength = 0; char *line; if (!lineFileNext(lf, &line, &lineLength)) break; char *s = skipLeadingSpaces(line); char c = s[0]; if (c == 0 || c == '#') continue; /* Chop it into tabs */ @@ -260,32 +233,37 @@ char **geneBedVal = onePos->val; // Get our bed as string array out of hash /* Zero out cluster histogram */ int i; for (i=0; i<clusterCount; ++i) { clusterTotal[i] = 0.0; clusterElements[i] = 0; } /* Loop through rest of row filling in histogram */ for (i=1; i<colCount; ++i) { int clusterIx = colToCluster[i]; char *textVal = matrixRow[i]; +if (clusterIx == clusterCount - 1) //ugly + { + uglyf("%s %d %s\n", geneName, i, textVal); + } // special case so common we parse out "0" inline double val = (textVal[0] == '0' && textVal[1] == 0) ? 0.0 : sqlDouble(textVal); + sumTotal += val; int valCount = clusterElements[clusterIx]; clusterElements[clusterIx] = valCount+1; if (doMedian) { if (valCount >= clusterSize[clusterIx]) internalErr(); clusterSamples[clusterIx][valCount] = val; } else clusterTotal[clusterIx] += val; } /* Output info - first six from the bed, then name2, then our barchart */ for (i=0; i<6; ++i) fprintf(f, "%s\t", geneBedVal[i]); @@ -302,42 +280,49 @@ } else name2 = geneBedVal[name2Ix]; if (name2 == NULL) name2 = name; fprintf(f, "%s\t", name2); fprintf(f, "%d\t", clusterCount); for (i=0; i<clusterCount; ++i) { if (i != 0) fprintf(f, ","); if (doMedian) fprintf(f, "%g", doubleMedian(clusterElements[i], clusterSamples[i])); else + { fprintf(f, "%g", clusterTotal[i]/clusterElements[i]); } + } /* Data file offset info */ if (!clSimple) fprintf(f, "\t%lld\t%lld", (long long)lineFileTell(lf), (long long)lineLength); fprintf(f, "\n"); } dotForUser(); } verbose(1, "\n%d genes found, %d (%0.2f%%) missed\n", hitCount, missCount, 100.0*missCount/(hitCount+missCount)); +if (!doMedian) + { + verbose(1, "matrix total %g, %d clusters, %g ave/cluster\n", + sumTotal, clusterCount, sumTotal/clusterCount); + } carefulClose(&f); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 5) usage(); clSimple = optionExists("simple"); clMedian = optionExists("median"); clName2 = optionVal("name2", clName2); clusterMatrixToBarchartBed(argv[1], argv[2], argv[3], argv[4]); return 0; }