src/utils/clusterMatrixToBarchartBed/clusterMatrixToBarchartBed.c 8bb6956fbfc0b789de4c76938878dd3a4399b3d0

8bb6956fbfc0b789de4c76938878dd3a4399b3d0
kent
  Sun Dec 20 18:09:04 2020 -0800
Fixing camel casing to be like most of our barChart C code.

diff --git src/utils/clusterMatrixToBarchartBed/clusterMatrixToBarchartBed.c src/utils/clusterMatrixToBarchartBed/clusterMatrixToBarchartBed.c
deleted file mode 100644
index fead46b..0000000
--- src/utils/clusterMatrixToBarchartBed/clusterMatrixToBarchartBed.c
+++ /dev/null
@@ -1,297 +0,0 @@
-/* clusterMatrixToBarchartBed - Compute a barchart bed file from  a gene matrix 
- * and a gene bed file and a way to cluster samples. */
-
-#include "common.h"
-#include "linefile.h"
-#include "hash.h"
-#include "options.h"
-#include "localmem.h"
-#include "obscure.h"
-#include "sqlNum.h"
-
-boolean clDataOffset = FALSE;
-boolean clMean = FALSE;
-
-void usage()
-/* Explain usage and exit. */
-{
-errAbort(
-  "clusterMatrixToBarchartBed - Compute a barchart bed file from  a gene matrix\n"
-  "and a gene bed file and a way to cluster samples.\n"
-  "usage:\n"
-  "   clusterMatrixToBarchartBed sampleClusters.tsv geneMatrix.tsv geneset.bed output.bed\n"
-  "where:\n"
-  "   sampleClusters.tsv is a two column tab separated file with sampleId and clusterId\n"
-  "   geneMatrix.tsv has a row for each gene. The first row uses the same sampleId as above\n"
-  "   geneset.bed has the maps the genes in the matrix (from it's first column) to the genome\n"
-  "   output.bed is the resulting bar chart, with one column per cluster\n"
-  "options:\n"
-  "   -dataOffset - store the position of gene in geneMatrix.tsv file in output\n"
-  "   -mean - use mean (instead of median)\n"
-  );
-}
-
-/* Command line validation table. */
-static struct optionSpec options[] = {
-   {"dataOffset", OPTION_BOOLEAN},
-   {"_dataOffset", OPTION_BOOLEAN},
-   {"mean", OPTION_BOOLEAN},
-   {NULL, 0},
-};
-
-struct hash *hashTsvBy(char *in, int keyColIx, int *retColCount)
-/* Return a hash of rows keyed by the given column */
-{
-struct lineFile *lf = lineFileOpen(in, TRUE);
-struct hash *hash = hashNew(0);
-char *line = NULL, **row = NULL;
-int colCount = 0, colAlloc=0;	/* Columns as counted and as allocated */
-while (lineFileNextReal(lf, &line))
-    {
-    if (colCount == 0)
-        {
-	*retColCount = colCount = chopByChar(line, '\t', NULL, 0);
-	verbose(2, "Got %d columns in first real line\n", colCount);
-	colAlloc = colCount + 1;  // +1 so we can detect unexpected input and complain 
-	lmAllocArray(hash->lm, row, colAlloc);
-	}
-    int count = chopByChar(line, '\t', row, colAlloc);
-    if (count != colCount)
-        {
-	errAbort("Expecting %d words, got more than that line %d of %s", 
-	    colCount, lf->lineIx, lf->fileName);
-	}
-    hashAdd(hash, row[keyColIx], lmCloneRow(hash->lm, row, colCount) );
-    }
-lineFileClose(&lf);
-return hash;
-}
-
-void hashSamplesAndClusters(char *tsvFile, 
-    struct hash **retSampleHash, struct hash **retClusterHash)
-/* Read two column tsv file into a hash keyed by first column */
-{
-struct hash *sampleHash = hashNew(0);
-struct hash *clusterHash = hashNew(0);
-char *row[2];
-struct lineFile *lf = lineFileOpen(tsvFile, TRUE);
-while (lineFileNextRowTab(lf, row, ArraySize(row)) )
-    {
-    /* Find cluster in cluster hash, if it doesn't exist make it. */
-    char *clusterName = row[1];
-    struct hashEl *hel = hashLookup(clusterHash, clusterName);
-    if (hel == NULL)
-	hel = hashAddInt(clusterHash, clusterName, 1);
-    else
-	hel->val = ((char *)hel->val)+1;    // Increment hash pointer as per hashIncInt
-    char *clusterStableName = hel->name;	// This is allocated in clusterHash
-    hashAdd(sampleHash, row[0], clusterStableName);
-    }
-lineFileClose(&lf);
-*retSampleHash = sampleHash;
-*retClusterHash = clusterHash;
-}
-
-void clusterMatrixToBarchartBed(char *sampleClusters, char *matrixTsv, char *geneBed, char *output)
-/* clusterMatrixToBarchartBed - Compute a barchart bed file from  a gene matrix 
- * and a gene bed file and a way to cluster samples. */
-{
-/* Figure out if we need to do medians etc */
-boolean doMedian = !clMean;
-
-/* Load up the gene set */
-verbose(1, "clusterMatrixToBarchartBed(%s,%s,%s,%s)\n", sampleClusters, matrixTsv, geneBed, output);
-int bedRowSize = 0;
-struct hash *geneHash = hashTsvBy(geneBed, 3, &bedRowSize);
-verbose(1, "%d genes in %s\n", geneHash->elCount, geneBed);
-
-/* Load up the sample clustering */
-struct hash *sampleHash = NULL, *clusterHash = NULL;
-hashSamplesAndClusters(sampleClusters, &sampleHash, &clusterHash);
-int clusterCount = clusterHash->elCount;
-verbose(1, "%d samples and %d clusters in %s\n", sampleHash->elCount, clusterCount,
-    sampleClusters);
-if (clusterCount <= 1 || clusterCount >= 10000)
-    errAbort("%d is not a good number of clusters", clusterCount);
-double clusterTotal[clusterCount];
-int clusterElements[clusterCount];
-
-/* Alphabetize cluster names  */
-char *clusterNames[clusterCount];
-struct hashEl *hel;
-struct hashCookie cookie = hashFirst(clusterHash);
-int clusterIx = 0;
-while ((hel = hashNext(&cookie)) != NULL)
-    {
-    clusterNames[clusterIx] = hel->name;
-    ++clusterIx;
-    }
-sortStrings(clusterNames, clusterCount);
-verbose(2, "%s to %s\n", clusterNames[0], clusterNames[clusterCount-1]);
-
-/* Figure out size of each alphabetized cluster in terms of number of samples in cluster 
- * if we are doing median */
-int clusterSize[clusterCount];
-double *clusterSamples[clusterCount];
-if (doMedian)
-    {
-    for (clusterIx = 0; clusterIx < clusterCount; ++clusterIx)
-	{
-	clusterSize[clusterIx] = hashIntVal(clusterHash, clusterNames[clusterIx]);
-	verbose(2, "clusterSize[%d] = %d\n", clusterIx, clusterSize[clusterIx]);
-	}
-
-    /* Make up array of doubles for each cluster to hold all samples in that clusters */
-    for (clusterIx = 0; clusterIx < clusterCount; ++clusterIx)
-	{
-	double *samples;
-	AllocArray(samples, clusterSize[clusterIx]);
-	clusterSamples[clusterIx] = samples;
-	}
-    }
-
-/* Make hash that goes from cluster name to cluster index */
-struct hash *clusterToClusterIdHash = hashNew(0);
-for (clusterIx = 0; clusterIx<clusterCount; ++clusterIx)
-    {
-    hashAddInt(clusterToClusterIdHash, clusterNames[clusterIx], clusterIx);
-    }
-
-
-/* Open output */
-FILE *f = mustOpen(output, "w");
-
-/* Open up matrix file and read first line into sample labeling */
-struct lineFile *lf = lineFileOpen(matrixTsv, TRUE);
-char *line;
-lineFileNeedNext(lf, &line, NULL);
-if (line[0] == '#')	// Opening sharp on labels is optional, skip it if there
-    line = skipLeadingSpaces(line+1);
-int colCount = chopByChar(line, '\t', NULL, 0);
-int colAlloc = colCount + 1;
-char **sampleNames;
-AllocArray(sampleNames, colAlloc);
-chopByChar(line, '\t', sampleNames, colCount);
-
-/* Make array that maps row index to clusterID */
-int colToCluster[colCount];
-int colIx;
-for (colIx=1; colIx <colCount; colIx = colIx+1)
-    {
-    char *colName = sampleNames[colIx];
-    char *clusterName = hashFindVal(sampleHash, colName);
-    colToCluster[colIx] = -1;
-    if (clusterName != NULL)
-        {
-	int clusterId = hashIntValDefault(clusterToClusterIdHash, clusterName, -1);
-	colToCluster[colIx] = clusterId;
-	if (clusterId == -1)
-	    warn("%s is in expression matrix but not in sample cluster file", clusterName);
-	}
-    }
-
-
-/* Set up row for reading one row of matrix at a time. */
-char **matrixRow;
-AllocArray(matrixRow, colAlloc);
-int hitCount = 0, missCount = 0;
-dotForUserInit(100);
-for (;;)
-    {
-    /* Fetch next line and remember how long it is.  Just skip over lines that are empty or
-     * start with # character. */
-    int lineLength = 0;
-    char *line;
-    if (!lineFileNext(lf, &line, &lineLength))
-        break;
-    char *s = skipLeadingSpaces(line);
-    char c = s[0];
-    if (c == 0 || c == '#')
-        continue;
-
-    /* Chop it into tabs */
-    int rowSize = chopByChar(line, '\t', matrixRow, colAlloc);
-    lineFileExpectWords(lf, colCount, rowSize);
-
-    char *geneName = matrixRow[0];
-    struct hashEl *onePos = hashLookup(geneHash, geneName);
-    if (onePos == NULL)
-	{
-	warn("Can't find gene %s in %s", geneName, geneBed);
-	++missCount;
-	continue;
-	}
-    else
-	{
-	++hitCount;
-	}
-
-    /* A gene may map multiple places.  This loop takes care of that */
-    for (; onePos != NULL; onePos = hashLookupNext(onePos))
-        {
-	char **geneBedVal = onePos->val;	// Get our bed as string array out of hash
-
-	/* Zero out cluster histogram */
-	int i;
-	for (i=0; i<clusterCount; ++i)
-	    {
-	    clusterTotal[i] = 0.0;
-	    clusterElements[i] = 0;
-	    }
-
-	/* Loop through rest of row filling in histogram */
-	for (i=1; i<colCount; ++i)
-	    {
-	    int clusterIx = colToCluster[i];
-	    char *textVal = matrixRow[i];
-	    // special case so common we parse out "0" inline
-	    double val = (textVal[0] == '0' && textVal[1] == 0) ? 0.0 : sqlDouble(textVal);
-	    int valCount = clusterElements[clusterIx];
-	    clusterElements[clusterIx] = valCount+1;
-	    if (doMedian)
-		{
-		if (valCount >= clusterSize[clusterIx])
-		    internalErr();
-		clusterSamples[clusterIx][valCount] = val;
-		}
-	    else
-		clusterTotal[clusterIx] += val;
-	    }
-
-	/* Output info - first from the bed, then our barchart */
-	for (i=0; i<bedRowSize; ++i)
-	    fprintf(f, "%s\t",  geneBedVal[i]);
-	fprintf(f, "%d\t", clusterCount);
-	for (i=0; i<clusterCount; ++i)
-	    {
-	    if (i != 0)
-	       fprintf(f, ",");
-	    if (doMedian)
-		fprintf(f, "%g", doubleMedian(clusterElements[i], clusterSamples[i]));
-	    else
-		fprintf(f, "%g",  clusterTotal[i]/clusterElements[i]);
-	    }
-	
-	/* Data file offset info */
-	if (clDataOffset)
-	    fprintf(f, "\t%lld\t%lld",  (long long)lineFileTell(lf), (long long)lineLength);
-
-	fprintf(f, "\n");
-	}
-    dotForUser();
-    }
-verbose(1, "%d genes found, %d missed\n", hitCount, missCount);
-carefulClose(&f);
-}
-
-int main(int argc, char *argv[])
-/* Process command line. */
-{
-optionInit(&argc, argv, options);
-if (argc != 5)
-    usage();
-clDataOffset = (optionExists("_dataOffset") || optionExists("dataOffset"));
-clMean = optionExists("mean");
-clusterMatrixToBarchartBed(argv[1], argv[2], argv[3], argv[4]);
-return 0;
-}