8135446fb6a4c338e543585aa411808eef36ee10
kent
  Sun Dec 20 16:00:10 2020 -0800
Optimizing speed and polishing comments.

diff --git src/utils/clusterMatrixToBarchartBed/clusterMatrixToBarchartBed.c src/utils/clusterMatrixToBarchartBed/clusterMatrixToBarchartBed.c
index ea28755..fead46b 100644
--- src/utils/clusterMatrixToBarchartBed/clusterMatrixToBarchartBed.c
+++ src/utils/clusterMatrixToBarchartBed/clusterMatrixToBarchartBed.c
@@ -1,284 +1,297 @@
-/* clusterMatrixToBarchartBed - Take a gene matrix and a gene bed file and a way to cluster 
- * samples into a barchart type bed. */
+/* clusterMatrixToBarchartBed - Compute a barchart bed file from  a gene matrix 
+ * and a gene bed file and a way to cluster samples. */
+
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "localmem.h"
+#include "obscure.h"
 #include "sqlNum.h"
 
 boolean clDataOffset = FALSE;
 boolean clMean = FALSE;
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
-  "clusterMatrixToBarchartBed - Take a gene matrix and a gene bed file and a way to cluster\n"
-  "samples into a barchart type bed\n"
+  "clusterMatrixToBarchartBed - Compute a barchart bed file from  a gene matrix\n"
+  "and a gene bed file and a way to cluster samples.\n"
   "usage:\n"
   "   clusterMatrixToBarchartBed sampleClusters.tsv geneMatrix.tsv geneset.bed output.bed\n"
   "where:\n"
-  "   sampleClusters.tsv is a tab separated file with the first column being sample ids\n"
-  "   geneMatrix.tsv has a row for each gene. The first row uses the same sample ids\n"
+  "   sampleClusters.tsv is a two column tab separated file with sampleId and clusterId\n"
+  "   geneMatrix.tsv has a row for each gene. The first row uses the same sampleId as above\n"
   "   geneset.bed has the maps the genes in the matrix (from it's first column) to the genome\n"
   "   output.bed is the resulting bar chart, with one column per cluster\n"
   "options:\n"
   "   -dataOffset - store the position of gene in geneMatrix.tsv file in output\n"
   "   -mean - use mean (instead of median)\n"
   );
 }
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
    {"dataOffset", OPTION_BOOLEAN},
    {"_dataOffset", OPTION_BOOLEAN},
    {"mean", OPTION_BOOLEAN},
    {NULL, 0},
 };
 
 struct hash *hashTsvBy(char *in, int keyColIx, int *retColCount)
 /* Return a hash of rows keyed by the given column */
 {
 struct lineFile *lf = lineFileOpen(in, TRUE);
 struct hash *hash = hashNew(0);
 char *line = NULL, **row = NULL;
 int colCount = 0, colAlloc=0;	/* Columns as counted and as allocated */
 while (lineFileNextReal(lf, &line))
     {
     if (colCount == 0)
         {
 	*retColCount = colCount = chopByChar(line, '\t', NULL, 0);
 	verbose(2, "Got %d columns in first real line\n", colCount);
 	colAlloc = colCount + 1;  // +1 so we can detect unexpected input and complain 
 	lmAllocArray(hash->lm, row, colAlloc);
 	}
     int count = chopByChar(line, '\t', row, colAlloc);
     if (count != colCount)
         {
 	errAbort("Expecting %d words, got more than that line %d of %s", 
 	    colCount, lf->lineIx, lf->fileName);
 	}
     hashAdd(hash, row[keyColIx], lmCloneRow(hash->lm, row, colCount) );
     }
 lineFileClose(&lf);
 return hash;
 }
 
 void hashSamplesAndClusters(char *tsvFile, 
     struct hash **retSampleHash, struct hash **retClusterHash)
 /* Read two column tsv file into a hash keyed by first column */
 {
 struct hash *sampleHash = hashNew(0);
 struct hash *clusterHash = hashNew(0);
 char *row[2];
 struct lineFile *lf = lineFileOpen(tsvFile, TRUE);
 while (lineFileNextRowTab(lf, row, ArraySize(row)) )
     {
     /* Find cluster in cluster hash, if it doesn't exist make it. */
     char *clusterName = row[1];
     struct hashEl *hel = hashLookup(clusterHash, clusterName);
     if (hel == NULL)
 	hel = hashAddInt(clusterHash, clusterName, 1);
     else
 	hel->val = ((char *)hel->val)+1;    // Increment hash pointer as per hashIncInt
     char *clusterStableName = hel->name;	// This is allocated in clusterHash
     hashAdd(sampleHash, row[0], clusterStableName);
     }
 lineFileClose(&lf);
 *retSampleHash = sampleHash;
 *retClusterHash = clusterHash;
 }
 
 void clusterMatrixToBarchartBed(char *sampleClusters, char *matrixTsv, char *geneBed, char *output)
-/* clusterMatrixToBarchartBed - Take a gene matrix and a gene bed file and a way to cluster samples 
- * into a barchart type bed. */
+/* clusterMatrixToBarchartBed - Compute a barchart bed file from  a gene matrix 
+ * and a gene bed file and a way to cluster samples. */
 {
+/* Figure out if we need to do medians etc */
+boolean doMedian = !clMean;
+
 /* Load up the gene set */
 verbose(1, "clusterMatrixToBarchartBed(%s,%s,%s,%s)\n", sampleClusters, matrixTsv, geneBed, output);
 int bedRowSize = 0;
 struct hash *geneHash = hashTsvBy(geneBed, 3, &bedRowSize);
 verbose(1, "%d genes in %s\n", geneHash->elCount, geneBed);
 
 /* Load up the sample clustering */
 struct hash *sampleHash = NULL, *clusterHash = NULL;
 hashSamplesAndClusters(sampleClusters, &sampleHash, &clusterHash);
 int clusterCount = clusterHash->elCount;
 verbose(1, "%d samples and %d clusters in %s\n", sampleHash->elCount, clusterCount,
     sampleClusters);
 if (clusterCount <= 1 || clusterCount >= 10000)
     errAbort("%d is not a good number of clusters", clusterCount);
 double clusterTotal[clusterCount];
 int clusterElements[clusterCount];
 
 /* Alphabetize cluster names  */
 char *clusterNames[clusterCount];
 struct hashEl *hel;
 struct hashCookie cookie = hashFirst(clusterHash);
 int clusterIx = 0;
 while ((hel = hashNext(&cookie)) != NULL)
     {
     clusterNames[clusterIx] = hel->name;
     ++clusterIx;
     }
 sortStrings(clusterNames, clusterCount);
 verbose(2, "%s to %s\n", clusterNames[0], clusterNames[clusterCount-1]);
 
-/* Figure out size of each alphabetized cluster in terms of number of samples in cluster */
+/* Figure out size of each alphabetized cluster in terms of number of samples in cluster 
+ * if we are doing median */
 int clusterSize[clusterCount];
+double *clusterSamples[clusterCount];
+if (doMedian)
+    {
     for (clusterIx = 0; clusterIx < clusterCount; ++clusterIx)
 	{
 	clusterSize[clusterIx] = hashIntVal(clusterHash, clusterNames[clusterIx]);
 	verbose(2, "clusterSize[%d] = %d\n", clusterIx, clusterSize[clusterIx]);
 	}
 
     /* Make up array of doubles for each cluster to hold all samples in that clusters */
-double *clusterSamples[clusterCount];
     for (clusterIx = 0; clusterIx < clusterCount; ++clusterIx)
 	{
 	double *samples;
 	AllocArray(samples, clusterSize[clusterIx]);
 	clusterSamples[clusterIx] = samples;
 	}
+    }
 
 /* Make hash that goes from cluster name to cluster index */
 struct hash *clusterToClusterIdHash = hashNew(0);
 for (clusterIx = 0; clusterIx<clusterCount; ++clusterIx)
     {
     hashAddInt(clusterToClusterIdHash, clusterNames[clusterIx], clusterIx);
     }
 
 
 /* Open output */
 FILE *f = mustOpen(output, "w");
 
 /* Open up matrix file and read first line into sample labeling */
 struct lineFile *lf = lineFileOpen(matrixTsv, TRUE);
 char *line;
 lineFileNeedNext(lf, &line, NULL);
 if (line[0] == '#')	// Opening sharp on labels is optional, skip it if there
     line = skipLeadingSpaces(line+1);
 int colCount = chopByChar(line, '\t', NULL, 0);
 int colAlloc = colCount + 1;
 char **sampleNames;
 AllocArray(sampleNames, colAlloc);
 chopByChar(line, '\t', sampleNames, colCount);
 
 /* Make array that maps row index to clusterID */
 int colToCluster[colCount];
 int colIx;
 for (colIx=1; colIx <colCount; colIx = colIx+1)
     {
     char *colName = sampleNames[colIx];
     char *clusterName = hashFindVal(sampleHash, colName);
     colToCluster[colIx] = -1;
     if (clusterName != NULL)
         {
 	int clusterId = hashIntValDefault(clusterToClusterIdHash, clusterName, -1);
 	colToCluster[colIx] = clusterId;
 	if (clusterId == -1)
 	    warn("%s is in expression matrix but not in sample cluster file", clusterName);
 	}
     }
 
 
 /* Set up row for reading one row of matrix at a time. */
 char **matrixRow;
 AllocArray(matrixRow, colAlloc);
 int hitCount = 0, missCount = 0;
+dotForUserInit(100);
 for (;;)
     {
     /* Fetch next line and remember how long it is.  Just skip over lines that are empty or
      * start with # character. */
     int lineLength = 0;
     char *line;
     if (!lineFileNext(lf, &line, &lineLength))
         break;
     char *s = skipLeadingSpaces(line);
     char c = s[0];
     if (c == 0 || c == '#')
         continue;
 
     /* Chop it into tabs */
     int rowSize = chopByChar(line, '\t', matrixRow, colAlloc);
     lineFileExpectWords(lf, colCount, rowSize);
 
     char *geneName = matrixRow[0];
     struct hashEl *onePos = hashLookup(geneHash, geneName);
     if (onePos == NULL)
 	{
 	warn("Can't find gene %s in %s", geneName, geneBed);
 	++missCount;
 	continue;
 	}
     else
 	{
 	++hitCount;
 	}
 
+    /* A gene may map multiple places.  This loop takes care of that */
     for (; onePos != NULL; onePos = hashLookupNext(onePos))
         {
-	char **geneBedVal = onePos->val;
+	char **geneBedVal = onePos->val;	// Get our bed as string array out of hash
 
 	/* Zero out cluster histogram */
 	int i;
 	for (i=0; i<clusterCount; ++i)
 	    {
 	    clusterTotal[i] = 0.0;
 	    clusterElements[i] = 0;
 	    }
 
-	zeroBytes(&clusterTotal, sizeof(clusterTotal));
-	zeroBytes(&clusterElements, sizeof(clusterElements));
-
 	/* Loop through rest of row filling in histogram */
 	for (i=1; i<colCount; ++i)
 	    {
 	    int clusterIx = colToCluster[i];
-	    double val = sqlDouble(matrixRow[i]);
-	    int pos = clusterElements[clusterIx];
-	    if (pos >= clusterSize[clusterIx])
+	    char *textVal = matrixRow[i];
+	    // special case so common we parse out "0" inline
+	    double val = (textVal[0] == '0' && textVal[1] == 0) ? 0.0 : sqlDouble(textVal);
+	    int valCount = clusterElements[clusterIx];
+	    clusterElements[clusterIx] = valCount+1;
+	    if (doMedian)
+		{
+		if (valCount >= clusterSize[clusterIx])
 		    internalErr();
-	    clusterElements[clusterIx] = pos+1;
-	    clusterSamples[clusterIx][pos] = val;
+		clusterSamples[clusterIx][valCount] = val;
+		}
+	    else
 		clusterTotal[clusterIx] += val;
 	    }
 
-	/* Output info */
+	/* Output info - first from the bed, then our barchart */
 	for (i=0; i<bedRowSize; ++i)
 	    fprintf(f, "%s\t",  geneBedVal[i]);
 	fprintf(f, "%d\t", clusterCount);
 	for (i=0; i<clusterCount; ++i)
 	    {
 	    if (i != 0)
 	       fprintf(f, ",");
-	    if (clMean)
-		fprintf(f, "%g",  clusterTotal[i]/clusterElements[i]);
-	    else
+	    if (doMedian)
 		fprintf(f, "%g", doubleMedian(clusterElements[i], clusterSamples[i]));
+	    else
+		fprintf(f, "%g",  clusterTotal[i]/clusterElements[i]);
 	    }
 	
 	/* Data file offset info */
 	if (clDataOffset)
 	    fprintf(f, "\t%lld\t%lld",  (long long)lineFileTell(lf), (long long)lineLength);
 
 	fprintf(f, "\n");
 	}
-
-
+    dotForUser();
     }
 verbose(1, "%d genes found, %d missed\n", hitCount, missCount);
 carefulClose(&f);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 5)
     usage();
 clDataOffset = (optionExists("_dataOffset") || optionExists("dataOffset"));
 clMean = optionExists("mean");
 clusterMatrixToBarchartBed(argv[1], argv[2], argv[3], argv[4]);
 return 0;
 }