src/utils/clusterMatrixToBarChartBed/clusterMatrixToBarChartBed.c b3291f717be7ee6ce33882e0c07cf815d7ac40c5

b3291f717be7ee6ce33882e0c07cf815d7ac40c5
kent
  Thu Dec 31 15:05:10 2020 -0800
Moved hashTsvBy() to obscure.c. Removed some debugging uglyfs.

diff --git src/utils/clusterMatrixToBarChartBed/clusterMatrixToBarChartBed.c src/utils/clusterMatrixToBarChartBed/clusterMatrixToBarChartBed.c
index 1b7fe7e..19d2053 100644
--- src/utils/clusterMatrixToBarChartBed/clusterMatrixToBarChartBed.c
+++ src/utils/clusterMatrixToBarChartBed/clusterMatrixToBarChartBed.c
@@ -31,58 +31,30 @@
   "options:\n"
   "   -simple - don't store the position of gene in geneMatrix.tsv file in output\n"
   "   -median - use median (instead of mean)\n"
   "   -name2=twoColFile.tsv - get name2 from file where first col is same ase geneset.bed's name\n"
   );
 }
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
    {"simple", OPTION_BOOLEAN},
    {"median", OPTION_BOOLEAN},
    {"name2", OPTION_STRING},
    {NULL, 0},
 };
 
-struct hash *hashTsvBy(char *in, int keyColIx, int *retColCount)
-/* Return a hash of rows keyed by the given column */
-{
-struct lineFile *lf = lineFileOpen(in, TRUE);
-struct hash *hash = hashNew(0);
-char *line = NULL, **row = NULL;
-int colCount = 0, colAlloc=0;	/* Columns as counted and as allocated */
-while (lineFileNextReal(lf, &line))
-    {
-    if (colCount == 0)
-        {
-	*retColCount = colCount = chopByChar(line, '\t', NULL, 0);
-	verbose(2, "Got %d columns in first real line\n", colCount);
-	colAlloc = colCount + 1;  // +1 so we can detect unexpected input and complain 
-	lmAllocArray(hash->lm, row, colAlloc);
-	}
-    int count = chopByChar(line, '\t', row, colAlloc);
-    if (count != colCount)
-        {
-	errAbort("Expecting %d words, got more than that line %d of %s", 
-	    colCount, lf->lineIx, lf->fileName);
-	}
-    hashAdd(hash, row[keyColIx], lmCloneRow(hash->lm, row, colCount) );
-    }
-lineFileClose(&lf);
-return hash;
-}
-
 void hashSamplesAndClusters(char *tsvFile, 
     struct hash **retSampleHash, struct hash **retClusterHash)
 /* Read two column tsv file into a hash keyed by first column */
 {
 struct hash *sampleHash = hashNew(0);
 struct hash *clusterHash = hashNew(0);
 char *row[2];
 struct lineFile *lf = lineFileOpen(tsvFile, TRUE);
 while (lineFileNextRowTab(lf, row, ArraySize(row)) )
     {
     /* Find cluster in cluster hash, if it doesn't exist make it. */
     char *clusterName = row[1];
     struct hashEl *hel = hashLookup(clusterHash, clusterName);
     if (hel == NULL)
 	hel = hashAddInt(clusterHash, clusterName, 1);
@@ -211,30 +183,31 @@
     colToCluster[colIx] = -1;
     if (clusterName != NULL)
         {
 	int clusterId = hashIntValDefault(clusterToClusterIdHash, clusterName, -1);
 	colToCluster[colIx] = clusterId;
 	if (clusterId == -1)
 	    warn("%s is in expression matrix but not in sample cluster file", clusterName);
 	}
     }
 
 
 /* Set up row for reading one row of matrix at a time. */
 char **matrixRow;
 AllocArray(matrixRow, colAlloc);
 int hitCount = 0, missCount = 0;
+double sumTotal = 0;
 dotForUserInit(100);
 for (;;)
     {
     /* Fetch next line and remember how long it is.  Just skip over lines that are empty or
      * start with # character. */
     int lineLength = 0;
     char *line;
     if (!lineFileNext(lf, &line, &lineLength))
         break;
     char *s = skipLeadingSpaces(line);
     char c = s[0];
     if (c == 0 || c == '#')
         continue;
 
     /* Chop it into tabs */
@@ -260,32 +233,37 @@
 	char **geneBedVal = onePos->val;	// Get our bed as string array out of hash
 
 	/* Zero out cluster histogram */
 	int i;
 	for (i=0; i<clusterCount; ++i)
 	    {
 	    clusterTotal[i] = 0.0;
 	    clusterElements[i] = 0;
 	    }
 
 	/* Loop through rest of row filling in histogram */
 	for (i=1; i<colCount; ++i)
 	    {
 	    int clusterIx = colToCluster[i];
 	    char *textVal = matrixRow[i];
+if (clusterIx == clusterCount - 1)  //ugly
+    {
+    uglyf("%s %d %s\n", geneName, i, textVal);
+    }
 	    // special case so common we parse out "0" inline
 	    double val = (textVal[0] == '0' && textVal[1] == 0) ? 0.0 : sqlDouble(textVal);
+	    sumTotal += val;
 	    int valCount = clusterElements[clusterIx];
 	    clusterElements[clusterIx] = valCount+1;
 	    if (doMedian)
 		{
 		if (valCount >= clusterSize[clusterIx])
 		    internalErr();
 		clusterSamples[clusterIx][valCount] = val;
 		}
 	    else
 		clusterTotal[clusterIx] += val;
 	    }
 
 	/* Output info - first six from the bed, then name2, then our barchart */
 	for (i=0; i<6; ++i)
 	    fprintf(f, "%s\t",  geneBedVal[i]);
@@ -302,42 +280,49 @@
 	    }
 	else
 	    name2 = geneBedVal[name2Ix];
 	if (name2 == NULL)
 	    name2 = name;
 	fprintf(f, "%s\t", name2);
 
 	fprintf(f, "%d\t", clusterCount);
 	for (i=0; i<clusterCount; ++i)
 	    {
 	    if (i != 0)
 	       fprintf(f, ",");
 	    if (doMedian)
 		fprintf(f, "%g", doubleMedian(clusterElements[i], clusterSamples[i]));
 	    else
+		{
 		fprintf(f, "%g",  clusterTotal[i]/clusterElements[i]);
 		}
+	    }
 	
 	/* Data file offset info */
 	if (!clSimple)
 	    fprintf(f, "\t%lld\t%lld",  (long long)lineFileTell(lf), (long long)lineLength);
 
 	fprintf(f, "\n");
 	}
     dotForUser();
     }
 verbose(1, "\n%d genes found, %d (%0.2f%%) missed\n", hitCount, missCount, 100.0*missCount/(hitCount+missCount));
+if (!doMedian)
+    {
+    verbose(1, "matrix total %g, %d clusters, %g ave/cluster\n", 
+	sumTotal, clusterCount, sumTotal/clusterCount);
+    }
 carefulClose(&f);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 5)
     usage();
 clSimple = optionExists("simple");
 clMedian = optionExists("median");
 clName2 = optionVal("name2", clName2);
 clusterMatrixToBarchartBed(argv[1], argv[2], argv[3], argv[4]);
 return 0;
 }