src/utils/clusterMatrixToBarChartBed/clusterMatrixToBarChartBed.c 992541b9ac7cdc7f292bc6cc35b8c471e15ffaea

992541b9ac7cdc7f292bc6cc35b8c471e15ffaea
kent
  Tue Dec 22 10:07:00 2020 -0800
Adding name2 option.  Making mean the default and median a flag.

diff --git src/utils/clusterMatrixToBarChartBed/clusterMatrixToBarChartBed.c src/utils/clusterMatrixToBarChartBed/clusterMatrixToBarChartBed.c
index fead46b..da9f6dd 100644
--- src/utils/clusterMatrixToBarChartBed/clusterMatrixToBarChartBed.c
+++ src/utils/clusterMatrixToBarChartBed/clusterMatrixToBarChartBed.c
@@ -1,53 +1,58 @@
 /* clusterMatrixToBarchartBed - Compute a barchart bed file from  a gene matrix 
  * and a gene bed file and a way to cluster samples. */
 
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "localmem.h"
 #include "obscure.h"
 #include "sqlNum.h"
 
 boolean clDataOffset = FALSE;
-boolean clMean = FALSE;
+boolean clMedian = FALSE;
+char *clName2 = NULL;
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "clusterMatrixToBarchartBed - Compute a barchart bed file from  a gene matrix\n"
   "and a gene bed file and a way to cluster samples.\n"
   "usage:\n"
   "   clusterMatrixToBarchartBed sampleClusters.tsv geneMatrix.tsv geneset.bed output.bed\n"
   "where:\n"
   "   sampleClusters.tsv is a two column tab separated file with sampleId and clusterId\n"
   "   geneMatrix.tsv has a row for each gene. The first row uses the same sampleId as above\n"
   "   geneset.bed has the maps the genes in the matrix (from it's first column) to the genome\n"
+  "        geneset.bed needs 6 standard bed fields.  Unless name2 is set it also needs a name2\n"
+  "        field as the last field\n"
   "   output.bed is the resulting bar chart, with one column per cluster\n"
   "options:\n"
   "   -dataOffset - store the position of gene in geneMatrix.tsv file in output\n"
-  "   -mean - use mean (instead of median)\n"
+  "   -median - use median (instead of mean)\n"
+  "   -name2=twoColFile.tsv - get name2 from file where first col is same ase geneset.bed's name\n"
   );
 }
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
    {"dataOffset", OPTION_BOOLEAN},
    {"_dataOffset", OPTION_BOOLEAN},
-   {"mean", OPTION_BOOLEAN},
+   {"median", OPTION_BOOLEAN},
+   {"name2", OPTION_STRING},
    {NULL, 0},
 };
 
 struct hash *hashTsvBy(char *in, int keyColIx, int *retColCount)
 /* Return a hash of rows keyed by the given column */
 {
 struct lineFile *lf = lineFileOpen(in, TRUE);
 struct hash *hash = hashNew(0);
 char *line = NULL, **row = NULL;
 int colCount = 0, colAlloc=0;	/* Columns as counted and as allocated */
 while (lineFileNextReal(lf, &line))
     {
     if (colCount == 0)
         {
 	*retColCount = colCount = chopByChar(line, '\t', NULL, 0);
@@ -85,37 +90,61 @@
     else
 	hel->val = ((char *)hel->val)+1;    // Increment hash pointer as per hashIncInt
     char *clusterStableName = hel->name;	// This is allocated in clusterHash
     hashAdd(sampleHash, row[0], clusterStableName);
     }
 lineFileClose(&lf);
 *retSampleHash = sampleHash;
 *retClusterHash = clusterHash;
 }
 
 void clusterMatrixToBarchartBed(char *sampleClusters, char *matrixTsv, char *geneBed, char *output)
 /* clusterMatrixToBarchartBed - Compute a barchart bed file from  a gene matrix 
  * and a gene bed file and a way to cluster samples. */
 {
 /* Figure out if we need to do medians etc */
-boolean doMedian = !clMean;
+boolean doMedian = clMedian;
 
 /* Load up the gene set */
 verbose(1, "clusterMatrixToBarchartBed(%s,%s,%s,%s)\n", sampleClusters, matrixTsv, geneBed, output);
 int bedRowSize = 0;
 struct hash *geneHash = hashTsvBy(geneBed, 3, &bedRowSize);
-verbose(1, "%d genes in %s\n", geneHash->elCount, geneBed);
+verbose(1, "%d columns about %d genes in %s\n", bedRowSize, geneHash->elCount, geneBed);
+
+/* Deal with external gene hash */
+struct hash *nameToName2 = NULL;
+if (clName2 != NULL)
+    {
+    int colCount = 0;
+    nameToName2 = hashTsvBy(clName2, 0, &colCount);
+    if (colCount != 2)
+        errAbort("Expecting %s to be a two column tab separated file", clName2);
+    }
+
+/* Keep track of how many fields gene bed has to have and locate name2 */
+int geneBedMinSize = 6;
+int name2Ix = bedRowSize - 1;	    // Last field if it is in bed
+if (clName2 != NULL)
+    geneBedMinSize -= 1;
+if (bedRowSize < geneBedMinSize)
+    {
+    if (clName2 == NULL)
+	errAbort("%s needs to have at least 6 standard BED fields and a name2 field\n", geneBed);
+    else
+	errAbort("%s needs to have at least 6 standard BED fields\n", geneBed);
+    }
+
 
 /* Load up the sample clustering */
 struct hash *sampleHash = NULL, *clusterHash = NULL;
 hashSamplesAndClusters(sampleClusters, &sampleHash, &clusterHash);
 int clusterCount = clusterHash->elCount;
 verbose(1, "%d samples and %d clusters in %s\n", sampleHash->elCount, clusterCount,
     sampleClusters);
 if (clusterCount <= 1 || clusterCount >= 10000)
     errAbort("%d is not a good number of clusters", clusterCount);
 double clusterTotal[clusterCount];
 int clusterElements[clusterCount];
 
 /* Alphabetize cluster names  */
 char *clusterNames[clusterCount];
 struct hashEl *hel;
@@ -246,52 +275,70 @@
 	    char *textVal = matrixRow[i];
 	    // special case so common we parse out "0" inline
 	    double val = (textVal[0] == '0' && textVal[1] == 0) ? 0.0 : sqlDouble(textVal);
 	    int valCount = clusterElements[clusterIx];
 	    clusterElements[clusterIx] = valCount+1;
 	    if (doMedian)
 		{
 		if (valCount >= clusterSize[clusterIx])
 		    internalErr();
 		clusterSamples[clusterIx][valCount] = val;
 		}
 	    else
 		clusterTotal[clusterIx] += val;
 	    }
 
-	/* Output info - first from the bed, then our barchart */
-	for (i=0; i<bedRowSize; ++i)
+	/* Output info - first six from the bed, then name2, then our barchart */
+	for (i=0; i<6; ++i)
 	    fprintf(f, "%s\t",  geneBedVal[i]);
+
+	char *name = geneBedVal[3];	// By bed definition it's fourth field
+	char *name2 = NULL;
+	if (nameToName2 != NULL)
+	    {
+	    char **namedRow = hashFindVal(nameToName2, name);
+	    if (namedRow != NULL)
+		name2 = namedRow[1];	    // [0] is name 
+	    else
+	        warn("Can't find %s in %s", name, clName2);
+	    }
+	else
+	    name2 = geneBedVal[name2Ix];
+	if (name2 == NULL)
+	    name2 = name;
+	fprintf(f, "%s\t", name2);
+
 	fprintf(f, "%d\t", clusterCount);
 	for (i=0; i<clusterCount; ++i)
 	    {
 	    if (i != 0)
 	       fprintf(f, ",");
 	    if (doMedian)
 		fprintf(f, "%g", doubleMedian(clusterElements[i], clusterSamples[i]));
 	    else
 		fprintf(f, "%g",  clusterTotal[i]/clusterElements[i]);
 	    }
 	
 	/* Data file offset info */
 	if (clDataOffset)
 	    fprintf(f, "\t%lld\t%lld",  (long long)lineFileTell(lf), (long long)lineLength);
 
 	fprintf(f, "\n");
 	}
     dotForUser();
     }
 verbose(1, "%d genes found, %d missed\n", hitCount, missCount);
 carefulClose(&f);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 5)
     usage();
 clDataOffset = (optionExists("_dataOffset") || optionExists("dataOffset"));
-clMean = optionExists("mean");
+clMedian = optionExists("median");
+clName2 = optionVal("name2", clName2);
 clusterMatrixToBarchartBed(argv[1], argv[2], argv[3], argv[4]);
 return 0;
 }