src/utils/matrixClusterColumns/matrixClusterColumns.c ab7847c47ac5507a7e126691bb78826ed4ba345d

ab7847c47ac5507a7e126691bb78826ed4ba345d
chmalee
  Mon Sep 13 16:34:47 2021 -0700
Staging GTEx single nuclei gene expression track, refs #29954

diff --git src/utils/matrixClusterColumns/matrixClusterColumns.c src/utils/matrixClusterColumns/matrixClusterColumns.c
index d0f47bd..ff70afa 100644
--- src/utils/matrixClusterColumns/matrixClusterColumns.c
+++ src/utils/matrixClusterColumns/matrixClusterColumns.c
@@ -18,37 +18,39 @@
 {
 errAbort(
   "matrixClusterColumns - Group the columns of a matrix into clusters, and output a matrix with\n"
   "the same number of rows and generally much fewer columns. Combines columns by taking mean.\n"
   "usage:\n"
   "   matrixClusterColumns inMatrix.tsv meta.tsv cluster outMatrix.tsv outStats.tsv [cluster2 outMatrix2.tsv outStats2.tsv ... ]\n"
   "where:\n"
   "   inMatrix.tsv is a file in tsv format with cell labels in first row and gene labels in first column\n"
   "   meta.tsv is a table where the first row is field labels and the first column is sample ids\n"
   "   cluster is the name of the field with the cluster names\n"
   "You can produce multiple clusterings in the same pass through the input matrix by specifying\n"
   "additional cluster/outMatrix/outStats triples in the command line.\n"
   "options:\n"
   "   -makeIndex=index.tsv - output index tsv file with <matrix-col1><input-file-pos><line-len>\n"
   "   -median if set ouput median rather than mean cluster value\n"
+  "   -excludeZeros if set exclude zeros when calculating mean/median\n"
   );
 }
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
    {"makeIndex", OPTION_STRING},
    {"median", OPTION_BOOLEAN},
+   {"excludeZeros", OPTION_BOOLEAN},
    {NULL, 0},
 };
 
 
 void readLineArray(char *fileName, int *retCount, char ***retLines)
 /* Return an array of strings, one for each line of file.  Return # of lines in file too */
 {
 /* This is sloppy with memory but it doesn't matter since we won't free it. */
 struct slName *el, *list = readAllLines(fileName);
 if (list == NULL)
     errAbort("%s is empty", fileName);
 int count = slCount(list);
 char **lines;
 AllocArray(lines, count);
 int i;
@@ -124,43 +126,44 @@
     {
     struct clustering *next;
     char *clusterField;	    /* Field to cluster on */
     char *outMatrixFile;    /* Where to put matrix result */
     char *outStatsFile;	    /* Where to put stats result */
     int clusterMetaIx;	    /* Index of cluster field in meta table */
     int *colToCluster;	    /* Maps input matrix column to clustered column */
     int clusterCount;   /* Number of different values in column we are clustering */
     double *clusterTotal;  /* A place to hold totals for each cluster */
     double *clusterGrandTotal;	/* Total over all rows */
     int *clusterElements;  /* A place to hold counts for each cluster */
     double *clusterResults; /* Results after clustering */
     struct hash *clusterSizeHash;   /* Keyed by cluster, int value is elements in cluster */
     char **clusterNames;	    /* Holds name of each cluster */
     int *clusterSizes;	    /* An array that holds size of each cluster */
+    boolean excludeZeros; /* If true exclude zero vals from calc */
 
     /* Things needed by median handling */
     boolean doMedian;	/* If true we calculate median */
     double **clusterSamples; /* An array that holds an array big enough for all vals in cluster. */
 
     struct dyString **chunkLinesOut; /* parallel output */
     double **chunkSubtotals;	     /* Totals */
     FILE *matrixFile;		     /* serial output */
     };
 
 
 struct clustering *clusteringNew(char *clusterField, char *outMatrixFile, char *outStatsFile,
-    struct fieldedTable *metaTable, struct ccMatrix *v, boolean doMedian)
+    struct fieldedTable *metaTable, struct ccMatrix *v, boolean doMedian, boolean excludeZeros)
 /* Make up a new clustering structure */
 {
 /* Check that all column names in matrix are unique */
 int colCount = v->colCount;
 char **colLabels = v->colLabels;
 struct hash *uniqColHash = hashNew(0);
 int colIx;
 for (colIx=0; colIx < colCount; colIx = colIx+1)
     {
     char *label = colLabels[colIx];
     if (hashLookup(uniqColHash, label) == NULL)
 	hashAdd(uniqColHash, label, NULL);
     else
         errAbort("Duplicated column label %s in input matrix", label);
     }
@@ -226,30 +229,32 @@
 
 if (doMedian)
     {	
     /* Allocate arrays to hold number of samples and all sample vals for each cluster */
     clustering->doMedian = doMedian;
     AllocArray(clustering->clusterSamples, clusterCount);
     int clusterIx;
     for (clusterIx = 0; clusterIx < clusterCount; ++clusterIx)
 	{
 	double *samples;
 	AllocArray(samples, clustering->clusterSizes[clusterIx]);
 	clustering->clusterSamples[clusterIx] = samples;
 	}
     }
 
+clustering->excludeZeros = excludeZeros;
+
 
 /* Make up array that has -1 where no cluster available, otherwise output index, also
  * hash up all column labels. */
 int *colToCluster = clustering->colToCluster = needHugeMem(colCount * sizeof(colToCluster[0]));
 int unclusteredColumns = 0, missCount = 0;
 for (colIx=0; colIx < colCount; colIx = colIx+1)
     {
     char *colName = colLabels[colIx];
     char *clusterName = hashFindVal(sampleHash, colName);
     colToCluster[colIx] = -1;
     if (clusterName != NULL)
         {
 	int clusterId = hashIntValDefault(clusterIxHash, clusterName, -1);
 	colToCluster[colIx] = clusterId;
 	if (clusterId == -1)
@@ -332,30 +337,32 @@
     {
     clusterTotal[i] = 0.0;
     clusterElements[i] = 0;
     }
 
 /* Loop through rest of row filling in histogram */
 int *colToCluster = clustering->colToCluster;
 boolean doMedian = clustering->doMedian;
 for (i=0; i<colCount; ++i)
     {
     int clusterIx = colToCluster[i];
     if (clusterIx >= 0)
 	{
 	double val = a[i];
 	int valCount = clusterElements[clusterIx];
+        if (clustering->excludeZeros && val == 0.0)
+            continue;
 	clusterElements[clusterIx] = valCount+1;
 	clusterTotal[clusterIx] += val;
 	if (doMedian)
 	    {
 	    if (valCount >= clustering->clusterSizes[clusterIx])
 		internalErr();
 	    clustering->clusterSamples[clusterIx][valCount] = val;
 	    }
 	}
     }
 
 /* Do output to outstrng and do grand totalling */
 dyStringClear(out);
 dyStringPrintf(out, "%s", rowLabel);
 for (i=0; i<clusterCount; ++i)
@@ -443,55 +450,55 @@
     }
 
 /* Then do the clustering */
 struct clustering *clustering;
 for (clustering = lii->clusteringList; clustering != NULL; clustering = clustering->next)
       {
       int chunkIx = lii->chunkIx;
       clusterRow(clustering, xSize, rowLabel, 
 	    lii->vals, lii->totalingTemp, lii->elementsTemp,
 	    clustering->chunkLinesOut[chunkIx], clustering->chunkSubtotals[chunkIx]);
       }
 }
 
 void matrixClusterColumns(char *matrixFile, char *metaFile, char *sampleField,
     int outputCount, char **clusterFields, char **outMatrixFiles, char **outStatsFiles,
-    char *outputIndex, boolean doMedian)
+    char *outputIndex, boolean doMedian, boolean excludeZeros)
 /* matrixClusterColumns - Group the columns of a matrix into clusters, and output a matrix 
  * the with same number of rows and generally much fewer columns.. */
 {
 FILE *fIndex = NULL;
 if (outputIndex)
     fIndex = mustOpen(outputIndex, "w");
 
 /* Load up metadata and make sure we have all of the cluster fields we need 
  * and fill out array of clusterIx corresponding to clusterFields in metaFile. */
 struct fieldedTable *metaTable = fieldedTableFromTabFile(metaFile, metaFile, NULL, 0);
 struct hash *metaHash = fieldedTableIndex(metaTable, sampleField);
 verbose(1, "Read %d rows from %s\n", metaHash->elCount, metaFile);
 
 /* Load up input matrix first line at least */
 struct ccMatrix *v = ccMatrixOpen(matrixFile);
 verbose(1, "matrix %s has %d fields\n", matrixFile, v->colCount);
 
 /* Create a clustering for each output and find index in metaTable for each. */
 struct clustering *clusteringList = NULL, *clustering;
 int i;
 for (i=0; i<outputCount; ++i)
     {
     clustering = clusteringNew(clusterFields[i], outMatrixFiles[i], outStatsFiles[i], 
-			metaTable, v, doMedian);
+			metaTable, v, doMedian, excludeZeros);
     slAddTail(&clusteringList, clustering);
     }
 
 /* Set up buffers for pthread workers */
 struct lineIoItem chunks[chunkMaxSize];
 for (i=0; i<chunkMaxSize; ++i)
     {
     struct lineIoItem *chunk = &chunks[i];
     chunk->fileName = matrixFile;
     chunk->clusteringList = clusteringList;
     chunk->chunkIx = i;
     chunk->lineIn = dyStringNew(0);
     chunk->rowLabel = dyStringNew(0);
     AllocArray(chunk->vals, v->colCount);
     AllocArray(chunk->totalingTemp, v->colCount);
@@ -575,18 +582,18 @@
 int minArgc = 6;
 if (argc < minArgc || ((argc-minArgc)%3)!=0)  // Force minimum, even number
     usage();
 int outputCount = 1 + (argc-minArgc)/3;	      // Add one since at minimum have 1
 char *clusterFields[outputCount], *outMatrixFiles[outputCount], *outStatsFiles[outputCount];
 int i;
 char **triples = argv + minArgc - 3;
 for (i=0; i<outputCount; ++i)
     {
     clusterFields[i] = triples[0];
     outMatrixFiles[i] = triples[1];
     outStatsFiles[i] = triples[2];
     triples += 3;
     }
 matrixClusterColumns(argv[1], argv[2], argv[3], 
-    outputCount, clusterFields, outMatrixFiles, outStatsFiles, makeIndex, optionExists("median"));
+    outputCount, clusterFields, outMatrixFiles, outStatsFiles, makeIndex, optionExists("median"), optionExists("excludeZeros"));
 return 0;
 }