826d8721b1c1b0a8b1030056c90f14a8d319f5e8
kent
  Mon Dec 28 16:30:36 2020 -0800
First cut of tool designed to cluster big matrices by column.  This is a refactoring of clusterMatrixIntoBarchartBed to just handle the matrix shrinking part, whichis the most time consuming and perhaps useful elsewhere.

diff --git src/utils/matrixClusterColumns/matrixClusterColumns.c src/utils/matrixClusterColumns/matrixClusterColumns.c
new file mode 100644
index 0000000..65a92ea
--- /dev/null
+++ src/utils/matrixClusterColumns/matrixClusterColumns.c
@@ -0,0 +1,420 @@
+/* matrixClusterColumns - Group the columns of a matrix into clusters, and output a matrix 
+ * the with same number of rows and generally much fewer columns.. */
+#include "common.h"
+#include "linefile.h"
+#include "hash.h"
+#include "options.h"
+#include "obscure.h"
+#include "fieldedTable.h"
+#include "sqlNum.h"
+
+
+void usage()
+/* Explain usage and exit. */
+{
+errAbort(
+  "matrixClusterColumns - Group the columns of a matrix into clusters, and output a matrix with\n"
+  "the same number of rows and generally much fewer columns.\n"
+  "usage:\n"
+  "   matrixClusterColumns input meta.tsv sample cluster output.tsv [clusterCol output2 ... ]\n"
+  "where:\n"
+  "   input is a file either in tsv format or in mtx (matrix mart sorted) format\n"
+  "   meta.tsv is a table where the first row is field labels and the first column is sample ids\n"
+  "   sample is the name of the field with the sample (often single cell) id's\n"
+  "   cluster is the name of the field with the cluster names\n"
+  "You can produce multiple clusterings in the same pass through the input matrix by specifying\n"
+  "additional cluster/output pairs in the command line.\n"
+  "options:\n"
+  "   -columnLabels=file.txt - a file with a line for each column, required for mtx inputs\n"
+  "   -rowLabels=file.txt - a file with a label for each row, also required for mtx inputs\n"
+  );
+}
+
+/* Command line validation table. */
+static struct optionSpec options[] = {
+   {"columnLabels", OPTION_STRING},
+   {"rowLabels", OPTION_STRING},
+   {NULL, 0},
+};
+
+
+void readLineArray(char *fileName, int *retCount, char ***retLines)
+/* Return an array of strings, one for each line of file.  Return # of lines in file too */
+{
+/* This is sloppy with memory but it doesn't matter since we won't free it. */
+struct slName *el, *list = readAllLines(fileName);
+if (list == NULL)
+    errAbort("%s is empty", fileName);
+int count = slCount(list);
+char **lines;
+AllocArray(lines, count);
+int i;
+for (i=0, el=list; i<count; ++i, el = el->next)
+    {
+    lines[i] = el->name;
+    }
+*retCount = count;
+*retLines = lines;
+}
+
+int countNonzero(double *a, int size)
+/* Return number of nonzero items in array a */
+{
+int count = 0;
+while (--size >= 0)
+    if (*a++ != 0.0)
+        ++count;
+return count;
+}
+
+double sumArray(double *a, int size)
+/* Return sum of all items in array */
+{
+double sum = 0.0;
+while (--size >= 0)
+    sum += *a++;
+return sum;
+}
+
+struct vMatrix
+/* Virtual matrix - fast row-oriented access to either tsv or mtx files */
+    {
+    struct vMatrix *next;
+
+
+        /* TSV private fields */
+    struct lineFile *lf;	    // Line file for tab-sep case
+    struct fieldedTable *ft;	    // fielded table if a tab-sep file
+    double *rowAsNum;		    // Our fielded table result array of numbers
+    char **rowAsStrings;		    // Our row as an array of strings
+
+	/* From below are fields that you can read */
+    char **rowLabels;		    // If non-NULL array of labels for each row
+    char **colLabels;		    // Array of labels for each column
+    int colCount;		    // Number of columns in a row
+    int curRow;			    // Current row we are processing
+    struct dyString *rowLabel;	    // Label associated with this line
+    };
+
+struct vMatrix *vMatrixOpen(char *matrixFile, char *columnFile, char *rowFile)
+/* Figure out if it's a mtx or tgz file and open it */
+{
+/* Read in labels if there are any */
+char **columnLabels = NULL, **rowLabels = NULL;
+int columnCount = 0, rowCount = 0;
+if (columnFile != NULL)
+    readLineArray(columnFile, &columnCount, &columnLabels);
+if (rowFile != NULL)
+    readLineArray(rowFile, &rowCount, &rowLabels);
+
+struct vMatrix *v = needMem(sizeof(*v));
+struct lineFile *lf = v->lf = lineFileOpen(matrixFile, TRUE);
+struct fieldedTable *ft = v->ft = fieldedTableReadTabHeader(lf, NULL, 0);
+v->colCount = ft->fieldCount-1;	    // Don't include row label field 
+uglyf("v->colCount is %d insize vMatrixOpen for %s\n", v->colCount, matrixFile);
+v->rowAsNum = needHugeMem(v->colCount * sizeof(v->rowAsNum[0]));
+v->rowAsStrings = needHugeMem(ft->fieldCount * sizeof(v->rowAsStrings[0]));
+if (columnLabels == NULL)
+    columnLabels = ft->fields+1;	// +1 to skip over row label field
+v->rowLabels = rowLabels;
+v->colLabels = columnLabels;
+v->rowLabel = dyStringNew(32);
+return v;
+}
+
+void vMatrixClose(struct vMatrix **pV)
+/* Close up vMatrix */
+{
+struct vMatrix *v = *pV;
+if (v != NULL)
+    {
+    fieldedTableFree(&v->ft);
+    freeMem(v->rowAsNum);
+    freeMem(v->rowAsStrings);
+    dyStringFree(&v->rowLabel);
+    freez(pV);
+    }
+}
+
+double *vMatrixNextRow(struct vMatrix *v)
+/* Return next row of matrix or NULL if at end of file */
+{
+int colCount = v->colCount;
+if (!lineFileNextRow(v->lf, v->rowAsStrings, colCount))
+    return NULL;
+
+/* Save away the row label for later. */
+char *rowLabel;
+if (v->rowLabels)
+    rowLabel = v->rowLabels[v->curRow];
+else
+    rowLabel = v->rowAsStrings[0];
+dyStringClear(v->rowLabel);
+dyStringAppend(v->rowLabel, rowLabel);
+
+/* Convert ascii to floating point, with little optimization for the many zeroes we usually see */
+int i;
+for (i=1; i<colCount; ++i)
+    {
+    char *str = v->rowAsStrings[i];
+    double val = ((str[0] == '0' && str[1] == 0) ? 0.0 : sqlDouble(str));
+    v->rowAsNum[i-1] = val;
+    }
+v->curRow += 1;
+return v->rowAsNum;
+}
+
+struct clustering
+/* Stuff we need to cluster something.  This is something we might do
+ * repeatedly to same input matrix */
+    {
+    struct clustering *next;
+    char *clusterField;	    /* Field to cluster on */
+    char *outputFile;	    /* Where to put result */
+    int clusterMetaIx;	    /* Index of cluster field in meta table */
+    int *colToCluster;	    /* Maps input matrix column to clustered column */
+    int clusterCount;   /* Number of different values in column we are clustering */
+    double *clusterTotal;  /* A place to hold totals for each cluster */
+    int *clusterElements;  /* A place to hold counts for each cluster */
+    double *clusterResults; /* Results after clustering */
+    FILE *file;		    /* output */
+    };
+
+
+struct clustering *clusteringNew(char *clusterField, char *outputFile, 
+    struct fieldedTable *metaTable, int sampleFieldIx, struct vMatrix *v)
+/* Make up a new clustering job structure */
+{
+struct clustering *job;
+AllocVar(job);
+job->clusterField = clusterField;
+job->outputFile = outputFile;
+int clusterFieldIx = job->clusterMetaIx = fieldedTableMustFindFieldIx(metaTable, clusterField);
+
+/* Make up hash of sample names with cluster values */
+struct hash *sampleHash = hashNew(0);	/* Keyed by sample value is cluster */
+struct fieldedRow *fr;
+for (fr = metaTable->rowList; fr != NULL; fr = fr->next)
+    {
+    char **row = fr->row;
+    hashAdd(sampleHash, row[sampleFieldIx], row[clusterFieldIx]);
+    }
+
+/* Find all uniq cluster names */
+struct slName *nameList = NULL;
+struct hash *uniqHash = hashNew(0);
+for (fr = metaTable->rowList; fr != NULL; fr = fr->next)
+    {
+    char *cluster = fr->row[clusterFieldIx];
+    if (hashLookup(uniqHash, cluster) == NULL)
+        {
+	slNameAddHead(&nameList, cluster);
+	hashAdd(uniqHash, cluster, NULL);
+	}
+    }
+hashFree(&uniqHash);
+
+/* Just alphabetize names for now */
+slNameSort(&nameList);
+uglyf("Got %d unique values for %s\n", slCount(nameList), clusterField);
+
+/* Make up hash that maps cluster names to cluster ids */
+struct hash *clusterHash = hashNew(0);	/* Keyed by cluster, no value */
+struct slName *name;
+int i;
+for (name = nameList, i=0; name != NULL; name = name->next, ++i)
+    hashAddInt(clusterHash, name->name, i);
+int clusterCount = job->clusterCount = clusterHash->elCount;
+
+/* Make up array that has -1 where no cluster available, otherwise output index */
+int colCount = v->colCount;
+int *colToCluster = job->colToCluster = needHugeMem(colCount * sizeof(colToCluster[0]));
+int colIx;
+int unclusteredColumns = 0, missCount = 0;
+for (colIx=0; colIx < colCount; colIx = colIx+1)
+    {
+    char *colName = v->colLabels[colIx];
+    char *clusterName = hashFindVal(sampleHash, colName);
+    colToCluster[colIx] = -1;
+    if (clusterName != NULL)
+        {
+	int clusterId = hashIntValDefault(clusterHash, clusterName, -1);
+	colToCluster[colIx] = clusterId;
+	if (clusterId == -1)
+	    {
+	    verbose(3, "%s is in expression matrix but not in sample cluster file", clusterName);
+	    ++missCount;
+	    }
+	}
+    else
+	unclusteredColumns += 1;
+    }
+verbose(2, "%d total columns, %d unclustered, %d misses\n", 
+    colCount, unclusteredColumns, missCount);
+
+/* Allocate space for results for clustering one line */
+job->clusterResults = needHugeMem(clusterCount * sizeof(job->clusterResults[0]));
+
+/* Allocate a few more things */
+job->clusterTotal = needMem(clusterCount*sizeof(job->clusterTotal[0]));
+job->clusterElements = needMem(clusterCount*sizeof(job->clusterElements[0]));
+
+/* Open file - and write out header */
+FILE *f = job->file = mustOpen(job->outputFile, "w");
+if (v->ft->startsSharp)
+    fputc('#', f);
+
+/* First field name agrees with first column of matrix */
+fputs( v->ft->fields[0],f);
+
+/* Use our clusters for the rest of the names */
+for (name = nameList; name != NULL; name = name->next) 
+    {
+    fputc('\t', f);
+    fputs(name->name, f);
+    }
+fputc('\n', f);
+
+/* Clean up and return result */
+hashFree(&sampleHash);
+hashFree(&clusterHash);
+return job;
+}
+
+void clusterRow(struct clustering *job, struct vMatrix *v, double *a)
+/* Process a row in a, outputting in job->file */
+{
+/* Zero out cluster histogram */
+double *clusterTotal = job->clusterTotal;
+int *clusterElements = job->clusterElements;
+int clusterCount = job->clusterCount;
+int i;
+for (i=0; i<clusterCount; ++i)
+    {
+    clusterTotal[i] = 0.0;
+    clusterElements[i] = 0;
+    }
+
+/* Loop through rest of row filling in histogram */
+int colCount = v->colCount;
+int *colToCluster = job->colToCluster;
+for (i=0; i<colCount; ++i)
+    {
+    int clusterIx = colToCluster[i];
+    if (clusterIx >= 0)
+	{
+	double val = a[i];
+	int valCount = clusterElements[clusterIx];
+	clusterElements[clusterIx] = valCount+1;
+#ifdef SOON
+	if (doMedian)
+	    {
+	    if (valCount >= clusterSize[clusterIx])
+		internalErr();
+	    clusterSamples[clusterIx][valCount] = val;
+	    }
+	else
+#endif /* SOON */
+	    clusterTotal[clusterIx] += val;
+	}
+    else
+        uglyAbort("WTF");
+    }
+
+/* Do output to file */
+FILE *f = job->file;
+fprintf(f, "%s", v->rowLabel->string);
+for (i=0; i<clusterCount; ++i)
+    {
+    fprintf(f, "\t");
+    double val;
+#ifdef SOON
+    if (doMedian)
+	val = doubleMedian(clusterElements[i], clusterSamples[i]);
+    else
+#endif /* SOON */
+	val = clusterTotal[i]/clusterElements[i];
+    fprintf(f, "%g", val);
+    }
+	
+	/* Data file offset info */
+#ifdef SOON
+	if (!clSimple)
+	    fprintf(f, "\t%lld\t%lld",  (long long)lineFileTell(lf), (long long)lineLength);
+#endif /* SOON */
+
+fprintf(f, "\n");
+}
+
+
+
+
+void matrixClusterColumns(char *matrixFile, char *metaFile, char *sampleField,
+    int outputCount, char **clusterFields, char **outputFiles)
+/* matrixClusterColumns - Group the columns of a matrix into clusters, and output a matrix 
+ * the with same number of rows and generally much fewer columns.. */
+{
+/* Load up metadata and make sure we have all of the cluster fields we need 
+ * and fill out array of clusterIx corresponding to clusterFields in metaFile. */
+struct fieldedTable *metaTable = fieldedTableFromTabFile(metaFile, metaFile, NULL, 0);
+int sampleFieldIx = fieldedTableMustFindFieldIx(metaTable, sampleField);
+struct hash *metaHash = fieldedTableIndex(metaTable, sampleField);
+verbose(1, "Read %d rows from %s\n", metaHash->elCount, metaFile);
+
+/* Load up input matrix and labels */
+char *columnFile = optionVal("columnLabels", NULL);
+char *rowFile = optionVal("rowLabels", NULL);
+struct vMatrix *v = vMatrixOpen(matrixFile, columnFile, rowFile);
+verbose(1, "matrix %s has %d fields\n", matrixFile, v->colCount);
+
+/* Create a clustering for each output and find index in metaTable for each. */
+struct clustering *jobList = NULL, *job;
+int i;
+for (i=0; i<outputCount; ++i)
+    {
+    job = clusteringNew(clusterFields[i], outputFiles[i], metaTable, sampleFieldIx, v);
+    slAddTail(&jobList, job);
+    }
+uglyf("Made %d jobs\n", outputCount);
+
+
+/* Chug through big matrix a row at a time clustering */
+dotForUserInit(100);
+for (;;)
+  {
+  double *a = vMatrixNextRow(v);
+  if (a == NULL)
+       break;
+  for (job = jobList; job != NULL; job = job->next)
+      clusterRow(job, v, a);
+  dotForUser();
+  }
+fputc('\n', stderr);  // Cover last dotForUser
+
+/* Close files */
+for (job = jobList; job != NULL; job = job->next)
+    carefulClose(&job->file);
+
+vMatrixClose(&v);
+}
+
+int main(int argc, char *argv[])
+/* Process command line. */
+{
+optionInit(&argc, argv, options);
+int minArgc = 6;
+if (argc < minArgc || ((argc-minArgc)%2)!=0)  // Force minimum, even number
+    usage();
+int outputCount = 1 + (argc-minArgc)/2;	      // Add one since at minimum have 1
+char *clusterFields[outputCount], *outputFiles[outputCount];
+int i;
+char **pair = argv + minArgc - 2;
+for (i=0; i<outputCount; ++i)
+    {
+    clusterFields[i] = pair[0];
+    outputFiles[i] = pair[1];
+    pair += 2;
+    }
+matrixClusterColumns(argv[1], argv[2], argv[3], outputCount, clusterFields, outputFiles);
+return 0;
+}