ebfdcc9a044d15c0bd9c4355b2c92ff3d6f95024
kent
  Sun Dec 19 08:50:12 2021 -0800
Parallelizing memMatrixToTsv and moving it to a library.  Parallelizing column normalizing.

diff --git src/utils/matrixNormalize/matrixNormalize.c src/utils/matrixNormalize/matrixNormalize.c
index 5cf4322..df1a08a 100644
--- src/utils/matrixNormalize/matrixNormalize.c
+++ src/utils/matrixNormalize/matrixNormalize.c
@@ -1,239 +1,206 @@
 /* matrixNormalize - Normalize a matrix somehow - make it's columns or rows all sum to one or have vector length one.. */
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "vMatrix.h"
 #include "pthreadDoList.h"
 
-int clThreads = 10;
-
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "matrixNormalize - Normalize a matrix somehow - make it's columns or rows all sum to one or have vector length one.\n"
   "usage:\n"
   "   matrixNormalize direction how inMatrix outMatrix\n"
   "where \"direction\" is one of\n"
   "   row - normalize rows to one\n"
   "   column - normalize columns to one\n"
   "and \"how\" is one of\n"
   "   sum - sum adds to one after normalization\n"
   "   length - Euclidian length as a vector adds to one\n"
   "options:\n"
   "   -target=val - use target val instead of one for normalizing\n"
-  "   -threads=N - number of threads to use, Expect limited help if you increase this\n"
-  "                Default is %d.  Only goes multitheaded in column mode and even then\n"
-  "                about 60%% of program is in the non-parallel bit.  Still, it helps\n"
-  , clThreads
+  "   -serial - disable use of pthreads to speed up via parallelization\n"
   );
 }
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
    {"target", OPTION_DOUBLE},
-   {"threads", OPTION_INT},
+   {"serial", OPTION_BOOLEAN},
    {NULL, 0},
 };
 
 boolean howIsLength(char *how)
 /* Return TRUE if how is "length",  FALSE if it is "sum", and abort otherwise */
 {
 if (sameWord(how, "length"))
     return TRUE;
 else if (sameWord(how, "sum"))
     return FALSE;
 errAbort("Unrecognized \"how\" %s", how);
 return FALSE;
 }
 
 void matrixNormalizeRows(char *inFile, boolean isLength, double target, char *outFile)
 /* Normalize matrix one row at a time. */
 {
 struct vRowMatrix *m = vRowMatrixOnTsv(inFile);
 FILE *f = mustOpen(outFile, "w");
 int size = m->xSize;
 double *row;
 char *label;
 while ((row = vRowMatrixNextRow(m, &label)) != NULL)
     {
     double scaleVal = target;
     double sum = 0.0;
     int i;
     if (isLength)
        {
        for (i=0; i<size; ++i)
 	    {
 	    double val = row[i];
             sum += val*val;
 	    }
        sum = sqrt(sum);
        }
     else
        {
        for (i=0; i<size; ++i)
             sum += row[i];
        }
     if (sum != 0.0)
 	scaleVal /= sum;
     for (i=0; i<size; ++i)
         row[i] *= scaleVal;
     fprintf(f, "%s", label);
     for (i=0; i<size; ++i)
 	{
 	double val = row[i];
 	if (val == 0.0)
 	   fputs("\t0", f);
 	else
 	    fprintf(f, "\t%g", row[i]);
 	}
     fprintf(f, "\n");
     }
 carefulClose(&f);
 vRowMatrixFree(&m);
 }
 
-void memMatrixToTsv(struct memMatrix *m, char *fileName)
-/* Output memory matrix to tab-sep file with labels */
-{
-FILE *f = mustOpen(fileName, "w");
-
-/* Print label row */
-fprintf(f, "%s", m->centerLabel);
-int x, xSize = m->xSize;
-for (x=0; x<xSize; ++x)
-     fprintf(f, "\t%s", m->xLabels[x]);
-fprintf(f, "\n");
-
-/* Print rest */
-/* Now output going through row by row */
-int y, ySize = m->ySize;
-for (y=0; y<ySize; ++y)
-   {
-   fprintf(f, "%s", m->yLabels[y]);
-   double *row = m->rows[y];
-   for (x=0; x<xSize; ++x)
-       {
-       double val = row[x];
-       if (val == 0.0)
-	   fputs("\t0", f);  // Common special case in these sparse matrices
-       else
-	   fprintf(f, "\t%g", row[x]);
-       }
-   fprintf(f, "\n");
-   }
-carefulClose(&f);
-}
-
 static void normCol(struct memMatrix *m, int x, double target, boolean isLength)
 /* Normalize column x of matrix to sum to target according to measure in isLength */
 {
 /* Calculate how big we are currently */
 double scaleVal = target;
 double sum = 0.0;
 int ySize = m->ySize;
 int y;
 if (isLength)
     {
     for (y=0; y<ySize; ++y)
 	{
 	double val = m->rows[y][x];
 	sum += val*val;
 	}
     sum = sqrt(sum);
     }
 else
     {
     for (y=0; y<ySize; ++y)
 	sum += m->rows[y][x];
     }
 
 /* Multiply ourselves by normalizing scale factor */
 if (sum != 0.0)
     {
     scaleVal /= sum;
     for (y=0; y<ySize; ++y)
 	m->rows[y][x] *= scaleVal;
     }
 }
 
 struct columnJob
 /* Multithreaded job that handles a column */
     {
     struct columnJob *next;
     int x;	/* Index of column withing matrix */
     boolean isLength;	/* Do sqrt(sum(squares)) type normalization */
     double target;	/* Want column to end up summing to this */
     };
 
 void columnWorker(void *item, void *context)
 /* A worker to execute a single column job */
 {
 struct columnJob *job = item;
 struct memMatrix *m = context;
 normCol(m, job->x, job->target, job->isLength);
 }
 
-void matrixNormalizeColumns(char *inFile, boolean isLength, double target, char *outFile)
+void matrixNormalizeColumns(char *inFile, boolean isLength, double target, char *outFile, 
+    int threads)
 /* Normalize matrix one row at a time. */
 {
 /* Open up input file */
+verboseTimeInit();
 struct memMatrix *m = memMatrixFromTsv(inFile);
+verboseTime(1, "Loaded %d x %d matrix from %s", m->xSize, m->ySize, inFile);
 
 /* Go through matrix by column (y-dimension) */
 int xSize = m->xSize;
 int x;
 
-if (clThreads > 1)
+if (threads > 1)
     {
-    int threadCount = min(clThreads, 100);
     struct columnJob *jobList = NULL;
     for (x=0; x<xSize; ++x)
 	{
 	struct columnJob *job;
 	AllocVar(job);
 	job->x = x;
 	job->isLength = isLength;
 	job->target = target;
 	slAddHead(&jobList, job);
 	}
     slReverse(&jobList);
-    pthreadDoList(threadCount, jobList, columnWorker, m);
+    pthreadDoList(threads, jobList, columnWorker, m);
     }
 else
     {
     for (x=0; x<xSize; ++x)
 	{
 	normCol(m, x, target, isLength);
 	}
     }
-/* Output and go home */
+verboseTime(1, "normalized matrix");
 memMatrixToTsv(m, outFile);
+verboseTime(1, "Wrote %s", outFile);
 }
 
 
 void matrixNormalize(char *direction, char *how, char *inFile, char *outFile)
 /* matrixNormalize - Normalize a matrix somehow - make it's columns or rows all sum to one or 
  * have vector length one.. */
 {
 boolean isLength = howIsLength(how);
 double target = optionDouble("target", 1.0);
+boolean serial = optionExists("serial");
+int threads = (serial ? 1 : 50);
 if (sameWord(direction, "row"))
     matrixNormalizeRows(inFile, isLength, target, outFile);
 else if (sameWord(direction, "column"))
-    matrixNormalizeColumns(inFile, isLength,  target, outFile);
+    matrixNormalizeColumns(inFile, isLength,  target, outFile, threads);
 else
     errAbort("Unrecognized direction %s", direction);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 5)
     usage();
-clThreads = optionInt("threads", clThreads);
 matrixNormalize(argv[1], argv[2], argv[3], argv[4]);
 return 0;
 }