a3939547c53f1ccb39910e61bfa90a0c54310273
kent
  Tue Jan 5 00:40:36 2021 -0800
First cut of vRowMatrix to stream through a variety of matrices semi-transparently I hope.  FOr now just handles tsv though.

diff --git src/hca/hcaUnpack5/hcaUnpack5.c src/hca/hcaUnpack5/hcaUnpack5.c
index 4e221dc..d6d5b3f 100644
--- src/hca/hcaUnpack5/hcaUnpack5.c
+++ src/hca/hcaUnpack5/hcaUnpack5.c
@@ -1,23 +1,25 @@
-/* hcaUnpack5 - Convert cellxgene hdf5 files to something cell browser and genome browser like better.. */
+/* hcaUnpack5 - Convert cellxgene hdf5 files to something cell browser and genome browser 
+ * like better.. */
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "localmem.h"
 #include "portable.h"
 #include "obscure.h"
+#include "sparseMatrix.h"
 #include <hdf5.h>
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "hcaUnpack5 - Convert h5ad (scanpy) files to a directory filled with 3 things\n"
   "usage:\n"
   "   hcaUnpack5 input.h5ad outDir\n"
   "The output dir will be populated with exprMatrix.tsv, meta.tsv, and project.tsv\n"
   "where:\n"
   "    exprMatrix.tsv has the cell x gene matrix with cells as columns.  This includes\n"
   "             the cell names in the first row and the gene names in the first column.\n."
   "    meta.tsv has the cell-by-cell metadata.  The first row is labels, and the first\n"
   "             column corresponds with the cell names in exprMatrix\n"
@@ -407,133 +409,30 @@
 int size = dims[0];
 char **array;
 AllocArray(array, size);
 hid_t type = cVarString();
 h5dReadAll(hid, type, array);
 
 /* Clean up and go home */
 H5Tclose(type);
 H5Dclose(hid);
 H5Sclose(space);
 *retData = array;
 return size;
 }
 
 
-struct sparseRowVal
-/* Linked list of things in a row */
-    {
-    struct sparseRowVal *next;
-    int x;
-    float val;
-    };
-
-struct sparseRowMatrix
-/* A sparse matrix with fast row access in memory */
-    {
-    struct sparseRowMatrix *next;
-    int xSize, ySize;	/* Dimensions of our matrix */
-    struct lm *lm;	/* Local memory pool, where row lists come from */
-    struct sparseRowVal **rows;
-    };
-
-struct sparseRowMatrix *sparseRowMatrixNew(int xSize, int ySize)
-/* Make up a new sparseRowMatrix structure */
-{
-struct sparseRowMatrix *matrix;
-AllocVar(matrix);
-matrix->xSize = xSize;
-matrix->ySize = ySize;
-matrix->lm = lmInit(0);
-lmAllocArray(matrix->lm, matrix->rows, ySize);
-return matrix;
-}
-
-void sparseRowMatrixFree(struct sparseRowMatrix **pMatrix)
-/* Free up resources associated with sparse matrix  */
-{
-struct sparseRowMatrix *matrix = *pMatrix;
-if (matrix != NULL)
-    {
-    lmCleanup(&matrix->lm);
-    freez(pMatrix);
-    }
-}
-
-static void inline sparseRowMatrixAdd(struct sparseRowMatrix *matrix, int x, int y, float val)
-/* Add data to our sparse matrix */
-{
-struct sparseRowVal *fv;
-lmAllocVar(matrix->lm, fv);
-fv->x = x;
-fv->val = val;
-slAddHead(&matrix->rows[y], fv);
-}
-
-void sparseRowMatrixTsvBody(struct sparseRowMatrix *matrix, char **rowLabels, 
-    boolean withDots, FILE *f)
-/* Write body (but not header) of matrix to tsv file */
-{
-int xSize = matrix->xSize; 
-int ySize = matrix->ySize;
-int x,y;
-double row[xSize];
-for (y=0; y<ySize; ++y)
-    {
-    zeroBytes(row, sizeof(row));
-    struct sparseRowVal *fv;
-    for (fv = matrix->rows[y]; fv != NULL; fv = fv->next)
-	row[fv->x] = fv->val;
-    fprintf(f, "%s", rowLabels[y]);
-    for (x=0; x<xSize; ++x)
-	{
-        fprintf(f, "\t%g", row[x]);
-	}
-    fprintf(f, "\n");
-    if (withDots)
-	dotForUser();
-    }
-}
-
-static void writeTsvRow(FILE *f, int rowSize, char **row)
-/* Write out row of strings to a line in tab-sep file */
-{
-if (rowSize > 0)
-    {
-    fprintf(f, "%s", row[0]);
-    int i;
-    for (i=1; i<rowSize; ++i)
-        fprintf(f, "\t%s", row[i]);
-    }
-fprintf(f, "\n");
-}
-
-void sparseRowMatrixSaveAsTsv(struct sparseRowMatrix *matrix, 
-    char **columnLabels, char **rowLabels, boolean withDots, char *fileName)
-{
-FILE *f = mustOpen(fileName, "w");
-verbose(1, "outputting %d row matrix, a dot every 100 rows\n", matrix->ySize);
-fprintf(f, "gene\t");
-writeTsvRow(f, matrix->xSize, columnLabels);
-if (withDots)
-    dotForUserInit(100);
-sparseRowMatrixTsvBody(matrix, rowLabels, withDots, f);
-verbose(1, "\n");
-carefulClose(&f);
-}
-
-
 void saveExprMatrix(struct hcaUnpack5 *context, char **rowLabels, char *fileName)
 /* Save out expression matrix.  Just process it one line at a time so as
  * not to run out of memory */
 {
 /* Get column with sample names, aka indexCol */
 struct metaCol *indexCol = context->indexCol;
 
 hid_t file = context->file;
 
 /* Figure out size of primary data - that is number of cells with data */
 hid_t data = h5dOpen(file, "X/data");
 hid_t dataSpace = H5Dget_space(data);
 hsize_t     dataDims[4];
 int dataDimCount = H5Sget_simple_extent_dims(dataSpace, dataDims, NULL);
 if (dataDimCount != 1)
@@ -586,33 +485,31 @@
     hsize_t dim1[1] = {size};
     hsize_t readMemSpace = H5Screate_simple(1, dim1, NULL);
     colOffset[0] = start;
     colSize[0] = size;
     H5Sselect_hyperslab(indiColSpace, H5S_SELECT_SET, colOffset, NULL, colSize, NULL);
     H5Sselect_hyperslab(dataColSpace, H5S_SELECT_SET, colOffset, NULL, colSize, NULL);
     h5dRead(indi, H5T_NATIVE_INT, readMemSpace, indiColSpace, H5P_DEFAULT, intiBuf);
     h5dRead(data, H5T_NATIVE_FLOAT, readMemSpace, dataColSpace, H5P_DEFAULT, fValBuf);
 
     int i;
     for (i=0; i<size; ++i)
         sparseRowMatrixAdd(matrix, colIx, intiBuf[i], fValBuf[i]);
     }
 
 /* Open output and Write out header */
-sparseRowMatrixSaveAsTsv(matrix, indexCol->val.asString, rowLabels, TRUE, fileName);
-
-
+sparseRowMatrixSaveAsTsv(matrix, indexCol->val.asString, rowLabels, fileName);
 
 /* Clean up and go home*/
 sparseRowMatrixFree(&matrix);
 freez(&intiBuf);
 freez(&fValBuf);
 }
 
 void saveArrayOfStrings(char **array, int size, char *fileName)
 /* Write out array of strings, pretty easy */
 {
 FILE *f = mustOpen(fileName, "w");
 int i;
 for (i=0; i<size; ++i)
     fprintf(f, "%s\n", array[i]);
 carefulClose(&f);