42fbe7842ac08e964f712072b62724f7dec7e89c
kent
  Mon Jan 4 17:36:09 2021 -0800
Refactoring in prep for moving sparseRowMatrix stuff to library

diff --git src/hca/hcaUnpack5/hcaUnpack5.c src/hca/hcaUnpack5/hcaUnpack5.c
index 347019d..4e221dc 100644
--- src/hca/hcaUnpack5/hcaUnpack5.c
+++ src/hca/hcaUnpack5/hcaUnpack5.c
@@ -1,32 +1,32 @@
 /* hcaUnpack5 - Convert cellxgene hdf5 files to something cell browser and genome browser like better.. */
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "localmem.h"
 #include "portable.h"
 #include "obscure.h"
 #include <hdf5.h>
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
-  "hcaUnpack5 - Convert cellxgene hdf5 files to a directory filled with 3 things\n"
+  "hcaUnpack5 - Convert h5ad (scanpy) files to a directory filled with 3 things\n"
   "usage:\n"
-  "   hcaUnpack5 input.h5 outDir\n"
+  "   hcaUnpack5 input.h5ad outDir\n"
   "The output dir will be populated with exprMatrix.tsv, meta.tsv, and project.tsv\n"
   "where:\n"
   "    exprMatrix.tsv has the cell x gene matrix with cells as columns.  This includes\n"
   "             the cell names in the first row and the gene names in the first column.\n."
   "    meta.tsv has the cell-by-cell metadata.  The first row is labels, and the first\n"
   "             column corresponds with the cell names in exprMatrix\n"
   "    gene.lst has the list of genes, one per line\n"
   "options:\n"
   "   -noMatrix - skip the matrix making step\n"
   );
 }
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
    {"noMatrix", OPTION_BOOLEAN},
@@ -407,114 +407,139 @@
 int size = dims[0];
 char **array;
 AllocArray(array, size);
 hid_t type = cVarString();
 h5dReadAll(hid, type, array);
 
 /* Clean up and go home */
 H5Tclose(type);
 H5Dclose(hid);
 H5Sclose(space);
 *retData = array;
 return size;
 }
 
 
-struct flipVal
-/* helps us flip around things */
+struct sparseRowVal
+/* Linked list of things in a row */
     {
-    struct flipVal *next;
+    struct sparseRowVal *next;
     int x;
     float val;
     };
 
-struct sparseFlip
-/* Struct to help flip a sparse matrix */
+struct sparseRowMatrix
+/* A sparse matrix with fast row access in memory */
     {
-    struct sparseFlip *next;
+    struct sparseRowMatrix *next;
     int xSize, ySize;	/* Dimensions of our matrix */
     struct lm *lm;	/* Local memory pool, where row lists come from */
-    struct flipVal **rows;
+    struct sparseRowVal **rows;
     };
 
-struct sparseFlip *sparseFlipNew(int xSize, int ySize)
-/* Make up a new sparseFlipper structure */
+struct sparseRowMatrix *sparseRowMatrixNew(int xSize, int ySize)
+/* Make up a new sparseRowMatrix structure */
 {
-struct sparseFlip *flip;
-AllocVar(flip);
-flip->xSize = xSize;
-flip->ySize = ySize;
-flip->lm = lmInit(0);
-lmAllocArray(flip->lm, flip->rows, ySize);
-return flip;
+struct sparseRowMatrix *matrix;
+AllocVar(matrix);
+matrix->xSize = xSize;
+matrix->ySize = ySize;
+matrix->lm = lmInit(0);
+lmAllocArray(matrix->lm, matrix->rows, ySize);
+return matrix;
 }
 
-void sparseFlipFree(struct sparseFlip **pFlip)
-/* Free up resources associated with sparse flipper */
+void sparseRowMatrixFree(struct sparseRowMatrix **pMatrix)
+/* Free up resources associated with sparse matrix  */
 {
-struct sparseFlip *flip = *pFlip;
-if (flip != NULL)
+struct sparseRowMatrix *matrix = *pMatrix;
+if (matrix != NULL)
     {
-    lmCleanup(&flip->lm);
-    freez(pFlip);
+    lmCleanup(&matrix->lm);
+    freez(pMatrix);
     }
 }
 
-static void inline sparseFlipAdd(struct sparseFlip *flip, int x, int y, float val)
+static void inline sparseRowMatrixAdd(struct sparseRowMatrix *matrix, int x, int y, float val)
 /* Add data to our sparse matrix */
 {
-struct flipVal *fv;
-lmAllocVar(flip->lm, fv);
+struct sparseRowVal *fv;
+lmAllocVar(matrix->lm, fv);
 fv->x = x;
 fv->val = val;
-slAddHead(&flip->rows[y], fv);
+slAddHead(&matrix->rows[y], fv);
 }
 
-void sparseFlipTsvBody(struct sparseFlip *flip, char **rowLabels, FILE *f)
+void sparseRowMatrixTsvBody(struct sparseRowMatrix *matrix, char **rowLabels, 
+    boolean withDots, FILE *f)
 /* Write body (but not header) of matrix to tsv file */
 {
-int xSize = flip->xSize; 
-int ySize = flip->ySize;
+int xSize = matrix->xSize; 
+int ySize = matrix->ySize;
 int x,y;
 double row[xSize];
 for (y=0; y<ySize; ++y)
     {
     zeroBytes(row, sizeof(row));
-    struct flipVal *fv;
-    for (fv = flip->rows[y]; fv != NULL; fv = fv->next)
+    struct sparseRowVal *fv;
+    for (fv = matrix->rows[y]; fv != NULL; fv = fv->next)
 	row[fv->x] = fv->val;
     fprintf(f, "%s", rowLabels[y]);
     for (x=0; x<xSize; ++x)
+	{
         fprintf(f, "\t%g", row[x]);
+	}
     fprintf(f, "\n");
+    if (withDots)
 	dotForUser();
     }
 }
 
+static void writeTsvRow(FILE *f, int rowSize, char **row)
+/* Write out row of strings to a line in tab-sep file */
+{
+if (rowSize > 0)
+    {
+    fprintf(f, "%s", row[0]);
+    int i;
+    for (i=1; i<rowSize; ++i)
+        fprintf(f, "\t%s", row[i]);
+    }
+fprintf(f, "\n");
+}
+
+void sparseRowMatrixSaveAsTsv(struct sparseRowMatrix *matrix, 
+    char **columnLabels, char **rowLabels, boolean withDots, char *fileName)
+{
+FILE *f = mustOpen(fileName, "w");
+verbose(1, "outputting %d row matrix, a dot every 100 rows\n", matrix->ySize);
+fprintf(f, "gene\t");
+writeTsvRow(f, matrix->xSize, columnLabels);
+if (withDots)
+    dotForUserInit(100);
+sparseRowMatrixTsvBody(matrix, rowLabels, withDots, f);
+verbose(1, "\n");
+carefulClose(&f);
+}
+
 
 void saveExprMatrix(struct hcaUnpack5 *context, char **rowLabels, char *fileName)
 /* Save out expression matrix.  Just process it one line at a time so as
  * not to run out of memory */
 {
-/* Open output and Write out header */
-FILE *f = mustOpen(fileName, "w");
-fprintf(f, "gene_id");
+/* Get column with sample names, aka indexCol */
 struct metaCol *indexCol = context->indexCol;
-int i;
-for (i=0; i<indexCol->size; ++i)
-    fprintf(f, "\t%s", indexCol->val.asString[i]);
-fprintf(f, "\n");
 
 hid_t file = context->file;
 
 /* Figure out size of primary data - that is number of cells with data */
 hid_t data = h5dOpen(file, "X/data");
 hid_t dataSpace = H5Dget_space(data);
 hsize_t     dataDims[4];
 int dataDimCount = H5Sget_simple_extent_dims(dataSpace, dataDims, NULL);
 if (dataDimCount != 1)
     internalErr();
 long long dataSize = dataDims[0];
 verbose(1, "Sparse matrix with %lld elements\n", dataSize);
 
 /* Load up array that contains offsets of each column within column-major data */
 int *indptr = NULL;
@@ -529,70 +554,70 @@
 hsize_t indiDims[4];
 int indiDimCount = H5Sget_simple_extent_dims(indiSpace, indiDims, NULL);
 if (indiDimCount != 1)
     internalErr();
 int indiSize = indiDims[0];
 verbose(2, "indi size is %d\n", indiSize);
 
 hsize_t indiColSpace = H5Dget_space(indi);
 hsize_t colOffset[1], colSize[1];
 hsize_t dataColSpace = H5Dget_space(data);
 
 int sizeAlloc = 0;
 int *intiBuf = NULL;
 float *fValBuf = NULL;
 
-/* Scan through matrix a column at a time, reading it into flipper */
-struct sparseFlip *flip = sparseFlipNew(context->cellCount, context->geneCount);
+/* Scan through matrix a column at a time, reading it into matrix  */
+struct sparseRowMatrix *matrix = sparseRowMatrixNew(context->cellCount, context->geneCount);
 int colIx;
 int colCount = context->cellCount;
 for (colIx = 0; colIx <colCount; ++colIx)
     {
     int start = indptr[colIx];
     int end = indptr[colIx+1];
     int size = end - start;
     if (size > sizeAlloc)
         {
 	freeMem(intiBuf);
 	freeMem(fValBuf);
 	intiBuf = needMem(size * sizeof(intiBuf[0]));
 	fValBuf = needMem(size * sizeof(fValBuf[0]));
 	sizeAlloc = size;
 	}
     hsize_t dim1[1] = {size};
     hsize_t readMemSpace = H5Screate_simple(1, dim1, NULL);
     colOffset[0] = start;
     colSize[0] = size;
     H5Sselect_hyperslab(indiColSpace, H5S_SELECT_SET, colOffset, NULL, colSize, NULL);
     H5Sselect_hyperslab(dataColSpace, H5S_SELECT_SET, colOffset, NULL, colSize, NULL);
     h5dRead(indi, H5T_NATIVE_INT, readMemSpace, indiColSpace, H5P_DEFAULT, intiBuf);
     h5dRead(data, H5T_NATIVE_FLOAT, readMemSpace, dataColSpace, H5P_DEFAULT, fValBuf);
 
     int i;
     for (i=0; i<size; ++i)
-        sparseFlipAdd(flip, colIx, intiBuf[i], fValBuf[i]);
+        sparseRowMatrixAdd(matrix, colIx, intiBuf[i], fValBuf[i]);
     }
-verbose(1, "outputting %d row matrix, a dot every 100 rows\n", context->geneCount);
-dotForUserInit(100);
-sparseFlipTsvBody(flip, rowLabels, f);
-verbose(1, "\n");
+
+/* Open output and Write out header */
+sparseRowMatrixSaveAsTsv(matrix, indexCol->val.asString, rowLabels, TRUE, fileName);
+
+
 
 /* Clean up and go home*/
-sparseFlipFree(&flip);
+sparseRowMatrixFree(&matrix);
 freez(&intiBuf);
 freez(&fValBuf);
-carefulClose(&f);
 }
 
 void saveArrayOfStrings(char **array, int size, char *fileName)
 /* Write out array of strings, pretty easy */
 {
 FILE *f = mustOpen(fileName, "w");
 int i;
 for (i=0; i<size; ++i)
     fprintf(f, "%s\n", array[i]);
 carefulClose(&f);
 }
 
 void hcaUnpack5(char *input, char *output, boolean withMatrix)
 /* hcaUnpack5 - Convert cellxgene hdf5 files to something cell browser and genome browser like 
  * better.. */