a3939547c53f1ccb39910e61bfa90a0c54310273 kent Tue Jan 5 00:40:36 2021 -0800 First cut of vRowMatrix to stream through a variety of matrices semi-transparently I hope. FOr now just handles tsv though. diff --git src/hca/hcaUnpack5/hcaUnpack5.c src/hca/hcaUnpack5/hcaUnpack5.c index 4e221dc..d6d5b3f 100644 --- src/hca/hcaUnpack5/hcaUnpack5.c +++ src/hca/hcaUnpack5/hcaUnpack5.c @@ -1,23 +1,25 @@ -/* hcaUnpack5 - Convert cellxgene hdf5 files to something cell browser and genome browser like better.. */ +/* hcaUnpack5 - Convert cellxgene hdf5 files to something cell browser and genome browser + * like better.. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "localmem.h" #include "portable.h" #include "obscure.h" +#include "sparseMatrix.h" #include void usage() /* Explain usage and exit. */ { errAbort( "hcaUnpack5 - Convert h5ad (scanpy) files to a directory filled with 3 things\n" "usage:\n" " hcaUnpack5 input.h5ad outDir\n" "The output dir will be populated with exprMatrix.tsv, meta.tsv, and project.tsv\n" "where:\n" " exprMatrix.tsv has the cell x gene matrix with cells as columns. This includes\n" " the cell names in the first row and the gene names in the first column.\n." " meta.tsv has the cell-by-cell metadata. The first row is labels, and the first\n" " column corresponds with the cell names in exprMatrix\n" @@ -407,133 +409,30 @@ int size = dims[0]; char **array; AllocArray(array, size); hid_t type = cVarString(); h5dReadAll(hid, type, array); /* Clean up and go home */ H5Tclose(type); H5Dclose(hid); H5Sclose(space); *retData = array; return size; } -struct sparseRowVal -/* Linked list of things in a row */ - { - struct sparseRowVal *next; - int x; - float val; - }; - -struct sparseRowMatrix -/* A sparse matrix with fast row access in memory */ - { - struct sparseRowMatrix *next; - int xSize, ySize; /* Dimensions of our matrix */ - struct lm *lm; /* Local memory pool, where row lists come from */ - struct sparseRowVal **rows; - }; - -struct sparseRowMatrix *sparseRowMatrixNew(int xSize, int ySize) -/* Make up a new sparseRowMatrix structure */ -{ -struct sparseRowMatrix *matrix; -AllocVar(matrix); -matrix->xSize = xSize; -matrix->ySize = ySize; -matrix->lm = lmInit(0); -lmAllocArray(matrix->lm, matrix->rows, ySize); -return matrix; -} - -void sparseRowMatrixFree(struct sparseRowMatrix **pMatrix) -/* Free up resources associated with sparse matrix */ -{ -struct sparseRowMatrix *matrix = *pMatrix; -if (matrix != NULL) - { - lmCleanup(&matrix->lm); - freez(pMatrix); - } -} - -static void inline sparseRowMatrixAdd(struct sparseRowMatrix *matrix, int x, int y, float val) -/* Add data to our sparse matrix */ -{ -struct sparseRowVal *fv; -lmAllocVar(matrix->lm, fv); -fv->x = x; -fv->val = val; -slAddHead(&matrix->rows[y], fv); -} - -void sparseRowMatrixTsvBody(struct sparseRowMatrix *matrix, char **rowLabels, - boolean withDots, FILE *f) -/* Write body (but not header) of matrix to tsv file */ -{ -int xSize = matrix->xSize; -int ySize = matrix->ySize; -int x,y; -double row[xSize]; -for (y=0; yrows[y]; fv != NULL; fv = fv->next) - row[fv->x] = fv->val; - fprintf(f, "%s", rowLabels[y]); - for (x=0; x 0) - { - fprintf(f, "%s", row[0]); - int i; - for (i=1; iySize); -fprintf(f, "gene\t"); -writeTsvRow(f, matrix->xSize, columnLabels); -if (withDots) - dotForUserInit(100); -sparseRowMatrixTsvBody(matrix, rowLabels, withDots, f); -verbose(1, "\n"); -carefulClose(&f); -} - - void saveExprMatrix(struct hcaUnpack5 *context, char **rowLabels, char *fileName) /* Save out expression matrix. Just process it one line at a time so as * not to run out of memory */ { /* Get column with sample names, aka indexCol */ struct metaCol *indexCol = context->indexCol; hid_t file = context->file; /* Figure out size of primary data - that is number of cells with data */ hid_t data = h5dOpen(file, "X/data"); hid_t dataSpace = H5Dget_space(data); hsize_t dataDims[4]; int dataDimCount = H5Sget_simple_extent_dims(dataSpace, dataDims, NULL); if (dataDimCount != 1) @@ -586,33 +485,31 @@ hsize_t dim1[1] = {size}; hsize_t readMemSpace = H5Screate_simple(1, dim1, NULL); colOffset[0] = start; colSize[0] = size; H5Sselect_hyperslab(indiColSpace, H5S_SELECT_SET, colOffset, NULL, colSize, NULL); H5Sselect_hyperslab(dataColSpace, H5S_SELECT_SET, colOffset, NULL, colSize, NULL); h5dRead(indi, H5T_NATIVE_INT, readMemSpace, indiColSpace, H5P_DEFAULT, intiBuf); h5dRead(data, H5T_NATIVE_FLOAT, readMemSpace, dataColSpace, H5P_DEFAULT, fValBuf); int i; for (i=0; ival.asString, rowLabels, TRUE, fileName); - - +sparseRowMatrixSaveAsTsv(matrix, indexCol->val.asString, rowLabels, fileName); /* Clean up and go home*/ sparseRowMatrixFree(&matrix); freez(&intiBuf); freez(&fValBuf); } void saveArrayOfStrings(char **array, int size, char *fileName) /* Write out array of strings, pretty easy */ { FILE *f = mustOpen(fileName, "w"); int i; for (i=0; i