42fbe7842ac08e964f712072b62724f7dec7e89c kent Mon Jan 4 17:36:09 2021 -0800 Refactoring in prep for moving sparseRowMatrix stuff to library diff --git src/hca/hcaUnpack5/hcaUnpack5.c src/hca/hcaUnpack5/hcaUnpack5.c index 347019d..4e221dc 100644 --- src/hca/hcaUnpack5/hcaUnpack5.c +++ src/hca/hcaUnpack5/hcaUnpack5.c @@ -1,32 +1,32 @@ /* hcaUnpack5 - Convert cellxgene hdf5 files to something cell browser and genome browser like better.. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "localmem.h" #include "portable.h" #include "obscure.h" #include <hdf5.h> void usage() /* Explain usage and exit. */ { errAbort( - "hcaUnpack5 - Convert cellxgene hdf5 files to a directory filled with 3 things\n" + "hcaUnpack5 - Convert h5ad (scanpy) files to a directory filled with 3 things\n" "usage:\n" - " hcaUnpack5 input.h5 outDir\n" + " hcaUnpack5 input.h5ad outDir\n" "The output dir will be populated with exprMatrix.tsv, meta.tsv, and project.tsv\n" "where:\n" " exprMatrix.tsv has the cell x gene matrix with cells as columns. This includes\n" " the cell names in the first row and the gene names in the first column.\n." " meta.tsv has the cell-by-cell metadata. The first row is labels, and the first\n" " column corresponds with the cell names in exprMatrix\n" " gene.lst has the list of genes, one per line\n" "options:\n" " -noMatrix - skip the matrix making step\n" ); } /* Command line validation table. */ static struct optionSpec options[] = { {"noMatrix", OPTION_BOOLEAN}, @@ -407,114 +407,139 @@ int size = dims[0]; char **array; AllocArray(array, size); hid_t type = cVarString(); h5dReadAll(hid, type, array); /* Clean up and go home */ H5Tclose(type); H5Dclose(hid); H5Sclose(space); *retData = array; return size; } -struct flipVal -/* helps us flip around things */ +struct sparseRowVal +/* Linked list of things in a row */ { - struct flipVal *next; + struct sparseRowVal *next; int x; float val; }; -struct sparseFlip -/* Struct to help flip a sparse matrix */ +struct sparseRowMatrix +/* A sparse matrix with fast row access in memory */ { - struct sparseFlip *next; + struct sparseRowMatrix *next; int xSize, ySize; /* Dimensions of our matrix */ struct lm *lm; /* Local memory pool, where row lists come from */ - struct flipVal **rows; + struct sparseRowVal **rows; }; -struct sparseFlip *sparseFlipNew(int xSize, int ySize) -/* Make up a new sparseFlipper structure */ +struct sparseRowMatrix *sparseRowMatrixNew(int xSize, int ySize) +/* Make up a new sparseRowMatrix structure */ { -struct sparseFlip *flip; -AllocVar(flip); -flip->xSize = xSize; -flip->ySize = ySize; -flip->lm = lmInit(0); -lmAllocArray(flip->lm, flip->rows, ySize); -return flip; +struct sparseRowMatrix *matrix; +AllocVar(matrix); +matrix->xSize = xSize; +matrix->ySize = ySize; +matrix->lm = lmInit(0); +lmAllocArray(matrix->lm, matrix->rows, ySize); +return matrix; } -void sparseFlipFree(struct sparseFlip **pFlip) -/* Free up resources associated with sparse flipper */ +void sparseRowMatrixFree(struct sparseRowMatrix **pMatrix) +/* Free up resources associated with sparse matrix */ { -struct sparseFlip *flip = *pFlip; -if (flip != NULL) +struct sparseRowMatrix *matrix = *pMatrix; +if (matrix != NULL) { - lmCleanup(&flip->lm); - freez(pFlip); + lmCleanup(&matrix->lm); + freez(pMatrix); } } -static void inline sparseFlipAdd(struct sparseFlip *flip, int x, int y, float val) +static void inline sparseRowMatrixAdd(struct sparseRowMatrix *matrix, int x, int y, float val) /* Add data to our sparse matrix */ { -struct flipVal *fv; -lmAllocVar(flip->lm, fv); +struct sparseRowVal *fv; +lmAllocVar(matrix->lm, fv); fv->x = x; fv->val = val; -slAddHead(&flip->rows[y], fv); +slAddHead(&matrix->rows[y], fv); } -void sparseFlipTsvBody(struct sparseFlip *flip, char **rowLabels, FILE *f) +void sparseRowMatrixTsvBody(struct sparseRowMatrix *matrix, char **rowLabels, + boolean withDots, FILE *f) /* Write body (but not header) of matrix to tsv file */ { -int xSize = flip->xSize; -int ySize = flip->ySize; +int xSize = matrix->xSize; +int ySize = matrix->ySize; int x,y; double row[xSize]; for (y=0; y<ySize; ++y) { zeroBytes(row, sizeof(row)); - struct flipVal *fv; - for (fv = flip->rows[y]; fv != NULL; fv = fv->next) + struct sparseRowVal *fv; + for (fv = matrix->rows[y]; fv != NULL; fv = fv->next) row[fv->x] = fv->val; fprintf(f, "%s", rowLabels[y]); for (x=0; x<xSize; ++x) + { fprintf(f, "\t%g", row[x]); + } fprintf(f, "\n"); + if (withDots) dotForUser(); } } +static void writeTsvRow(FILE *f, int rowSize, char **row) +/* Write out row of strings to a line in tab-sep file */ +{ +if (rowSize > 0) + { + fprintf(f, "%s", row[0]); + int i; + for (i=1; i<rowSize; ++i) + fprintf(f, "\t%s", row[i]); + } +fprintf(f, "\n"); +} + +void sparseRowMatrixSaveAsTsv(struct sparseRowMatrix *matrix, + char **columnLabels, char **rowLabels, boolean withDots, char *fileName) +{ +FILE *f = mustOpen(fileName, "w"); +verbose(1, "outputting %d row matrix, a dot every 100 rows\n", matrix->ySize); +fprintf(f, "gene\t"); +writeTsvRow(f, matrix->xSize, columnLabels); +if (withDots) + dotForUserInit(100); +sparseRowMatrixTsvBody(matrix, rowLabels, withDots, f); +verbose(1, "\n"); +carefulClose(&f); +} + void saveExprMatrix(struct hcaUnpack5 *context, char **rowLabels, char *fileName) /* Save out expression matrix. Just process it one line at a time so as * not to run out of memory */ { -/* Open output and Write out header */ -FILE *f = mustOpen(fileName, "w"); -fprintf(f, "gene_id"); +/* Get column with sample names, aka indexCol */ struct metaCol *indexCol = context->indexCol; -int i; -for (i=0; i<indexCol->size; ++i) - fprintf(f, "\t%s", indexCol->val.asString[i]); -fprintf(f, "\n"); hid_t file = context->file; /* Figure out size of primary data - that is number of cells with data */ hid_t data = h5dOpen(file, "X/data"); hid_t dataSpace = H5Dget_space(data); hsize_t dataDims[4]; int dataDimCount = H5Sget_simple_extent_dims(dataSpace, dataDims, NULL); if (dataDimCount != 1) internalErr(); long long dataSize = dataDims[0]; verbose(1, "Sparse matrix with %lld elements\n", dataSize); /* Load up array that contains offsets of each column within column-major data */ int *indptr = NULL; @@ -529,70 +554,70 @@ hsize_t indiDims[4]; int indiDimCount = H5Sget_simple_extent_dims(indiSpace, indiDims, NULL); if (indiDimCount != 1) internalErr(); int indiSize = indiDims[0]; verbose(2, "indi size is %d\n", indiSize); hsize_t indiColSpace = H5Dget_space(indi); hsize_t colOffset[1], colSize[1]; hsize_t dataColSpace = H5Dget_space(data); int sizeAlloc = 0; int *intiBuf = NULL; float *fValBuf = NULL; -/* Scan through matrix a column at a time, reading it into flipper */ -struct sparseFlip *flip = sparseFlipNew(context->cellCount, context->geneCount); +/* Scan through matrix a column at a time, reading it into matrix */ +struct sparseRowMatrix *matrix = sparseRowMatrixNew(context->cellCount, context->geneCount); int colIx; int colCount = context->cellCount; for (colIx = 0; colIx <colCount; ++colIx) { int start = indptr[colIx]; int end = indptr[colIx+1]; int size = end - start; if (size > sizeAlloc) { freeMem(intiBuf); freeMem(fValBuf); intiBuf = needMem(size * sizeof(intiBuf[0])); fValBuf = needMem(size * sizeof(fValBuf[0])); sizeAlloc = size; } hsize_t dim1[1] = {size}; hsize_t readMemSpace = H5Screate_simple(1, dim1, NULL); colOffset[0] = start; colSize[0] = size; H5Sselect_hyperslab(indiColSpace, H5S_SELECT_SET, colOffset, NULL, colSize, NULL); H5Sselect_hyperslab(dataColSpace, H5S_SELECT_SET, colOffset, NULL, colSize, NULL); h5dRead(indi, H5T_NATIVE_INT, readMemSpace, indiColSpace, H5P_DEFAULT, intiBuf); h5dRead(data, H5T_NATIVE_FLOAT, readMemSpace, dataColSpace, H5P_DEFAULT, fValBuf); int i; for (i=0; i<size; ++i) - sparseFlipAdd(flip, colIx, intiBuf[i], fValBuf[i]); + sparseRowMatrixAdd(matrix, colIx, intiBuf[i], fValBuf[i]); } -verbose(1, "outputting %d row matrix, a dot every 100 rows\n", context->geneCount); -dotForUserInit(100); -sparseFlipTsvBody(flip, rowLabels, f); -verbose(1, "\n"); + +/* Open output and Write out header */ +sparseRowMatrixSaveAsTsv(matrix, indexCol->val.asString, rowLabels, TRUE, fileName); + + /* Clean up and go home*/ -sparseFlipFree(&flip); +sparseRowMatrixFree(&matrix); freez(&intiBuf); freez(&fValBuf); -carefulClose(&f); } void saveArrayOfStrings(char **array, int size, char *fileName) /* Write out array of strings, pretty easy */ { FILE *f = mustOpen(fileName, "w"); int i; for (i=0; i<size; ++i) fprintf(f, "%s\n", array[i]); carefulClose(&f); } void hcaUnpack5(char *input, char *output, boolean withMatrix) /* hcaUnpack5 - Convert cellxgene hdf5 files to something cell browser and genome browser like * better.. */