42fbe7842ac08e964f712072b62724f7dec7e89c kent Mon Jan 4 17:36:09 2021 -0800 Refactoring in prep for moving sparseRowMatrix stuff to library diff --git src/hca/hcaUnpack5/hcaUnpack5.c src/hca/hcaUnpack5/hcaUnpack5.c index 347019d..4e221dc 100644 --- src/hca/hcaUnpack5/hcaUnpack5.c +++ src/hca/hcaUnpack5/hcaUnpack5.c @@ -1,699 +1,724 @@ /* hcaUnpack5 - Convert cellxgene hdf5 files to something cell browser and genome browser like better.. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "localmem.h" #include "portable.h" #include "obscure.h" #include <hdf5.h> void usage() /* Explain usage and exit. */ { errAbort( - "hcaUnpack5 - Convert cellxgene hdf5 files to a directory filled with 3 things\n" + "hcaUnpack5 - Convert h5ad (scanpy) files to a directory filled with 3 things\n" "usage:\n" - " hcaUnpack5 input.h5 outDir\n" + " hcaUnpack5 input.h5ad outDir\n" "The output dir will be populated with exprMatrix.tsv, meta.tsv, and project.tsv\n" "where:\n" " exprMatrix.tsv has the cell x gene matrix with cells as columns. This includes\n" " the cell names in the first row and the gene names in the first column.\n." " meta.tsv has the cell-by-cell metadata. The first row is labels, and the first\n" " column corresponds with the cell names in exprMatrix\n" " gene.lst has the list of genes, one per line\n" "options:\n" " -noMatrix - skip the matrix making step\n" ); } /* Command line validation table. */ static struct optionSpec options[] = { {"noMatrix", OPTION_BOOLEAN}, {NULL, 0}, }; hid_t h5fOpen(char *fileName, int accessType) /* Open up file or die trying. accessType should be * H5F_ACC_RDONLY, H5F_ACC_RDWR, etc. */ { hid_t ret = H5Fopen(fileName, accessType, H5P_DEFAULT); if (ret < 0) errAbort("Couldn't open %s", fileName); return ret; } hid_t h5dOpen(hid_t parent, char *name) /* Open up a subpart or die trying */ { hid_t ret = H5Dopen(parent, name, H5P_DEFAULT); H5_DLL ssize_t H5Iget_name(hid_t id, char *name/*out*/, size_t size); if (ret < 0) { char parentName[256]; H5Iget_name(parent, parentName, sizeof(parentName)); errAbort("ERROR: Couldn't find dataset %s in %s", name, parentName); } return ret; } hid_t h5gOpen(hid_t parent, char *name) /* Open up a subpart or die trying */ { hid_t ret = H5Gopen(parent, name, H5P_DEFAULT); if (ret < 0) { char parentName[256]; H5Iget_name(parent, parentName, sizeof(parentName)); errAbort("ERROR: Couldn't find group %s in %s", name, parentName); } return ret; } void h5lIterate(hid_t hid, H5L_iterate_t callback, void *context) /* Iterate over a list. Requires a callback */ { herr_t status = H5Literate(hid, H5_INDEX_NAME, H5_ITER_NATIVE, NULL, callback, context); if (status < 0) internalErr(); // does this ever happen? } void h5dReadAll(hid_t hid, hid_t type, void *buf) /* Do the read, and pray the buf is pointing to someplace that can hold it... */ { herr_t status = H5Dread(hid, type, H5S_ALL, H5S_ALL, H5P_DEFAULT, buf); if (status < 0) { char name[256]; H5Iget_name(hid, name, sizeof(name)); errAbort("ERROR: Couldn't read from %s", name); } } void h5dRead(hid_t file, hid_t memType, hid_t memSpace, hid_t fileSpace, hid_t pList, void *buf) /* Do the read, and pray the buf is pointing to someplace that can hold it... */ { herr_t status = H5Dread(file, memType, memSpace, fileSpace, pList, buf); if (status < 0) { char name[256]; H5Iget_name(file, name, sizeof(name)); errAbort("ERROR: Couldn't read from %s", name); } } struct vocab /* A controlled vocab we can refer to by short numbers instead of strings */ { struct vocab *next; char *name; /* Column this vocab is associated with */ int valCount; /* Number of distinct values */ char **stringVals; /* array of strings */ }; struct metaCol /* One of these for each metadata column /field */ { struct metaCol *next; char *name; /* field name */ int size; /* size of column, ie number of rows */ H5T_class_t classt; /* class */ struct vocab *vocab; /* Some columns have associated vocabs */ union { int *asVocab; int *asInt; double *asDouble; char **asString;} val; }; struct metaCol *metaColNew(const char *name) /* Build up a new metaCol of given name */ { struct metaCol *col; AllocVar(col); col->name = cloneString(name); return col; } struct hcaUnpack5 /* There is just one of these for our program - it's the context * we give most of our h5 callbacks */ { struct hcaUnpack5 *next; hid_t file; /* Open hdf5 file */ struct vocab *vocabList; /* List of all restricted vocabs */ struct hash *vocabHash; /* Hash of vocabs keyed by vocab->name */ struct metaCol *metaList; /* List of all metaCols */ struct hash *metaHash; /* Hash of all metaCols keyed by metaCol->name */ struct metaCol *indexCol; /* Column that has cell IDs */ int cellCount; /* Number of cells - rows in meta.tsv cols-1 in exprHash */ int geneCount; /* Number of genes */ }; struct vocab *vocabNew(const char *name) /* Make a new, largely empty, vocab structure */ { struct vocab *vocab; AllocVar(vocab); vocab->name = cloneString(name); return vocab; } hid_t cVarString() /* Return hid_t that'll be usual C variable length 0 terminated strings */ { hid_t memtype = H5Tcopy (H5T_C_S1); H5Tset_size (memtype, H5T_VARIABLE); H5Tset_cset(memtype, H5T_CSET_UTF8) ; return memtype; } void readVocab(struct vocab *vocab, hid_t hid, struct hcaUnpack5 *context) /* Read vocab from given hid */ { hid_t space = H5Dget_space (hid); hsize_t dims[1]; H5Sget_simple_extent_dims (space, dims, NULL); vocab->valCount = dims[0]; AllocArray(vocab->stringVals, vocab->valCount); /* Set us up for zero-terminated UTF8 (or ascii) strings */ hid_t memtype = cVarString(); /* Read it */ h5dReadAll(hid, memtype, vocab->stringVals); /* Clean up */ H5Sclose(space); H5Tclose(memtype); } int readCol(struct metaCol *col, hid_t hid, struct hcaUnpack5 *context) /* Read metaCol from given hid */ { hid_t space = H5Dget_space (hid); hsize_t dims[1]; H5Sget_simple_extent_dims (space, dims, NULL); int size = col->size = dims[0]; hid_t type = H5Dget_type(hid); H5T_class_t classt = col->classt = H5Tget_class(type); struct vocab *vocab = hashFindVal(context->vocabHash, col->name); if (vocab != NULL) { int *buf; AllocArray(buf, size); col->val.asVocab = buf; h5dReadAll(hid, H5T_NATIVE_INT, buf); } else { switch (classt) { case H5T_INTEGER: { int *buf; AllocArray(buf, size); col->val.asInt = buf; h5dReadAll(hid, H5T_NATIVE_INT, buf); break; } case H5T_FLOAT: { double *buf; AllocArray(buf, size); col->val.asDouble = buf; h5dReadAll(hid, H5T_NATIVE_DOUBLE, buf); break; } case H5T_STRING: { char **buf; AllocArray(buf, size); col->val.asString = buf; h5dReadAll(hid, cVarString(), buf); break; } default: warn("Skipping type %d", type); break; } } /* Clean up and go home */ H5Sclose(space); H5Tclose(type); return size; } herr_t vocabCallback(hid_t loc_id, const char *name, const H5L_info_t *info, void *operator_data) /* Callback from H5Iterate that we use to build up a column that has * an assocated vocabulary */ { struct hcaUnpack5 *context = operator_data; herr_t status; H5O_info_t infobuf; /* * Get type of the object and display its name and type. * The name of the object is passed to this function by * the Library. */ status = H5Oget_info_by_name(loc_id, name, &infobuf, H5P_DEFAULT); if (status < 0) errAbort("missing status for %s", name); if (infobuf.type == H5O_TYPE_DATASET) { struct vocab *vocab = vocabNew(name); hashAdd(context->vocabHash, vocab->name, vocab); slAddHead(&context->vocabList, vocab); } return 0; } herr_t obsCallback(hid_t loc_id, const char *name, const H5L_info_t *info, void *operator_data) /* Callback from H5Iterate that we use to build up a column that has * an assocated vocabulary */ { struct hcaUnpack5 *context = operator_data; herr_t status; H5O_info_t infobuf; /* * Get type of the object and display its name and type. * The name of the object is passed to this function by * the Library. */ status = H5Oget_info_by_name(loc_id, name, &infobuf, H5P_DEFAULT); if (status < 0) errAbort("missing status for %s", name); if (infobuf.type == H5O_TYPE_DATASET) { struct metaCol *col = metaColNew(name); hashAdd(context->metaHash, col->name, col); slAddHead(&context->metaList, col); struct vocab *vocab = hashFindVal(context->vocabHash, col->name); if (vocab != NULL) col->vocab = vocab; } return 0; } void saveMeta(struct hcaUnpack5 *context, char *fileName) /* Save the metaList to fileName */ { FILE *f = mustOpen(fileName, "w"); fprintf(f, "#"); fprintf(f, "_index"); struct metaCol *col; for (col = context->metaList; col != NULL; col = col->next) { fprintf(f, "\t%s", col->name); } fprintf(f, "\n"); struct metaCol *indexCol = context->indexCol; int colSize = context->cellCount; int i; for (i=0; i<colSize; ++i) { fprintf(f, "%s", indexCol->val.asString[i]); for (col = context->metaList; col != NULL; col = col->next) { struct vocab *vocab = col->vocab; if (vocab != NULL) fprintf(f, "\t%s", vocab->stringVals[col->val.asVocab[i]]); else { switch (col->classt) { case H5T_INTEGER: fprintf(f, "\t%d", col->val.asInt[i]); break; case H5T_FLOAT: fprintf(f, "\t%g", col->val.asDouble[i]); break; case H5T_STRING: fprintf(f, "\t%s", col->val.asString[i]); break; default: internalErr(); break; } } } fprintf(f, "\n"); } carefulClose(&f); } int h5dReadFloatArray(hid_t file, char *path, float **retData) /* Load up float array from file into memory */ { hid_t hid = h5dOpen(file, path); hsize_t space = H5Dget_space(hid); hsize_t dims[8]; int dimCount = H5Sget_simple_extent_dims(space, dims, NULL); if (dimCount != 1) errAbort("Expecting %s to be one dimensional, but it has %d dimensions", path, dimCount); int size = dims[0]; float *array; AllocArray(array, size); h5dReadAll(hid, H5T_NATIVE_FLOAT, array); /* Clean up and go home */ H5Dclose(hid); H5Sclose(space); *retData = array; return size; } int h5dReadIntArray(hid_t file, char *path, int **retData) /* Load up integer array from file into memory */ { hid_t hid = h5dOpen(file, path); hsize_t space = H5Dget_space(hid); hsize_t dims[8]; int dimCount = H5Sget_simple_extent_dims(space, dims, NULL); if (dimCount != 1) errAbort("Expecting %s to be one dimensional, but it has %d dimensions", path, dimCount); int size = dims[0]; int *array; AllocArray(array, size); h5dReadAll(hid, H5T_NATIVE_INT, array); /* Clean up and go home */ H5Dclose(hid); H5Sclose(space); *retData = array; return size; } int h5dReadStringArray(hid_t file, char *path, char ***retData) /* Load up string array from file into memory */ { hid_t hid = h5dOpen(file, path); hsize_t space = H5Dget_space(hid); hsize_t dims[8]; int dimCount = H5Sget_simple_extent_dims(space, dims, NULL); if (dimCount != 1) errAbort("Expecting %s to be one dimensional, but it has %d dimensions", path, dimCount); int size = dims[0]; char **array; AllocArray(array, size); hid_t type = cVarString(); h5dReadAll(hid, type, array); /* Clean up and go home */ H5Tclose(type); H5Dclose(hid); H5Sclose(space); *retData = array; return size; } -struct flipVal -/* helps us flip around things */ +struct sparseRowVal +/* Linked list of things in a row */ { - struct flipVal *next; + struct sparseRowVal *next; int x; float val; }; -struct sparseFlip -/* Struct to help flip a sparse matrix */ +struct sparseRowMatrix +/* A sparse matrix with fast row access in memory */ { - struct sparseFlip *next; + struct sparseRowMatrix *next; int xSize, ySize; /* Dimensions of our matrix */ struct lm *lm; /* Local memory pool, where row lists come from */ - struct flipVal **rows; + struct sparseRowVal **rows; }; -struct sparseFlip *sparseFlipNew(int xSize, int ySize) -/* Make up a new sparseFlipper structure */ +struct sparseRowMatrix *sparseRowMatrixNew(int xSize, int ySize) +/* Make up a new sparseRowMatrix structure */ { -struct sparseFlip *flip; -AllocVar(flip); -flip->xSize = xSize; -flip->ySize = ySize; -flip->lm = lmInit(0); -lmAllocArray(flip->lm, flip->rows, ySize); -return flip; +struct sparseRowMatrix *matrix; +AllocVar(matrix); +matrix->xSize = xSize; +matrix->ySize = ySize; +matrix->lm = lmInit(0); +lmAllocArray(matrix->lm, matrix->rows, ySize); +return matrix; } -void sparseFlipFree(struct sparseFlip **pFlip) -/* Free up resources associated with sparse flipper */ +void sparseRowMatrixFree(struct sparseRowMatrix **pMatrix) +/* Free up resources associated with sparse matrix */ { -struct sparseFlip *flip = *pFlip; -if (flip != NULL) +struct sparseRowMatrix *matrix = *pMatrix; +if (matrix != NULL) { - lmCleanup(&flip->lm); - freez(pFlip); + lmCleanup(&matrix->lm); + freez(pMatrix); } } -static void inline sparseFlipAdd(struct sparseFlip *flip, int x, int y, float val) +static void inline sparseRowMatrixAdd(struct sparseRowMatrix *matrix, int x, int y, float val) /* Add data to our sparse matrix */ { -struct flipVal *fv; -lmAllocVar(flip->lm, fv); +struct sparseRowVal *fv; +lmAllocVar(matrix->lm, fv); fv->x = x; fv->val = val; -slAddHead(&flip->rows[y], fv); +slAddHead(&matrix->rows[y], fv); } -void sparseFlipTsvBody(struct sparseFlip *flip, char **rowLabels, FILE *f) +void sparseRowMatrixTsvBody(struct sparseRowMatrix *matrix, char **rowLabels, + boolean withDots, FILE *f) /* Write body (but not header) of matrix to tsv file */ { -int xSize = flip->xSize; -int ySize = flip->ySize; +int xSize = matrix->xSize; +int ySize = matrix->ySize; int x,y; double row[xSize]; for (y=0; y<ySize; ++y) { zeroBytes(row, sizeof(row)); - struct flipVal *fv; - for (fv = flip->rows[y]; fv != NULL; fv = fv->next) + struct sparseRowVal *fv; + for (fv = matrix->rows[y]; fv != NULL; fv = fv->next) row[fv->x] = fv->val; fprintf(f, "%s", rowLabels[y]); for (x=0; x<xSize; ++x) + { fprintf(f, "\t%g", row[x]); + } fprintf(f, "\n"); + if (withDots) dotForUser(); } } +static void writeTsvRow(FILE *f, int rowSize, char **row) +/* Write out row of strings to a line in tab-sep file */ +{ +if (rowSize > 0) + { + fprintf(f, "%s", row[0]); + int i; + for (i=1; i<rowSize; ++i) + fprintf(f, "\t%s", row[i]); + } +fprintf(f, "\n"); +} + +void sparseRowMatrixSaveAsTsv(struct sparseRowMatrix *matrix, + char **columnLabels, char **rowLabels, boolean withDots, char *fileName) +{ +FILE *f = mustOpen(fileName, "w"); +verbose(1, "outputting %d row matrix, a dot every 100 rows\n", matrix->ySize); +fprintf(f, "gene\t"); +writeTsvRow(f, matrix->xSize, columnLabels); +if (withDots) + dotForUserInit(100); +sparseRowMatrixTsvBody(matrix, rowLabels, withDots, f); +verbose(1, "\n"); +carefulClose(&f); +} + void saveExprMatrix(struct hcaUnpack5 *context, char **rowLabels, char *fileName) /* Save out expression matrix. Just process it one line at a time so as * not to run out of memory */ { -/* Open output and Write out header */ -FILE *f = mustOpen(fileName, "w"); -fprintf(f, "gene_id"); +/* Get column with sample names, aka indexCol */ struct metaCol *indexCol = context->indexCol; -int i; -for (i=0; i<indexCol->size; ++i) - fprintf(f, "\t%s", indexCol->val.asString[i]); -fprintf(f, "\n"); hid_t file = context->file; /* Figure out size of primary data - that is number of cells with data */ hid_t data = h5dOpen(file, "X/data"); hid_t dataSpace = H5Dget_space(data); hsize_t dataDims[4]; int dataDimCount = H5Sget_simple_extent_dims(dataSpace, dataDims, NULL); if (dataDimCount != 1) internalErr(); long long dataSize = dataDims[0]; verbose(1, "Sparse matrix with %lld elements\n", dataSize); /* Load up array that contains offsets of each column within column-major data */ int *indptr = NULL; int indptrSize = h5dReadIntArray(file, "X/indptr", &indptr); verbose(2, "indptr size is %d\n", indptrSize); if (indptrSize - 1 != indexCol->size) errAbort("Disagreement between X/indptr size (%d) and obs/_index size (%d)", indptrSize - 1, indexCol->size); hid_t indi = h5dOpen(file, "X/indices"); hsize_t indiSpace = H5Dget_space(indi); hsize_t indiDims[4]; int indiDimCount = H5Sget_simple_extent_dims(indiSpace, indiDims, NULL); if (indiDimCount != 1) internalErr(); int indiSize = indiDims[0]; verbose(2, "indi size is %d\n", indiSize); hsize_t indiColSpace = H5Dget_space(indi); hsize_t colOffset[1], colSize[1]; hsize_t dataColSpace = H5Dget_space(data); int sizeAlloc = 0; int *intiBuf = NULL; float *fValBuf = NULL; -/* Scan through matrix a column at a time, reading it into flipper */ -struct sparseFlip *flip = sparseFlipNew(context->cellCount, context->geneCount); +/* Scan through matrix a column at a time, reading it into matrix */ +struct sparseRowMatrix *matrix = sparseRowMatrixNew(context->cellCount, context->geneCount); int colIx; int colCount = context->cellCount; for (colIx = 0; colIx <colCount; ++colIx) { int start = indptr[colIx]; int end = indptr[colIx+1]; int size = end - start; if (size > sizeAlloc) { freeMem(intiBuf); freeMem(fValBuf); intiBuf = needMem(size * sizeof(intiBuf[0])); fValBuf = needMem(size * sizeof(fValBuf[0])); sizeAlloc = size; } hsize_t dim1[1] = {size}; hsize_t readMemSpace = H5Screate_simple(1, dim1, NULL); colOffset[0] = start; colSize[0] = size; H5Sselect_hyperslab(indiColSpace, H5S_SELECT_SET, colOffset, NULL, colSize, NULL); H5Sselect_hyperslab(dataColSpace, H5S_SELECT_SET, colOffset, NULL, colSize, NULL); h5dRead(indi, H5T_NATIVE_INT, readMemSpace, indiColSpace, H5P_DEFAULT, intiBuf); h5dRead(data, H5T_NATIVE_FLOAT, readMemSpace, dataColSpace, H5P_DEFAULT, fValBuf); int i; for (i=0; i<size; ++i) - sparseFlipAdd(flip, colIx, intiBuf[i], fValBuf[i]); + sparseRowMatrixAdd(matrix, colIx, intiBuf[i], fValBuf[i]); } -verbose(1, "outputting %d row matrix, a dot every 100 rows\n", context->geneCount); -dotForUserInit(100); -sparseFlipTsvBody(flip, rowLabels, f); -verbose(1, "\n"); + +/* Open output and Write out header */ +sparseRowMatrixSaveAsTsv(matrix, indexCol->val.asString, rowLabels, TRUE, fileName); + + /* Clean up and go home*/ -sparseFlipFree(&flip); +sparseRowMatrixFree(&matrix); freez(&intiBuf); freez(&fValBuf); -carefulClose(&f); } void saveArrayOfStrings(char **array, int size, char *fileName) /* Write out array of strings, pretty easy */ { FILE *f = mustOpen(fileName, "w"); int i; for (i=0; i<size; ++i) fprintf(f, "%s\n", array[i]); carefulClose(&f); } void hcaUnpack5(char *input, char *output, boolean withMatrix) /* hcaUnpack5 - Convert cellxgene hdf5 files to something cell browser and genome browser like * better.. */ { struct hcaUnpack5 *context; AllocVar(context); context->vocabHash = hashNew(0); context->metaHash = hashNew(0); hid_t file = context->file = h5fOpen(input, H5F_ACC_RDONLY); hid_t cats = h5gOpen(file, "obs/__categories"); h5lIterate(cats, vocabCallback, context); slReverse(&context->vocabList); verbose(2, "And now to attach vocabs?\n"); struct vocab *vocab; for (vocab = context->vocabList; vocab != NULL; vocab = vocab->next) { hid_t v = h5dOpen(cats, vocab->name); readVocab(vocab, v, context); verbose(2, "%s has %d items\n", vocab->name, vocab->valCount); int i; for (i=0; i<vocab->valCount; ++i) verbose(2, " %s\n", vocab->stringVals[i]); } verbose(2, "And now for looping through the real thingies\n"); hid_t obs = h5gOpen(file, "obs"); h5lIterate(obs, obsCallback, context); struct metaCol *col; int firstColSize = 0; struct metaCol *indexCol = NULL; for (col = context->metaList; col != NULL; col = col->next) { char *name = col->name; if (sameString(name, "_index")) indexCol = context->indexCol = col; if (differentString(name, "__categories")) { hid_t hid = h5dOpen(obs, col->name); int size = readCol(col, hid, context); if (firstColSize == 0) firstColSize = size; else { if (size != firstColSize) errAbort("size mismatch %d vs %d", size, firstColSize); } } } /* Rip indexCol out of list */ if (indexCol == NULL) internalErr(); slRemoveEl(&context->metaList, indexCol); /* Check all columns are same size */ int colSize = indexCol->size; for (col = context->metaList; col != NULL; col = col->next) { if (col->size != colSize) errAbort("ERROR: %s has %d cells but %s has %d", col->name, col->size, indexCol->name, indexCol->size); } context->cellCount = colSize; /* Get list of gene names*/ char **geneNames; int geneCount = h5dReadStringArray(file, "var/_index", &geneNames); context->geneCount = geneCount; verbose(1, "Data on %d cells x %d genes\n", context->cellCount, context->geneCount); /* Make output directory */ makeDirsOnPath(output); char path[PATH_LEN]; safef(path, sizeof(path), "%s/meta.tsv", output); saveMeta(context, path); safef(path, sizeof(path), "%s/gene.lst", output); saveArrayOfStrings(geneNames, geneCount, path); if (withMatrix) { safef(path, sizeof(path), "%s/exprMatrix.tsv", output); saveExprMatrix(context, geneNames, path); } } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 3) usage(); hcaUnpack5(argv[1], argv[2], !optionExists("noMatrix")); return 0; }