be4311c07e14feb728abc6425ee606ffaa611a58 markd Fri Jan 22 06:46:58 2021 -0800 merge with master diff --git src/hca/hcaUnpack5/hcaUnpack5.c src/hca/hcaUnpack5/hcaUnpack5.c new file mode 100644 index 0000000..ddd474f --- /dev/null +++ src/hca/hcaUnpack5/hcaUnpack5.c @@ -0,0 +1,621 @@ +/* hcaUnpack5 - Convert cellxgene hdf5 files to something cell browser and genome browser + * like better.. */ +#include "common.h" +#include "linefile.h" +#include "hash.h" +#include "options.h" +#include "localmem.h" +#include "portable.h" +#include "obscure.h" +#include "sparseMatrix.h" +#include + +void usage() +/* Explain usage and exit. */ +{ +errAbort( + "hcaUnpack5 - Convert h5ad (scanpy) files to a directory filled with 3 things\n" + "usage:\n" + " hcaUnpack5 input.h5ad outDir\n" + "The output dir will be populated with exprMatrix.tsv, meta.tsv, and gene.lst\n" + "where:\n" + " exprMatrix.tsv has the cell x gene matrix with cells as columns. This includes\n" + " the cell names in the first row and the gene names in the first column.\n." + " meta.tsv has the cell-by-cell metadata. The first row is labels, and the first\n" + " column corresponds with the cell names in exprMatrix\n" + " gene.lst has the list of genes, one per line\n" + "options:\n" + " -noMatrix - skip the matrix making step\n" + ); +} + +/* Command line validation table. */ +static struct optionSpec options[] = { + {"noMatrix", OPTION_BOOLEAN}, + {NULL, 0}, +}; + + +hid_t h5fOpen(char *fileName, int accessType) +/* Open up file or die trying. accessType should be + * H5F_ACC_RDONLY, H5F_ACC_RDWR, etc. */ +{ +hid_t ret = H5Fopen(fileName, accessType, H5P_DEFAULT); +if (ret < 0) + errAbort("Couldn't open %s", fileName); +return ret; +} + +hid_t h5dOpen(hid_t parent, char *name) +/* Open up a subpart or die trying */ +{ +hid_t ret = H5Dopen(parent, name, H5P_DEFAULT); +H5_DLL ssize_t H5Iget_name(hid_t id, char *name/*out*/, size_t size); +if (ret < 0) + { + char parentName[256]; + H5Iget_name(parent, parentName, sizeof(parentName)); + errAbort("ERROR: Couldn't find dataset %s in %s", name, parentName); + } +return ret; +} + +hid_t h5gOpen(hid_t parent, char *name) +/* Open up a subpart or die trying */ +{ +hid_t ret = H5Gopen(parent, name, H5P_DEFAULT); +if (ret < 0) + { + char parentName[256]; + H5Iget_name(parent, parentName, sizeof(parentName)); + errAbort("ERROR: Couldn't find group %s in %s", name, parentName); + } +return ret; +} + + +void h5lIterate(hid_t hid, H5L_iterate_t callback, void *context) +/* Iterate over a list. Requires a callback */ +{ +herr_t status = H5Literate(hid, H5_INDEX_NAME, H5_ITER_NATIVE, NULL, callback, context); +if (status < 0) + internalErr(); // does this ever happen? +} + +void h5dReadAll(hid_t hid, hid_t type, void *buf) +/* Do the read, and pray the buf is pointing to someplace that can hold it... */ +{ +herr_t status = H5Dread(hid, type, H5S_ALL, H5S_ALL, H5P_DEFAULT, buf); +if (status < 0) + { + char name[256]; + H5Iget_name(hid, name, sizeof(name)); + errAbort("ERROR: Couldn't read from %s", name); + } +} + +void h5dRead(hid_t file, hid_t memType, hid_t memSpace, hid_t fileSpace, hid_t pList, void *buf) +/* Do the read, and pray the buf is pointing to someplace that can hold it... */ +{ +herr_t status = H5Dread(file, memType, memSpace, fileSpace, pList, buf); +if (status < 0) + { + char name[256]; + H5Iget_name(file, name, sizeof(name)); + errAbort("ERROR: Couldn't read from %s", name); + } +} + +struct vocab +/* A controlled vocab we can refer to by short numbers instead of strings */ + { + struct vocab *next; + char *name; /* Column this vocab is associated with */ + int valCount; /* Number of distinct values */ + char **stringVals; /* array of strings */ + }; + +struct metaCol +/* One of these for each metadata column /field */ + { + struct metaCol *next; + char *name; /* field name */ + int size; /* size of column, ie number of rows */ + H5T_class_t classt; /* class */ + struct vocab *vocab; /* Some columns have associated vocabs */ + union { int *asVocab; int *asInt; double *asDouble; char **asString;} val; + }; + +struct metaCol *metaColNew(const char *name) +/* Build up a new metaCol of given name */ +{ +struct metaCol *col; +AllocVar(col); +col->name = cloneString(name); +return col; +} + +struct hcaUnpack5 +/* There is just one of these for our program - it's the context + * we give most of our h5 callbacks */ + { + struct hcaUnpack5 *next; + hid_t file; /* Open hdf5 file */ + struct vocab *vocabList; /* List of all restricted vocabs */ + struct hash *vocabHash; /* Hash of vocabs keyed by vocab->name */ + struct metaCol *metaList; /* List of all metaCols */ + struct hash *metaHash; /* Hash of all metaCols keyed by metaCol->name */ + struct metaCol *indexCol; /* Column that has cell IDs */ + int cellCount; /* Number of cells - rows in meta.tsv cols-1 in exprHash */ + int geneCount; /* Number of genes */ + }; + +struct vocab *vocabNew(const char *name) +/* Make a new, largely empty, vocab structure */ +{ +struct vocab *vocab; +AllocVar(vocab); +vocab->name = cloneString(name); +return vocab; +} + +hid_t cVarString() +/* Return hid_t that'll be usual C variable length 0 terminated strings */ +{ +hid_t memtype = H5Tcopy (H5T_C_S1); +H5Tset_size (memtype, H5T_VARIABLE); +H5Tset_cset(memtype, H5T_CSET_UTF8) ; +return memtype; +} + +void readVocab(struct vocab *vocab, hid_t hid, struct hcaUnpack5 *context) +/* Read vocab from given hid */ +{ +hid_t space = H5Dget_space (hid); +hsize_t dims[1]; +H5Sget_simple_extent_dims (space, dims, NULL); +vocab->valCount = dims[0]; +AllocArray(vocab->stringVals, vocab->valCount); + +/* Set us up for zero-terminated UTF8 (or ascii) strings */ +hid_t memtype = cVarString(); + +/* Read it */ +h5dReadAll(hid, memtype, vocab->stringVals); + +/* Clean up */ +H5Sclose(space); +H5Tclose(memtype); +} + +int readCol(struct metaCol *col, hid_t hid, struct hcaUnpack5 *context) +/* Read metaCol from given hid */ +{ +hid_t space = H5Dget_space (hid); +hsize_t dims[1]; +H5Sget_simple_extent_dims (space, dims, NULL); +int size = col->size = dims[0]; +hid_t type = H5Dget_type(hid); +H5T_class_t classt = col->classt = H5Tget_class(type); + +struct vocab *vocab = hashFindVal(context->vocabHash, col->name); +if (vocab != NULL) + { + int *buf; + AllocArray(buf, size); + col->val.asVocab = buf; + h5dReadAll(hid, H5T_NATIVE_INT, buf); + } +else + { + switch (classt) + { + case H5T_INTEGER: + { + int *buf; + AllocArray(buf, size); + col->val.asInt = buf; + h5dReadAll(hid, H5T_NATIVE_INT, buf); + break; + } + case H5T_FLOAT: + { + double *buf; + AllocArray(buf, size); + col->val.asDouble = buf; + h5dReadAll(hid, H5T_NATIVE_DOUBLE, buf); + break; + } + case H5T_STRING: + { + char **buf; + AllocArray(buf, size); + col->val.asString = buf; + h5dReadAll(hid, cVarString(), buf); + break; + } + default: + warn("Skipping type %d", type); + break; + } + } + +/* Clean up and go home */ +H5Sclose(space); +H5Tclose(type); +return size; +} + + +herr_t vocabCallback(hid_t loc_id, const char *name, const H5L_info_t *info, + void *operator_data) +/* Callback from H5Iterate that we use to build up a column that has + * an assocated vocabulary */ +{ +struct hcaUnpack5 *context = operator_data; +herr_t status; +H5O_info_t infobuf; + +/* + * Get type of the object and display its name and type. + * The name of the object is passed to this function by + * the Library. + */ +status = H5Oget_info_by_name(loc_id, name, &infobuf, H5P_DEFAULT); +if (status < 0) + errAbort("missing status for %s", name); + +if (infobuf.type == H5O_TYPE_DATASET) + { + struct vocab *vocab = vocabNew(name); + hashAdd(context->vocabHash, vocab->name, vocab); + slAddHead(&context->vocabList, vocab); + } + +return 0; +} + +herr_t obsCallback(hid_t loc_id, const char *name, const H5L_info_t *info, + void *operator_data) +/* Callback from H5Iterate that we use to build up a column that has + * an assocated vocabulary */ +{ +struct hcaUnpack5 *context = operator_data; +herr_t status; +H5O_info_t infobuf; + +/* + * Get type of the object and display its name and type. + * The name of the object is passed to this function by + * the Library. + */ +status = H5Oget_info_by_name(loc_id, name, &infobuf, H5P_DEFAULT); +if (status < 0) + errAbort("missing status for %s", name); + + +if (infobuf.type == H5O_TYPE_DATASET) + { + struct metaCol *col = metaColNew(name); + hashAdd(context->metaHash, col->name, col); + slAddHead(&context->metaList, col); + struct vocab *vocab = hashFindVal(context->vocabHash, col->name); + if (vocab != NULL) + col->vocab = vocab; + } + +return 0; +} + +void saveMeta(struct hcaUnpack5 *context, char *fileName) +/* Save the metaList to fileName */ +{ +FILE *f = mustOpen(fileName, "w"); +fprintf(f, "#"); +fprintf(f, "_index"); +struct metaCol *col; +for (col = context->metaList; col != NULL; col = col->next) + { + fprintf(f, "\t%s", col->name); + } +fprintf(f, "\n"); + +struct metaCol *indexCol = context->indexCol; +int colSize = context->cellCount; +int i; +for (i=0; ival.asString[i]); + for (col = context->metaList; col != NULL; col = col->next) + { + struct vocab *vocab = col->vocab; + if (vocab != NULL) + fprintf(f, "\t%s", vocab->stringVals[col->val.asVocab[i]]); + else + { + switch (col->classt) + { + case H5T_INTEGER: + fprintf(f, "\t%d", col->val.asInt[i]); + break; + case H5T_FLOAT: + fprintf(f, "\t%g", col->val.asDouble[i]); + break; + case H5T_STRING: + fprintf(f, "\t%s", col->val.asString[i]); + break; + default: + internalErr(); + break; + } + } + } + fprintf(f, "\n"); + } +carefulClose(&f); +} + +int h5dReadFloatArray(hid_t file, char *path, float **retData) +/* Load up float array from file into memory */ +{ +hid_t hid = h5dOpen(file, path); +hsize_t space = H5Dget_space(hid); +hsize_t dims[8]; +int dimCount = H5Sget_simple_extent_dims(space, dims, NULL); +if (dimCount != 1) + errAbort("Expecting %s to be one dimensional, but it has %d dimensions", path, dimCount); +int size = dims[0]; +float *array; +AllocArray(array, size); +h5dReadAll(hid, H5T_NATIVE_FLOAT, array); + +/* Clean up and go home */ +H5Dclose(hid); +H5Sclose(space); +*retData = array; +return size; +} + +int h5dReadIntArray(hid_t file, char *path, int **retData) +/* Load up integer array from file into memory */ +{ +hid_t hid = h5dOpen(file, path); +hsize_t space = H5Dget_space(hid); +hsize_t dims[8]; +int dimCount = H5Sget_simple_extent_dims(space, dims, NULL); +if (dimCount != 1) + errAbort("Expecting %s to be one dimensional, but it has %d dimensions", path, dimCount); +int size = dims[0]; +int *array; +AllocArray(array, size); +h5dReadAll(hid, H5T_NATIVE_INT, array); + +/* Clean up and go home */ +H5Dclose(hid); +H5Sclose(space); +*retData = array; +return size; +} + +int h5dReadStringArray(hid_t file, char *path, char ***retData) +/* Load up string array from file into memory */ +{ +hid_t hid = h5dOpen(file, path); +hsize_t space = H5Dget_space(hid); +hsize_t dims[8]; +int dimCount = H5Sget_simple_extent_dims(space, dims, NULL); +if (dimCount != 1) + errAbort("Expecting %s to be one dimensional, but it has %d dimensions", path, dimCount); +int size = dims[0]; +char **array; +AllocArray(array, size); +hid_t type = cVarString(); +h5dReadAll(hid, type, array); + +/* Clean up and go home */ +H5Tclose(type); +H5Dclose(hid); +H5Sclose(space); +*retData = array; +return size; +} + + +void saveExprMatrix(struct hcaUnpack5 *context, char **rowLabels, char *fileName) +/* Save out expression matrix. Just process it one line at a time so as + * not to run out of memory */ +{ +/* Get column with sample names, aka indexCol */ +struct metaCol *indexCol = context->indexCol; + +hid_t file = context->file; + +/* Figure out size of primary data - that is number of cells with data */ +hid_t data = h5dOpen(file, "X/data"); +hid_t dataSpace = H5Dget_space(data); +hsize_t dataDims[4]; +int dataDimCount = H5Sget_simple_extent_dims(dataSpace, dataDims, NULL); +if (dataDimCount != 1) + internalErr(); +long long dataSize = dataDims[0]; +verbose(1, "Sparse matrix with %lld elements\n", dataSize); + +/* Load up array that contains offsets of each column within column-major data */ +int *indptr = NULL; +int indptrSize = h5dReadIntArray(file, "X/indptr", &indptr); +verbose(2, "indptr size is %d\n", indptrSize); +if (indptrSize - 1 != indexCol->size) + errAbort("Disagreement between X/indptr size (%d) and obs/_index size (%d)", + indptrSize - 1, indexCol->size); + +hid_t indi = h5dOpen(file, "X/indices"); +hsize_t indiSpace = H5Dget_space(indi); +hsize_t indiDims[4]; +int indiDimCount = H5Sget_simple_extent_dims(indiSpace, indiDims, NULL); +if (indiDimCount != 1) + internalErr(); +int indiSize = indiDims[0]; +verbose(2, "indi size is %d\n", indiSize); + +hsize_t indiColSpace = H5Dget_space(indi); +hsize_t colOffset[1], colSize[1]; +hsize_t dataColSpace = H5Dget_space(data); + +int sizeAlloc = 0; +int *intiBuf = NULL; +float *fValBuf = NULL; + +/* Scan through matrix a column at a time, reading it into matrix */ +struct sparseRowMatrix *matrix = sparseRowMatrixNew(context->cellCount, context->geneCount); +int colIx; +int colCount = context->cellCount; +for (colIx = 0; colIx sizeAlloc) + { + freeMem(intiBuf); + freeMem(fValBuf); + intiBuf = needMem(size * sizeof(intiBuf[0])); + fValBuf = needMem(size * sizeof(fValBuf[0])); + sizeAlloc = size; + } + hsize_t dim1[1] = {size}; + hsize_t readMemSpace = H5Screate_simple(1, dim1, NULL); + colOffset[0] = start; + colSize[0] = size; + H5Sselect_hyperslab(indiColSpace, H5S_SELECT_SET, colOffset, NULL, colSize, NULL); + H5Sselect_hyperslab(dataColSpace, H5S_SELECT_SET, colOffset, NULL, colSize, NULL); + h5dRead(indi, H5T_NATIVE_INT, readMemSpace, indiColSpace, H5P_DEFAULT, intiBuf); + h5dRead(data, H5T_NATIVE_FLOAT, readMemSpace, dataColSpace, H5P_DEFAULT, fValBuf); + + int i; + for (i=0; ival.asString, rowLabels, fileName); + +/* Clean up and go home*/ +sparseRowMatrixFree(&matrix); +freez(&intiBuf); +freez(&fValBuf); +} + +void saveArrayOfStrings(char **array, int size, char *fileName) +/* Write out array of strings, pretty easy */ +{ +FILE *f = mustOpen(fileName, "w"); +int i; +for (i=0; ivocabHash = hashNew(0); +context->metaHash = hashNew(0); + +hid_t file = context->file = h5fOpen(input, H5F_ACC_RDONLY); +hid_t cats = h5gOpen(file, "obs/__categories"); +h5lIterate(cats, vocabCallback, context); +slReverse(&context->vocabList); + +verbose(2, "And now to attach vocabs?\n"); +struct vocab *vocab; +for (vocab = context->vocabList; vocab != NULL; vocab = vocab->next) + { + hid_t v = h5dOpen(cats, vocab->name); + readVocab(vocab, v, context); + verbose(2, "%s has %d items\n", vocab->name, vocab->valCount); + int i; + for (i=0; ivalCount; ++i) + verbose(2, " %s\n", vocab->stringVals[i]); + } + +verbose(2, "And now for looping through the real thingies\n"); +hid_t obs = h5gOpen(file, "obs"); +h5lIterate(obs, obsCallback, context); + + +struct metaCol *col; +int firstColSize = 0; +struct metaCol *indexCol = NULL; +for (col = context->metaList; col != NULL; col = col->next) + { + char *name = col->name; + if (sameString(name, "_index")) + indexCol = context->indexCol = col; + if (differentString(name, "__categories")) + { + hid_t hid = h5dOpen(obs, col->name); + int size = readCol(col, hid, context); + if (firstColSize == 0) + firstColSize = size; + else + { + if (size != firstColSize) + errAbort("size mismatch %d vs %d", size, firstColSize); + } + } + } + +/* Rip indexCol out of list */ +if (indexCol == NULL) + internalErr(); + +slRemoveEl(&context->metaList, indexCol); + +/* Check all columns are same size */ +int colSize = indexCol->size; +for (col = context->metaList; col != NULL; col = col->next) + { + if (col->size != colSize) + errAbort("ERROR: %s has %d cells but %s has %d", + col->name, col->size, indexCol->name, indexCol->size); + } +context->cellCount = colSize; + +/* Get list of gene names*/ +char **geneNames; +int geneCount = h5dReadStringArray(file, "var/_index", &geneNames); +context->geneCount = geneCount; + +verbose(1, "Data on %d cells x %d genes\n", context->cellCount, context->geneCount); + +/* Make output directory */ +makeDirsOnPath(output); +char path[PATH_LEN]; +safef(path, sizeof(path), "%s/meta.tsv", output); +saveMeta(context, path); + +safef(path, sizeof(path), "%s/gene.lst", output); +saveArrayOfStrings(geneNames, geneCount, path); + + +if (withMatrix) + { + safef(path, sizeof(path), "%s/exprMatrix.tsv", output); + saveExprMatrix(context, geneNames, path); + } + +} + + +int main(int argc, char *argv[]) +/* Process command line. */ +{ +optionInit(&argc, argv, options); +if (argc != 3) + usage(); +hcaUnpack5(argv[1], argv[2], !optionExists("noMatrix")); +return 0; +}