ab7847c47ac5507a7e126691bb78826ed4ba345d chmalee Mon Sep 13 16:34:47 2021 -0700 Staging GTEx single nuclei gene expression track, refs #29954 diff --git src/hca/hcaUnpack5/hcaUnpack5.c src/hca/hcaUnpack5/hcaUnpack5.c index ddd474f..3dc730a 100644 --- src/hca/hcaUnpack5/hcaUnpack5.c +++ src/hca/hcaUnpack5/hcaUnpack5.c @@ -1,26 +1,27 @@ /* hcaUnpack5 - Convert cellxgene hdf5 files to something cell browser and genome browser * like better.. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "localmem.h" #include "portable.h" #include "obscure.h" #include "sparseMatrix.h" #include <hdf5.h> +#include <hdf5_hl.h> void usage() /* Explain usage and exit. */ { errAbort( "hcaUnpack5 - Convert h5ad (scanpy) files to a directory filled with 3 things\n" "usage:\n" " hcaUnpack5 input.h5ad outDir\n" "The output dir will be populated with exprMatrix.tsv, meta.tsv, and gene.lst\n" "where:\n" " exprMatrix.tsv has the cell x gene matrix with cells as columns. This includes\n" " the cell names in the first row and the gene names in the first column.\n." " meta.tsv has the cell-by-cell metadata. The first row is labels, and the first\n" " column corresponds with the cell names in exprMatrix\n" " gene.lst has the list of genes, one per line\n" @@ -100,31 +101,32 @@ herr_t status = H5Dread(file, memType, memSpace, fileSpace, pList, buf); if (status < 0) { char name[256]; H5Iget_name(file, name, sizeof(name)); errAbort("ERROR: Couldn't read from %s", name); } } struct vocab /* A controlled vocab we can refer to by short numbers instead of strings */ { struct vocab *next; char *name; /* Column this vocab is associated with */ int valCount; /* Number of distinct values */ - char **stringVals; /* array of strings */ + H5T_class_t classt; /* class of values */ + union { int *asInt; double *asDouble; char **asString;} val; // array of values }; struct metaCol /* One of these for each metadata column /field */ { struct metaCol *next; char *name; /* field name */ int size; /* size of column, ie number of rows */ H5T_class_t classt; /* class */ struct vocab *vocab; /* Some columns have associated vocabs */ union { int *asVocab; int *asInt; double *asDouble; char **asString;} val; }; struct metaCol *metaColNew(const char *name) /* Build up a new metaCol of given name */ @@ -163,94 +165,112 @@ /* Return hid_t that'll be usual C variable length 0 terminated strings */ { hid_t memtype = H5Tcopy (H5T_C_S1); H5Tset_size (memtype, H5T_VARIABLE); H5Tset_cset(memtype, H5T_CSET_UTF8) ; return memtype; } void readVocab(struct vocab *vocab, hid_t hid, struct hcaUnpack5 *context) /* Read vocab from given hid */ { hid_t space = H5Dget_space (hid); hsize_t dims[1]; H5Sget_simple_extent_dims (space, dims, NULL); vocab->valCount = dims[0]; -AllocArray(vocab->stringVals, vocab->valCount); - -/* Set us up for zero-terminated UTF8 (or ascii) strings */ -hid_t memtype = cVarString(); - -/* Read it */ -h5dReadAll(hid, memtype, vocab->stringVals); - +hid_t type = H5Dget_type(hid); +H5T_class_t classt = vocab->classt = H5Tget_class(type); +switch (classt) + { + case H5T_INTEGER: + { + int *buf; + AllocArray(buf, vocab->valCount); + h5dReadAll(hid, H5T_NATIVE_INT, buf); + vocab->val.asInt = buf; + break; + } + case H5T_FLOAT: + { + double *buf; + AllocArray(buf, vocab->valCount); + h5dReadAll(hid, H5T_NATIVE_DOUBLE, buf); + vocab->val.asDouble = buf; + break; + } + case H5T_STRING: + { + char **buf; + AllocArray(buf, vocab->valCount); + h5dReadAll(hid, cVarString(), buf); + vocab->val.asString = buf; + break; + } + default: + { + size_t msgSize = H5LTdtype_to_text(type, NULL, H5LT_DDL, &msgSize); + char msg[msgSize]; + H5LTdtype_to_text(type, msg, H5LT_DDL, &msgSize); + errAbort("ERROR: unsupported vocab type: %s\n", msg); + break; + } + } /* Clean up */ H5Sclose(space); -H5Tclose(memtype); +H5Tclose(type); } int readCol(struct metaCol *col, hid_t hid, struct hcaUnpack5 *context) /* Read metaCol from given hid */ { hid_t space = H5Dget_space (hid); hsize_t dims[1]; H5Sget_simple_extent_dims (space, dims, NULL); int size = col->size = dims[0]; hid_t type = H5Dget_type(hid); H5T_class_t classt = col->classt = H5Tget_class(type); -struct vocab *vocab = hashFindVal(context->vocabHash, col->name); -if (vocab != NULL) - { - int *buf; - AllocArray(buf, size); - col->val.asVocab = buf; - h5dReadAll(hid, H5T_NATIVE_INT, buf); - } -else - { switch (classt) { case H5T_INTEGER: { int *buf; AllocArray(buf, size); col->val.asInt = buf; h5dReadAll(hid, H5T_NATIVE_INT, buf); break; } case H5T_FLOAT: { double *buf; AllocArray(buf, size); col->val.asDouble = buf; h5dReadAll(hid, H5T_NATIVE_DOUBLE, buf); break; } case H5T_STRING: { char **buf; AllocArray(buf, size); col->val.asString = buf; h5dReadAll(hid, cVarString(), buf); break; } default: warn("Skipping type %d", type); break; } - } /* Clean up and go home */ H5Sclose(space); H5Tclose(type); return size; } herr_t vocabCallback(hid_t loc_id, const char *name, const H5L_info_t *info, void *operator_data) /* Callback from H5Iterate that we use to build up a column that has * an assocated vocabulary */ { struct hcaUnpack5 *context = operator_data; herr_t status; @@ -316,33 +336,64 @@ struct metaCol *col; for (col = context->metaList; col != NULL; col = col->next) { fprintf(f, "\t%s", col->name); } fprintf(f, "\n"); struct metaCol *indexCol = context->indexCol; int colSize = context->cellCount; int i; for (i=0; i<colSize; ++i) { fprintf(f, "%s", indexCol->val.asString[i]); for (col = context->metaList; col != NULL; col = col->next) { - struct vocab *vocab = col->vocab; - if (vocab != NULL) - fprintf(f, "\t%s", vocab->stringVals[col->val.asVocab[i]]); + if (col->vocab) + { + int vocabIx = col->val.asInt[i]; + if (vocabIx < 0) + { + // why does this happen? Is this data invalid? Is it unknown? + // it's not a read error, see the following comparisons: + // cd /hive/data/outside/gtex/V9/snRnaSeq/ + // h5dump -y -d/obs/__categories/granular GTEx_8_tissues_snRNAseq_immune_atlas_071421.public_obs.h5ad | less -S + // clearly an array of tissue-like names? then how come the data contains + // indices like -1? + // h5dump -d/obs/granular GTEx_8_tissues_snRNAseq_immune_atlas_071421.public_obs.h5ad | less -S + // for now just print "unkown val" as a placeholder + fprintf(f, "\tunknown value"); + } + else + { + switch (col->vocab->classt) + { + case H5T_INTEGER: + fprintf(f, "\t%d", col->vocab->val.asInt[vocabIx]); + break; + case H5T_FLOAT: + fprintf(f, "\t%g", col->vocab->val.asDouble[vocabIx]); + break; + case H5T_STRING: + fprintf(f, "\t%s", col->vocab->val.asString[vocabIx]); + break; + default: + internalErr(); + break; + } + } + } else { switch (col->classt) { case H5T_INTEGER: fprintf(f, "\t%d", col->val.asInt[i]); break; case H5T_FLOAT: fprintf(f, "\t%g", col->val.asDouble[i]); break; case H5T_STRING: fprintf(f, "\t%s", col->val.asString[i]); break; default: internalErr(); @@ -526,31 +577,44 @@ hid_t file = context->file = h5fOpen(input, H5F_ACC_RDONLY); hid_t cats = h5gOpen(file, "obs/__categories"); h5lIterate(cats, vocabCallback, context); slReverse(&context->vocabList); verbose(2, "And now to attach vocabs?\n"); struct vocab *vocab; for (vocab = context->vocabList; vocab != NULL; vocab = vocab->next) { hid_t v = h5dOpen(cats, vocab->name); readVocab(vocab, v, context); verbose(2, "%s has %d items\n", vocab->name, vocab->valCount); int i; for (i=0; i<vocab->valCount; ++i) - verbose(2, " %s\n", vocab->stringVals[i]); + { + switch (vocab->classt) + { + case H5T_INTEGER: + verbose(2, " %d\n", vocab->val.asInt[i]); + break; + case H5T_FLOAT: + verbose(2, " %g\n", vocab->val.asDouble[i]); + break; + default: + verbose(2, " %s\n", vocab->val.asString[i]); + break; + } + } } verbose(2, "And now for looping through the real thingies\n"); hid_t obs = h5gOpen(file, "obs"); h5lIterate(obs, obsCallback, context); struct metaCol *col; int firstColSize = 0; struct metaCol *indexCol = NULL; for (col = context->metaList; col != NULL; col = col->next) { char *name = col->name; if (sameString(name, "_index")) indexCol = context->indexCol = col;