ab7847c47ac5507a7e126691bb78826ed4ba345d
chmalee
  Mon Sep 13 16:34:47 2021 -0700
Staging GTEx single nuclei gene expression track, refs #29954

diff --git src/hca/hcaUnpack5/hcaUnpack5.c src/hca/hcaUnpack5/hcaUnpack5.c
index ddd474f..3dc730a 100644
--- src/hca/hcaUnpack5/hcaUnpack5.c
+++ src/hca/hcaUnpack5/hcaUnpack5.c
@@ -1,26 +1,27 @@
 /* hcaUnpack5 - Convert cellxgene hdf5 files to something cell browser and genome browser 
  * like better.. */
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "localmem.h"
 #include "portable.h"
 #include "obscure.h"
 #include "sparseMatrix.h"
 #include <hdf5.h>
+#include <hdf5_hl.h>
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "hcaUnpack5 - Convert h5ad (scanpy) files to a directory filled with 3 things\n"
   "usage:\n"
   "   hcaUnpack5 input.h5ad outDir\n"
   "The output dir will be populated with exprMatrix.tsv, meta.tsv, and gene.lst\n"
   "where:\n"
   "    exprMatrix.tsv has the cell x gene matrix with cells as columns.  This includes\n"
   "             the cell names in the first row and the gene names in the first column.\n."
   "    meta.tsv has the cell-by-cell metadata.  The first row is labels, and the first\n"
   "             column corresponds with the cell names in exprMatrix\n"
   "    gene.lst has the list of genes, one per line\n"
@@ -100,31 +101,32 @@
 herr_t status = H5Dread(file, memType, memSpace, fileSpace, pList, buf);
 if (status < 0)
    {
    char name[256];
    H5Iget_name(file, name, sizeof(name));
    errAbort("ERROR: Couldn't read from %s", name);
    }
 }
 
 struct vocab
 /* A controlled vocab we can refer to by short numbers instead of strings */
     {
     struct vocab *next;
     char *name;	    /* Column this vocab is associated with */
     int valCount;   /* Number of distinct values */
-    char **stringVals;    /* array of strings */
+    H5T_class_t classt;	/* class of values */
+    union { int *asInt; double *asDouble; char **asString;} val; // array of values
     };
 
 struct metaCol 
 /* One of these for each metadata column /field */
     {
     struct metaCol *next;
     char *name;	    /* field name */
     int size;	    /* size of column, ie number of rows */
     H5T_class_t classt;	/* class */
     struct vocab *vocab;    /* Some columns have associated vocabs */
     union { int *asVocab; int *asInt; double *asDouble; char **asString;} val;
     };
 
 struct metaCol *metaColNew(const char *name)
 /* Build up a new metaCol of given name */
@@ -163,94 +165,112 @@
 /* Return hid_t that'll be usual C variable length 0 terminated strings */
 {
 hid_t memtype = H5Tcopy (H5T_C_S1);
 H5Tset_size (memtype, H5T_VARIABLE);
 H5Tset_cset(memtype, H5T_CSET_UTF8) ;
 return memtype;
 }
 
 void readVocab(struct vocab *vocab, hid_t hid, struct hcaUnpack5 *context)
 /* Read vocab from given hid */
 {
 hid_t space = H5Dget_space (hid);
 hsize_t     dims[1];
 H5Sget_simple_extent_dims (space, dims, NULL);
 vocab->valCount = dims[0];
-AllocArray(vocab->stringVals, vocab->valCount);
-
-/* Set us up for zero-terminated UTF8 (or ascii) strings */
-hid_t memtype = cVarString();
-
-/* Read it */
-h5dReadAll(hid, memtype, vocab->stringVals);
-
+hid_t type = H5Dget_type(hid);
+H5T_class_t classt = vocab->classt = H5Tget_class(type);
+switch (classt)
+    {
+    case H5T_INTEGER:
+        {
+        int *buf;
+        AllocArray(buf, vocab->valCount);
+        h5dReadAll(hid, H5T_NATIVE_INT, buf);
+        vocab->val.asInt = buf;
+        break;
+        }
+    case H5T_FLOAT:
+        {
+        double *buf;
+        AllocArray(buf, vocab->valCount);
+        h5dReadAll(hid, H5T_NATIVE_DOUBLE, buf);
+        vocab->val.asDouble = buf;
+        break;
+        }
+    case H5T_STRING:
+        {
+        char **buf;
+        AllocArray(buf, vocab->valCount);
+        h5dReadAll(hid, cVarString(), buf);
+        vocab->val.asString = buf;
+        break;
+        }
+    default:
+        {
+        size_t msgSize = H5LTdtype_to_text(type, NULL, H5LT_DDL, &msgSize);
+        char msg[msgSize];
+        H5LTdtype_to_text(type, msg, H5LT_DDL, &msgSize);
+        errAbort("ERROR: unsupported vocab type: %s\n", msg);
+        break;
+        }
+    }
 /* Clean up */
 H5Sclose(space);
-H5Tclose(memtype);
+H5Tclose(type);
 }
 
 int readCol(struct metaCol *col, hid_t hid, struct hcaUnpack5 *context)
 /* Read metaCol from given hid */
 {
 hid_t space = H5Dget_space (hid);
 hsize_t     dims[1];
 H5Sget_simple_extent_dims (space, dims, NULL);
 int size = col->size = dims[0];
 hid_t type = H5Dget_type(hid);
 H5T_class_t classt = col->classt = H5Tget_class(type);
 
-struct vocab *vocab = hashFindVal(context->vocabHash, col->name);
-if (vocab != NULL)
-    {
-    int *buf;
-    AllocArray(buf, size);
-    col->val.asVocab = buf;
-    h5dReadAll(hid, H5T_NATIVE_INT, buf);
-    }
-else
-    {
 switch (classt)
     {
     case H5T_INTEGER:
         {
         int *buf;
         AllocArray(buf, size);
         col->val.asInt = buf;
         h5dReadAll(hid, H5T_NATIVE_INT, buf);
         break;
         }
     case H5T_FLOAT:
         {
         double *buf;
         AllocArray(buf, size);
         col->val.asDouble = buf;
         h5dReadAll(hid, H5T_NATIVE_DOUBLE, buf);
         break;
         }
     case H5T_STRING:
         {
         char **buf;
         AllocArray(buf, size);
         col->val.asString = buf;
         h5dReadAll(hid, cVarString(), buf);
         break;
         }
     default:
         warn("Skipping type %d", type);
         break;
     }
-    }
 
 /* Clean up and go home */
 H5Sclose(space);
 H5Tclose(type);
 return size;
 }
 
 
 herr_t vocabCallback(hid_t loc_id, const char *name, const H5L_info_t *info,
             void *operator_data)
 /* Callback from H5Iterate that we use to build up a column that has
  * an assocated vocabulary */
 {
 struct hcaUnpack5 *context = operator_data;
 herr_t          status;
@@ -316,33 +336,64 @@
 struct metaCol *col;
 for (col = context->metaList; col != NULL; col = col->next)
     {
     fprintf(f, "\t%s", col->name);
     }
 fprintf(f, "\n");
 
 struct metaCol *indexCol = context->indexCol;
 int colSize = context->cellCount;
 int i;
 for (i=0; i<colSize; ++i)
     {
     fprintf(f, "%s", indexCol->val.asString[i]);
     for (col = context->metaList; col != NULL; col = col->next)
         {
-	struct vocab *vocab = col->vocab;
-	if (vocab != NULL)
-	    fprintf(f, "\t%s", vocab->stringVals[col->val.asVocab[i]]);
+        if (col->vocab)
+            {
+            int vocabIx = col->val.asInt[i];
+            if (vocabIx < 0)
+                {
+                // why does this happen? Is this data invalid? Is it unknown?
+                // it's not a read error, see the following comparisons:
+                // cd /hive/data/outside/gtex/V9/snRnaSeq/
+                // h5dump -y -d/obs/__categories/granular GTEx_8_tissues_snRNAseq_immune_atlas_071421.public_obs.h5ad | less -S
+                // clearly an array of tissue-like names? then how come the data contains
+                // indices like -1?
+                // h5dump -d/obs/granular GTEx_8_tissues_snRNAseq_immune_atlas_071421.public_obs.h5ad | less -S
+                // for now just print "unkown val" as a placeholder
+                fprintf(f, "\tunknown value");
+                }
+            else
+                {
+                switch (col->vocab->classt)
+                    {
+                    case H5T_INTEGER:
+                        fprintf(f, "\t%d", col->vocab->val.asInt[vocabIx]);
+                        break;
+                    case H5T_FLOAT:
+                        fprintf(f, "\t%g", col->vocab->val.asDouble[vocabIx]);
+                        break;
+                    case H5T_STRING:
+                        fprintf(f, "\t%s", col->vocab->val.asString[vocabIx]);
+                        break;
+                    default:
+                        internalErr();
+                        break;
+                    }
+                }
+            }
         else
             {
             switch (col->classt)
                 {
                 case H5T_INTEGER:
                     fprintf(f, "\t%d", col->val.asInt[i]);
                     break;
                 case H5T_FLOAT:
                     fprintf(f, "\t%g", col->val.asDouble[i]);
                     break;
                 case H5T_STRING:
                     fprintf(f, "\t%s", col->val.asString[i]);
                     break;
                 default:
                     internalErr();
@@ -526,31 +577,44 @@
 
 hid_t file = context->file = h5fOpen(input, H5F_ACC_RDONLY);
 hid_t cats = h5gOpen(file, "obs/__categories");
 h5lIterate(cats, vocabCallback, context);
 slReverse(&context->vocabList);
 
 verbose(2, "And now to attach vocabs?\n");
 struct vocab *vocab;
 for (vocab = context->vocabList; vocab != NULL; vocab = vocab->next)
     {
     hid_t v = h5dOpen(cats, vocab->name);
     readVocab(vocab, v, context);
     verbose(2, "%s has %d items\n", vocab->name, vocab->valCount);
     int i;
     for (i=0; i<vocab->valCount; ++i)
-        verbose(2, "  %s\n", vocab->stringVals[i]);
+        {
+        switch (vocab->classt)
+            {
+            case H5T_INTEGER:
+                verbose(2, "  %d\n", vocab->val.asInt[i]);
+                break;
+            case H5T_FLOAT:
+                verbose(2, "  %g\n", vocab->val.asDouble[i]);
+                break;
+            default:
+                verbose(2, "  %s\n", vocab->val.asString[i]);
+                break;
+            }
+        }
     }
 
 verbose(2, "And now for looping through the real thingies\n");
 hid_t obs = h5gOpen(file, "obs");
 h5lIterate(obs, obsCallback, context);
 
 
 struct metaCol *col;
 int firstColSize = 0;
 struct metaCol *indexCol = NULL;
 for (col = context->metaList; col != NULL; col = col->next)
     {
     char *name = col->name;
     if (sameString(name, "_index"))
         indexCol = context->indexCol = col;