src/hg/hgc/hgc.c 46169b41deecd48121198e1911c41dc0a3f96b47

46169b41deecd48121198e1911c41dc0a3f96b47
chmalee
  Tue Jan 19 18:12:04 2021 -0800
Allow variable size data tables on hgc. Allow these tables to be JSON or
pipe and semi-colon encoded. Add more support for external data
references in bigBeds: allow relevant trackDb settings like
skipEmptyFields, allow variable size tables in external files, allow
gzip compressed external files.

diff --git src/hg/hgc/hgc.c src/hg/hgc/hgc.c
index 3c62247..e16e8ae 100644
--- src/hg/hgc/hgc.c
+++ src/hg/hgc/hgc.c
@@ -247,30 +247,34 @@
 #include "itemDetailsHtml.h"
 #include "trackVersion.h"
 #include "numtsClick.h"
 #include "geneReviewsClick.h"
 #include "bigBed.h"
 #include "bigPsl.h"
 #include "bedTabix.h"
 #include "longRange.h"
 #include "hmmstats.h"
 #include "aveStats.h"
 #include "trix.h"
 #include "bPlusTree.h"
 #include "customFactory.h"
 #include "iupac.h"
 #include "clinvarSubLolly.h"
+#include "jsHelper.h"
+#include "errCatch.h"
+#include "htslib/bgzf.h"
+#include "htslib/kstring.h"
 
 static char *rootDir = "hgcData";
 
 #define LINESIZE 70  /* size of lines in comp seq feature */
 
 struct cart *cart;	/* User's settings. */
 char *seqName;		/* Name of sequence we're working on. */
 int winStart, winEnd;   /* Bounds of sequence. */
 char *database;		/* Name of mySQL database. */
 char *organism;		/* Colloquial name of organism. */
 char *genome;		/* common name, e.g. Mouse, Human */
 char *scientificName;	/* Scientific name of organism. */
 
 struct hash *trackHash;	/* A hash of all tracks - trackDb valued */
 
@@ -287,30 +291,31 @@
 Color shadesOfRed[16];
 boolean exprBedColorsMade = FALSE; /* Have the shades of red been made? */
 int maxRGBShade = 16;
 
 struct bed *sageExpList = NULL;
 char ncbiOmimUrl[255] = {"https://www.ncbi.nlm.nih.gov/omim/"};
 
 struct palInfo
 {
     char *chrom;
     int left;
     int right;
     char *rnaName;
 };
 
+
 /* See this NCBI web doc for more info about entrezFormat:
  * https://www.ncbi.nlm.nih.gov/entrez/query/static/linking.html */
 char *entrezFormat = "https://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Search&db=%s&term=%s&doptcmdl=%s&tool=genome.ucsc.edu";
 char *entrezPureSearchFormat = "https://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=PureSearch&db=%s&details_term=%s[%s] ";
 char *ncbiGeneFormat = "https://www.ncbi.nlm.nih.gov/gene/%s";
 char *entrezUidFormat = "https://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=%s&list_uids=%d&dopt=%s&tool=genome.ucsc.edu";
 /* db=unists is not mentioned in NCBI's doc... so stick with this usage: */
 char *unistsnameScript = "https://www.ncbi.nlm.nih.gov:80/entrez/query.fcgi?db=unists";
 char *unistsScript = "https://www.ncbi.nlm.nih.gov/genome/sts/sts.cgi?uid=";
 char *gdbScript = "http://www.gdb.org/gdb-bin/genera/accno?accessionNum=";
 char *cloneDbScript = "https://www.ncbi.nlm.nih.gov/clone?term=";
 char *traceScript = "https://www.ncbi.nlm.nih.gov/Traces/trace.cgi?cmd=retrieve&val=";
 char *genMapDbScript = "http://genomics.med.upenn.edu/perl/genmapdb/byclonesearch.pl?clone=";
 char *uniprotFormat = "http://www.uniprot.org/uniprot/%s";
 char *dbSnpFormat = "https://www.ncbi.nlm.nih.gov/SNP/snp_ref.cgi?type=rs&rs=%s";
@@ -1476,30 +1481,69 @@
         itemName = parts[1];
         encode = FALSE; // assume the link is already encoded
         }
     if (startsWith("http", itemName)) // the ID may be a full URL already, encoding would destroy it
         encode = FALSE;
 
     char *idUrl = replaceInUrl(url, idForUrl, cart, database, seqName, winStart, 
                     winEnd, tdb->track, encode, NULL);
     printf("<a href=\"%s\" target=\"_blank\">%s</a>", idUrl, itemName);
     } 
 printf("</td></tr>\n");
 freeMem(slIds);
 //freeMem(idNames);
 }
 
+char *readOneLineMaybeBgzip(char *fileOrUrl, bits64 offset, bits64 len)
+/* If fileOrUrl is bgzip-compressed and indexed, then use htslib's bgzf functions to
+ * retrieve uncompressed data from offset; otherwise (plain text) use udc. If len is 0,
+ * read up to next '\n' delimiter. */
+{
+char *line = needMem(len+1);
+if (endsWith(fileOrUrl, ".gz"))
+    {
+    BGZF *fp = bgzf_open(fileOrUrl, "r");
+    kstring_t str = { 0, 0, NULL };
+    if (bgzf_index_load(fp, fileOrUrl, ".gzi") < 0)
+        errAbort("bgzf_index_load failed to load .gzi index for %s", fileOrUrl);
+    if (bgzf_useek(fp, offset, SEEK_SET) < 0)
+        errAbort("bgzf_useek failed to seek to uncompressed offset %lld in %s", offset, fileOrUrl);
+
+    // bgzf_getline is faster than bgzf_read(), so we only use the len param for error checking
+    bits64 count = bgzf_getline(fp, '\n', &str);
+    if (count == 0)
+        errAbort("bgzf_getline unexpected end of file while parsing '%s'", fileOrUrl);
+    else if (count < 0)
+        errAbort("bgzf_getline unexpected error while parsing '%s'", fileOrUrl);
+    else if (len > 0 && count != len)
+        errAbort("bgzf_getline failed to read %lld bytes at uncompressed offset %lld in %s, got %lld",
+                     len, offset, fileOrUrl, count);
+    line = ks_release(&str);
+    bgzf_close(fp);
+    }
+else
+    {
+    struct udcFile *udcF = udcFileOpen(fileOrUrl, NULL);
+    udcSeek(udcF, offset);
+    line = udcReadLine(udcF);
+    if (line == NULL)
+        errAbort("error reading line from '%s'", fileOrUrl);
+    udcFileClose(&udcF);
+    }
+return line;
+}
+
 int extraFieldsStart(struct trackDb *tdb, int fieldCount, struct asObject *as)
 /* return the index of the first extra field */
 {
 int start = 0;
 char *type = cloneString(tdb->type);
 char *word = nextWord(&type);
 if (word && (sameWord(word,"bed") || startsWith("big", word)))
     {
     if (NULL != (word = nextWord(&type)))
         start = sqlUnsigned(word);
     else // custom beds and bigBeds may not have full type "begBed 9 +"
         start = max(0,slCount(as->columnList) - fieldCount);
     }
 return start;
 }
@@ -1550,74 +1594,140 @@
 for (count = 0; col != NULL && count < fieldCount; col = col->next)
     {
     struct slPair *field;
     AllocVar(field);
     char *fieldName = col->name;
     char *fieldVal = row[count];
     field->name = fieldName;
     field->val = fieldVal;
     slAddHead(&fields, field);
     count++;
     }
 slReverse(fields);
 return fields;
 }
 
+void printEmbeddedTable(struct trackDb *tdb, struct embeddedTbl *thisTbl, struct dyString *dy)
+// Pretty print a '|' and ';' encoded table or a JSON encoded table from a bigBed field
+{
+jsIncludeFile("hgc.js", NULL);
+if (isNotEmpty(thisTbl->encodedTbl))
+    {
+    if (startsWith("_json", thisTbl->field) || startsWith("json", thisTbl->field))
+        {
+        struct jsonElement *jsElem = NULL;
+        struct errCatch *errCatch = errCatchNew();
+        if (errCatchStart(errCatch))
+            jsElem = jsonParse(thisTbl->encodedTbl);
+        errCatchEnd(errCatch);
+        if (errCatch->gotError)
+            warn("ERROR: JSON field '%s' for track '%s' is malformed: %s", thisTbl->field, tdb->track, errCatch->message->string);
+        else if (errCatch->gotWarning)
+            warn("Warning: %s", errCatch->message->string);
+        errCatchFree(&errCatch);
+        if (jsElem != NULL)
+            {
+            dyStringPrintf(dy, "{label: \"%s\", data: %s},", thisTbl->title != NULL ? thisTbl->title : thisTbl->field, thisTbl->encodedTbl);
+            }
+        }
+    else
+        {
+        printf("<tr><td>%s</td><td>", thisTbl->title);
+        printf("<table class='jsonTable'>\n");
+        printf("<tr><td>");
+        char table[4096];
+        safef(table, sizeof(table), "%s", thisTbl->encodedTbl);
+        int swapped = strSwapStrs(table, 4096, ";", "</td></tr><tr><td>");
+        if (swapped == -1)
+            errAbort("Error substituting ';' for '</tr><tr>' in hgc.c:printEmbeddedTable()");
+        swapped = strSwapStrs(table, 4096, "|", "</td><td>");
+        if (swapped == -1)
+            errAbort("Error substituting '|' for '</td><td>' in hgc.c:printEmbeddedTable()");
+        printf("%s</tr>\n", table);
+        printf("</table>\n");
+        printf("</td></tr>\n");
+        }
+    }
+}
+
 void printExtraDetailsTable(char *trackName, char *tableName, char *fileName, struct dyString *tableText)
 // convert a tab-sep table to HTML
 {
 struct lineFile *lf = lineFileOnString(fileName, TRUE, tableText->string);
 char *description = tableName != NULL ? tableName : "Additional Details";
-printf("<table>");
-jsBeginCollapsibleSection(cart, trackName, "extraTbl", description, FALSE);
+printf("<p><b>%s</b></p>\n", description);
 printf("<table class=\"bedExtraTbl\">\n");
 char *line;
 while (lineFileNext(lf, &line, NULL))
     {
     printf("<tr><td>");
     char *toPrint = replaceChars(line, "\t", "</td><td>");
     printf("%s", toPrint);
     printf("</td></tr>\n");
     }
 printf("</table>\n"); // closes bedExtraTbl
-jsEndCollapsibleSection();
-printf("</table>\n"); // close wrapper table
 }
 
 static struct slName *findFieldsInExtraFile(char *detailsTableUrl, struct asColumn *col, struct dyString *ds)
 // return a list of the ${}-enclosed fields from an extra file
 {
 struct slName *foundFields = NULL;
 char *table = netReadTextFileIfExists(hReplaceGbdb(detailsTableUrl));
 if (table)
     {
     for (; col != NULL; col = col->next)
         {
         char field[256];
         safef(field, sizeof(field), "${%s}", col->name);
         if (stringIn(field, table))
             {
             struct slName *replaceField = slNameNew(col->name);
             slAddHead(&foundFields, replaceField);
             }
         }
     dyStringPrintf(ds, "%s", table);
+    if (foundFields)
         slReverse(foundFields);
     }
 return foundFields;
 }
 
+void getExtraTableFields(struct trackDb *tdb, struct slName **retFieldNames, struct embeddedTbl **retList, struct hash *embeddedTblHash)
+/* Parse the trackDb field "extraTableFields" into the field names and titles specified,
+ * and fill out a hash keyed on the bigBed field name (which may be in an external file
+ * and not in the bigBed itself) to a helper struct for storing user defined tables. */
+{
+struct slName *tmp, *embeddedTblSetting = slNameListFromComma(trackDbSetting(tdb, "extraTableFields"));
+char *title = NULL, *fieldName = NULL;
+for (tmp = embeddedTblSetting; tmp != NULL; tmp = tmp->next)
+    {
+    fieldName = cloneString(tmp->name);
+    if (strchr(tmp->name, '|'))
+        {
+        title = strchr(fieldName, '|');
+        *title++ = 0;
+        }
+    struct embeddedTbl *new;
+    AllocVar(new);
+    new->field = fieldName;
+    new->title = title != NULL ? cloneString(title) : fieldName;
+    slAddHead(retList, new);
+    slNameAddHead(retFieldNames, fieldName);
+    hashAdd(embeddedTblHash, fieldName, new);
+    }
+}
+
 int extraFieldsPrintAs(struct trackDb *tdb,struct sqlResult *sr,char **fields,int fieldCount, struct asObject *as)
 // Any extra bed or bigBed fields (defined in as and occurring after N in bed N + types.
 // sr may be null for bigBeds.
 // Returns number of extra fields actually printed.
 {
 // We are trying to print extra fields so we need to figure out how many fields to skip
 int start = extraFieldsStart(tdb, fieldCount, as);
 
 struct asColumn *col = as->columnList;
 char *urlsStr = trackDbSettingClosestToHomeOrDefault(tdb, "urls", NULL);
 struct hash* fieldToUrl = hashFromString(urlsStr);
 boolean skipEmptyFields = trackDbSettingOn(tdb, "skipEmptyFields");
 
 // make list of fields to skip
 char *skipFieldsStr = trackDbSetting(tdb, "skipFields");
@@ -1633,120 +1743,152 @@
 
 // make list of fields that we want to substitute
 // this setting has format description|URLorFilePath, with the stuff before the pipe optional
 char *extraDetailsTableName = NULL, *extraDetails = cloneString(trackDbSetting(tdb, "extraDetailsTable"));
 if (extraDetails && strchr(extraDetails,'|'))
     {
     extraDetailsTableName = extraDetails;
     extraDetails = strchr(extraDetails,'|');
     *extraDetails++ = 0;
     }
 struct dyString *extraTblStr = dyStringNew(0);
 struct slName *detailsTableFields = NULL;
 if (extraDetails)
     detailsTableFields = findFieldsInExtraFile(extraDetails, col, extraTblStr);
 
+struct hash *embeddedTblHash = hashNew(0);
+struct slName *embeddedTblFields = NULL;
+struct embeddedTbl *embeddedTblList = NULL;
+getExtraTableFields(tdb, &embeddedTblFields, &embeddedTblList, embeddedTblHash);
+
 // iterate over fields, print as table rows
 int count = 0;
+int printCount = 0;
 for (;col != NULL && count < fieldCount;col=col->next)
     {
     if (start > 0)  // skip past already known fields
         {
         start--;
         continue;
         }
     int ix = count;
     if (sr != NULL)
         {
         ix = sqlFieldColumn(sr, col->name); // If sr provided, name must match sql columnn name!
         if (ix == -1 || ix > fieldCount)      // so extraField really just provides a label
             continue;
         }
 
     char *fieldName = col->name;
-
-    if (count == 0)
-        printf("<br><table class='bedExtraTbl'>");
-    
     count++;
 
-    // do not print a row if the fieldName from the .as file is in the "skipFields" list
-    // or if a field name starts with _. This maked bigBed extra fields consistent with
-    // external extra fields in that _ field names have some meaning and are not shown
-    if (startsWith("_", fieldName) || (skipIds && slNameInList(skipIds, fieldName)))
-        continue;
-
     // don't print this field if we are gonna print it later in a custom table
     if (detailsTableFields && slNameInList(detailsTableFields, fieldName))
         {
         int fieldLen = strlen(fieldName);
         char *replaceField = needMem(fieldLen+4);
         replaceField[0] = '$';
         replaceField[1] = '{';
         strcpy(replaceField+2, fieldName);
         replaceField[fieldLen+2] = '}';
         replaceField[fieldLen+3] = 0;
         extraTblStr = dyStringSub(extraTblStr->string, replaceField, fields[ix]);
         continue;
         }
 
+    // similar to above, if the field contains an embedded table skip it here
+    // and print it later
+    if (embeddedTblFields)
+        {
+        struct embeddedTbl *new = hashFindVal(embeddedTblHash, fieldName);
+        if (new)
+            {
+            new->encodedTbl = fields[ix];
+            continue;
+            }
+        }
+
+    // do not print a row if the fieldName from the .as file is in the "skipFields" list
+    // or if a field name starts with _. This makes bigBed extra fields consistent with
+    // external extra fields in that _ field names have some meaning and are not shown
+    if (startsWith("_", fieldName) || (skipIds && slNameInList(skipIds, fieldName)))
+        continue;
+
     // skip this row if it's empty and "skipEmptyFields" option is set
     if (skipEmptyFields && isEmpty(fields[ix]))
         continue;
 
+    if (printCount == 0)
+        printf("<br><table class='bedExtraTbl'>");
+
     // split this table to separate current row from the previous one, if the trackDb option is set
     if (sepFields && slNameInList(sepFields, fieldName))
         printf("</tr></table>\n<p>\n<table class='bedExtraTbl'>");
 
     // field description
     char *entry;
     if (sameString(fieldName, "cdsStartStat") && sameString("enum('none','unk','incmpl','cmpl')", col->comment))
         entry = "Status of CDS start annotation (none, unknown, incomplete, or complete)";
     else if (sameString(fieldName, "cdsEndStat") && sameString("enum('none','unk','incmpl','cmpl')", col->comment))
         entry = "Status of CDS end annotation (none, unknown, incomplete, or complete)";
     else
         entry = col->comment;
     printf("<tr><td>%s</td>", entry); // bold style now in HGStyle.css
 
     if (col->isList || col->isArray || col->lowType->stringy || asTypesIsInt(col->lowType->type))
         printIdOrLinks(col, fieldToUrl, tdb, fields[ix]);
     else if (asTypesIsFloating(col->lowType->type))
         {
         double valDouble = strtod(fields[ix],NULL);
         if (errno == 0 && valDouble != 0)
             printf("<td>%g</td></tr>\n", valDouble);
         else
             printf("<td>%s</td></tr>\n", fields[ix]); // decided not to print error
         }
     else
         printf("<td>%s</td></tr>\n", fields[ix]);
+    printCount++;
     }
 if (skipIds)
     slFreeList(skipIds);
 if (sepFields)
     slFreeList(sepFields);
 
-if (count > 0)
+if (embeddedTblFields)
+    {
+    struct embeddedTbl *thisTbl;
+    struct dyString *tableLabelsDy = dyStringNew(0);
+    for (thisTbl = embeddedTblList; thisTbl != NULL; thisTbl = thisTbl->next)
+        {
+        if (thisTbl->encodedTbl)
+            {
+            printEmbeddedTable(tdb, thisTbl, tableLabelsDy);
+            }
+        }
+    jsInline(dyStringCannibalize(&tableLabelsDy));
+    }
+
+if (printCount > 0)
     printf("</table>\n");
 
+
 if (detailsTableFields)
     {
-    printf("<br>\n");
     printExtraDetailsTable(tdb->track, extraDetailsTableName, extraDetails, extraTblStr);
     }
 
-return count;
+return printCount;
 }
 
 int extraFieldsPrint(struct trackDb *tdb,struct sqlResult *sr,char **fields,int fieldCount)
 // Any extra bed or bigBed fields (defined in as and occurring after N in bed N + types.
 // sr may be null for bigBeds.
 // Returns number of extra fields actually printed.
 {
 struct asObject *as = asForDb(tdb, database);
 if (as == NULL)
     return 0;
 
 int ret =  extraFieldsPrintAs(tdb, sr, fields,fieldCount, as);
 //asObjectFree(&as);
 
 return ret;