46169b41deecd48121198e1911c41dc0a3f96b47
chmalee
Tue Jan 19 18:12:04 2021 -0800
Allow variable size data tables on hgc. Allow these tables to be JSON or
pipe and semi-colon encoded. Add more support for external data
references in bigBeds: allow relevant trackDb settings like
skipEmptyFields, allow variable size tables in external files, allow
gzip compressed external files.
diff --git src/hg/hgc/hgc.c src/hg/hgc/hgc.c
index 3c62247..e16e8ae 100644
--- src/hg/hgc/hgc.c
+++ src/hg/hgc/hgc.c
@@ -247,30 +247,34 @@
#include "itemDetailsHtml.h"
#include "trackVersion.h"
#include "numtsClick.h"
#include "geneReviewsClick.h"
#include "bigBed.h"
#include "bigPsl.h"
#include "bedTabix.h"
#include "longRange.h"
#include "hmmstats.h"
#include "aveStats.h"
#include "trix.h"
#include "bPlusTree.h"
#include "customFactory.h"
#include "iupac.h"
#include "clinvarSubLolly.h"
+#include "jsHelper.h"
+#include "errCatch.h"
+#include "htslib/bgzf.h"
+#include "htslib/kstring.h"
static char *rootDir = "hgcData";
#define LINESIZE 70 /* size of lines in comp seq feature */
struct cart *cart; /* User's settings. */
char *seqName; /* Name of sequence we're working on. */
int winStart, winEnd; /* Bounds of sequence. */
char *database; /* Name of mySQL database. */
char *organism; /* Colloquial name of organism. */
char *genome; /* common name, e.g. Mouse, Human */
char *scientificName; /* Scientific name of organism. */
struct hash *trackHash; /* A hash of all tracks - trackDb valued */
@@ -287,30 +291,31 @@
Color shadesOfRed[16];
boolean exprBedColorsMade = FALSE; /* Have the shades of red been made? */
int maxRGBShade = 16;
struct bed *sageExpList = NULL;
char ncbiOmimUrl[255] = {"https://www.ncbi.nlm.nih.gov/omim/"};
struct palInfo
{
char *chrom;
int left;
int right;
char *rnaName;
};
+
/* See this NCBI web doc for more info about entrezFormat:
* https://www.ncbi.nlm.nih.gov/entrez/query/static/linking.html */
char *entrezFormat = "https://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Search&db=%s&term=%s&doptcmdl=%s&tool=genome.ucsc.edu";
char *entrezPureSearchFormat = "https://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=PureSearch&db=%s&details_term=%s[%s] ";
char *ncbiGeneFormat = "https://www.ncbi.nlm.nih.gov/gene/%s";
char *entrezUidFormat = "https://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=%s&list_uids=%d&dopt=%s&tool=genome.ucsc.edu";
/* db=unists is not mentioned in NCBI's doc... so stick with this usage: */
char *unistsnameScript = "https://www.ncbi.nlm.nih.gov:80/entrez/query.fcgi?db=unists";
char *unistsScript = "https://www.ncbi.nlm.nih.gov/genome/sts/sts.cgi?uid=";
char *gdbScript = "http://www.gdb.org/gdb-bin/genera/accno?accessionNum=";
char *cloneDbScript = "https://www.ncbi.nlm.nih.gov/clone?term=";
char *traceScript = "https://www.ncbi.nlm.nih.gov/Traces/trace.cgi?cmd=retrieve&val=";
char *genMapDbScript = "http://genomics.med.upenn.edu/perl/genmapdb/byclonesearch.pl?clone=";
char *uniprotFormat = "http://www.uniprot.org/uniprot/%s";
char *dbSnpFormat = "https://www.ncbi.nlm.nih.gov/SNP/snp_ref.cgi?type=rs&rs=%s";
@@ -1476,30 +1481,69 @@
itemName = parts[1];
encode = FALSE; // assume the link is already encoded
}
if (startsWith("http", itemName)) // the ID may be a full URL already, encoding would destroy it
encode = FALSE;
char *idUrl = replaceInUrl(url, idForUrl, cart, database, seqName, winStart,
winEnd, tdb->track, encode, NULL);
printf("%s", idUrl, itemName);
}
printf("\n");
freeMem(slIds);
//freeMem(idNames);
}
+char *readOneLineMaybeBgzip(char *fileOrUrl, bits64 offset, bits64 len)
+/* If fileOrUrl is bgzip-compressed and indexed, then use htslib's bgzf functions to
+ * retrieve uncompressed data from offset; otherwise (plain text) use udc. If len is 0,
+ * read up to next '\n' delimiter. */
+{
+char *line = needMem(len+1);
+if (endsWith(fileOrUrl, ".gz"))
+ {
+ BGZF *fp = bgzf_open(fileOrUrl, "r");
+ kstring_t str = { 0, 0, NULL };
+ if (bgzf_index_load(fp, fileOrUrl, ".gzi") < 0)
+ errAbort("bgzf_index_load failed to load .gzi index for %s", fileOrUrl);
+ if (bgzf_useek(fp, offset, SEEK_SET) < 0)
+ errAbort("bgzf_useek failed to seek to uncompressed offset %lld in %s", offset, fileOrUrl);
+
+ // bgzf_getline is faster than bgzf_read(), so we only use the len param for error checking
+ bits64 count = bgzf_getline(fp, '\n', &str);
+ if (count == 0)
+ errAbort("bgzf_getline unexpected end of file while parsing '%s'", fileOrUrl);
+ else if (count < 0)
+ errAbort("bgzf_getline unexpected error while parsing '%s'", fileOrUrl);
+ else if (len > 0 && count != len)
+ errAbort("bgzf_getline failed to read %lld bytes at uncompressed offset %lld in %s, got %lld",
+ len, offset, fileOrUrl, count);
+ line = ks_release(&str);
+ bgzf_close(fp);
+ }
+else
+ {
+ struct udcFile *udcF = udcFileOpen(fileOrUrl, NULL);
+ udcSeek(udcF, offset);
+ line = udcReadLine(udcF);
+ if (line == NULL)
+ errAbort("error reading line from '%s'", fileOrUrl);
+ udcFileClose(&udcF);
+ }
+return line;
+}
+
int extraFieldsStart(struct trackDb *tdb, int fieldCount, struct asObject *as)
/* return the index of the first extra field */
{
int start = 0;
char *type = cloneString(tdb->type);
char *word = nextWord(&type);
if (word && (sameWord(word,"bed") || startsWith("big", word)))
{
if (NULL != (word = nextWord(&type)))
start = sqlUnsigned(word);
else // custom beds and bigBeds may not have full type "begBed 9 +"
start = max(0,slCount(as->columnList) - fieldCount);
}
return start;
}
@@ -1550,74 +1594,140 @@
for (count = 0; col != NULL && count < fieldCount; col = col->next)
{
struct slPair *field;
AllocVar(field);
char *fieldName = col->name;
char *fieldVal = row[count];
field->name = fieldName;
field->val = fieldVal;
slAddHead(&fields, field);
count++;
}
slReverse(fields);
return fields;
}
+void printEmbeddedTable(struct trackDb *tdb, struct embeddedTbl *thisTbl, struct dyString *dy)
+// Pretty print a '|' and ';' encoded table or a JSON encoded table from a bigBed field
+{
+jsIncludeFile("hgc.js", NULL);
+if (isNotEmpty(thisTbl->encodedTbl))
+ {
+ if (startsWith("_json", thisTbl->field) || startsWith("json", thisTbl->field))
+ {
+ struct jsonElement *jsElem = NULL;
+ struct errCatch *errCatch = errCatchNew();
+ if (errCatchStart(errCatch))
+ jsElem = jsonParse(thisTbl->encodedTbl);
+ errCatchEnd(errCatch);
+ if (errCatch->gotError)
+ warn("ERROR: JSON field '%s' for track '%s' is malformed: %s", thisTbl->field, tdb->track, errCatch->message->string);
+ else if (errCatch->gotWarning)
+ warn("Warning: %s", errCatch->message->string);
+ errCatchFree(&errCatch);
+ if (jsElem != NULL)
+ {
+ dyStringPrintf(dy, "{label: \"%s\", data: %s},", thisTbl->title != NULL ? thisTbl->title : thisTbl->field, thisTbl->encodedTbl);
+ }
+ }
+ else
+ {
+ printf("
%s | ", thisTbl->title);
+ printf("\n");
+ printf("");
+ char table[4096];
+ safef(table, sizeof(table), "%s", thisTbl->encodedTbl);
+ int swapped = strSwapStrs(table, 4096, ";", " | ");
+ if (swapped == -1)
+ errAbort("Error substituting ';' for ' | ' in hgc.c:printEmbeddedTable()");
+ swapped = strSwapStrs(table, 4096, "|", "");
+ if (swapped == -1)
+ errAbort("Error substituting '|' for ' | ' in hgc.c:printEmbeddedTable()");
+ printf("%s | \n", table);
+ printf(" \n");
+ printf(" |
\n");
+ }
+ }
+}
+
void printExtraDetailsTable(char *trackName, char *tableName, char *fileName, struct dyString *tableText)
// convert a tab-sep table to HTML
{
struct lineFile *lf = lineFileOnString(fileName, TRUE, tableText->string);
char *description = tableName != NULL ? tableName : "Additional Details";
-printf("");
-jsBeginCollapsibleSection(cart, trackName, "extraTbl", description, FALSE);
+printf("%s
\n", description);
printf("\n"); // closes bedExtraTbl
-jsEndCollapsibleSection();
-printf("
\n"); // close wrapper table
}
static struct slName *findFieldsInExtraFile(char *detailsTableUrl, struct asColumn *col, struct dyString *ds)
// return a list of the ${}-enclosed fields from an extra file
{
struct slName *foundFields = NULL;
char *table = netReadTextFileIfExists(hReplaceGbdb(detailsTableUrl));
if (table)
{
for (; col != NULL; col = col->next)
{
char field[256];
safef(field, sizeof(field), "${%s}", col->name);
if (stringIn(field, table))
{
struct slName *replaceField = slNameNew(col->name);
slAddHead(&foundFields, replaceField);
}
}
dyStringPrintf(ds, "%s", table);
+ if (foundFields)
slReverse(foundFields);
}
return foundFields;
}
+void getExtraTableFields(struct trackDb *tdb, struct slName **retFieldNames, struct embeddedTbl **retList, struct hash *embeddedTblHash)
+/* Parse the trackDb field "extraTableFields" into the field names and titles specified,
+ * and fill out a hash keyed on the bigBed field name (which may be in an external file
+ * and not in the bigBed itself) to a helper struct for storing user defined tables. */
+{
+struct slName *tmp, *embeddedTblSetting = slNameListFromComma(trackDbSetting(tdb, "extraTableFields"));
+char *title = NULL, *fieldName = NULL;
+for (tmp = embeddedTblSetting; tmp != NULL; tmp = tmp->next)
+ {
+ fieldName = cloneString(tmp->name);
+ if (strchr(tmp->name, '|'))
+ {
+ title = strchr(fieldName, '|');
+ *title++ = 0;
+ }
+ struct embeddedTbl *new;
+ AllocVar(new);
+ new->field = fieldName;
+ new->title = title != NULL ? cloneString(title) : fieldName;
+ slAddHead(retList, new);
+ slNameAddHead(retFieldNames, fieldName);
+ hashAdd(embeddedTblHash, fieldName, new);
+ }
+}
+
int extraFieldsPrintAs(struct trackDb *tdb,struct sqlResult *sr,char **fields,int fieldCount, struct asObject *as)
// Any extra bed or bigBed fields (defined in as and occurring after N in bed N + types.
// sr may be null for bigBeds.
// Returns number of extra fields actually printed.
{
// We are trying to print extra fields so we need to figure out how many fields to skip
int start = extraFieldsStart(tdb, fieldCount, as);
struct asColumn *col = as->columnList;
char *urlsStr = trackDbSettingClosestToHomeOrDefault(tdb, "urls", NULL);
struct hash* fieldToUrl = hashFromString(urlsStr);
boolean skipEmptyFields = trackDbSettingOn(tdb, "skipEmptyFields");
// make list of fields to skip
char *skipFieldsStr = trackDbSetting(tdb, "skipFields");
@@ -1633,120 +1743,152 @@
// make list of fields that we want to substitute
// this setting has format description|URLorFilePath, with the stuff before the pipe optional
char *extraDetailsTableName = NULL, *extraDetails = cloneString(trackDbSetting(tdb, "extraDetailsTable"));
if (extraDetails && strchr(extraDetails,'|'))
{
extraDetailsTableName = extraDetails;
extraDetails = strchr(extraDetails,'|');
*extraDetails++ = 0;
}
struct dyString *extraTblStr = dyStringNew(0);
struct slName *detailsTableFields = NULL;
if (extraDetails)
detailsTableFields = findFieldsInExtraFile(extraDetails, col, extraTblStr);
+struct hash *embeddedTblHash = hashNew(0);
+struct slName *embeddedTblFields = NULL;
+struct embeddedTbl *embeddedTblList = NULL;
+getExtraTableFields(tdb, &embeddedTblFields, &embeddedTblList, embeddedTblHash);
+
// iterate over fields, print as table rows
int count = 0;
+int printCount = 0;
for (;col != NULL && count < fieldCount;col=col->next)
{
if (start > 0) // skip past already known fields
{
start--;
continue;
}
int ix = count;
if (sr != NULL)
{
ix = sqlFieldColumn(sr, col->name); // If sr provided, name must match sql columnn name!
if (ix == -1 || ix > fieldCount) // so extraField really just provides a label
continue;
}
char *fieldName = col->name;
-
- if (count == 0)
- printf("