46169b41deecd48121198e1911c41dc0a3f96b47 chmalee Tue Jan 19 18:12:04 2021 -0800 Allow variable size data tables on hgc. Allow these tables to be JSON or pipe and semi-colon encoded. Add more support for external data references in bigBeds: allow relevant trackDb settings like skipEmptyFields, allow variable size tables in external files, allow gzip compressed external files. diff --git src/hg/hgc/hgc.c src/hg/hgc/hgc.c index 3c62247..e16e8ae 100644 --- src/hg/hgc/hgc.c +++ src/hg/hgc/hgc.c @@ -247,30 +247,34 @@ #include "itemDetailsHtml.h" #include "trackVersion.h" #include "numtsClick.h" #include "geneReviewsClick.h" #include "bigBed.h" #include "bigPsl.h" #include "bedTabix.h" #include "longRange.h" #include "hmmstats.h" #include "aveStats.h" #include "trix.h" #include "bPlusTree.h" #include "customFactory.h" #include "iupac.h" #include "clinvarSubLolly.h" +#include "jsHelper.h" +#include "errCatch.h" +#include "htslib/bgzf.h" +#include "htslib/kstring.h" static char *rootDir = "hgcData"; #define LINESIZE 70 /* size of lines in comp seq feature */ struct cart *cart; /* User's settings. */ char *seqName; /* Name of sequence we're working on. */ int winStart, winEnd; /* Bounds of sequence. */ char *database; /* Name of mySQL database. */ char *organism; /* Colloquial name of organism. */ char *genome; /* common name, e.g. Mouse, Human */ char *scientificName; /* Scientific name of organism. */ struct hash *trackHash; /* A hash of all tracks - trackDb valued */ @@ -287,30 +291,31 @@ Color shadesOfRed[16]; boolean exprBedColorsMade = FALSE; /* Have the shades of red been made? */ int maxRGBShade = 16; struct bed *sageExpList = NULL; char ncbiOmimUrl[255] = {"https://www.ncbi.nlm.nih.gov/omim/"}; struct palInfo { char *chrom; int left; int right; char *rnaName; }; + /* See this NCBI web doc for more info about entrezFormat: * https://www.ncbi.nlm.nih.gov/entrez/query/static/linking.html */ char *entrezFormat = "https://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Search&db=%s&term=%s&doptcmdl=%s&tool=genome.ucsc.edu"; char *entrezPureSearchFormat = "https://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=PureSearch&db=%s&details_term=%s[%s] "; char *ncbiGeneFormat = "https://www.ncbi.nlm.nih.gov/gene/%s"; char *entrezUidFormat = "https://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=%s&list_uids=%d&dopt=%s&tool=genome.ucsc.edu"; /* db=unists is not mentioned in NCBI's doc... so stick with this usage: */ char *unistsnameScript = "https://www.ncbi.nlm.nih.gov:80/entrez/query.fcgi?db=unists"; char *unistsScript = "https://www.ncbi.nlm.nih.gov/genome/sts/sts.cgi?uid="; char *gdbScript = "http://www.gdb.org/gdb-bin/genera/accno?accessionNum="; char *cloneDbScript = "https://www.ncbi.nlm.nih.gov/clone?term="; char *traceScript = "https://www.ncbi.nlm.nih.gov/Traces/trace.cgi?cmd=retrieve&val="; char *genMapDbScript = "http://genomics.med.upenn.edu/perl/genmapdb/byclonesearch.pl?clone="; char *uniprotFormat = "http://www.uniprot.org/uniprot/%s"; char *dbSnpFormat = "https://www.ncbi.nlm.nih.gov/SNP/snp_ref.cgi?type=rs&rs=%s"; @@ -1476,30 +1481,69 @@ itemName = parts[1]; encode = FALSE; // assume the link is already encoded } if (startsWith("http", itemName)) // the ID may be a full URL already, encoding would destroy it encode = FALSE; char *idUrl = replaceInUrl(url, idForUrl, cart, database, seqName, winStart, winEnd, tdb->track, encode, NULL); printf("<a href=\"%s\" target=\"_blank\">%s</a>", idUrl, itemName); } printf("</td></tr>\n"); freeMem(slIds); //freeMem(idNames); } +char *readOneLineMaybeBgzip(char *fileOrUrl, bits64 offset, bits64 len) +/* If fileOrUrl is bgzip-compressed and indexed, then use htslib's bgzf functions to + * retrieve uncompressed data from offset; otherwise (plain text) use udc. If len is 0, + * read up to next '\n' delimiter. */ +{ +char *line = needMem(len+1); +if (endsWith(fileOrUrl, ".gz")) + { + BGZF *fp = bgzf_open(fileOrUrl, "r"); + kstring_t str = { 0, 0, NULL }; + if (bgzf_index_load(fp, fileOrUrl, ".gzi") < 0) + errAbort("bgzf_index_load failed to load .gzi index for %s", fileOrUrl); + if (bgzf_useek(fp, offset, SEEK_SET) < 0) + errAbort("bgzf_useek failed to seek to uncompressed offset %lld in %s", offset, fileOrUrl); + + // bgzf_getline is faster than bgzf_read(), so we only use the len param for error checking + bits64 count = bgzf_getline(fp, '\n', &str); + if (count == 0) + errAbort("bgzf_getline unexpected end of file while parsing '%s'", fileOrUrl); + else if (count < 0) + errAbort("bgzf_getline unexpected error while parsing '%s'", fileOrUrl); + else if (len > 0 && count != len) + errAbort("bgzf_getline failed to read %lld bytes at uncompressed offset %lld in %s, got %lld", + len, offset, fileOrUrl, count); + line = ks_release(&str); + bgzf_close(fp); + } +else + { + struct udcFile *udcF = udcFileOpen(fileOrUrl, NULL); + udcSeek(udcF, offset); + line = udcReadLine(udcF); + if (line == NULL) + errAbort("error reading line from '%s'", fileOrUrl); + udcFileClose(&udcF); + } +return line; +} + int extraFieldsStart(struct trackDb *tdb, int fieldCount, struct asObject *as) /* return the index of the first extra field */ { int start = 0; char *type = cloneString(tdb->type); char *word = nextWord(&type); if (word && (sameWord(word,"bed") || startsWith("big", word))) { if (NULL != (word = nextWord(&type))) start = sqlUnsigned(word); else // custom beds and bigBeds may not have full type "begBed 9 +" start = max(0,slCount(as->columnList) - fieldCount); } return start; } @@ -1550,74 +1594,140 @@ for (count = 0; col != NULL && count < fieldCount; col = col->next) { struct slPair *field; AllocVar(field); char *fieldName = col->name; char *fieldVal = row[count]; field->name = fieldName; field->val = fieldVal; slAddHead(&fields, field); count++; } slReverse(fields); return fields; } +void printEmbeddedTable(struct trackDb *tdb, struct embeddedTbl *thisTbl, struct dyString *dy) +// Pretty print a '|' and ';' encoded table or a JSON encoded table from a bigBed field +{ +jsIncludeFile("hgc.js", NULL); +if (isNotEmpty(thisTbl->encodedTbl)) + { + if (startsWith("_json", thisTbl->field) || startsWith("json", thisTbl->field)) + { + struct jsonElement *jsElem = NULL; + struct errCatch *errCatch = errCatchNew(); + if (errCatchStart(errCatch)) + jsElem = jsonParse(thisTbl->encodedTbl); + errCatchEnd(errCatch); + if (errCatch->gotError) + warn("ERROR: JSON field '%s' for track '%s' is malformed: %s", thisTbl->field, tdb->track, errCatch->message->string); + else if (errCatch->gotWarning) + warn("Warning: %s", errCatch->message->string); + errCatchFree(&errCatch); + if (jsElem != NULL) + { + dyStringPrintf(dy, "{label: \"%s\", data: %s},", thisTbl->title != NULL ? thisTbl->title : thisTbl->field, thisTbl->encodedTbl); + } + } + else + { + printf("<tr><td>%s</td><td>", thisTbl->title); + printf("<table class='jsonTable'>\n"); + printf("<tr><td>"); + char table[4096]; + safef(table, sizeof(table), "%s", thisTbl->encodedTbl); + int swapped = strSwapStrs(table, 4096, ";", "</td></tr><tr><td>"); + if (swapped == -1) + errAbort("Error substituting ';' for '</tr><tr>' in hgc.c:printEmbeddedTable()"); + swapped = strSwapStrs(table, 4096, "|", "</td><td>"); + if (swapped == -1) + errAbort("Error substituting '|' for '</td><td>' in hgc.c:printEmbeddedTable()"); + printf("%s</tr>\n", table); + printf("</table>\n"); + printf("</td></tr>\n"); + } + } +} + void printExtraDetailsTable(char *trackName, char *tableName, char *fileName, struct dyString *tableText) // convert a tab-sep table to HTML { struct lineFile *lf = lineFileOnString(fileName, TRUE, tableText->string); char *description = tableName != NULL ? tableName : "Additional Details"; -printf("<table>"); -jsBeginCollapsibleSection(cart, trackName, "extraTbl", description, FALSE); +printf("<p><b>%s</b></p>\n", description); printf("<table class=\"bedExtraTbl\">\n"); char *line; while (lineFileNext(lf, &line, NULL)) { printf("<tr><td>"); char *toPrint = replaceChars(line, "\t", "</td><td>"); printf("%s", toPrint); printf("</td></tr>\n"); } printf("</table>\n"); // closes bedExtraTbl -jsEndCollapsibleSection(); -printf("</table>\n"); // close wrapper table } static struct slName *findFieldsInExtraFile(char *detailsTableUrl, struct asColumn *col, struct dyString *ds) // return a list of the ${}-enclosed fields from an extra file { struct slName *foundFields = NULL; char *table = netReadTextFileIfExists(hReplaceGbdb(detailsTableUrl)); if (table) { for (; col != NULL; col = col->next) { char field[256]; safef(field, sizeof(field), "${%s}", col->name); if (stringIn(field, table)) { struct slName *replaceField = slNameNew(col->name); slAddHead(&foundFields, replaceField); } } dyStringPrintf(ds, "%s", table); + if (foundFields) slReverse(foundFields); } return foundFields; } +void getExtraTableFields(struct trackDb *tdb, struct slName **retFieldNames, struct embeddedTbl **retList, struct hash *embeddedTblHash) +/* Parse the trackDb field "extraTableFields" into the field names and titles specified, + * and fill out a hash keyed on the bigBed field name (which may be in an external file + * and not in the bigBed itself) to a helper struct for storing user defined tables. */ +{ +struct slName *tmp, *embeddedTblSetting = slNameListFromComma(trackDbSetting(tdb, "extraTableFields")); +char *title = NULL, *fieldName = NULL; +for (tmp = embeddedTblSetting; tmp != NULL; tmp = tmp->next) + { + fieldName = cloneString(tmp->name); + if (strchr(tmp->name, '|')) + { + title = strchr(fieldName, '|'); + *title++ = 0; + } + struct embeddedTbl *new; + AllocVar(new); + new->field = fieldName; + new->title = title != NULL ? cloneString(title) : fieldName; + slAddHead(retList, new); + slNameAddHead(retFieldNames, fieldName); + hashAdd(embeddedTblHash, fieldName, new); + } +} + int extraFieldsPrintAs(struct trackDb *tdb,struct sqlResult *sr,char **fields,int fieldCount, struct asObject *as) // Any extra bed or bigBed fields (defined in as and occurring after N in bed N + types. // sr may be null for bigBeds. // Returns number of extra fields actually printed. { // We are trying to print extra fields so we need to figure out how many fields to skip int start = extraFieldsStart(tdb, fieldCount, as); struct asColumn *col = as->columnList; char *urlsStr = trackDbSettingClosestToHomeOrDefault(tdb, "urls", NULL); struct hash* fieldToUrl = hashFromString(urlsStr); boolean skipEmptyFields = trackDbSettingOn(tdb, "skipEmptyFields"); // make list of fields to skip char *skipFieldsStr = trackDbSetting(tdb, "skipFields"); @@ -1633,120 +1743,152 @@ // make list of fields that we want to substitute // this setting has format description|URLorFilePath, with the stuff before the pipe optional char *extraDetailsTableName = NULL, *extraDetails = cloneString(trackDbSetting(tdb, "extraDetailsTable")); if (extraDetails && strchr(extraDetails,'|')) { extraDetailsTableName = extraDetails; extraDetails = strchr(extraDetails,'|'); *extraDetails++ = 0; } struct dyString *extraTblStr = dyStringNew(0); struct slName *detailsTableFields = NULL; if (extraDetails) detailsTableFields = findFieldsInExtraFile(extraDetails, col, extraTblStr); +struct hash *embeddedTblHash = hashNew(0); +struct slName *embeddedTblFields = NULL; +struct embeddedTbl *embeddedTblList = NULL; +getExtraTableFields(tdb, &embeddedTblFields, &embeddedTblList, embeddedTblHash); + // iterate over fields, print as table rows int count = 0; +int printCount = 0; for (;col != NULL && count < fieldCount;col=col->next) { if (start > 0) // skip past already known fields { start--; continue; } int ix = count; if (sr != NULL) { ix = sqlFieldColumn(sr, col->name); // If sr provided, name must match sql columnn name! if (ix == -1 || ix > fieldCount) // so extraField really just provides a label continue; } char *fieldName = col->name; - - if (count == 0) - printf("<br><table class='bedExtraTbl'>"); - count++; - // do not print a row if the fieldName from the .as file is in the "skipFields" list - // or if a field name starts with _. This maked bigBed extra fields consistent with - // external extra fields in that _ field names have some meaning and are not shown - if (startsWith("_", fieldName) || (skipIds && slNameInList(skipIds, fieldName))) - continue; - // don't print this field if we are gonna print it later in a custom table if (detailsTableFields && slNameInList(detailsTableFields, fieldName)) { int fieldLen = strlen(fieldName); char *replaceField = needMem(fieldLen+4); replaceField[0] = '$'; replaceField[1] = '{'; strcpy(replaceField+2, fieldName); replaceField[fieldLen+2] = '}'; replaceField[fieldLen+3] = 0; extraTblStr = dyStringSub(extraTblStr->string, replaceField, fields[ix]); continue; } + // similar to above, if the field contains an embedded table skip it here + // and print it later + if (embeddedTblFields) + { + struct embeddedTbl *new = hashFindVal(embeddedTblHash, fieldName); + if (new) + { + new->encodedTbl = fields[ix]; + continue; + } + } + + // do not print a row if the fieldName from the .as file is in the "skipFields" list + // or if a field name starts with _. This makes bigBed extra fields consistent with + // external extra fields in that _ field names have some meaning and are not shown + if (startsWith("_", fieldName) || (skipIds && slNameInList(skipIds, fieldName))) + continue; + // skip this row if it's empty and "skipEmptyFields" option is set if (skipEmptyFields && isEmpty(fields[ix])) continue; + if (printCount == 0) + printf("<br><table class='bedExtraTbl'>"); + // split this table to separate current row from the previous one, if the trackDb option is set if (sepFields && slNameInList(sepFields, fieldName)) printf("</tr></table>\n<p>\n<table class='bedExtraTbl'>"); // field description char *entry; if (sameString(fieldName, "cdsStartStat") && sameString("enum('none','unk','incmpl','cmpl')", col->comment)) entry = "Status of CDS start annotation (none, unknown, incomplete, or complete)"; else if (sameString(fieldName, "cdsEndStat") && sameString("enum('none','unk','incmpl','cmpl')", col->comment)) entry = "Status of CDS end annotation (none, unknown, incomplete, or complete)"; else entry = col->comment; printf("<tr><td>%s</td>", entry); // bold style now in HGStyle.css if (col->isList || col->isArray || col->lowType->stringy || asTypesIsInt(col->lowType->type)) printIdOrLinks(col, fieldToUrl, tdb, fields[ix]); else if (asTypesIsFloating(col->lowType->type)) { double valDouble = strtod(fields[ix],NULL); if (errno == 0 && valDouble != 0) printf("<td>%g</td></tr>\n", valDouble); else printf("<td>%s</td></tr>\n", fields[ix]); // decided not to print error } else printf("<td>%s</td></tr>\n", fields[ix]); + printCount++; } if (skipIds) slFreeList(skipIds); if (sepFields) slFreeList(sepFields); -if (count > 0) +if (embeddedTblFields) + { + struct embeddedTbl *thisTbl; + struct dyString *tableLabelsDy = dyStringNew(0); + for (thisTbl = embeddedTblList; thisTbl != NULL; thisTbl = thisTbl->next) + { + if (thisTbl->encodedTbl) + { + printEmbeddedTable(tdb, thisTbl, tableLabelsDy); + } + } + jsInline(dyStringCannibalize(&tableLabelsDy)); + } + +if (printCount > 0) printf("</table>\n"); + if (detailsTableFields) { - printf("<br>\n"); printExtraDetailsTable(tdb->track, extraDetailsTableName, extraDetails, extraTblStr); } -return count; +return printCount; } int extraFieldsPrint(struct trackDb *tdb,struct sqlResult *sr,char **fields,int fieldCount) // Any extra bed or bigBed fields (defined in as and occurring after N in bed N + types. // sr may be null for bigBeds. // Returns number of extra fields actually printed. { struct asObject *as = asForDb(tdb, database); if (as == NULL) return 0; int ret = extraFieldsPrintAs(tdb, sr, fields,fieldCount, as); //asObjectFree(&as); return ret;