46169b41deecd48121198e1911c41dc0a3f96b47 chmalee Tue Jan 19 18:12:04 2021 -0800 Allow variable size data tables on hgc. Allow these tables to be JSON or pipe and semi-colon encoded. Add more support for external data references in bigBeds: allow relevant trackDb settings like skipEmptyFields, allow variable size tables in external files, allow gzip compressed external files. diff --git src/hg/hgc/bigDbSnpClick.c src/hg/hgc/bigDbSnpClick.c index 88d3435..dc736e1 100644 --- src/hg/hgc/bigDbSnpClick.c +++ src/hg/hgc/bigDbSnpClick.c @@ -1,453 +1,422 @@ /* Show details for bigDbSnp track items. */ /* Copyright (C) 2019 The Regents of the University of California * See README in this or parent directory for licensing information. */ #include "common.h" #include "hgc.h" #include "bigDbSnp.h" #include "dbSnpDetails.h" #include "bPlusTree.h" #include "htslib/bgzf.h" #include "soTerm.h" -static char *readMaybeBgzip(char *fileOrUrl, bits64 offset, bits64 len) -/* If fileOrUrl is bgzip-compressed and indexed, then use htslib's bgzf functions to - * retrieve uncompressed data from offset; otherwise (plain text) use udc. */ -{ -char *line = needMem(len+1); -if (endsWith(fileOrUrl, ".gz")) - { - BGZF *fp = bgzf_open(fileOrUrl, "r"); - if (bgzf_index_load(fp, fileOrUrl, ".gzi") < 0) - errAbort("bgzf_index_load failed to load .gzi index for %s", fileOrUrl); - if (bgzf_useek(fp, offset, SEEK_SET) < 0) - errAbort("bgzf_useek failed to seek to uncompressed offset %lld in %s", offset, fileOrUrl); - bits64 count = bgzf_read(fp, line, len); - if (count != len) - errAbort("bgzf_read failed to read %lld bytes at uncompressed offset %lld in %s, got %lld", - len, offset, fileOrUrl, count); - bgzf_close(fp); - } -else - { - struct udcFile *udcF = udcFileOpen(fileOrUrl, NULL); - udcSeek(udcF, offset); - bits64 count = udcRead(udcF, line, len); - if (count != len) - errAbort("expected %Ld bytes at offset %Ld in %s, got %Ld. ", - len, offset, fileOrUrl, count); - udcFileClose(&udcF); - } -return line; -} - static struct dbSnpDetails *getDetails(struct bigDbSnp *bds, char *detailsFileOrUrl) /* Seek to the offset for this variant in detailsFileOrUrl, read the line and load as * struct dbSnpDetails. */ { bits64 offset = bds->_dataOffset; bits64 len = bds->_dataLen; -char *line = readMaybeBgzip(detailsFileOrUrl, offset, len); +char *line = readOneLineMaybeBgzip(detailsFileOrUrl, offset, len); // Newline must be trimmed or else it messes up parsing of final column if empty! if (line[len-1] == '\n') line[len-1] = '\0'; char *row[DBSNPDETAILS_NUM_COLS+1]; int wordCount = chopTabs(line, row); if (wordCount != DBSNPDETAILS_NUM_COLS) errAbort("dbSnpDetails: expected %d tab-separated words at offset %Ld in %s, got %d", DBSNPDETAILS_NUM_COLS, offset, detailsFileOrUrl, wordCount); return dbSnpDetailsLoad(row); } struct slName *getFreqSourceOrder(struct trackDb *tdb, char *rsId, int expectedCount) /* If tdb has freqSourceOrder*/ { struct slName *sourceList = NULL; char *sourceNames = trackDbSetting(tdb, "freqSourceOrder"); if (sourceNames) { sourceList = slNameListFromComma(sourceNames); int settingCount = slCount(sourceList); if (settingCount != expectedCount) errAbort("bigDbSnp freqSourceCount for %s is %d, " "but trackDb setting freqSourceOrder lists %d sources", rsId, expectedCount, settingCount); } return sourceList; } INLINE boolean freqSourceHasData(struct dbSnpDetails *details, int sourceIx) /* Return TRUE if freqSource sourceIx has any data for this variant. */ { return details->alleleTotals[sourceIx] > 0; } struct alCounts { int obsCount; // Number of chromosomes on which allele was observed by a project int totalCount; // Number of chromosomes on which project observed some allele of this variant }; static struct hash *makePerAlleleCounts(struct dbSnpDetails *details, boolean isRc) /* Return a hash of allele to array of alCounts (each source's frequency count data). */ { struct hash *perAlleleCounts = hashNew(0); int sIx; for (sIx = 0; sIx < details->freqSourceCount; sIx++) { if (freqSourceHasData(details, sIx)) { int totalCount = details->alleleTotals[sIx]; char *blob = cloneString(details->alleleCounts[sIx]); struct slName *pab, *perAlleleBlobs = slNameListFromString(blob, '|'); for (pab = perAlleleBlobs; pab != NULL; pab = pab->next) { char *words[3]; int wordCount = chopByChar(pab->name, ':', words, ArraySize(words)); if (wordCount != 2) errAbort("Malformed allele:count in |-separated '%s': " "expecting two :-separated words but got %d", pab->name, wordCount); char *allele = words[0]; if (isRc) reverseComplement(allele, strlen(allele)); int obsCount = atoi(words[1]); struct alCounts *alArray = hashFindVal(perAlleleCounts, allele); if (alArray == NULL) { AllocArray(alArray, details->freqSourceCount); hashAdd(perAlleleCounts, allele, alArray); } alArray[sIx].obsCount = obsCount; alArray[sIx].totalCount = totalCount; } } } return perAlleleCounts; } static void printAlleleRow(struct hash *perAlleleCounts, char *allele, struct dbSnpDetails *details, char **perSourceMajorAl) /* Print the allele and its counts/freqs from each freqSource for allele, if any. */ { puts("
Allele | "); struct slName *source = sourceList; for (sIx = 0; sIx < bds->freqSourceCount; sIx++, source = source->next) if (freqSourceHasData(details, sIx)) printf("%s | ", source->name); puts("
---|
Interesting or anomalous conditions noted by UCSC:
");
puts("
This variant maps to additional locations:
");
char chromName[bbi->chromBpt->keySize+1];
int lastChromId = -1;
struct bigBedInterval *bb;
for (bb = bbList; bb != NULL; bb = bb->next)
{
if (!startsWithWord(bds->name, bb->rest))
errAbort("Error: bigBedNameQuery search for name '%s' yielded '%s'",
bds->name, bb->rest);
bbiCachedChromLookup(bbi, bb->chromId, lastChromId, chromName, sizeof(chromName));
char startBuf[16], endBuf[16];
char *row[BIGDBSNP_NUM_COLS];
int bbFieldCount = bigBedIntervalToRow(bb, chromName, startBuf, endBuf, row, ArraySize(row));
if (bbFieldCount != BIGDBSNP_NUM_COLS)
errAbort("bigDbSnpClick: expected %d columns but got %d", BIGDBSNP_NUM_COLS,
bbFieldCount);
struct bigDbSnp *otherBds = bigDbSnpLoad(row);
if (differentString(bds->chrom, otherBds->chrom) ||
bds->chromStart != otherBds->chromStart ||
bds->chromEnd != otherBds->chromEnd)
{
bedPrintPos((struct bed *)otherBds, 3, tdb);
if (bb->next != NULL)
puts("
");
}
}
puts("
Reference allele: | %s | |
altCount > 1) printf(" style='vertical-align:top'"); printf(">Alternate allele%s: | ", (bds->altCount > 1 ? "s" : ""));
if (bds->altCount == 0)
printf("none");
else
{
int i;
for (i = 0; i < bds->altCount; i++)
{
char *alt = bds->alts[i];
char *abbrevAlt = isEmpty(alt) ? "-" : bigDbSnpAbbrevAllele(alt, abbrev, sizeof abbrev);
printf("%s%s", (i > 0 ? ", \n" : ""), abbrevAlt); char *minRepLeft = getMinRep(ref, alt, TRUE); if (minRepLeft) { char *minRepRight = getMinRep(ref, alt, FALSE); if (sameString(minRepLeft, minRepRight)) printf(" [%s]", minRepLeft); else printf(" [%s (left-shifted), %s (right-shifted)]", minRepLeft, minRepRight); } printf(""); } puts(" | Uncertainty in indel placement: | %d base%s | \n", bds->shiftBases, (bds->shiftBases > 1 ? "s" : "")); if (details) printDbSnpDetails(bds, details, tdb); printf("
Variation class/type: | %s |