d3752edc12da1bf08427946150f564dbdd5d2254 angie Thu Oct 24 13:55:51 2019 -0700 bigDbSnp track handler code - initial commit. refs #23283 * dnautil: Added trimRefAltLeft to get left-justified trimming (a la VCF not HGVS). * bigBedClick: do hReplaceGbdb up front in parseDetailsTablUrls instead of waiting until endpoint. * trackDbCustom.c: consolidating type-handling for wig/bigWig vs. bigBed-based big*. diff --git src/hg/hgc/bigDbSnpClick.c src/hg/hgc/bigDbSnpClick.c new file mode 100644 index 0000000..88cafe8 --- /dev/null +++ src/hg/hgc/bigDbSnpClick.c @@ -0,0 +1,451 @@ +/* Show details for bigDbSnp track items. */ + +/* Copyright (C) 2019 The Regents of the University of California + * See README in this or parent directory for licensing information. */ + +#include "common.h" +#include "hgc.h" +#include "bigDbSnp.h" +#include "dbSnpDetails.h" +#include "bPlusTree.h" +#include "htslib/bgzf.h" +#include "soTerm.h" + +static char *readMaybeBgzip(char *fileOrUrl, bits64 offset, bits64 len) +/* If fileOrUrl is bgzip-compressed and indexed, then use htslib's bgzf functions to + * retrieve uncompressed data from offset; otherwise (plain text) use udc. */ +{ +char *line = needMem(len+1); +if (endsWith(fileOrUrl, ".gz")) + { + BGZF *fp = bgzf_open(fileOrUrl, "r"); + if (bgzf_index_load(fp, fileOrUrl, ".gzi") < 0) + errAbort("bgzf_index_load failed to load .gzi index for %s", fileOrUrl); + if (bgzf_useek(fp, offset, SEEK_SET) < 0) + errAbort("bgzf_useek failed to seek to uncompressed offset %lld in %s", offset, fileOrUrl); + bits64 count = bgzf_read(fp, line, len); + if (count != len) + errAbort("bgzf_read failed to read %lld bytes at uncompressed offset %lld in %s, got %lld", + len, offset, fileOrUrl, count); + bgzf_close(fp); + } +else + { + struct udcFile *udcF = udcFileOpen(fileOrUrl, NULL); + udcSeek(udcF, offset); + bits64 count = udcRead(udcF, line, len); + if (count != len) + errAbort("expected %Ld bytes at offset %Ld in %s, got %Ld. ", + len, offset, fileOrUrl, count); + udcFileClose(&udcF); + } +return line; +} + +static struct dbSnpDetails *getDetails(struct bigDbSnp *bds, char *detailsFileOrUrl) +/* Seek to the offset for this variant in detailsFileOrUrl, read the line and load as + * struct dbSnpDetails. */ +{ +bits64 offset = bds->_dataOffset; +bits64 len = bds->_dataLen; +char *line = readMaybeBgzip(detailsFileOrUrl, offset, len); +// Newline must be trimmed or else it messes up parsing of final column if empty! +if (line[len-1] == '\n') + line[len-1] = '\0'; +char *row[DBSNPDETAILS_NUM_COLS+1]; +int wordCount = chopTabs(line, row); +if (wordCount != DBSNPDETAILS_NUM_COLS) + errAbort("dbSnpDetails: expected %d tab-separated words at offset %Ld in %s, got %d", + DBSNPDETAILS_NUM_COLS, offset, detailsFileOrUrl, wordCount); +return dbSnpDetailsLoad(row); +} + +struct slName *getFreqSourceOrder(struct trackDb *tdb, char *rsId, int expectedCount) +/* If tdb has freqSourceOrder*/ +{ +struct slName *sourceList = NULL; +char *sourceNames = trackDbSetting(tdb, "freqSourceOrder"); +if (sourceNames) + { + sourceList = slNameListFromComma(sourceNames); + int settingCount = slCount(sourceList); + if (settingCount != expectedCount) + errAbort("bigDbSnp freqSourceCount for %s is %d, " + "but trackDb setting freqSourceOrder lists %d sources", + rsId, expectedCount, settingCount); + } +return sourceList; +} + +INLINE boolean freqSourceHasData(struct dbSnpDetails *details, int sourceIx) +/* Return TRUE if freqSource sourceIx has any data for this variant. */ +{ +return details->alleleTotals[sourceIx] > 0; +} + +struct alCounts + { + int obsCount; // Number of chromosomes on which allele was observed by a project + int totalCount; // Number of chromosomes on which project observed some allele of this variant + }; + +static struct hash *makePerAlleleCounts(struct dbSnpDetails *details, boolean isRc) +/* Return a hash of allele to array of alCounts (each source's frequency count data). */ +{ +struct hash *perAlleleCounts = hashNew(0); +int sIx; +for (sIx = 0; sIx < details->freqSourceCount; sIx++) + { + if (freqSourceHasData(details, sIx)) + { + int totalCount = details->alleleTotals[sIx]; + char *blob = cloneString(details->alleleCounts[sIx]); + struct slName *pab, *perAlleleBlobs = slNameListFromString(blob, '|'); + for (pab = perAlleleBlobs; pab != NULL; pab = pab->next) + { + char *words[3]; + int wordCount = chopByChar(pab->name, ':', words, ArraySize(words)); + if (wordCount != 2) + errAbort("Malformed allele:count in |-separated '%s': " + "expecting two :-separated words but got %d", pab->name, wordCount); + char *allele = words[0]; + if (isRc) + reverseComplement(allele, strlen(allele)); + int obsCount = atoi(words[1]); + struct alCounts *alArray = hashFindVal(perAlleleCounts, allele); + if (alArray == NULL) + { + AllocArray(alArray, details->freqSourceCount); + hashAdd(perAlleleCounts, allele, alArray); + } + alArray[sIx].obsCount = obsCount; + alArray[sIx].totalCount = totalCount; + } + } + } +return perAlleleCounts; +} + +static void printAlleleRow(struct hash *perAlleleCounts, char *allele, + struct dbSnpDetails *details, char **perSourceMajorAl) +/* Print the allele and its counts/freqs from each freqSource for allele, if any. */ +{ +puts("<tr>"); +char buf[1024]; +char *abbrevAl = isEmpty(allele) ? "-" : bigDbSnpAbbrevAllele(allele, buf, sizeof buf); +printf("<td><span class='inlineCode'>%s</span></td>", abbrevAl); +struct alCounts *alleleCounts = hashFindVal(perAlleleCounts, allele); +int sIx; +for (sIx = 0; sIx < details->freqSourceCount; sIx++) + { + if (freqSourceHasData(details, sIx)) + { + struct alCounts *counts = NULL; + if (alleleCounts) + counts = &alleleCounts[sIx]; + if (counts != NULL && counts->totalCount != 0) + { + boolean isMajor = sameString(perSourceMajorAl[sIx], allele); + printf("<td>"); + if (isMajor) + printf("<b>"); + printf("%d/%d (%f)", counts->obsCount, counts->totalCount, + ((double)counts->obsCount / (double)counts->totalCount)); + if (isMajor) + printf("</b>"); + } + else + printf("<td>-</td>"); + } + } +puts("</tr>"); +} + +static void printAlleleCountsAndFreqs(struct bigDbSnp *bds, struct dbSnpDetails *details, + struct trackDb *tdb) +/* Print a table row containing a table of alleles, counts & frequencies if applicable. */ +{ +// Allele counts & frequencies +if (bds->freqSourceCount > 0) + { + puts("<tr><td colspan=2><b>Allele frequency counts:</b></td></tr>"); + puts("<tr><td colspan=2><table class='stdTbl'>"); + struct slName *sourceList = getFreqSourceOrder(tdb, bds->name, bds->freqSourceCount); + int sIx; + if (sourceList) + { + puts("<tr><th>Allele</th>"); + struct slName *source = sourceList; + for (sIx = 0; sIx < bds->freqSourceCount; sIx++, source = source->next) + if (freqSourceHasData(details, sIx)) + printf("<th>%s</th> ", source->name); + puts("</tr>"); + } + boolean isRc = (stringIn(bdsRevStrand, bds->ucscNotes) != NULL); + struct hash *perAlleleCounts = makePerAlleleCounts(details, isRc); + // Row for reference allele + printAlleleRow(perAlleleCounts, bds->ref, details, bds->majorAllele); + hashRemove(perAlleleCounts, bds->ref); + // Rows for alternate alleles + int aIx; + for (aIx = 0; aIx < bds->altCount; aIx++) + { + char *alt = bds->alts[aIx]; + printAlleleRow(perAlleleCounts, alt, details, bds->majorAllele); + hashRemove(perAlleleCounts, alt); + } + // Rows for frequency alleles not included in ref/alt, if any + struct hashEl *hel, *helList = hashElListHash(perAlleleCounts); + for (hel = helList; hel != NULL; hel = hel->next) + { + char *alt = hel->name; + printAlleleRow(perAlleleCounts, alt, details, bds->majorAllele); + } + puts("</table>"); + puts("</td></tr>"); + } +} + +static void printOneSoTerm(struct dbSnpDetails *details, int ix) +/* Print a SO term with link to MISO browser for details->soTerms[ix]. */ +{ +printf("%s", soTermToMisoLink((enum soTerm)details->soTerms[ix])); +} + +static void printOneClinVar(struct dbSnpDetails *details, int ix) +/* Print a link to ClinVar for details->clinVar{Accs,Sigs}[ix]. */ +{ +printf("<a href='https://www.ncbi.nlm.nih.gov/clinvar/%s' target=_blank>%s</a> (%s)", + details->clinVarAccs[ix], details->clinVarAccs[ix], details->clinVarSigs[ix]); +} + +static void printOneSubmitter(struct dbSnpDetails *details, int ix) +/* Print a link to dbSNP submitter page for details->submitters[ix]. */ +{ +printf("<a href=\"https://www.ncbi.nlm.nih.gov/SNP/snp_viewTable.cgi?h=%s\" target=_blank>" + "%s</a>", details->submitters[ix], details->submitters[ix]); +} + +static void printOnePub(struct dbSnpDetails *details, int ix) +/* Print a link to Pubmed for details->pubMedIds[ix]. */ +{ +printf("<a href=\"https://www.ncbi.nlm.nih.gov/pubmed/%d\" target=_blank>PMID%d</a>\n", + details->pubMedIds[ix], details->pubMedIds[ix]); +} + +static void printDetailsRow(struct dbSnpDetails *details, char *label, int itemCount, + void (*printOneValue)(struct dbSnpDetails *details, int ix)) +/* Print out one row of a details table using callback to print comma-sep list of values. */ +{ +if (itemCount > 0) + { + printf("<tr><td><b>%s:</b></td><td>", label); + int ix; + for (ix = 0; ix < itemCount; ix++) + { + if (ix != 0) + puts(", "); + printOneValue(details, ix); + } + puts("</td></tr>"); + } +} + +static void printDbSnpDetails(struct bigDbSnp *bds, struct dbSnpDetails *details, + struct trackDb *tdb) +/* Print out extras from dbSnpDetails file. */ +{ +printAlleleCountsAndFreqs(bds, details, tdb); +printDetailsRow(details, "Functional effects", details->soTermCount, printOneSoTerm); +printDetailsRow(details, "ClinVar", details->clinVarCount, printOneClinVar); +printDetailsRow(details, "Submitted by", details->submitterCount, printOneSubmitter); +printDetailsRow(details, "Publications in PubMed", details->pubMedIdCount, printOnePub); +} + +static void printUcscNotes(char *ucscNotes) +/* Print explanations of ucscNotes items. */ +{ +if (isNotEmpty(ucscNotes)) + { + puts("<p>Interesting or anomalous conditions noted by UCSC:</b><br>"); + puts("<ul>"); + boolean isCommonAll = (stringIn(bdsCommonAll, ucscNotes) != NULL); + struct slName *note, *noteList = slNameListFromComma(ucscNotes); + for (note = noteList; note != NULL; note = note->next) + { + // When commonAll is true, commonSome is also true but not informative, + // so skip commonSome if commonAll is true. + if (! (isCommonAll && sameString(note->name, bdsCommonSome))) + { + char *desc = bigDbSnpDescribeUcscNote(note->name); + printf("<li>%s\n", desc ? desc : note->name); + } + } + puts("</ul>"); + } +} + +static char *getMinRep(char *ref, char *alt, boolean leftJustify) +/* If ref and alt can be trimmed down to a shorter representation then return that, othw NULL. */ +{ +char *minRep = NULL; +int refLen = strlen(ref); +int altLen = strlen(alt); +if ((refLen > 1 && altLen > 0) || (altLen > 1 && refLen > 0)) + { + char *refCpy = cloneStringZ(ref, refLen); + char *altCpy = cloneStringZ(alt, altLen); + uint start=0, end=refLen; + int refCpyLen = refLen, altCpyLen = altLen; + if (leftJustify) + trimRefAltLeft(refCpy, altCpy, &start, &end, &refCpyLen, &altCpyLen); + else + trimRefAlt(refCpy, altCpy, &start, &end, &refCpyLen, &altCpyLen); + if (refCpyLen < refLen) + { + struct dyString *dy = dyStringNew(0); + char bufRef[1024], bufAlt[1024]; + if (refCpyLen == 0) + dyStringPrintf(dy, "ins%s", + bigDbSnpAbbrevAllele(altCpy, bufAlt, sizeof bufAlt)); + else if (altCpyLen == 0) + dyStringPrintf(dy, "del%s", + bigDbSnpAbbrevAllele(refCpy, bufRef, sizeof bufRef)); + else + dyStringPrintf(dy, "del%sins%s", + bigDbSnpAbbrevAllele(refCpy, bufRef, sizeof bufRef), + bigDbSnpAbbrevAllele(altCpy, bufAlt, sizeof bufAlt)); + minRep = dyStringCannibalize(&dy); + } + } +return minRep; +} + +static void printOtherMappings(struct bbiFile *bbi, struct bigDbSnp *bds, struct trackDb *tdb) +/* If the variant maps to other genomic sequences (alts/fixes/PAR), link to those positions. */ +{ +int fieldIx; +struct bptFile *bpt = bigBedOpenExtraIndex(bbi, "name", &fieldIx); +struct lm *lm = lmInit(0); +struct bigBedInterval *bbList = bigBedNameQuery(bbi, bpt, fieldIx, bds->name, lm); +if (slCount(bbList) > 1) + { + puts("<p><b>This variant maps to additional locations:</b><br><br>"); + char chromName[bbi->chromBpt->keySize+1]; + int lastChromId = -1; + struct bigBedInterval *bb; + for (bb = bbList; bb != NULL; bb = bb->next) + { + if (!startsWithWord(bds->name, bb->rest)) + errAbort("Error: bigBedNameQuery search for name '%s' yielded '%s'", + bds->name, bb->rest); + bbiCachedChromLookup(bbi, bb->chromId, lastChromId, chromName, sizeof(chromName)); + char startBuf[16], endBuf[16]; + char *row[BIGDBSNP_NUM_COLS]; + int bbFieldCount = bigBedIntervalToRow(bb, chromName, startBuf, endBuf, row, ArraySize(row)); + if (bbFieldCount != BIGDBSNP_NUM_COLS) + errAbort("bigDbSnpClick: expected %d columns but got %d", BIGDBSNP_NUM_COLS, + bbFieldCount); + struct bigDbSnp *otherBds = bigDbSnpLoad(row); + if (differentString(bds->chrom, otherBds->chrom) || + bds->chromStart != otherBds->chromStart || + bds->chromEnd != otherBds->chromEnd) + { + bedPrintPos((struct bed *)otherBds, 3, tdb); + if (bb->next != NULL) + puts("<br>"); + } + } + puts("</p>"); + } +bptFileDetach(&bpt); +lmCleanup(&lm); +} + +void doBigDbSnp(struct trackDb *tdb, char *rsId) +/* Show details for bigDbSnp item. */ +{ +int start = cartInt(cart, "o"); +int end = cartInt(cart, "t"); +char *fileOrUrl = trackDbSetting(tdb, "bigDataUrl"); +if (isEmpty(fileOrUrl)) + errAbort("bigDbSnpClick: trackDb is missing bigDataUrl setting"); +struct bbiFile *bbi = bigBedFileOpen(fileOrUrl); +boolean found = FALSE; +char *chrom = cartString(cart, "c"); +int ivStart = start, ivEnd = end; +if (start == end) + { + // item is an insertion; expand the search range from 0 bases to 2 so we catch it: + ivStart = max(0, start-1); + ivEnd++; + } +struct lm *lm = lmInit(0); +struct bigBedInterval *bbList = bigBedIntervalQuery(bbi, chrom, ivStart, ivEnd, 0, lm); +struct bigBedInterval *bb; +for (bb = bbList; bb != NULL; bb = bb->next) + { + if (!startsWithWord(rsId, bb->rest)) + continue; + found = TRUE; + char startBuf[16], endBuf[16]; + char *row[BIGDBSNP_NUM_COLS]; + int bbFieldCount = bigBedIntervalToRow(bb, chrom, startBuf, endBuf, row, ArraySize(row)); + if (bbFieldCount != BIGDBSNP_NUM_COLS) + errAbort("bigDbSnpClick: expected %d columns but got %d", BIGDBSNP_NUM_COLS, bbFieldCount); + struct bigDbSnp *bds = bigDbSnpLoad(row); + struct dbSnpDetails *details = NULL; + struct slPair *detailsUrls = parseDetailsTablUrls(tdb); + if (detailsUrls) + details = getDetails(bds, detailsUrls->val); + bedPrintPos((struct bed *)bds, 3, tdb); + puts("<br>"); + puts("<table>"); + char *ref = bds->ref; + char abbrev[1024]; + char *abbrevRef = isEmpty(ref) ? "-" : bigDbSnpAbbrevAllele(ref, abbrev, sizeof abbrev); + printf("<tr><td><b>Reference allele:</b></td><td><span class='inlineCode'>%s</span><td></tr>\n", + abbrevRef); + printf("<tr><td"); + if (bds->altCount > 1) + printf(" style='vertical-align:top'"); + printf("><b>Alternate allele%s:</b></td><td>", (bds->altCount > 1 ? "s" : "")); + if (bds->altCount == 0) + printf("<em>none</em>"); + else + { + int i; + for (i = 0; i < bds->altCount; i++) + { + char *alt = bds->alts[i]; + char *abbrevAlt = isEmpty(alt) ? "-" : bigDbSnpAbbrevAllele(alt, abbrev, sizeof abbrev); + printf("%s<span class='inlineCode'>%s", (i > 0 ? ",<br>\n" : ""), abbrevAlt); + char *minRepLeft = getMinRep(ref, alt, TRUE); + if (minRepLeft) + { + char *minRepRight = getMinRep(ref, alt, FALSE); + if (sameString(minRepLeft, minRepRight)) + printf(" [%s]", minRepLeft); + else + printf(" [%s (left-shifted), %s (right-shifted)]", minRepLeft, minRepRight); + } + printf("</span>"); + } + puts("</td></tr>"); + } + if (bds->shiftBases > 0) + printf("<td><b>Uncertainty in indel placement:</b></td><td>%d base%s</td></tr>\n", + bds->shiftBases, (bds->shiftBases > 1 ? "s" : "")); + if (details) + printDbSnpDetails(bds, details, tdb); + printf("<tr><td><b>Variation class/type:</b></td><td>%s</td></tr>\n", + bigDbSnpClassToString(bds->class)); + puts("</table>"); + printUcscNotes(bds->ucscNotes); + printOtherMappings(bbi, bds, tdb); + } +if (!found) + printf("No item %s starting at %s:%d\n", rsId, chrom, start+1); +lmCleanup(&lm); +bbiFileClose(&bbi); +}