d3752edc12da1bf08427946150f564dbdd5d2254
angie
Thu Oct 24 13:55:51 2019 -0700
bigDbSnp track handler code - initial commit. refs #23283
* dnautil: Added trimRefAltLeft to get left-justified trimming (a la VCF not HGVS).
* bigBedClick: do hReplaceGbdb up front in parseDetailsTablUrls instead of waiting until endpoint.
* trackDbCustom.c: consolidating type-handling for wig/bigWig vs. bigBed-based big*.
diff --git src/hg/hgc/bigDbSnpClick.c src/hg/hgc/bigDbSnpClick.c
new file mode 100644
index 0000000..88cafe8
--- /dev/null
+++ src/hg/hgc/bigDbSnpClick.c
@@ -0,0 +1,451 @@
+/* Show details for bigDbSnp track items. */
+
+/* Copyright (C) 2019 The Regents of the University of California
+ * See README in this or parent directory for licensing information. */
+
+#include "common.h"
+#include "hgc.h"
+#include "bigDbSnp.h"
+#include "dbSnpDetails.h"
+#include "bPlusTree.h"
+#include "htslib/bgzf.h"
+#include "soTerm.h"
+
+static char *readMaybeBgzip(char *fileOrUrl, bits64 offset, bits64 len)
+/* If fileOrUrl is bgzip-compressed and indexed, then use htslib's bgzf functions to
+ * retrieve uncompressed data from offset; otherwise (plain text) use udc. */
+{
+char *line = needMem(len+1);
+if (endsWith(fileOrUrl, ".gz"))
+ {
+ BGZF *fp = bgzf_open(fileOrUrl, "r");
+ if (bgzf_index_load(fp, fileOrUrl, ".gzi") < 0)
+ errAbort("bgzf_index_load failed to load .gzi index for %s", fileOrUrl);
+ if (bgzf_useek(fp, offset, SEEK_SET) < 0)
+ errAbort("bgzf_useek failed to seek to uncompressed offset %lld in %s", offset, fileOrUrl);
+ bits64 count = bgzf_read(fp, line, len);
+ if (count != len)
+ errAbort("bgzf_read failed to read %lld bytes at uncompressed offset %lld in %s, got %lld",
+ len, offset, fileOrUrl, count);
+ bgzf_close(fp);
+ }
+else
+ {
+ struct udcFile *udcF = udcFileOpen(fileOrUrl, NULL);
+ udcSeek(udcF, offset);
+ bits64 count = udcRead(udcF, line, len);
+ if (count != len)
+ errAbort("expected %Ld bytes at offset %Ld in %s, got %Ld. ",
+ len, offset, fileOrUrl, count);
+ udcFileClose(&udcF);
+ }
+return line;
+}
+
+static struct dbSnpDetails *getDetails(struct bigDbSnp *bds, char *detailsFileOrUrl)
+/* Seek to the offset for this variant in detailsFileOrUrl, read the line and load as
+ * struct dbSnpDetails. */
+{
+bits64 offset = bds->_dataOffset;
+bits64 len = bds->_dataLen;
+char *line = readMaybeBgzip(detailsFileOrUrl, offset, len);
+// Newline must be trimmed or else it messes up parsing of final column if empty!
+if (line[len-1] == '\n')
+ line[len-1] = '\0';
+char *row[DBSNPDETAILS_NUM_COLS+1];
+int wordCount = chopTabs(line, row);
+if (wordCount != DBSNPDETAILS_NUM_COLS)
+ errAbort("dbSnpDetails: expected %d tab-separated words at offset %Ld in %s, got %d",
+ DBSNPDETAILS_NUM_COLS, offset, detailsFileOrUrl, wordCount);
+return dbSnpDetailsLoad(row);
+}
+
+struct slName *getFreqSourceOrder(struct trackDb *tdb, char *rsId, int expectedCount)
+/* If tdb has freqSourceOrder*/
+{
+struct slName *sourceList = NULL;
+char *sourceNames = trackDbSetting(tdb, "freqSourceOrder");
+if (sourceNames)
+ {
+ sourceList = slNameListFromComma(sourceNames);
+ int settingCount = slCount(sourceList);
+ if (settingCount != expectedCount)
+ errAbort("bigDbSnp freqSourceCount for %s is %d, "
+ "but trackDb setting freqSourceOrder lists %d sources",
+ rsId, expectedCount, settingCount);
+ }
+return sourceList;
+}
+
+INLINE boolean freqSourceHasData(struct dbSnpDetails *details, int sourceIx)
+/* Return TRUE if freqSource sourceIx has any data for this variant. */
+{
+return details->alleleTotals[sourceIx] > 0;
+}
+
+struct alCounts
+ {
+ int obsCount; // Number of chromosomes on which allele was observed by a project
+ int totalCount; // Number of chromosomes on which project observed some allele of this variant
+ };
+
+static struct hash *makePerAlleleCounts(struct dbSnpDetails *details, boolean isRc)
+/* Return a hash of allele to array of alCounts (each source's frequency count data). */
+{
+struct hash *perAlleleCounts = hashNew(0);
+int sIx;
+for (sIx = 0; sIx < details->freqSourceCount; sIx++)
+ {
+ if (freqSourceHasData(details, sIx))
+ {
+ int totalCount = details->alleleTotals[sIx];
+ char *blob = cloneString(details->alleleCounts[sIx]);
+ struct slName *pab, *perAlleleBlobs = slNameListFromString(blob, '|');
+ for (pab = perAlleleBlobs; pab != NULL; pab = pab->next)
+ {
+ char *words[3];
+ int wordCount = chopByChar(pab->name, ':', words, ArraySize(words));
+ if (wordCount != 2)
+ errAbort("Malformed allele:count in |-separated '%s': "
+ "expecting two :-separated words but got %d", pab->name, wordCount);
+ char *allele = words[0];
+ if (isRc)
+ reverseComplement(allele, strlen(allele));
+ int obsCount = atoi(words[1]);
+ struct alCounts *alArray = hashFindVal(perAlleleCounts, allele);
+ if (alArray == NULL)
+ {
+ AllocArray(alArray, details->freqSourceCount);
+ hashAdd(perAlleleCounts, allele, alArray);
+ }
+ alArray[sIx].obsCount = obsCount;
+ alArray[sIx].totalCount = totalCount;
+ }
+ }
+ }
+return perAlleleCounts;
+}
+
+static void printAlleleRow(struct hash *perAlleleCounts, char *allele,
+ struct dbSnpDetails *details, char **perSourceMajorAl)
+/* Print the allele and its counts/freqs from each freqSource for allele, if any. */
+{
+puts("
");
+char buf[1024];
+char *abbrevAl = isEmpty(allele) ? "-" : bigDbSnpAbbrevAllele(allele, buf, sizeof buf);
+printf("%s | ", abbrevAl);
+struct alCounts *alleleCounts = hashFindVal(perAlleleCounts, allele);
+int sIx;
+for (sIx = 0; sIx < details->freqSourceCount; sIx++)
+ {
+ if (freqSourceHasData(details, sIx))
+ {
+ struct alCounts *counts = NULL;
+ if (alleleCounts)
+ counts = &alleleCounts[sIx];
+ if (counts != NULL && counts->totalCount != 0)
+ {
+ boolean isMajor = sameString(perSourceMajorAl[sIx], allele);
+ printf("");
+ if (isMajor)
+ printf("");
+ printf("%d/%d (%f)", counts->obsCount, counts->totalCount,
+ ((double)counts->obsCount / (double)counts->totalCount));
+ if (isMajor)
+ printf("");
+ }
+ else
+ printf(" | - | ");
+ }
+ }
+puts("
");
+}
+
+static void printAlleleCountsAndFreqs(struct bigDbSnp *bds, struct dbSnpDetails *details,
+ struct trackDb *tdb)
+/* Print a table row containing a table of alleles, counts & frequencies if applicable. */
+{
+// Allele counts & frequencies
+if (bds->freqSourceCount > 0)
+ {
+ puts("Allele frequency counts: |
");
+ puts("");
+ struct slName *sourceList = getFreqSourceOrder(tdb, bds->name, bds->freqSourceCount);
+ int sIx;
+ if (sourceList)
+ {
+ puts("Allele | ");
+ struct slName *source = sourceList;
+ for (sIx = 0; sIx < bds->freqSourceCount; sIx++, source = source->next)
+ if (freqSourceHasData(details, sIx))
+ printf("%s | ", source->name);
+ puts(" ");
+ }
+ boolean isRc = (stringIn(bdsRevStrand, bds->ucscNotes) != NULL);
+ struct hash *perAlleleCounts = makePerAlleleCounts(details, isRc);
+ // Row for reference allele
+ printAlleleRow(perAlleleCounts, bds->ref, details, bds->majorAllele);
+ hashRemove(perAlleleCounts, bds->ref);
+ // Rows for alternate alleles
+ int aIx;
+ for (aIx = 0; aIx < bds->altCount; aIx++)
+ {
+ char *alt = bds->alts[aIx];
+ printAlleleRow(perAlleleCounts, alt, details, bds->majorAllele);
+ hashRemove(perAlleleCounts, alt);
+ }
+ // Rows for frequency alleles not included in ref/alt, if any
+ struct hashEl *hel, *helList = hashElListHash(perAlleleCounts);
+ for (hel = helList; hel != NULL; hel = hel->next)
+ {
+ char *alt = hel->name;
+ printAlleleRow(perAlleleCounts, alt, details, bds->majorAllele);
+ }
+ puts(" ");
+ puts(" |
");
+ }
+}
+
+static void printOneSoTerm(struct dbSnpDetails *details, int ix)
+/* Print a SO term with link to MISO browser for details->soTerms[ix]. */
+{
+printf("%s", soTermToMisoLink((enum soTerm)details->soTerms[ix]));
+}
+
+static void printOneClinVar(struct dbSnpDetails *details, int ix)
+/* Print a link to ClinVar for details->clinVar{Accs,Sigs}[ix]. */
+{
+printf("%s (%s)",
+ details->clinVarAccs[ix], details->clinVarAccs[ix], details->clinVarSigs[ix]);
+}
+
+static void printOneSubmitter(struct dbSnpDetails *details, int ix)
+/* Print a link to dbSNP submitter page for details->submitters[ix]. */
+{
+printf(""
+ "%s", details->submitters[ix], details->submitters[ix]);
+}
+
+static void printOnePub(struct dbSnpDetails *details, int ix)
+/* Print a link to Pubmed for details->pubMedIds[ix]. */
+{
+printf("PMID%d\n",
+ details->pubMedIds[ix], details->pubMedIds[ix]);
+}
+
+static void printDetailsRow(struct dbSnpDetails *details, char *label, int itemCount,
+ void (*printOneValue)(struct dbSnpDetails *details, int ix))
+/* Print out one row of a details table using callback to print comma-sep list of values. */
+{
+if (itemCount > 0)
+ {
+ printf("%s: | ", label);
+ int ix;
+ for (ix = 0; ix < itemCount; ix++)
+ {
+ if (ix != 0)
+ puts(", ");
+ printOneValue(details, ix);
+ }
+ puts(" |
");
+ }
+}
+
+static void printDbSnpDetails(struct bigDbSnp *bds, struct dbSnpDetails *details,
+ struct trackDb *tdb)
+/* Print out extras from dbSnpDetails file. */
+{
+printAlleleCountsAndFreqs(bds, details, tdb);
+printDetailsRow(details, "Functional effects", details->soTermCount, printOneSoTerm);
+printDetailsRow(details, "ClinVar", details->clinVarCount, printOneClinVar);
+printDetailsRow(details, "Submitted by", details->submitterCount, printOneSubmitter);
+printDetailsRow(details, "Publications in PubMed", details->pubMedIdCount, printOnePub);
+}
+
+static void printUcscNotes(char *ucscNotes)
+/* Print explanations of ucscNotes items. */
+{
+if (isNotEmpty(ucscNotes))
+ {
+ puts("Interesting or anomalous conditions noted by UCSC:
");
+ puts("
");
+ boolean isCommonAll = (stringIn(bdsCommonAll, ucscNotes) != NULL);
+ struct slName *note, *noteList = slNameListFromComma(ucscNotes);
+ for (note = noteList; note != NULL; note = note->next)
+ {
+ // When commonAll is true, commonSome is also true but not informative,
+ // so skip commonSome if commonAll is true.
+ if (! (isCommonAll && sameString(note->name, bdsCommonSome)))
+ {
+ char *desc = bigDbSnpDescribeUcscNote(note->name);
+ printf("- %s\n", desc ? desc : note->name);
+ }
+ }
+ puts("
");
+ }
+}
+
+static char *getMinRep(char *ref, char *alt, boolean leftJustify)
+/* If ref and alt can be trimmed down to a shorter representation then return that, othw NULL. */
+{
+char *minRep = NULL;
+int refLen = strlen(ref);
+int altLen = strlen(alt);
+if ((refLen > 1 && altLen > 0) || (altLen > 1 && refLen > 0))
+ {
+ char *refCpy = cloneStringZ(ref, refLen);
+ char *altCpy = cloneStringZ(alt, altLen);
+ uint start=0, end=refLen;
+ int refCpyLen = refLen, altCpyLen = altLen;
+ if (leftJustify)
+ trimRefAltLeft(refCpy, altCpy, &start, &end, &refCpyLen, &altCpyLen);
+ else
+ trimRefAlt(refCpy, altCpy, &start, &end, &refCpyLen, &altCpyLen);
+ if (refCpyLen < refLen)
+ {
+ struct dyString *dy = dyStringNew(0);
+ char bufRef[1024], bufAlt[1024];
+ if (refCpyLen == 0)
+ dyStringPrintf(dy, "ins%s",
+ bigDbSnpAbbrevAllele(altCpy, bufAlt, sizeof bufAlt));
+ else if (altCpyLen == 0)
+ dyStringPrintf(dy, "del%s",
+ bigDbSnpAbbrevAllele(refCpy, bufRef, sizeof bufRef));
+ else
+ dyStringPrintf(dy, "del%sins%s",
+ bigDbSnpAbbrevAllele(refCpy, bufRef, sizeof bufRef),
+ bigDbSnpAbbrevAllele(altCpy, bufAlt, sizeof bufAlt));
+ minRep = dyStringCannibalize(&dy);
+ }
+ }
+return minRep;
+}
+
+static void printOtherMappings(struct bbiFile *bbi, struct bigDbSnp *bds, struct trackDb *tdb)
+/* If the variant maps to other genomic sequences (alts/fixes/PAR), link to those positions. */
+{
+int fieldIx;
+struct bptFile *bpt = bigBedOpenExtraIndex(bbi, "name", &fieldIx);
+struct lm *lm = lmInit(0);
+struct bigBedInterval *bbList = bigBedNameQuery(bbi, bpt, fieldIx, bds->name, lm);
+if (slCount(bbList) > 1)
+ {
+ puts("This variant maps to additional locations:
");
+ char chromName[bbi->chromBpt->keySize+1];
+ int lastChromId = -1;
+ struct bigBedInterval *bb;
+ for (bb = bbList; bb != NULL; bb = bb->next)
+ {
+ if (!startsWithWord(bds->name, bb->rest))
+ errAbort("Error: bigBedNameQuery search for name '%s' yielded '%s'",
+ bds->name, bb->rest);
+ bbiCachedChromLookup(bbi, bb->chromId, lastChromId, chromName, sizeof(chromName));
+ char startBuf[16], endBuf[16];
+ char *row[BIGDBSNP_NUM_COLS];
+ int bbFieldCount = bigBedIntervalToRow(bb, chromName, startBuf, endBuf, row, ArraySize(row));
+ if (bbFieldCount != BIGDBSNP_NUM_COLS)
+ errAbort("bigDbSnpClick: expected %d columns but got %d", BIGDBSNP_NUM_COLS,
+ bbFieldCount);
+ struct bigDbSnp *otherBds = bigDbSnpLoad(row);
+ if (differentString(bds->chrom, otherBds->chrom) ||
+ bds->chromStart != otherBds->chromStart ||
+ bds->chromEnd != otherBds->chromEnd)
+ {
+ bedPrintPos((struct bed *)otherBds, 3, tdb);
+ if (bb->next != NULL)
+ puts("
");
+ }
+ }
+ puts("
");
+ }
+bptFileDetach(&bpt);
+lmCleanup(&lm);
+}
+
+void doBigDbSnp(struct trackDb *tdb, char *rsId)
+/* Show details for bigDbSnp item. */
+{
+int start = cartInt(cart, "o");
+int end = cartInt(cart, "t");
+char *fileOrUrl = trackDbSetting(tdb, "bigDataUrl");
+if (isEmpty(fileOrUrl))
+ errAbort("bigDbSnpClick: trackDb is missing bigDataUrl setting");
+struct bbiFile *bbi = bigBedFileOpen(fileOrUrl);
+boolean found = FALSE;
+char *chrom = cartString(cart, "c");
+int ivStart = start, ivEnd = end;
+if (start == end)
+ {
+ // item is an insertion; expand the search range from 0 bases to 2 so we catch it:
+ ivStart = max(0, start-1);
+ ivEnd++;
+ }
+struct lm *lm = lmInit(0);
+struct bigBedInterval *bbList = bigBedIntervalQuery(bbi, chrom, ivStart, ivEnd, 0, lm);
+struct bigBedInterval *bb;
+for (bb = bbList; bb != NULL; bb = bb->next)
+ {
+ if (!startsWithWord(rsId, bb->rest))
+ continue;
+ found = TRUE;
+ char startBuf[16], endBuf[16];
+ char *row[BIGDBSNP_NUM_COLS];
+ int bbFieldCount = bigBedIntervalToRow(bb, chrom, startBuf, endBuf, row, ArraySize(row));
+ if (bbFieldCount != BIGDBSNP_NUM_COLS)
+ errAbort("bigDbSnpClick: expected %d columns but got %d", BIGDBSNP_NUM_COLS, bbFieldCount);
+ struct bigDbSnp *bds = bigDbSnpLoad(row);
+ struct dbSnpDetails *details = NULL;
+ struct slPair *detailsUrls = parseDetailsTablUrls(tdb);
+ if (detailsUrls)
+ details = getDetails(bds, detailsUrls->val);
+ bedPrintPos((struct bed *)bds, 3, tdb);
+ puts("
");
+ puts("");
+ char *ref = bds->ref;
+ char abbrev[1024];
+ char *abbrevRef = isEmpty(ref) ? "-" : bigDbSnpAbbrevAllele(ref, abbrev, sizeof abbrev);
+ printf("Reference allele: | %s | |
\n",
+ abbrevRef);
+ printf("altCount > 1)
+ printf(" style='vertical-align:top'");
+ printf(">Alternate allele%s: | ", (bds->altCount > 1 ? "s" : ""));
+ if (bds->altCount == 0)
+ printf("none");
+ else
+ {
+ int i;
+ for (i = 0; i < bds->altCount; i++)
+ {
+ char *alt = bds->alts[i];
+ char *abbrevAlt = isEmpty(alt) ? "-" : bigDbSnpAbbrevAllele(alt, abbrev, sizeof abbrev);
+ printf("%s%s", (i > 0 ? ", \n" : ""), abbrevAlt);
+ char *minRepLeft = getMinRep(ref, alt, TRUE);
+ if (minRepLeft)
+ {
+ char *minRepRight = getMinRep(ref, alt, FALSE);
+ if (sameString(minRepLeft, minRepRight))
+ printf(" [%s]", minRepLeft);
+ else
+ printf(" [%s (left-shifted), %s (right-shifted)]", minRepLeft, minRepRight);
+ }
+ printf("");
+ }
+ puts(" |
");
+ }
+ if (bds->shiftBases > 0)
+ printf("Uncertainty in indel placement: | %d base%s | \n",
+ bds->shiftBases, (bds->shiftBases > 1 ? "s" : ""));
+ if (details)
+ printDbSnpDetails(bds, details, tdb);
+ printf("Variation class/type: | %s |
\n",
+ bigDbSnpClassToString(bds->class));
+ puts("
");
+ printUcscNotes(bds->ucscNotes);
+ printOtherMappings(bbi, bds, tdb);
+ }
+if (!found)
+ printf("No item %s starting at %s:%d\n", rsId, chrom, start+1);
+lmCleanup(&lm);
+bbiFileClose(&bbi);
+}