acca3deffc05c4d8d11590a1cf3d893763254712
angie
Thu Oct 31 13:43:05 2019 -0700
dbSnp153: Adding new ucscNotes suggested by Ana Benet: clinvar{Benign,Conflicting,Pathogenic}, rareAll, rareSome. refs #23283
diff --git src/hg/hgc/bigDbSnpClick.c src/hg/hgc/bigDbSnpClick.c
index 88cafe8..cde26f1 100644
--- src/hg/hgc/bigDbSnpClick.c
+++ src/hg/hgc/bigDbSnpClick.c
@@ -1,451 +1,453 @@
/* Show details for bigDbSnp track items. */
/* Copyright (C) 2019 The Regents of the University of California
* See README in this or parent directory for licensing information. */
#include "common.h"
#include "hgc.h"
#include "bigDbSnp.h"
#include "dbSnpDetails.h"
#include "bPlusTree.h"
#include "htslib/bgzf.h"
#include "soTerm.h"
static char *readMaybeBgzip(char *fileOrUrl, bits64 offset, bits64 len)
/* If fileOrUrl is bgzip-compressed and indexed, then use htslib's bgzf functions to
* retrieve uncompressed data from offset; otherwise (plain text) use udc. */
{
char *line = needMem(len+1);
if (endsWith(fileOrUrl, ".gz"))
{
BGZF *fp = bgzf_open(fileOrUrl, "r");
if (bgzf_index_load(fp, fileOrUrl, ".gzi") < 0)
errAbort("bgzf_index_load failed to load .gzi index for %s", fileOrUrl);
if (bgzf_useek(fp, offset, SEEK_SET) < 0)
errAbort("bgzf_useek failed to seek to uncompressed offset %lld in %s", offset, fileOrUrl);
bits64 count = bgzf_read(fp, line, len);
if (count != len)
errAbort("bgzf_read failed to read %lld bytes at uncompressed offset %lld in %s, got %lld",
len, offset, fileOrUrl, count);
bgzf_close(fp);
}
else
{
struct udcFile *udcF = udcFileOpen(fileOrUrl, NULL);
udcSeek(udcF, offset);
bits64 count = udcRead(udcF, line, len);
if (count != len)
errAbort("expected %Ld bytes at offset %Ld in %s, got %Ld. ",
len, offset, fileOrUrl, count);
udcFileClose(&udcF);
}
return line;
}
static struct dbSnpDetails *getDetails(struct bigDbSnp *bds, char *detailsFileOrUrl)
/* Seek to the offset for this variant in detailsFileOrUrl, read the line and load as
* struct dbSnpDetails. */
{
bits64 offset = bds->_dataOffset;
bits64 len = bds->_dataLen;
char *line = readMaybeBgzip(detailsFileOrUrl, offset, len);
// Newline must be trimmed or else it messes up parsing of final column if empty!
if (line[len-1] == '\n')
line[len-1] = '\0';
char *row[DBSNPDETAILS_NUM_COLS+1];
int wordCount = chopTabs(line, row);
if (wordCount != DBSNPDETAILS_NUM_COLS)
errAbort("dbSnpDetails: expected %d tab-separated words at offset %Ld in %s, got %d",
DBSNPDETAILS_NUM_COLS, offset, detailsFileOrUrl, wordCount);
return dbSnpDetailsLoad(row);
}
struct slName *getFreqSourceOrder(struct trackDb *tdb, char *rsId, int expectedCount)
/* If tdb has freqSourceOrder*/
{
struct slName *sourceList = NULL;
char *sourceNames = trackDbSetting(tdb, "freqSourceOrder");
if (sourceNames)
{
sourceList = slNameListFromComma(sourceNames);
int settingCount = slCount(sourceList);
if (settingCount != expectedCount)
errAbort("bigDbSnp freqSourceCount for %s is %d, "
"but trackDb setting freqSourceOrder lists %d sources",
rsId, expectedCount, settingCount);
}
return sourceList;
}
INLINE boolean freqSourceHasData(struct dbSnpDetails *details, int sourceIx)
/* Return TRUE if freqSource sourceIx has any data for this variant. */
{
return details->alleleTotals[sourceIx] > 0;
}
struct alCounts
{
int obsCount; // Number of chromosomes on which allele was observed by a project
int totalCount; // Number of chromosomes on which project observed some allele of this variant
};
static struct hash *makePerAlleleCounts(struct dbSnpDetails *details, boolean isRc)
/* Return a hash of allele to array of alCounts (each source's frequency count data). */
{
struct hash *perAlleleCounts = hashNew(0);
int sIx;
for (sIx = 0; sIx < details->freqSourceCount; sIx++)
{
if (freqSourceHasData(details, sIx))
{
int totalCount = details->alleleTotals[sIx];
char *blob = cloneString(details->alleleCounts[sIx]);
struct slName *pab, *perAlleleBlobs = slNameListFromString(blob, '|');
for (pab = perAlleleBlobs; pab != NULL; pab = pab->next)
{
char *words[3];
int wordCount = chopByChar(pab->name, ':', words, ArraySize(words));
if (wordCount != 2)
errAbort("Malformed allele:count in |-separated '%s': "
"expecting two :-separated words but got %d", pab->name, wordCount);
char *allele = words[0];
if (isRc)
reverseComplement(allele, strlen(allele));
int obsCount = atoi(words[1]);
struct alCounts *alArray = hashFindVal(perAlleleCounts, allele);
if (alArray == NULL)
{
AllocArray(alArray, details->freqSourceCount);
hashAdd(perAlleleCounts, allele, alArray);
}
alArray[sIx].obsCount = obsCount;
alArray[sIx].totalCount = totalCount;
}
}
}
return perAlleleCounts;
}
static void printAlleleRow(struct hash *perAlleleCounts, char *allele,
struct dbSnpDetails *details, char **perSourceMajorAl)
/* Print the allele and its counts/freqs from each freqSource for allele, if any. */
{
puts("
");
char buf[1024];
char *abbrevAl = isEmpty(allele) ? "-" : bigDbSnpAbbrevAllele(allele, buf, sizeof buf);
printf("%s | ", abbrevAl);
struct alCounts *alleleCounts = hashFindVal(perAlleleCounts, allele);
int sIx;
for (sIx = 0; sIx < details->freqSourceCount; sIx++)
{
if (freqSourceHasData(details, sIx))
{
struct alCounts *counts = NULL;
if (alleleCounts)
counts = &alleleCounts[sIx];
if (counts != NULL && counts->totalCount != 0)
{
boolean isMajor = sameString(perSourceMajorAl[sIx], allele);
printf("");
if (isMajor)
printf("");
printf("%d/%d (%f)", counts->obsCount, counts->totalCount,
((double)counts->obsCount / (double)counts->totalCount));
if (isMajor)
printf("");
}
else
printf(" | - | ");
}
}
puts("
");
}
static void printAlleleCountsAndFreqs(struct bigDbSnp *bds, struct dbSnpDetails *details,
struct trackDb *tdb)
/* Print a table row containing a table of alleles, counts & frequencies if applicable. */
{
// Allele counts & frequencies
if (bds->freqSourceCount > 0)
{
puts("Allele frequency counts: |
");
puts("");
struct slName *sourceList = getFreqSourceOrder(tdb, bds->name, bds->freqSourceCount);
int sIx;
if (sourceList)
{
puts("Allele | ");
struct slName *source = sourceList;
for (sIx = 0; sIx < bds->freqSourceCount; sIx++, source = source->next)
if (freqSourceHasData(details, sIx))
printf("%s | ", source->name);
puts(" ");
}
boolean isRc = (stringIn(bdsRevStrand, bds->ucscNotes) != NULL);
struct hash *perAlleleCounts = makePerAlleleCounts(details, isRc);
// Row for reference allele
printAlleleRow(perAlleleCounts, bds->ref, details, bds->majorAllele);
hashRemove(perAlleleCounts, bds->ref);
// Rows for alternate alleles
int aIx;
for (aIx = 0; aIx < bds->altCount; aIx++)
{
char *alt = bds->alts[aIx];
printAlleleRow(perAlleleCounts, alt, details, bds->majorAllele);
hashRemove(perAlleleCounts, alt);
}
// Rows for frequency alleles not included in ref/alt, if any
struct hashEl *hel, *helList = hashElListHash(perAlleleCounts);
for (hel = helList; hel != NULL; hel = hel->next)
{
char *alt = hel->name;
printAlleleRow(perAlleleCounts, alt, details, bds->majorAllele);
}
puts(" ");
puts(" |
");
}
}
static void printOneSoTerm(struct dbSnpDetails *details, int ix)
/* Print a SO term with link to MISO browser for details->soTerms[ix]. */
{
printf("%s", soTermToMisoLink((enum soTerm)details->soTerms[ix]));
}
static void printOneClinVar(struct dbSnpDetails *details, int ix)
/* Print a link to ClinVar for details->clinVar{Accs,Sigs}[ix]. */
{
printf("%s (%s)",
details->clinVarAccs[ix], details->clinVarAccs[ix], details->clinVarSigs[ix]);
}
static void printOneSubmitter(struct dbSnpDetails *details, int ix)
/* Print a link to dbSNP submitter page for details->submitters[ix]. */
{
printf(""
"%s", details->submitters[ix], details->submitters[ix]);
}
static void printOnePub(struct dbSnpDetails *details, int ix)
/* Print a link to Pubmed for details->pubMedIds[ix]. */
{
printf("PMID%d\n",
details->pubMedIds[ix], details->pubMedIds[ix]);
}
static void printDetailsRow(struct dbSnpDetails *details, char *label, int itemCount,
void (*printOneValue)(struct dbSnpDetails *details, int ix))
/* Print out one row of a details table using callback to print comma-sep list of values. */
{
if (itemCount > 0)
{
printf("%s: | ", label);
int ix;
for (ix = 0; ix < itemCount; ix++)
{
if (ix != 0)
puts(", ");
printOneValue(details, ix);
}
puts(" |
");
}
}
static void printDbSnpDetails(struct bigDbSnp *bds, struct dbSnpDetails *details,
struct trackDb *tdb)
/* Print out extras from dbSnpDetails file. */
{
printAlleleCountsAndFreqs(bds, details, tdb);
printDetailsRow(details, "Functional effects", details->soTermCount, printOneSoTerm);
printDetailsRow(details, "ClinVar", details->clinVarCount, printOneClinVar);
printDetailsRow(details, "Submitted by", details->submitterCount, printOneSubmitter);
printDetailsRow(details, "Publications in PubMed", details->pubMedIdCount, printOnePub);
}
static void printUcscNotes(char *ucscNotes)
/* Print explanations of ucscNotes items. */
{
if (isNotEmpty(ucscNotes))
{
puts("Interesting or anomalous conditions noted by UCSC:
");
puts("
");
boolean isCommonAll = (stringIn(bdsCommonAll, ucscNotes) != NULL);
+ boolean isRareAll = (stringIn(bdsRareAll, ucscNotes) != NULL);
struct slName *note, *noteList = slNameListFromComma(ucscNotes);
for (note = noteList; note != NULL; note = note->next)
{
// When commonAll is true, commonSome is also true but not informative,
- // so skip commonSome if commonAll is true.
- if (! (isCommonAll && sameString(note->name, bdsCommonSome)))
+ // so skip commonSome if commonAll is true. Likewise for rareAll & rareSome.
+ if (! ((isCommonAll && sameString(note->name, bdsCommonSome)) ||
+ (isRareAll && sameString(note->name, bdsRareSome))))
{
char *desc = bigDbSnpDescribeUcscNote(note->name);
printf("- %s\n", desc ? desc : note->name);
}
}
puts("
");
}
}
static char *getMinRep(char *ref, char *alt, boolean leftJustify)
/* If ref and alt can be trimmed down to a shorter representation then return that, othw NULL. */
{
char *minRep = NULL;
int refLen = strlen(ref);
int altLen = strlen(alt);
if ((refLen > 1 && altLen > 0) || (altLen > 1 && refLen > 0))
{
char *refCpy = cloneStringZ(ref, refLen);
char *altCpy = cloneStringZ(alt, altLen);
uint start=0, end=refLen;
int refCpyLen = refLen, altCpyLen = altLen;
if (leftJustify)
trimRefAltLeft(refCpy, altCpy, &start, &end, &refCpyLen, &altCpyLen);
else
trimRefAlt(refCpy, altCpy, &start, &end, &refCpyLen, &altCpyLen);
if (refCpyLen < refLen)
{
struct dyString *dy = dyStringNew(0);
char bufRef[1024], bufAlt[1024];
if (refCpyLen == 0)
dyStringPrintf(dy, "ins%s",
bigDbSnpAbbrevAllele(altCpy, bufAlt, sizeof bufAlt));
else if (altCpyLen == 0)
dyStringPrintf(dy, "del%s",
bigDbSnpAbbrevAllele(refCpy, bufRef, sizeof bufRef));
else
dyStringPrintf(dy, "del%sins%s",
bigDbSnpAbbrevAllele(refCpy, bufRef, sizeof bufRef),
bigDbSnpAbbrevAllele(altCpy, bufAlt, sizeof bufAlt));
minRep = dyStringCannibalize(&dy);
}
}
return minRep;
}
static void printOtherMappings(struct bbiFile *bbi, struct bigDbSnp *bds, struct trackDb *tdb)
/* If the variant maps to other genomic sequences (alts/fixes/PAR), link to those positions. */
{
int fieldIx;
struct bptFile *bpt = bigBedOpenExtraIndex(bbi, "name", &fieldIx);
struct lm *lm = lmInit(0);
struct bigBedInterval *bbList = bigBedNameQuery(bbi, bpt, fieldIx, bds->name, lm);
if (slCount(bbList) > 1)
{
puts("This variant maps to additional locations:
");
char chromName[bbi->chromBpt->keySize+1];
int lastChromId = -1;
struct bigBedInterval *bb;
for (bb = bbList; bb != NULL; bb = bb->next)
{
if (!startsWithWord(bds->name, bb->rest))
errAbort("Error: bigBedNameQuery search for name '%s' yielded '%s'",
bds->name, bb->rest);
bbiCachedChromLookup(bbi, bb->chromId, lastChromId, chromName, sizeof(chromName));
char startBuf[16], endBuf[16];
char *row[BIGDBSNP_NUM_COLS];
int bbFieldCount = bigBedIntervalToRow(bb, chromName, startBuf, endBuf, row, ArraySize(row));
if (bbFieldCount != BIGDBSNP_NUM_COLS)
errAbort("bigDbSnpClick: expected %d columns but got %d", BIGDBSNP_NUM_COLS,
bbFieldCount);
struct bigDbSnp *otherBds = bigDbSnpLoad(row);
if (differentString(bds->chrom, otherBds->chrom) ||
bds->chromStart != otherBds->chromStart ||
bds->chromEnd != otherBds->chromEnd)
{
bedPrintPos((struct bed *)otherBds, 3, tdb);
if (bb->next != NULL)
puts("
");
}
}
puts("
");
}
bptFileDetach(&bpt);
lmCleanup(&lm);
}
void doBigDbSnp(struct trackDb *tdb, char *rsId)
/* Show details for bigDbSnp item. */
{
int start = cartInt(cart, "o");
int end = cartInt(cart, "t");
char *fileOrUrl = trackDbSetting(tdb, "bigDataUrl");
if (isEmpty(fileOrUrl))
errAbort("bigDbSnpClick: trackDb is missing bigDataUrl setting");
struct bbiFile *bbi = bigBedFileOpen(fileOrUrl);
boolean found = FALSE;
char *chrom = cartString(cart, "c");
int ivStart = start, ivEnd = end;
if (start == end)
{
// item is an insertion; expand the search range from 0 bases to 2 so we catch it:
ivStart = max(0, start-1);
ivEnd++;
}
struct lm *lm = lmInit(0);
struct bigBedInterval *bbList = bigBedIntervalQuery(bbi, chrom, ivStart, ivEnd, 0, lm);
struct bigBedInterval *bb;
for (bb = bbList; bb != NULL; bb = bb->next)
{
if (!startsWithWord(rsId, bb->rest))
continue;
found = TRUE;
char startBuf[16], endBuf[16];
char *row[BIGDBSNP_NUM_COLS];
int bbFieldCount = bigBedIntervalToRow(bb, chrom, startBuf, endBuf, row, ArraySize(row));
if (bbFieldCount != BIGDBSNP_NUM_COLS)
errAbort("bigDbSnpClick: expected %d columns but got %d", BIGDBSNP_NUM_COLS, bbFieldCount);
struct bigDbSnp *bds = bigDbSnpLoad(row);
struct dbSnpDetails *details = NULL;
struct slPair *detailsUrls = parseDetailsTablUrls(tdb);
if (detailsUrls)
details = getDetails(bds, detailsUrls->val);
bedPrintPos((struct bed *)bds, 3, tdb);
puts("
");
puts("");
char *ref = bds->ref;
char abbrev[1024];
char *abbrevRef = isEmpty(ref) ? "-" : bigDbSnpAbbrevAllele(ref, abbrev, sizeof abbrev);
printf("Reference allele: | %s | |
\n",
abbrevRef);
printf("altCount > 1)
printf(" style='vertical-align:top'");
printf(">Alternate allele%s: | ", (bds->altCount > 1 ? "s" : ""));
if (bds->altCount == 0)
printf("none");
else
{
int i;
for (i = 0; i < bds->altCount; i++)
{
char *alt = bds->alts[i];
char *abbrevAlt = isEmpty(alt) ? "-" : bigDbSnpAbbrevAllele(alt, abbrev, sizeof abbrev);
printf("%s%s", (i > 0 ? ", \n" : ""), abbrevAlt);
char *minRepLeft = getMinRep(ref, alt, TRUE);
if (minRepLeft)
{
char *minRepRight = getMinRep(ref, alt, FALSE);
if (sameString(minRepLeft, minRepRight))
printf(" [%s]", minRepLeft);
else
printf(" [%s (left-shifted), %s (right-shifted)]", minRepLeft, minRepRight);
}
printf("");
}
puts(" |
");
}
if (bds->shiftBases > 0)
printf("Uncertainty in indel placement: | %d base%s | \n",
bds->shiftBases, (bds->shiftBases > 1 ? "s" : ""));
if (details)
printDbSnpDetails(bds, details, tdb);
printf("Variation class/type: | %s |
\n",
bigDbSnpClassToString(bds->class));
puts("
");
printUcscNotes(bds->ucscNotes);
printOtherMappings(bbi, bds, tdb);
}
if (!found)
printf("No item %s starting at %s:%d\n", rsId, chrom, start+1);
lmCleanup(&lm);
bbiFileClose(&bbi);
}