0564395ec363631f2ff8d295da6f6b50f873fda4 braney Mon Jan 24 17:01:53 2022 -0800 more chromAlias work: some name changes and support for the new genark chromAlias format diff --git src/hg/hgc/bigDbSnpClick.c src/hg/hgc/bigDbSnpClick.c index cebb059..d1cb44a 100644 --- src/hg/hgc/bigDbSnpClick.c +++ src/hg/hgc/bigDbSnpClick.c @@ -1,423 +1,423 @@ /* Show details for bigDbSnp track items. */ /* Copyright (C) 2019 The Regents of the University of California * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */ #include "common.h" #include "hgc.h" #include "bigDbSnp.h" #include "dbSnpDetails.h" #include "bPlusTree.h" #include "htslib/bgzf.h" #include "soTerm.h" #include "chromAlias.h" static struct dbSnpDetails *getDetails(struct bigDbSnp *bds, char *detailsFileOrUrl) /* Seek to the offset for this variant in detailsFileOrUrl, read the line and load as * struct dbSnpDetails. */ { bits64 offset = bds->_dataOffset; bits64 len = bds->_dataLen; char *line = readOneLineMaybeBgzip(detailsFileOrUrl, offset, len); // Newline must be trimmed or else it messes up parsing of final column if empty! if (line[len-1] == '\n') line[len-1] = '\0'; char *row[DBSNPDETAILS_NUM_COLS+1]; int wordCount = chopTabs(line, row); if (wordCount != DBSNPDETAILS_NUM_COLS) errAbort("dbSnpDetails: expected %d tab-separated words at offset %Ld in %s, got %d", DBSNPDETAILS_NUM_COLS, offset, detailsFileOrUrl, wordCount); return dbSnpDetailsLoad(row); } struct slName *getFreqSourceOrder(struct trackDb *tdb, char *rsId, int expectedCount) /* If tdb has freqSourceOrder*/ { struct slName *sourceList = NULL; char *sourceNames = trackDbSetting(tdb, "freqSourceOrder"); if (sourceNames) { sourceList = slNameListFromComma(sourceNames); int settingCount = slCount(sourceList); if (settingCount != expectedCount) errAbort("bigDbSnp freqSourceCount for %s is %d, " "but trackDb setting freqSourceOrder lists %d sources", rsId, expectedCount, settingCount); } return sourceList; } INLINE boolean freqSourceHasData(struct dbSnpDetails *details, int sourceIx) /* Return TRUE if freqSource sourceIx has any data for this variant. */ { return details->alleleTotals[sourceIx] > 0; } struct alCounts { int obsCount; // Number of chromosomes on which allele was observed by a project int totalCount; // Number of chromosomes on which project observed some allele of this variant }; static struct hash *makePerAlleleCounts(struct dbSnpDetails *details, boolean isRc) /* Return a hash of allele to array of alCounts (each source's frequency count data). */ { struct hash *perAlleleCounts = hashNew(0); int sIx; for (sIx = 0; sIx < details->freqSourceCount; sIx++) { if (freqSourceHasData(details, sIx)) { int totalCount = details->alleleTotals[sIx]; char *blob = cloneString(details->alleleCounts[sIx]); struct slName *pab, *perAlleleBlobs = slNameListFromString(blob, '|'); for (pab = perAlleleBlobs; pab != NULL; pab = pab->next) { char *words[3]; int wordCount = chopByChar(pab->name, ':', words, ArraySize(words)); if (wordCount != 2) errAbort("Malformed allele:count in |-separated '%s': " "expecting two :-separated words but got %d", pab->name, wordCount); char *allele = words[0]; if (isRc) reverseComplement(allele, strlen(allele)); int obsCount = atoi(words[1]); struct alCounts *alArray = hashFindVal(perAlleleCounts, allele); if (alArray == NULL) { AllocArray(alArray, details->freqSourceCount); hashAdd(perAlleleCounts, allele, alArray); } alArray[sIx].obsCount = obsCount; alArray[sIx].totalCount = totalCount; } } } return perAlleleCounts; } static void printAlleleRow(struct hash *perAlleleCounts, char *allele, struct dbSnpDetails *details, char **perSourceMajorAl) /* Print the allele and its counts/freqs from each freqSource for allele, if any. */ { puts("<tr>"); char buf[1024]; char *abbrevAl = isEmpty(allele) ? "-" : bigDbSnpAbbrevAllele(allele, buf, sizeof buf); printf("<td><span class='inlineCode'>%s</span></td>", abbrevAl); struct alCounts *alleleCounts = hashFindVal(perAlleleCounts, allele); int sIx; for (sIx = 0; sIx < details->freqSourceCount; sIx++) { if (freqSourceHasData(details, sIx)) { struct alCounts *counts = NULL; if (alleleCounts) counts = &alleleCounts[sIx]; if (counts != NULL && counts->totalCount != 0) { boolean isMajor = sameString(perSourceMajorAl[sIx], allele); printf("<td>"); if (isMajor) printf("<b>"); printf("%d/%d (%f)", counts->obsCount, counts->totalCount, ((double)counts->obsCount / (double)counts->totalCount)); if (isMajor) printf("</b>"); } else printf("<td>-</td>"); } } puts("</tr>"); } static void printAlleleCountsAndFreqs(struct bigDbSnp *bds, struct dbSnpDetails *details, struct trackDb *tdb) /* Print a table row containing a table of alleles, counts & frequencies if applicable. */ { // Allele counts & frequencies if (bds->freqSourceCount > 0) { puts("<tr><td colspan=2><b>Allele frequency counts:</b></td></tr>"); puts("<tr><td colspan=2><table class='stdTbl'>"); struct slName *sourceList = getFreqSourceOrder(tdb, bds->name, bds->freqSourceCount); int sIx; if (sourceList) { puts("<tr><th>Allele</th>"); struct slName *source = sourceList; for (sIx = 0; sIx < bds->freqSourceCount; sIx++, source = source->next) if (freqSourceHasData(details, sIx)) printf("<th>%s</th> ", source->name); puts("</tr>"); } boolean isRc = (stringIn(bdsRevStrand, bds->ucscNotes) != NULL); struct hash *perAlleleCounts = makePerAlleleCounts(details, isRc); // Row for reference allele printAlleleRow(perAlleleCounts, bds->ref, details, bds->majorAllele); hashRemove(perAlleleCounts, bds->ref); // Rows for alternate alleles int aIx; for (aIx = 0; aIx < bds->altCount; aIx++) { char *alt = bds->alts[aIx]; printAlleleRow(perAlleleCounts, alt, details, bds->majorAllele); hashRemove(perAlleleCounts, alt); } // Rows for frequency alleles not included in ref/alt, if any struct hashEl *hel, *helList = hashElListHash(perAlleleCounts); for (hel = helList; hel != NULL; hel = hel->next) { char *alt = hel->name; printAlleleRow(perAlleleCounts, alt, details, bds->majorAllele); } puts("</table>"); puts("</td></tr>"); } } static void printOneSoTerm(struct dbSnpDetails *details, int ix) /* Print a SO term with link to MISO browser for details->soTerms[ix]. */ { printf("%s", soTermToMisoLink((enum soTerm)details->soTerms[ix])); } static void printOneClinVar(struct dbSnpDetails *details, int ix) /* Print a link to ClinVar for details->clinVar{Accs,Sigs}[ix]. */ { printf("<a href='https://www.ncbi.nlm.nih.gov/clinvar/%s' target=_blank>%s</a> (%s)", details->clinVarAccs[ix], details->clinVarAccs[ix], details->clinVarSigs[ix]); } static void printOneSubmitter(struct dbSnpDetails *details, int ix) /* Print a link to dbSNP submitter page for details->submitters[ix]. */ { printf("<a href=\"https://www.ncbi.nlm.nih.gov/SNP/snp_viewTable.cgi?h=%s\" target=_blank>" "%s</a>", details->submitters[ix], details->submitters[ix]); } static void printOnePub(struct dbSnpDetails *details, int ix) /* Print a link to Pubmed for details->pubMedIds[ix]. */ { printf("<a href=\"https://www.ncbi.nlm.nih.gov/pubmed/%d\" target=_blank>PMID%d</a>\n", details->pubMedIds[ix], details->pubMedIds[ix]); } static void printDetailsRow(struct dbSnpDetails *details, char *label, int itemCount, void (*printOneValue)(struct dbSnpDetails *details, int ix)) /* Print out one row of a details table using callback to print comma-sep list of values. */ { if (itemCount > 0) { printf("<tr><td><b>%s:</b></td><td>", label); int ix; for (ix = 0; ix < itemCount; ix++) { if (ix != 0) puts(", "); printOneValue(details, ix); } puts("</td></tr>"); } } static void printDbSnpDetails(struct bigDbSnp *bds, struct dbSnpDetails *details, struct trackDb *tdb) /* Print out extras from dbSnpDetails file. */ { printAlleleCountsAndFreqs(bds, details, tdb); printDetailsRow(details, "Functional effects", details->soTermCount, printOneSoTerm); printDetailsRow(details, "ClinVar", details->clinVarCount, printOneClinVar); printDetailsRow(details, "Submitted by", details->submitterCount, printOneSubmitter); printDetailsRow(details, "Publications in PubMed", details->pubMedIdCount, printOnePub); } static void printUcscNotes(char *ucscNotes) /* Print explanations of ucscNotes items. */ { if (isNotEmpty(ucscNotes)) { puts("<p>Interesting or anomalous conditions noted by UCSC:</b><br>"); puts("<ul>"); boolean isCommonAll = (stringIn(bdsCommonAll, ucscNotes) != NULL); boolean isRareAll = (stringIn(bdsRareAll, ucscNotes) != NULL); struct slName *note, *noteList = slNameListFromComma(ucscNotes); for (note = noteList; note != NULL; note = note->next) { // When commonAll is true, commonSome is also true but not informative, // so skip commonSome if commonAll is true. Likewise for rareAll & rareSome. if (! ((isCommonAll && sameString(note->name, bdsCommonSome)) || (isRareAll && sameString(note->name, bdsRareSome)))) { char *desc = bigDbSnpDescribeUcscNote(note->name); printf("<li>%s\n", desc ? desc : note->name); } } puts("</ul>"); } } static char *getMinRep(char *ref, char *alt, boolean leftJustify) /* If ref and alt can be trimmed down to a shorter representation then return that, othw NULL. */ { char *minRep = NULL; int refLen = strlen(ref); int altLen = strlen(alt); if ((refLen > 1 && altLen > 0) || (altLen > 1 && refLen > 0)) { char *refCpy = cloneStringZ(ref, refLen); char *altCpy = cloneStringZ(alt, altLen); uint start=0, end=refLen; int refCpyLen = refLen, altCpyLen = altLen; if (leftJustify) trimRefAltLeft(refCpy, altCpy, &start, &end, &refCpyLen, &altCpyLen); else trimRefAlt(refCpy, altCpy, &start, &end, &refCpyLen, &altCpyLen); if (refCpyLen < refLen) { struct dyString *dy = dyStringNew(0); char bufRef[1024], bufAlt[1024]; if (refCpyLen == 0) dyStringPrintf(dy, "ins%s", bigDbSnpAbbrevAllele(altCpy, bufAlt, sizeof bufAlt)); else if (altCpyLen == 0) dyStringPrintf(dy, "del%s", bigDbSnpAbbrevAllele(refCpy, bufRef, sizeof bufRef)); else dyStringPrintf(dy, "del%sins%s", bigDbSnpAbbrevAllele(refCpy, bufRef, sizeof bufRef), bigDbSnpAbbrevAllele(altCpy, bufAlt, sizeof bufAlt)); minRep = dyStringCannibalize(&dy); } } return minRep; } static void printOtherMappings(struct bbiFile *bbi, struct bigDbSnp *bds, struct trackDb *tdb) /* If the variant maps to other genomic sequences (alts/fixes/PAR), link to those positions. */ { int fieldIx; struct bptFile *bpt = bigBedOpenExtraIndex(bbi, "name", &fieldIx); struct lm *lm = lmInit(0); struct bigBedInterval *bbList = bigBedNameQuery(bbi, bpt, fieldIx, bds->name, lm); if (slCount(bbList) > 1) { puts("<p><b>This variant maps to additional locations:</b><br><br>"); char chromName[bbi->chromBpt->keySize+1]; int lastChromId = -1; struct bigBedInterval *bb; for (bb = bbList; bb != NULL; bb = bb->next) { if (!startsWithWord(bds->name, bb->rest)) errAbort("Error: bigBedNameQuery search for name '%s' yielded '%s'", bds->name, bb->rest); bbiCachedChromLookup(bbi, bb->chromId, lastChromId, chromName, sizeof(chromName)); char startBuf[16], endBuf[16]; char *row[BIGDBSNP_NUM_COLS]; int bbFieldCount = bigBedIntervalToRow(bb, chromName, startBuf, endBuf, row, ArraySize(row)); if (bbFieldCount != BIGDBSNP_NUM_COLS) errAbort("bigDbSnpClick: expected %d columns but got %d", BIGDBSNP_NUM_COLS, bbFieldCount); struct bigDbSnp *otherBds = bigDbSnpLoad(row); if (differentString(bds->chrom, otherBds->chrom) || bds->chromStart != otherBds->chromStart || bds->chromEnd != otherBds->chromEnd) { bedPrintPos((struct bed *)otherBds, 3, tdb); if (bb->next != NULL) puts("<br>"); } } puts("</p>"); } bptFileDetach(&bpt); lmCleanup(&lm); } void doBigDbSnp(struct trackDb *tdb, char *rsId) /* Show details for bigDbSnp item. */ { int start = cartInt(cart, "o"); int end = cartInt(cart, "t"); char *fileOrUrl = hReplaceGbdb(trackDbSetting(tdb, "bigDataUrl")); if (isEmpty(fileOrUrl)) errAbort("bigDbSnpClick: trackDb is missing bigDataUrl setting"); -struct bbiFile *bbi = bigBedFileOpenAlias(fileOrUrl, chromAliasGetHash(database)); +struct bbiFile *bbi = bigBedFileOpenAlias(fileOrUrl, chromAliasChromToAliasHash(database)); boolean found = FALSE; char *chrom = cartString(cart, "c"); int ivStart = start, ivEnd = end; if (start == end) { // item is an insertion; expand the search range from 0 bases to 2 so we catch it: ivStart = max(0, start-1); ivEnd++; } struct lm *lm = lmInit(0); struct bigBedInterval *bbList = bigBedIntervalQuery(bbi, chrom, ivStart, ivEnd, 0, lm); struct bigBedInterval *bb; for (bb = bbList; bb != NULL; bb = bb->next) { if (!startsWithWord(rsId, bb->rest)) continue; found = TRUE; char startBuf[16], endBuf[16]; char *row[BIGDBSNP_NUM_COLS]; int bbFieldCount = bigBedIntervalToRow(bb, chrom, startBuf, endBuf, row, ArraySize(row)); if (bbFieldCount != BIGDBSNP_NUM_COLS) errAbort("bigDbSnpClick: expected %d columns but got %d", BIGDBSNP_NUM_COLS, bbFieldCount); struct bigDbSnp *bds = bigDbSnpLoad(row); struct dbSnpDetails *details = NULL; struct slPair *detailsUrls = parseDetailsTablUrls(tdb); if (detailsUrls) details = getDetails(bds, detailsUrls->val); bedPrintPos((struct bed *)bds, 3, tdb); puts("<br>"); puts("<table>"); char *ref = bds->ref; char abbrev[1024]; char *abbrevRef = isEmpty(ref) ? "-" : bigDbSnpAbbrevAllele(ref, abbrev, sizeof abbrev); printf("<tr><td><b>Reference allele:</b></td><td><span class='inlineCode'>%s</span><td></tr>\n", abbrevRef); printf("<tr><td"); if (bds->altCount > 1) printf(" style='vertical-align:top'"); printf("><b>Alternate allele%s:</b></td><td>", (bds->altCount > 1 ? "s" : "")); if (bds->altCount == 0) printf("<em>none</em>"); else { int i; for (i = 0; i < bds->altCount; i++) { char *alt = bds->alts[i]; char *abbrevAlt = isEmpty(alt) ? "-" : bigDbSnpAbbrevAllele(alt, abbrev, sizeof abbrev); printf("%s<span class='inlineCode'>%s", (i > 0 ? ",<br>\n" : ""), abbrevAlt); char *minRepLeft = getMinRep(ref, alt, TRUE); if (minRepLeft) { char *minRepRight = getMinRep(ref, alt, FALSE); if (sameString(minRepLeft, minRepRight)) printf(" [%s]", minRepLeft); else printf(" [%s (left-shifted), %s (right-shifted)]", minRepLeft, minRepRight); } printf("</span>"); } puts("</td></tr>"); } if (bds->shiftBases > 0) printf("<td><b>Uncertainty in indel placement:</b></td><td>%d base%s</td></tr>\n", bds->shiftBases, (bds->shiftBases > 1 ? "s" : "")); if (details) printDbSnpDetails(bds, details, tdb); printf("<tr><td><b>Variation class/type:</b></td><td>%s</td></tr>\n", bigDbSnpClassToString(bds->class)); puts("</table>"); printUcscNotes(bds->ucscNotes); printOtherMappings(bbi, bds, tdb); } if (!found) printf("No item %s starting at %s:%d\n", rsId, chrom, start+1); lmCleanup(&lm); bbiFileClose(&bbi); }