b622d147b7dbac52dbf3ba26928cd18e02d42bd8 braney Sat Feb 26 12:34:37 2022 -0800 add support for using a bigBed as the chromAlias file diff --git src/hg/hgc/bigDbSnpClick.c src/hg/hgc/bigDbSnpClick.c index d1cb44a..25ca778 100644 --- src/hg/hgc/bigDbSnpClick.c +++ src/hg/hgc/bigDbSnpClick.c @@ -1,423 +1,423 @@ /* Show details for bigDbSnp track items. */ /* Copyright (C) 2019 The Regents of the University of California * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */ #include "common.h" #include "hgc.h" #include "bigDbSnp.h" #include "dbSnpDetails.h" #include "bPlusTree.h" #include "htslib/bgzf.h" #include "soTerm.h" #include "chromAlias.h" static struct dbSnpDetails *getDetails(struct bigDbSnp *bds, char *detailsFileOrUrl) /* Seek to the offset for this variant in detailsFileOrUrl, read the line and load as * struct dbSnpDetails. */ { bits64 offset = bds->_dataOffset; bits64 len = bds->_dataLen; char *line = readOneLineMaybeBgzip(detailsFileOrUrl, offset, len); // Newline must be trimmed or else it messes up parsing of final column if empty! if (line[len-1] == '\n') line[len-1] = '\0'; char *row[DBSNPDETAILS_NUM_COLS+1]; int wordCount = chopTabs(line, row); if (wordCount != DBSNPDETAILS_NUM_COLS) errAbort("dbSnpDetails: expected %d tab-separated words at offset %Ld in %s, got %d", DBSNPDETAILS_NUM_COLS, offset, detailsFileOrUrl, wordCount); return dbSnpDetailsLoad(row); } struct slName *getFreqSourceOrder(struct trackDb *tdb, char *rsId, int expectedCount) /* If tdb has freqSourceOrder*/ { struct slName *sourceList = NULL; char *sourceNames = trackDbSetting(tdb, "freqSourceOrder"); if (sourceNames) { sourceList = slNameListFromComma(sourceNames); int settingCount = slCount(sourceList); if (settingCount != expectedCount) errAbort("bigDbSnp freqSourceCount for %s is %d, " "but trackDb setting freqSourceOrder lists %d sources", rsId, expectedCount, settingCount); } return sourceList; } INLINE boolean freqSourceHasData(struct dbSnpDetails *details, int sourceIx) /* Return TRUE if freqSource sourceIx has any data for this variant. */ { return details->alleleTotals[sourceIx] > 0; } struct alCounts { int obsCount; // Number of chromosomes on which allele was observed by a project int totalCount; // Number of chromosomes on which project observed some allele of this variant }; static struct hash *makePerAlleleCounts(struct dbSnpDetails *details, boolean isRc) /* Return a hash of allele to array of alCounts (each source's frequency count data). */ { struct hash *perAlleleCounts = hashNew(0); int sIx; for (sIx = 0; sIx < details->freqSourceCount; sIx++) { if (freqSourceHasData(details, sIx)) { int totalCount = details->alleleTotals[sIx]; char *blob = cloneString(details->alleleCounts[sIx]); struct slName *pab, *perAlleleBlobs = slNameListFromString(blob, '|'); for (pab = perAlleleBlobs; pab != NULL; pab = pab->next) { char *words[3]; int wordCount = chopByChar(pab->name, ':', words, ArraySize(words)); if (wordCount != 2) errAbort("Malformed allele:count in |-separated '%s': " "expecting two :-separated words but got %d", pab->name, wordCount); char *allele = words[0]; if (isRc) reverseComplement(allele, strlen(allele)); int obsCount = atoi(words[1]); struct alCounts *alArray = hashFindVal(perAlleleCounts, allele); if (alArray == NULL) { AllocArray(alArray, details->freqSourceCount); hashAdd(perAlleleCounts, allele, alArray); } alArray[sIx].obsCount = obsCount; alArray[sIx].totalCount = totalCount; } } } return perAlleleCounts; } static void printAlleleRow(struct hash *perAlleleCounts, char *allele, struct dbSnpDetails *details, char **perSourceMajorAl) /* Print the allele and its counts/freqs from each freqSource for allele, if any. */ { puts(""); char buf[1024]; char *abbrevAl = isEmpty(allele) ? "-" : bigDbSnpAbbrevAllele(allele, buf, sizeof buf); printf("%s", abbrevAl); struct alCounts *alleleCounts = hashFindVal(perAlleleCounts, allele); int sIx; for (sIx = 0; sIx < details->freqSourceCount; sIx++) { if (freqSourceHasData(details, sIx)) { struct alCounts *counts = NULL; if (alleleCounts) counts = &alleleCounts[sIx]; if (counts != NULL && counts->totalCount != 0) { boolean isMajor = sameString(perSourceMajorAl[sIx], allele); printf(""); if (isMajor) printf(""); printf("%d/%d (%f)", counts->obsCount, counts->totalCount, ((double)counts->obsCount / (double)counts->totalCount)); if (isMajor) printf(""); } else printf("-"); } } puts(""); } static void printAlleleCountsAndFreqs(struct bigDbSnp *bds, struct dbSnpDetails *details, struct trackDb *tdb) /* Print a table row containing a table of alleles, counts & frequencies if applicable. */ { // Allele counts & frequencies if (bds->freqSourceCount > 0) { puts("Allele frequency counts:"); puts(""); struct slName *sourceList = getFreqSourceOrder(tdb, bds->name, bds->freqSourceCount); int sIx; if (sourceList) { puts(""); struct slName *source = sourceList; for (sIx = 0; sIx < bds->freqSourceCount; sIx++, source = source->next) if (freqSourceHasData(details, sIx)) printf(" ", source->name); puts(""); } boolean isRc = (stringIn(bdsRevStrand, bds->ucscNotes) != NULL); struct hash *perAlleleCounts = makePerAlleleCounts(details, isRc); // Row for reference allele printAlleleRow(perAlleleCounts, bds->ref, details, bds->majorAllele); hashRemove(perAlleleCounts, bds->ref); // Rows for alternate alleles int aIx; for (aIx = 0; aIx < bds->altCount; aIx++) { char *alt = bds->alts[aIx]; printAlleleRow(perAlleleCounts, alt, details, bds->majorAllele); hashRemove(perAlleleCounts, alt); } // Rows for frequency alleles not included in ref/alt, if any struct hashEl *hel, *helList = hashElListHash(perAlleleCounts); for (hel = helList; hel != NULL; hel = hel->next) { char *alt = hel->name; printAlleleRow(perAlleleCounts, alt, details, bds->majorAllele); } puts("
Allele%s
"); puts(""); } } static void printOneSoTerm(struct dbSnpDetails *details, int ix) /* Print a SO term with link to MISO browser for details->soTerms[ix]. */ { printf("%s", soTermToMisoLink((enum soTerm)details->soTerms[ix])); } static void printOneClinVar(struct dbSnpDetails *details, int ix) /* Print a link to ClinVar for details->clinVar{Accs,Sigs}[ix]. */ { printf("%s (%s)", details->clinVarAccs[ix], details->clinVarAccs[ix], details->clinVarSigs[ix]); } static void printOneSubmitter(struct dbSnpDetails *details, int ix) /* Print a link to dbSNP submitter page for details->submitters[ix]. */ { printf("" "%s", details->submitters[ix], details->submitters[ix]); } static void printOnePub(struct dbSnpDetails *details, int ix) /* Print a link to Pubmed for details->pubMedIds[ix]. */ { printf("PMID%d\n", details->pubMedIds[ix], details->pubMedIds[ix]); } static void printDetailsRow(struct dbSnpDetails *details, char *label, int itemCount, void (*printOneValue)(struct dbSnpDetails *details, int ix)) /* Print out one row of a details table using callback to print comma-sep list of values. */ { if (itemCount > 0) { printf("%s:", label); int ix; for (ix = 0; ix < itemCount; ix++) { if (ix != 0) puts(", "); printOneValue(details, ix); } puts(""); } } static void printDbSnpDetails(struct bigDbSnp *bds, struct dbSnpDetails *details, struct trackDb *tdb) /* Print out extras from dbSnpDetails file. */ { printAlleleCountsAndFreqs(bds, details, tdb); printDetailsRow(details, "Functional effects", details->soTermCount, printOneSoTerm); printDetailsRow(details, "ClinVar", details->clinVarCount, printOneClinVar); printDetailsRow(details, "Submitted by", details->submitterCount, printOneSubmitter); printDetailsRow(details, "Publications in PubMed", details->pubMedIdCount, printOnePub); } static void printUcscNotes(char *ucscNotes) /* Print explanations of ucscNotes items. */ { if (isNotEmpty(ucscNotes)) { puts("

Interesting or anomalous conditions noted by UCSC:
"); puts("

"); } } static char *getMinRep(char *ref, char *alt, boolean leftJustify) /* If ref and alt can be trimmed down to a shorter representation then return that, othw NULL. */ { char *minRep = NULL; int refLen = strlen(ref); int altLen = strlen(alt); if ((refLen > 1 && altLen > 0) || (altLen > 1 && refLen > 0)) { char *refCpy = cloneStringZ(ref, refLen); char *altCpy = cloneStringZ(alt, altLen); uint start=0, end=refLen; int refCpyLen = refLen, altCpyLen = altLen; if (leftJustify) trimRefAltLeft(refCpy, altCpy, &start, &end, &refCpyLen, &altCpyLen); else trimRefAlt(refCpy, altCpy, &start, &end, &refCpyLen, &altCpyLen); if (refCpyLen < refLen) { struct dyString *dy = dyStringNew(0); char bufRef[1024], bufAlt[1024]; if (refCpyLen == 0) dyStringPrintf(dy, "ins%s", bigDbSnpAbbrevAllele(altCpy, bufAlt, sizeof bufAlt)); else if (altCpyLen == 0) dyStringPrintf(dy, "del%s", bigDbSnpAbbrevAllele(refCpy, bufRef, sizeof bufRef)); else dyStringPrintf(dy, "del%sins%s", bigDbSnpAbbrevAllele(refCpy, bufRef, sizeof bufRef), bigDbSnpAbbrevAllele(altCpy, bufAlt, sizeof bufAlt)); minRep = dyStringCannibalize(&dy); } } return minRep; } static void printOtherMappings(struct bbiFile *bbi, struct bigDbSnp *bds, struct trackDb *tdb) /* If the variant maps to other genomic sequences (alts/fixes/PAR), link to those positions. */ { int fieldIx; struct bptFile *bpt = bigBedOpenExtraIndex(bbi, "name", &fieldIx); struct lm *lm = lmInit(0); struct bigBedInterval *bbList = bigBedNameQuery(bbi, bpt, fieldIx, bds->name, lm); if (slCount(bbList) > 1) { puts("

This variant maps to additional locations:

"); char chromName[bbi->chromBpt->keySize+1]; int lastChromId = -1; struct bigBedInterval *bb; for (bb = bbList; bb != NULL; bb = bb->next) { if (!startsWithWord(bds->name, bb->rest)) errAbort("Error: bigBedNameQuery search for name '%s' yielded '%s'", bds->name, bb->rest); bbiCachedChromLookup(bbi, bb->chromId, lastChromId, chromName, sizeof(chromName)); char startBuf[16], endBuf[16]; char *row[BIGDBSNP_NUM_COLS]; int bbFieldCount = bigBedIntervalToRow(bb, chromName, startBuf, endBuf, row, ArraySize(row)); if (bbFieldCount != BIGDBSNP_NUM_COLS) errAbort("bigDbSnpClick: expected %d columns but got %d", BIGDBSNP_NUM_COLS, bbFieldCount); struct bigDbSnp *otherBds = bigDbSnpLoad(row); if (differentString(bds->chrom, otherBds->chrom) || bds->chromStart != otherBds->chromStart || bds->chromEnd != otherBds->chromEnd) { bedPrintPos((struct bed *)otherBds, 3, tdb); if (bb->next != NULL) puts("
"); } } puts("

"); } bptFileDetach(&bpt); lmCleanup(&lm); } void doBigDbSnp(struct trackDb *tdb, char *rsId) /* Show details for bigDbSnp item. */ { int start = cartInt(cart, "o"); int end = cartInt(cart, "t"); char *fileOrUrl = hReplaceGbdb(trackDbSetting(tdb, "bigDataUrl")); if (isEmpty(fileOrUrl)) errAbort("bigDbSnpClick: trackDb is missing bigDataUrl setting"); -struct bbiFile *bbi = bigBedFileOpenAlias(fileOrUrl, chromAliasChromToAliasHash(database)); +struct bbiFile *bbi = bigBedFileOpenAlias(fileOrUrl, chromAliasFindAliases); boolean found = FALSE; char *chrom = cartString(cart, "c"); int ivStart = start, ivEnd = end; if (start == end) { // item is an insertion; expand the search range from 0 bases to 2 so we catch it: ivStart = max(0, start-1); ivEnd++; } struct lm *lm = lmInit(0); struct bigBedInterval *bbList = bigBedIntervalQuery(bbi, chrom, ivStart, ivEnd, 0, lm); struct bigBedInterval *bb; for (bb = bbList; bb != NULL; bb = bb->next) { if (!startsWithWord(rsId, bb->rest)) continue; found = TRUE; char startBuf[16], endBuf[16]; char *row[BIGDBSNP_NUM_COLS]; int bbFieldCount = bigBedIntervalToRow(bb, chrom, startBuf, endBuf, row, ArraySize(row)); if (bbFieldCount != BIGDBSNP_NUM_COLS) errAbort("bigDbSnpClick: expected %d columns but got %d", BIGDBSNP_NUM_COLS, bbFieldCount); struct bigDbSnp *bds = bigDbSnpLoad(row); struct dbSnpDetails *details = NULL; struct slPair *detailsUrls = parseDetailsTablUrls(tdb); if (detailsUrls) details = getDetails(bds, detailsUrls->val); bedPrintPos((struct bed *)bds, 3, tdb); puts("
"); puts(""); char *ref = bds->ref; char abbrev[1024]; char *abbrevRef = isEmpty(ref) ? "-" : bigDbSnpAbbrevAllele(ref, abbrev, sizeof abbrev); printf("\n", abbrevRef); printf("altCount > 1) printf(" style='vertical-align:top'"); printf(">Alternate allele%s:"); } if (bds->shiftBases > 0) printf("\n", bds->shiftBases, (bds->shiftBases > 1 ? "s" : "")); if (details) printDbSnpDetails(bds, details, tdb); printf("\n", bigDbSnpClassToString(bds->class)); puts("
Reference allele:%s
", (bds->altCount > 1 ? "s" : "")); if (bds->altCount == 0) printf("none"); else { int i; for (i = 0; i < bds->altCount; i++) { char *alt = bds->alts[i]; char *abbrevAlt = isEmpty(alt) ? "-" : bigDbSnpAbbrevAllele(alt, abbrev, sizeof abbrev); printf("%s%s", (i > 0 ? ",
\n" : ""), abbrevAlt); char *minRepLeft = getMinRep(ref, alt, TRUE); if (minRepLeft) { char *minRepRight = getMinRep(ref, alt, FALSE); if (sameString(minRepLeft, minRepRight)) printf(" [%s]", minRepLeft); else printf(" [%s (left-shifted), %s (right-shifted)]", minRepLeft, minRepRight); } printf("
"); } puts("
Uncertainty in indel placement:%d base%s
Variation class/type:%s
"); printUcscNotes(bds->ucscNotes); printOtherMappings(bbi, bds, tdb); } if (!found) printf("No item %s starting at %s:%d\n", rsId, chrom, start+1); lmCleanup(&lm); bbiFileClose(&bbi); }