d3752edc12da1bf08427946150f564dbdd5d2254
angie
  Thu Oct 24 13:55:51 2019 -0700
bigDbSnp track handler code - initial commit.  refs #23283
* dnautil: Added trimRefAltLeft to get left-justified trimming (a la VCF not HGVS).
* bigBedClick: do hReplaceGbdb up front in parseDetailsTablUrls instead of waiting until endpoint.
* trackDbCustom.c: consolidating type-handling for wig/bigWig vs. bigBed-based big*.

diff --git src/hg/hgc/bigDbSnpClick.c src/hg/hgc/bigDbSnpClick.c
new file mode 100644
index 0000000..88cafe8
--- /dev/null
+++ src/hg/hgc/bigDbSnpClick.c
@@ -0,0 +1,451 @@
+/* Show details for bigDbSnp track items. */
+
+/* Copyright (C) 2019 The Regents of the University of California 
+ * See README in this or parent directory for licensing information. */
+
+#include "common.h"
+#include "hgc.h"
+#include "bigDbSnp.h"
+#include "dbSnpDetails.h"
+#include "bPlusTree.h"
+#include "htslib/bgzf.h"
+#include "soTerm.h"
+
+static char *readMaybeBgzip(char *fileOrUrl, bits64 offset, bits64 len)
+/* If fileOrUrl is bgzip-compressed and indexed, then use htslib's bgzf functions to
+ * retrieve uncompressed data from offset; otherwise (plain text) use udc. */
+{
+char *line = needMem(len+1);
+if (endsWith(fileOrUrl, ".gz"))
+    {
+    BGZF *fp = bgzf_open(fileOrUrl, "r");
+    if (bgzf_index_load(fp, fileOrUrl, ".gzi") < 0)
+        errAbort("bgzf_index_load failed to load .gzi index for %s", fileOrUrl);
+    if (bgzf_useek(fp, offset, SEEK_SET) < 0)
+        errAbort("bgzf_useek failed to seek to uncompressed offset %lld in %s", offset, fileOrUrl);
+    bits64 count = bgzf_read(fp, line, len);
+    if (count != len)
+        errAbort("bgzf_read failed to read %lld bytes at uncompressed offset %lld in %s, got %lld",
+                 len, offset, fileOrUrl, count);
+    bgzf_close(fp);
+    }
+else
+    {
+    struct udcFile *udcF = udcFileOpen(fileOrUrl, NULL);
+    udcSeek(udcF, offset);
+    bits64 count = udcRead(udcF, line, len);
+    if (count != len)
+        errAbort("expected %Ld bytes at offset %Ld in %s, got %Ld. ",
+                 len, offset, fileOrUrl, count);
+    udcFileClose(&udcF);
+    }
+return line;
+}
+
+static struct dbSnpDetails *getDetails(struct bigDbSnp *bds, char *detailsFileOrUrl)
+/* Seek to the offset for this variant in detailsFileOrUrl, read the line and load as
+ * struct dbSnpDetails.  */
+{
+bits64 offset = bds->_dataOffset;
+bits64 len = bds->_dataLen;
+char *line = readMaybeBgzip(detailsFileOrUrl, offset, len);
+// Newline must be trimmed or else it messes up parsing of final column if empty!
+if (line[len-1] == '\n')
+    line[len-1] = '\0';
+char *row[DBSNPDETAILS_NUM_COLS+1];
+int wordCount = chopTabs(line, row);
+if (wordCount != DBSNPDETAILS_NUM_COLS)
+    errAbort("dbSnpDetails: expected %d tab-separated words at offset %Ld in %s, got %d",
+             DBSNPDETAILS_NUM_COLS, offset, detailsFileOrUrl, wordCount);
+return dbSnpDetailsLoad(row);
+}
+
+struct slName *getFreqSourceOrder(struct trackDb *tdb, char *rsId, int expectedCount)
+/* If tdb has freqSourceOrder*/
+{
+struct slName *sourceList = NULL;
+char *sourceNames = trackDbSetting(tdb, "freqSourceOrder");
+if (sourceNames)
+    {
+    sourceList = slNameListFromComma(sourceNames);
+    int settingCount = slCount(sourceList);
+    if (settingCount != expectedCount)
+        errAbort("bigDbSnp freqSourceCount for %s is %d, "
+                 "but trackDb setting freqSourceOrder lists %d sources",
+                 rsId, expectedCount, settingCount);
+    }
+return sourceList;
+}
+
+INLINE boolean freqSourceHasData(struct dbSnpDetails *details, int sourceIx)
+/* Return TRUE if freqSource sourceIx has any data for this variant. */
+{
+return details->alleleTotals[sourceIx] > 0;
+}
+
+struct alCounts
+    {
+    int obsCount;   // Number of chromosomes on which allele was observed by a project
+    int totalCount; // Number of chromosomes on which project observed some allele of this variant
+    };
+
+static struct hash *makePerAlleleCounts(struct dbSnpDetails *details, boolean isRc)
+/* Return a hash of allele to array of alCounts (each source's frequency count data). */
+{
+struct hash *perAlleleCounts = hashNew(0);
+int sIx;
+for (sIx = 0;  sIx < details->freqSourceCount;  sIx++)
+    {
+    if (freqSourceHasData(details, sIx))
+        {
+        int totalCount = details->alleleTotals[sIx];
+        char *blob = cloneString(details->alleleCounts[sIx]);
+        struct slName *pab, *perAlleleBlobs = slNameListFromString(blob, '|');
+        for (pab = perAlleleBlobs;  pab != NULL;  pab = pab->next)
+            {
+            char *words[3];
+            int wordCount = chopByChar(pab->name, ':', words, ArraySize(words));
+            if (wordCount != 2)
+                errAbort("Malformed allele:count in |-separated '%s': "
+                         "expecting two :-separated words but got %d", pab->name, wordCount);
+            char *allele = words[0];
+            if (isRc)
+                reverseComplement(allele, strlen(allele));
+            int obsCount = atoi(words[1]);
+            struct alCounts *alArray = hashFindVal(perAlleleCounts, allele);
+            if (alArray == NULL)
+                {
+                AllocArray(alArray, details->freqSourceCount);
+                hashAdd(perAlleleCounts, allele, alArray);
+                }
+            alArray[sIx].obsCount = obsCount;
+            alArray[sIx].totalCount = totalCount;
+            }
+        }
+    }
+return perAlleleCounts;
+}
+
+static void printAlleleRow(struct hash *perAlleleCounts, char *allele,
+                           struct dbSnpDetails *details, char **perSourceMajorAl)
+/* Print the allele and its counts/freqs from each freqSource for allele, if any. */
+{
+puts("<tr>");
+char buf[1024];
+char *abbrevAl = isEmpty(allele) ? "-" : bigDbSnpAbbrevAllele(allele, buf, sizeof buf);
+printf("<td><span class='inlineCode'>%s</span></td>", abbrevAl);
+struct alCounts *alleleCounts = hashFindVal(perAlleleCounts, allele);
+int sIx;
+for (sIx = 0;  sIx < details->freqSourceCount;  sIx++)
+    {
+    if (freqSourceHasData(details, sIx))
+        {
+        struct alCounts *counts = NULL;
+        if (alleleCounts)
+            counts = &alleleCounts[sIx];
+        if (counts != NULL && counts->totalCount != 0)
+            {
+            boolean isMajor = sameString(perSourceMajorAl[sIx], allele);
+            printf("<td>");
+            if (isMajor)
+                printf("<b>");
+            printf("%d/%d (%f)", counts->obsCount, counts->totalCount,
+                   ((double)counts->obsCount / (double)counts->totalCount));
+            if (isMajor)
+                printf("</b>");
+            }
+        else
+            printf("<td>-</td>");
+        }
+    }
+puts("</tr>");
+}
+
+static void printAlleleCountsAndFreqs(struct bigDbSnp *bds, struct dbSnpDetails *details,
+                                      struct trackDb *tdb)
+/* Print a table row containing a table of alleles, counts & frequencies if applicable. */
+{
+// Allele counts & frequencies
+if (bds->freqSourceCount > 0)
+    {
+    puts("<tr><td colspan=2><b>Allele frequency counts:</b></td></tr>");
+    puts("<tr><td colspan=2><table class='stdTbl'>");
+    struct slName *sourceList = getFreqSourceOrder(tdb, bds->name, bds->freqSourceCount);
+    int sIx;
+    if (sourceList)
+        {
+        puts("<tr><th>Allele</th>");
+        struct slName *source = sourceList;
+        for (sIx = 0;  sIx < bds->freqSourceCount;  sIx++, source = source->next)
+            if (freqSourceHasData(details, sIx))
+                printf("<th>%s</th> ", source->name);
+        puts("</tr>");
+        }
+    boolean isRc = (stringIn(bdsRevStrand, bds->ucscNotes) != NULL);
+    struct hash *perAlleleCounts = makePerAlleleCounts(details, isRc);
+    // Row for reference allele
+    printAlleleRow(perAlleleCounts, bds->ref, details, bds->majorAllele);
+    hashRemove(perAlleleCounts, bds->ref);
+    // Rows for alternate alleles
+    int aIx;
+    for (aIx = 0;  aIx < bds->altCount;  aIx++)
+        {
+        char *alt = bds->alts[aIx];
+        printAlleleRow(perAlleleCounts, alt, details, bds->majorAllele);
+        hashRemove(perAlleleCounts, alt);
+        }
+    // Rows for frequency alleles not included in ref/alt, if any
+    struct hashEl *hel, *helList = hashElListHash(perAlleleCounts);
+    for (hel = helList;  hel != NULL;  hel = hel->next)
+        {
+        char *alt = hel->name;
+        printAlleleRow(perAlleleCounts, alt, details, bds->majorAllele);
+        }
+    puts("</table>");
+    puts("</td></tr>");
+    }
+}
+
+static void printOneSoTerm(struct dbSnpDetails *details, int ix)
+/* Print a SO term with link to MISO browser for details->soTerms[ix]. */
+{
+printf("%s", soTermToMisoLink((enum soTerm)details->soTerms[ix]));
+}
+
+static void printOneClinVar(struct dbSnpDetails *details, int ix)
+/* Print a link to ClinVar for details->clinVar{Accs,Sigs}[ix]. */
+{
+printf("<a href='https://www.ncbi.nlm.nih.gov/clinvar/%s' target=_blank>%s</a> (%s)",
+       details->clinVarAccs[ix], details->clinVarAccs[ix], details->clinVarSigs[ix]);
+}
+
+static void printOneSubmitter(struct dbSnpDetails *details, int ix)
+/* Print a link to dbSNP submitter page for details->submitters[ix]. */
+{
+printf("<a href=\"https://www.ncbi.nlm.nih.gov/SNP/snp_viewTable.cgi?h=%s\" target=_blank>"
+       "%s</a>", details->submitters[ix], details->submitters[ix]);
+}
+
+static void printOnePub(struct dbSnpDetails *details, int ix)
+/* Print a link to Pubmed for details->pubMedIds[ix]. */
+{
+printf("<a href=\"https://www.ncbi.nlm.nih.gov/pubmed/%d\" target=_blank>PMID%d</a>\n",
+       details->pubMedIds[ix], details->pubMedIds[ix]);
+}
+
+static void printDetailsRow(struct dbSnpDetails *details, char *label, int itemCount,
+                            void (*printOneValue)(struct dbSnpDetails *details, int ix))
+/* Print out one row of a details table using callback to print comma-sep list of values. */
+{
+if (itemCount > 0)
+    {
+    printf("<tr><td><b>%s:</b></td><td>", label);
+    int ix;
+    for (ix = 0;  ix < itemCount;  ix++)
+        {
+        if (ix != 0)
+            puts(", ");
+        printOneValue(details, ix);
+        }
+    puts("</td></tr>");
+    }
+}
+
+static void printDbSnpDetails(struct bigDbSnp *bds, struct dbSnpDetails *details,
+                              struct trackDb *tdb)
+/* Print out extras from dbSnpDetails file. */
+{
+printAlleleCountsAndFreqs(bds, details, tdb);
+printDetailsRow(details, "Functional effects", details->soTermCount, printOneSoTerm);
+printDetailsRow(details, "ClinVar", details->clinVarCount, printOneClinVar);
+printDetailsRow(details, "Submitted by", details->submitterCount, printOneSubmitter);
+printDetailsRow(details, "Publications in PubMed", details->pubMedIdCount, printOnePub);
+}
+
+static void printUcscNotes(char *ucscNotes)
+/* Print explanations of ucscNotes items. */
+{
+if (isNotEmpty(ucscNotes))
+    {
+    puts("<p>Interesting or anomalous conditions noted by UCSC:</b><br>");
+    puts("<ul>");
+    boolean isCommonAll = (stringIn(bdsCommonAll, ucscNotes) != NULL);
+    struct slName *note, *noteList = slNameListFromComma(ucscNotes);
+    for (note = noteList;  note != NULL;  note = note->next)
+        {
+        // When commonAll is true, commonSome is also true but not informative,
+        // so skip commonSome if commonAll is true.
+        if (! (isCommonAll && sameString(note->name, bdsCommonSome)))
+            {
+            char *desc = bigDbSnpDescribeUcscNote(note->name);
+            printf("<li>%s\n", desc ? desc : note->name);
+            }
+        }
+    puts("</ul>");
+    }
+}
+
+static char *getMinRep(char *ref, char *alt, boolean leftJustify)
+/* If ref and alt can be trimmed down to a shorter representation then return that, othw NULL. */
+{
+char *minRep = NULL;
+int refLen = strlen(ref);
+int altLen = strlen(alt);
+if ((refLen > 1 && altLen > 0) || (altLen > 1 && refLen > 0))
+    {
+    char *refCpy = cloneStringZ(ref, refLen);
+    char *altCpy = cloneStringZ(alt, altLen);
+    uint start=0, end=refLen;
+    int refCpyLen = refLen, altCpyLen = altLen;
+    if (leftJustify)
+        trimRefAltLeft(refCpy, altCpy, &start, &end, &refCpyLen, &altCpyLen);
+    else
+        trimRefAlt(refCpy, altCpy, &start, &end, &refCpyLen, &altCpyLen);
+    if (refCpyLen < refLen)
+        {
+        struct dyString *dy = dyStringNew(0);
+        char bufRef[1024], bufAlt[1024];
+        if (refCpyLen == 0)
+            dyStringPrintf(dy, "ins%s",
+                           bigDbSnpAbbrevAllele(altCpy, bufAlt, sizeof bufAlt));
+        else if (altCpyLen == 0)
+            dyStringPrintf(dy, "del%s",
+                           bigDbSnpAbbrevAllele(refCpy, bufRef, sizeof bufRef));
+        else
+            dyStringPrintf(dy, "del%sins%s",
+                           bigDbSnpAbbrevAllele(refCpy, bufRef, sizeof bufRef),
+                           bigDbSnpAbbrevAllele(altCpy, bufAlt, sizeof bufAlt));
+        minRep = dyStringCannibalize(&dy);
+        }
+    }
+return minRep;
+}
+
+static void printOtherMappings(struct bbiFile *bbi, struct bigDbSnp *bds, struct trackDb *tdb)
+/* If the variant maps to other genomic sequences (alts/fixes/PAR), link to those positions. */
+{
+int fieldIx;
+struct bptFile *bpt = bigBedOpenExtraIndex(bbi, "name", &fieldIx);
+struct lm *lm = lmInit(0);
+struct bigBedInterval *bbList = bigBedNameQuery(bbi, bpt, fieldIx, bds->name, lm);
+if (slCount(bbList) > 1)
+    {
+    puts("<p><b>This variant maps to additional locations:</b><br><br>");
+    char chromName[bbi->chromBpt->keySize+1];
+    int lastChromId = -1;
+    struct bigBedInterval *bb;
+    for (bb = bbList; bb != NULL; bb = bb->next)
+        {
+        if (!startsWithWord(bds->name, bb->rest))
+            errAbort("Error: bigBedNameQuery search for name '%s' yielded '%s'",
+                     bds->name, bb->rest);
+        bbiCachedChromLookup(bbi, bb->chromId, lastChromId, chromName, sizeof(chromName));
+        char startBuf[16], endBuf[16];
+        char *row[BIGDBSNP_NUM_COLS];
+        int bbFieldCount = bigBedIntervalToRow(bb, chromName, startBuf, endBuf, row, ArraySize(row));
+        if (bbFieldCount != BIGDBSNP_NUM_COLS)
+            errAbort("bigDbSnpClick: expected %d columns but got %d", BIGDBSNP_NUM_COLS,
+                     bbFieldCount);
+        struct bigDbSnp *otherBds = bigDbSnpLoad(row);
+        if (differentString(bds->chrom, otherBds->chrom) ||
+            bds->chromStart != otherBds->chromStart ||
+            bds->chromEnd != otherBds->chromEnd)
+            {
+            bedPrintPos((struct bed *)otherBds, 3, tdb);
+            if (bb->next != NULL)
+                puts("<br>");
+            }
+        }
+    puts("</p>");
+    }
+bptFileDetach(&bpt);
+lmCleanup(&lm);
+}
+
+void doBigDbSnp(struct trackDb *tdb, char *rsId)
+/* Show details for bigDbSnp item. */
+{
+int start = cartInt(cart, "o");
+int end = cartInt(cart, "t");
+char *fileOrUrl = trackDbSetting(tdb, "bigDataUrl");
+if (isEmpty(fileOrUrl))
+    errAbort("bigDbSnpClick: trackDb is missing bigDataUrl setting");
+struct bbiFile *bbi = bigBedFileOpen(fileOrUrl);
+boolean found = FALSE;
+char *chrom = cartString(cart, "c");
+int ivStart = start, ivEnd = end;
+if (start == end)
+    {
+    // item is an insertion; expand the search range from 0 bases to 2 so we catch it:
+    ivStart = max(0, start-1);
+    ivEnd++;
+    }
+struct lm *lm = lmInit(0);
+struct bigBedInterval *bbList = bigBedIntervalQuery(bbi, chrom, ivStart, ivEnd, 0, lm);
+struct bigBedInterval *bb;
+for (bb = bbList; bb != NULL; bb = bb->next)
+    {
+    if (!startsWithWord(rsId, bb->rest))
+	continue;
+    found = TRUE;
+    char startBuf[16], endBuf[16];
+    char *row[BIGDBSNP_NUM_COLS];
+    int bbFieldCount = bigBedIntervalToRow(bb, chrom, startBuf, endBuf, row, ArraySize(row));
+    if (bbFieldCount != BIGDBSNP_NUM_COLS)
+        errAbort("bigDbSnpClick: expected %d columns but got %d", BIGDBSNP_NUM_COLS, bbFieldCount);
+    struct bigDbSnp *bds = bigDbSnpLoad(row);
+    struct dbSnpDetails *details = NULL;
+    struct slPair *detailsUrls = parseDetailsTablUrls(tdb);
+    if (detailsUrls)
+        details = getDetails(bds, detailsUrls->val);
+    bedPrintPos((struct bed *)bds, 3, tdb);
+    puts("<br>");
+    puts("<table>");
+    char *ref = bds->ref;
+    char abbrev[1024];
+    char *abbrevRef = isEmpty(ref) ? "-" : bigDbSnpAbbrevAllele(ref, abbrev, sizeof abbrev);
+    printf("<tr><td><b>Reference allele:</b></td><td><span class='inlineCode'>%s</span><td></tr>\n",
+           abbrevRef);
+    printf("<tr><td");
+    if (bds->altCount > 1)
+        printf(" style='vertical-align:top'");
+    printf("><b>Alternate allele%s:</b></td><td>", (bds->altCount > 1 ? "s" : ""));
+    if (bds->altCount == 0)
+        printf("<em>none</em>");
+    else
+        {
+        int i;
+        for (i = 0;  i < bds->altCount;  i++)
+            {
+            char *alt = bds->alts[i];
+            char *abbrevAlt = isEmpty(alt) ? "-" : bigDbSnpAbbrevAllele(alt, abbrev, sizeof abbrev);
+            printf("%s<span class='inlineCode'>%s", (i > 0 ? ",<br>\n" : ""), abbrevAlt);
+            char *minRepLeft = getMinRep(ref, alt, TRUE);
+            if (minRepLeft)
+                {
+                char *minRepRight = getMinRep(ref, alt, FALSE);
+                if (sameString(minRepLeft, minRepRight))
+                    printf(" [%s]", minRepLeft);
+                else
+                    printf(" [%s (left-shifted), %s (right-shifted)]", minRepLeft, minRepRight);
+                }
+            printf("</span>");
+            }
+        puts("</td></tr>");
+        }
+    if (bds->shiftBases > 0)
+        printf("<td><b>Uncertainty in indel placement:</b></td><td>%d base%s</td></tr>\n",
+               bds->shiftBases, (bds->shiftBases > 1 ? "s" : ""));
+    if (details)
+        printDbSnpDetails(bds, details, tdb);
+    printf("<tr><td><b>Variation class/type:</b></td><td>%s</td></tr>\n",
+           bigDbSnpClassToString(bds->class));
+    puts("</table>");
+    printUcscNotes(bds->ucscNotes);
+    printOtherMappings(bbi, bds, tdb);
+    }
+if (!found)
+    printf("No item %s starting at %s:%d\n", rsId, chrom, start+1);
+lmCleanup(&lm);
+bbiFileClose(&bbi);
+}