1b074613b1c6b7f84953c3f490631b91d844c9a9
galt
  Thu Sep 8 22:40:20 2022 -0700
Adapting Angie's existing variant effects to the new dbSnp that uses SPDI and provided via json formatted data. So users can see variant effects with version 153 or later. refs #29989

diff --git src/hg/hgc/bigDbSnpClick.c src/hg/hgc/bigDbSnpClick.c
index 25ca778..4800424 100644
--- src/hg/hgc/bigDbSnpClick.c
+++ src/hg/hgc/bigDbSnpClick.c
@@ -1,423 +1,456 @@
 /* Show details for bigDbSnp track items. */
 
 /* Copyright (C) 2019 The Regents of the University of California 
  * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
 
 #include "common.h"
 #include "hgc.h"
 #include "bigDbSnp.h"
 #include "dbSnpDetails.h"
 #include "bPlusTree.h"
 #include "htslib/bgzf.h"
 #include "soTerm.h"
 #include "chromAlias.h"
 
 static struct dbSnpDetails *getDetails(struct bigDbSnp *bds, char *detailsFileOrUrl)
 /* Seek to the offset for this variant in detailsFileOrUrl, read the line and load as
  * struct dbSnpDetails.  */
 {
 bits64 offset = bds->_dataOffset;
 bits64 len = bds->_dataLen;
 char *line = readOneLineMaybeBgzip(detailsFileOrUrl, offset, len);
 // Newline must be trimmed or else it messes up parsing of final column if empty!
 if (line[len-1] == '\n')
     line[len-1] = '\0';
 char *row[DBSNPDETAILS_NUM_COLS+1];
 int wordCount = chopTabs(line, row);
 if (wordCount != DBSNPDETAILS_NUM_COLS)
     errAbort("dbSnpDetails: expected %d tab-separated words at offset %Ld in %s, got %d",
              DBSNPDETAILS_NUM_COLS, offset, detailsFileOrUrl, wordCount);
 return dbSnpDetailsLoad(row);
 }
 
 struct slName *getFreqSourceOrder(struct trackDb *tdb, char *rsId, int expectedCount)
 /* If tdb has freqSourceOrder*/
 {
 struct slName *sourceList = NULL;
 char *sourceNames = trackDbSetting(tdb, "freqSourceOrder");
 if (sourceNames)
     {
     sourceList = slNameListFromComma(sourceNames);
     int settingCount = slCount(sourceList);
     if (settingCount != expectedCount)
         errAbort("bigDbSnp freqSourceCount for %s is %d, "
                  "but trackDb setting freqSourceOrder lists %d sources",
                  rsId, expectedCount, settingCount);
     }
 return sourceList;
 }
 
 INLINE boolean freqSourceHasData(struct dbSnpDetails *details, int sourceIx)
 /* Return TRUE if freqSource sourceIx has any data for this variant. */
 {
 return details->alleleTotals[sourceIx] > 0;
 }
 
 struct alCounts
     {
     int obsCount;   // Number of chromosomes on which allele was observed by a project
     int totalCount; // Number of chromosomes on which project observed some allele of this variant
     };
 
 static struct hash *makePerAlleleCounts(struct dbSnpDetails *details, boolean isRc)
 /* Return a hash of allele to array of alCounts (each source's frequency count data). */
 {
 struct hash *perAlleleCounts = hashNew(0);
 int sIx;
 for (sIx = 0;  sIx < details->freqSourceCount;  sIx++)
     {
     if (freqSourceHasData(details, sIx))
         {
         int totalCount = details->alleleTotals[sIx];
         char *blob = cloneString(details->alleleCounts[sIx]);
         struct slName *pab, *perAlleleBlobs = slNameListFromString(blob, '|');
         for (pab = perAlleleBlobs;  pab != NULL;  pab = pab->next)
             {
             char *words[3];
             int wordCount = chopByChar(pab->name, ':', words, ArraySize(words));
             if (wordCount != 2)
                 errAbort("Malformed allele:count in |-separated '%s': "
                          "expecting two :-separated words but got %d", pab->name, wordCount);
             char *allele = words[0];
             if (isRc)
                 reverseComplement(allele, strlen(allele));
             int obsCount = atoi(words[1]);
             struct alCounts *alArray = hashFindVal(perAlleleCounts, allele);
             if (alArray == NULL)
                 {
                 AllocArray(alArray, details->freqSourceCount);
                 hashAdd(perAlleleCounts, allele, alArray);
                 }
             alArray[sIx].obsCount = obsCount;
             alArray[sIx].totalCount = totalCount;
             }
         }
     }
 return perAlleleCounts;
 }
 
 static void printAlleleRow(struct hash *perAlleleCounts, char *allele,
                            struct dbSnpDetails *details, char **perSourceMajorAl)
 /* Print the allele and its counts/freqs from each freqSource for allele, if any. */
 {
 puts("<tr>");
 char buf[1024];
 char *abbrevAl = isEmpty(allele) ? "-" : bigDbSnpAbbrevAllele(allele, buf, sizeof buf);
 printf("<td><span class='inlineCode'>%s</span></td>", abbrevAl);
 struct alCounts *alleleCounts = hashFindVal(perAlleleCounts, allele);
 int sIx;
 for (sIx = 0;  sIx < details->freqSourceCount;  sIx++)
     {
     if (freqSourceHasData(details, sIx))
         {
         struct alCounts *counts = NULL;
         if (alleleCounts)
             counts = &alleleCounts[sIx];
         if (counts != NULL && counts->totalCount != 0)
             {
             boolean isMajor = sameString(perSourceMajorAl[sIx], allele);
             printf("<td>");
             if (isMajor)
                 printf("<b>");
             printf("%d/%d (%f)", counts->obsCount, counts->totalCount,
                    ((double)counts->obsCount / (double)counts->totalCount));
             if (isMajor)
                 printf("</b>");
             }
         else
             printf("<td>-</td>");
         }
     }
 puts("</tr>");
 }
 
 static void printAlleleCountsAndFreqs(struct bigDbSnp *bds, struct dbSnpDetails *details,
                                       struct trackDb *tdb)
 /* Print a table row containing a table of alleles, counts & frequencies if applicable. */
 {
 // Allele counts & frequencies
 if (bds->freqSourceCount > 0)
     {
     puts("<tr><td colspan=2><b>Allele frequency counts:</b></td></tr>");
     puts("<tr><td colspan=2><table class='stdTbl'>");
     struct slName *sourceList = getFreqSourceOrder(tdb, bds->name, bds->freqSourceCount);
     int sIx;
     if (sourceList)
         {
         puts("<tr><th>Allele</th>");
         struct slName *source = sourceList;
         for (sIx = 0;  sIx < bds->freqSourceCount;  sIx++, source = source->next)
             if (freqSourceHasData(details, sIx))
                 printf("<th>%s</th> ", source->name);
         puts("</tr>");
         }
     boolean isRc = (stringIn(bdsRevStrand, bds->ucscNotes) != NULL);
     struct hash *perAlleleCounts = makePerAlleleCounts(details, isRc);
     // Row for reference allele
     printAlleleRow(perAlleleCounts, bds->ref, details, bds->majorAllele);
     hashRemove(perAlleleCounts, bds->ref);
     // Rows for alternate alleles
     int aIx;
     for (aIx = 0;  aIx < bds->altCount;  aIx++)
         {
         char *alt = bds->alts[aIx];
         printAlleleRow(perAlleleCounts, alt, details, bds->majorAllele);
         hashRemove(perAlleleCounts, alt);
         }
     // Rows for frequency alleles not included in ref/alt, if any
     struct hashEl *hel, *helList = hashElListHash(perAlleleCounts);
     for (hel = helList;  hel != NULL;  hel = hel->next)
         {
         char *alt = hel->name;
         printAlleleRow(perAlleleCounts, alt, details, bds->majorAllele);
         }
     puts("</table>");
     puts("</td></tr>");
     }
 }
 
 static void printOneSoTerm(struct dbSnpDetails *details, int ix)
 /* Print a SO term with link to MISO browser for details->soTerms[ix]. */
 {
 printf("%s", soTermToMisoLink((enum soTerm)details->soTerms[ix]));
 }
 
 static void printOneClinVar(struct dbSnpDetails *details, int ix)
 /* Print a link to ClinVar for details->clinVar{Accs,Sigs}[ix]. */
 {
 printf("<a href='https://www.ncbi.nlm.nih.gov/clinvar/%s' target=_blank>%s</a> (%s)",
        details->clinVarAccs[ix], details->clinVarAccs[ix], details->clinVarSigs[ix]);
 }
 
 static void printOneSubmitter(struct dbSnpDetails *details, int ix)
 /* Print a link to dbSNP submitter page for details->submitters[ix]. */
 {
 printf("<a href=\"https://www.ncbi.nlm.nih.gov/SNP/snp_viewTable.cgi?h=%s\" target=_blank>"
        "%s</a>", details->submitters[ix], details->submitters[ix]);
 }
 
 static void printOnePub(struct dbSnpDetails *details, int ix)
 /* Print a link to Pubmed for details->pubMedIds[ix]. */
 {
 printf("<a href=\"https://www.ncbi.nlm.nih.gov/pubmed/%d\" target=_blank>PMID%d</a>\n",
        details->pubMedIds[ix], details->pubMedIds[ix]);
 }
 
 static void printDetailsRow(struct dbSnpDetails *details, char *label, int itemCount,
                             void (*printOneValue)(struct dbSnpDetails *details, int ix))
 /* Print out one row of a details table using callback to print comma-sep list of values. */
 {
 if (itemCount > 0)
     {
     printf("<tr><td><b>%s:</b></td><td>", label);
     int ix;
     for (ix = 0;  ix < itemCount;  ix++)
         {
         if (ix != 0)
             puts(", ");
         printOneValue(details, ix);
         }
     puts("</td></tr>");
     }
 }
 
 static void printDbSnpDetails(struct bigDbSnp *bds, struct dbSnpDetails *details,
                               struct trackDb *tdb)
 /* Print out extras from dbSnpDetails file. */
 {
 printAlleleCountsAndFreqs(bds, details, tdb);
 printDetailsRow(details, "Functional effects", details->soTermCount, printOneSoTerm);
 printDetailsRow(details, "ClinVar", details->clinVarCount, printOneClinVar);
 printDetailsRow(details, "Submitted by", details->submitterCount, printOneSubmitter);
 printDetailsRow(details, "Publications in PubMed", details->pubMedIdCount, printOnePub);
 }
 
 static void printUcscNotes(char *ucscNotes)
 /* Print explanations of ucscNotes items. */
 {
 if (isNotEmpty(ucscNotes))
     {
     puts("<p>Interesting or anomalous conditions noted by UCSC:</b><br>");
     puts("<ul>");
     boolean isCommonAll = (stringIn(bdsCommonAll, ucscNotes) != NULL);
     boolean isRareAll = (stringIn(bdsRareAll, ucscNotes) != NULL);
     struct slName *note, *noteList = slNameListFromComma(ucscNotes);
     for (note = noteList;  note != NULL;  note = note->next)
         {
         // When commonAll is true, commonSome is also true but not informative,
         // so skip commonSome if commonAll is true.  Likewise for rareAll & rareSome.
         if (! ((isCommonAll && sameString(note->name, bdsCommonSome)) ||
                (isRareAll && sameString(note->name, bdsRareSome))))
             {
             char *desc = bigDbSnpDescribeUcscNote(note->name);
             printf("<li>%s\n", desc ? desc : note->name);
             }
         }
     puts("</ul>");
     }
 }
 
 static char *getMinRep(char *ref, char *alt, boolean leftJustify)
 /* If ref and alt can be trimmed down to a shorter representation then return that, othw NULL. */
 {
 char *minRep = NULL;
 int refLen = strlen(ref);
 int altLen = strlen(alt);
 if ((refLen > 1 && altLen > 0) || (altLen > 1 && refLen > 0))
     {
     char *refCpy = cloneStringZ(ref, refLen);
     char *altCpy = cloneStringZ(alt, altLen);
     uint start=0, end=refLen;
     int refCpyLen = refLen, altCpyLen = altLen;
     if (leftJustify)
         trimRefAltLeft(refCpy, altCpy, &start, &end, &refCpyLen, &altCpyLen);
     else
         trimRefAlt(refCpy, altCpy, &start, &end, &refCpyLen, &altCpyLen);
     if (refCpyLen < refLen)
         {
         struct dyString *dy = dyStringNew(0);
         char bufRef[1024], bufAlt[1024];
         if (refCpyLen == 0)
             dyStringPrintf(dy, "ins%s",
                            bigDbSnpAbbrevAllele(altCpy, bufAlt, sizeof bufAlt));
         else if (altCpyLen == 0)
             dyStringPrintf(dy, "del%s",
                            bigDbSnpAbbrevAllele(refCpy, bufRef, sizeof bufRef));
         else
             dyStringPrintf(dy, "del%sins%s",
                            bigDbSnpAbbrevAllele(refCpy, bufRef, sizeof bufRef),
                            bigDbSnpAbbrevAllele(altCpy, bufAlt, sizeof bufAlt));
         minRep = dyStringCannibalize(&dy);
         }
     }
 return minRep;
 }
 
 static void printOtherMappings(struct bbiFile *bbi, struct bigDbSnp *bds, struct trackDb *tdb)
 /* If the variant maps to other genomic sequences (alts/fixes/PAR), link to those positions. */
 {
 int fieldIx;
 struct bptFile *bpt = bigBedOpenExtraIndex(bbi, "name", &fieldIx);
 struct lm *lm = lmInit(0);
 struct bigBedInterval *bbList = bigBedNameQuery(bbi, bpt, fieldIx, bds->name, lm);
 if (slCount(bbList) > 1)
     {
     puts("<p><b>This variant maps to additional locations:</b><br><br>");
     char chromName[bbi->chromBpt->keySize+1];
     int lastChromId = -1;
     struct bigBedInterval *bb;
     for (bb = bbList; bb != NULL; bb = bb->next)
         {
         if (!startsWithWord(bds->name, bb->rest))
             errAbort("Error: bigBedNameQuery search for name '%s' yielded '%s'",
                      bds->name, bb->rest);
         bbiCachedChromLookup(bbi, bb->chromId, lastChromId, chromName, sizeof(chromName));
         char startBuf[16], endBuf[16];
         char *row[BIGDBSNP_NUM_COLS];
         int bbFieldCount = bigBedIntervalToRow(bb, chromName, startBuf, endBuf, row, ArraySize(row));
         if (bbFieldCount != BIGDBSNP_NUM_COLS)
             errAbort("bigDbSnpClick: expected %d columns but got %d", BIGDBSNP_NUM_COLS,
                      bbFieldCount);
         struct bigDbSnp *otherBds = bigDbSnpLoad(row);
         if (differentString(bds->chrom, otherBds->chrom) ||
             bds->chromStart != otherBds->chromStart ||
             bds->chromEnd != otherBds->chromEnd)
             {
             bedPrintPos((struct bed *)otherBds, 3, tdb);
             if (bb->next != NULL)
                 puts("<br>");
             }
         }
     puts("</p>");
     }
 bptFileDetach(&bpt);
 lmCleanup(&lm);
 }
 
+struct snp125 *bdsTosnp125(struct bigDbSnp *bds)
+/* Copy over the bed6 plus observed fields. */
+{
+struct snp125 *snp125;
+AllocVar(snp125);
+snp125->chrom = cloneString(bds->chrom);
+snp125->chromStart = bds->chromStart;
+snp125->chromEnd = bds->chromEnd;
+snp125->name = cloneString(bds->name);
+snp125->refUCSC = cloneString(bds->ref);
+
+snp125->strand = cloneString("+");
+
+struct dyString *dy = dyStringNew(0);
+int i;
+for (i = 0;  i < bds->altCount;  i++)
+    {
+    char *alt = bds->alts[i];
+    if (i > 0)
+    dyStringPrintf(dy, "/");
+    dyStringPrintf(dy, "%s", alt);
+    }
+snp125->observed = cloneString(dyStringCannibalize(&dy));
+
+return snp125;
+}
+
+
 void doBigDbSnp(struct trackDb *tdb, char *rsId)
 /* Show details for bigDbSnp item. */
 {
 int start = cartInt(cart, "o");
 int end = cartInt(cart, "t");
 char *fileOrUrl = hReplaceGbdb(trackDbSetting(tdb, "bigDataUrl"));
 if (isEmpty(fileOrUrl))
     errAbort("bigDbSnpClick: trackDb is missing bigDataUrl setting");
 struct bbiFile *bbi =  bigBedFileOpenAlias(fileOrUrl, chromAliasFindAliases);
 boolean found = FALSE;
 char *chrom = cartString(cart, "c");
 int ivStart = start, ivEnd = end;
 if (start == end)
     {
     // item is an insertion; expand the search range from 0 bases to 2 so we catch it:
     ivStart = max(0, start-1);
     ivEnd++;
     }
 struct lm *lm = lmInit(0);
 struct bigBedInterval *bbList = bigBedIntervalQuery(bbi, chrom, ivStart, ivEnd, 0, lm);
 struct bigBedInterval *bb;
 for (bb = bbList; bb != NULL; bb = bb->next)
     {
     if (!startsWithWord(rsId, bb->rest))
 	continue;
     found = TRUE;
     char startBuf[16], endBuf[16];
     char *row[BIGDBSNP_NUM_COLS];
     int bbFieldCount = bigBedIntervalToRow(bb, chrom, startBuf, endBuf, row, ArraySize(row));
     if (bbFieldCount != BIGDBSNP_NUM_COLS)
         errAbort("bigDbSnpClick: expected %d columns but got %d", BIGDBSNP_NUM_COLS, bbFieldCount);
     struct bigDbSnp *bds = bigDbSnpLoad(row);
     struct dbSnpDetails *details = NULL;
     struct slPair *detailsUrls = parseDetailsTablUrls(tdb);
     if (detailsUrls)
         details = getDetails(bds, detailsUrls->val);
     bedPrintPos((struct bed *)bds, 3, tdb);
     puts("<br>");
     puts("<table>");
     char *ref = bds->ref;
     char abbrev[1024];
     char *abbrevRef = isEmpty(ref) ? "-" : bigDbSnpAbbrevAllele(ref, abbrev, sizeof abbrev);
     printf("<tr><td><b>Reference allele:</b></td><td><span class='inlineCode'>%s</span><td></tr>\n",
            abbrevRef);
     printf("<tr><td");
     if (bds->altCount > 1)
         printf(" style='vertical-align:top'");
     printf("><b>Alternate allele%s:</b></td><td>", (bds->altCount > 1 ? "s" : ""));
     if (bds->altCount == 0)
         printf("<em>none</em>");
     else
         {
         int i;
         for (i = 0;  i < bds->altCount;  i++)
             {
             char *alt = bds->alts[i];
             char *abbrevAlt = isEmpty(alt) ? "-" : bigDbSnpAbbrevAllele(alt, abbrev, sizeof abbrev);
             printf("%s<span class='inlineCode'>%s", (i > 0 ? ",<br>\n" : ""), abbrevAlt);
             char *minRepLeft = getMinRep(ref, alt, TRUE);
             if (minRepLeft)
                 {
                 char *minRepRight = getMinRep(ref, alt, FALSE);
                 if (sameString(minRepLeft, minRepRight))
                     printf(" [%s]", minRepLeft);
                 else
                     printf(" [%s (left-shifted), %s (right-shifted)]", minRepLeft, minRepRight);
                 }
             printf("</span>");
             }
         puts("</td></tr>");
         }
     if (bds->shiftBases > 0)
         printf("<td><b>Uncertainty in indel placement:</b></td><td>%d base%s</td></tr>\n",
                bds->shiftBases, (bds->shiftBases > 1 ? "s" : ""));
     if (details)
         printDbSnpDetails(bds, details, tdb);
     printf("<tr><td><b>Variation class/type:</b></td><td>%s</td></tr>\n",
            bigDbSnpClassToString(bds->class));
+
     puts("</table>");
     printUcscNotes(bds->ucscNotes);
+
+    struct snp125 *snp125 = bdsTosnp125(bds);
+    printSnp153Function(tdb, snp125);
+
     printOtherMappings(bbi, bds, tdb);
     }
 if (!found)
     printf("No item %s starting at %s:%d\n", rsId, chrom, start+1);
 lmCleanup(&lm);
 bbiFileClose(&bbi);
 }