src/hg/lib/bigDbSnp.c d3752edc12da1bf08427946150f564dbdd5d2254

d3752edc12da1bf08427946150f564dbdd5d2254
angie
  Thu Oct 24 13:55:51 2019 -0700
bigDbSnp track handler code - initial commit.  refs #23283
* dnautil: Added trimRefAltLeft to get left-justified trimming (a la VCF not HGVS).
* bigBedClick: do hReplaceGbdb up front in parseDetailsTablUrls instead of waiting until endpoint.
* trackDbCustom.c: consolidating type-handling for wig/bigWig vs. bigBed-based big*.

diff --git src/hg/lib/bigDbSnp.c src/hg/lib/bigDbSnp.c
index 71c30b7..863d8af 100644
--- src/hg/lib/bigDbSnp.c
+++ src/hg/lib/bigDbSnp.c
@@ -285,15 +285,193 @@
 if (sep == ',') fputc('"',f);
 sqlEnumPrint(f, el->class, values_class);
 if (sep == ',') fputc('"',f);
 fputc(sep,f);
 if (sep == ',') fputc('"',f);
 fprintf(f, "%s", el->ucscNotes);
 if (sep == ',') fputc('"',f);
 fputc(sep,f);
 fprintf(f, "%lld", el->_dataOffset);
 fputc(sep,f);
 fprintf(f, "%d", el->_dataLen);
 fputc(lastSep,f);
 }
 
 /* -------------------------------- End autoSql Generated Code -------------------------------- */
+
+struct symbolDesc
+{
+    char *symbol;
+    char *description;
+};
+
+struct symbolDesc ucscNotesDesc[] =
+    {
+    { bdsAltIsAmbiguous,
+      "At least one alternate allele "
+      "contains an IUPAC ambiguous base (e.g. 'R' for 'A or G')." },
+    { bdsClassMismatch,
+      "Variation class/type is inconsistent with alleles mapped to this genome assembly." },
+    { bdsClinvar,
+      "Variant is in ClinVar." },
+    { bdsClusterError,
+      "This variant has the same start, end and class as another variant; "
+      "they probably should have been merged into one variant." },
+    { bdsCommonAll,
+      "Variant is \"common\", i.e. has a Minor Allele Frequency of at least 1% "
+      "in all projects reporting frequencies." },
+    { bdsCommonSome,
+      "Variant is \"common\", i.e. has a Minor Allele Frequency of at least 1% "
+      "in some, but not all, projects reporting frequencies." },
+    { bdsDiffMajor,
+      "Different frequency sources have different major alleles "
+      "(see table of allele frequencies above)." },
+    { bdsFreqIsAmbiguous,
+      "At least one allele reported by at least one project "
+      "contains an IUPAC ambiguous base (e.g. 'R' for 'A or G')." },
+    { bdsFreqNotRefAlt,
+      "The reference genome allele is not the major allele in at least one project." },
+    { bdsMultiMap,
+      "This variant has been mapped to more than one distinct genomic location." },
+    { bdsOverlapDiffClass,
+      "This variant overlaps another variant with a different type/class." },
+    { bdsOverlapSameClass,
+      "This variant overlaps another with the same type/class but different start/end." },
+    { bdsRefIsAmbiguous,
+      "The reference genome allele "
+      "contains an IUPAC ambiguous base (e.g. 'R' for 'A or G')." },
+    { bdsRefIsMinor,
+      "The reference genome allele is not the major allele in at least one project." },
+    { bdsRefIsRare,
+      "The reference genome allele is rare (i.e. allele frequency < 1%)." },
+    { bdsRefIsSingleton,
+      "The reference genome allele has never been observed "
+      "in a population sequencing project reporting frequencies." },
+    { bdsRefMismatch,
+      "The reference genome allele reported by dbSNP differs from the GenBank assembly sequence." },
+    { bdsRevStrand,
+      "The orientation of the currently viewed reference genome sequence is different from "
+      "the orientation of dbSNP's preferred assembly; alleles are "
+      "presented on the forward strand of the currently viewed reference sequence." },
+    };
+
+char *bigDbSnpDescribeUcscNote(char *ucscNote)
+/* Return a string describing ucscNote, unless it is unrecognized in which case return NULL.
+ * Do not free returned value. */
+{
+int i;
+for (i = 0;  i < ArraySize(ucscNotesDesc);  i++)
+    {
+    if (sameString(ucscNote, ucscNotesDesc[i].symbol))
+        return ucscNotesDesc[i].description;
+    }
+return NULL;
+}
+
+char *bigDbSnpClassToString(enum bigDbSnpClass class)
+/* Return the string version of enum bigDbSnpClass.  Do not free result. */
+{
+char *string = NULL;
+switch (class)
+    {
+    case bigDbSnpSnv:
+        string = "snv";
+        break;
+    case bigDbSnpMnv:
+        string = "mnv";
+        break;
+    case bigDbSnpIns:
+        string = "ins";
+        break;
+    case bigDbSnpDel:
+        string = "del";
+        break;
+    case bigDbSnpDelins:
+        string = "delins";
+        break;
+    case bigDbSnpIdentity:
+        string = "identity";
+        break;
+    default:
+        errAbort("bigDbSnpClassToString: unrecognized value %d", (int)class);
+    }
+return string;
+}
+
+static boolean abbrevNRepeat(char *allele, int n, char *buf, size_t bufLen)
+/* If allele is an N-base repeat, and a shorter representation fits in buf, return TRUE. */
+{
+boolean canAbbrev = FALSE;
+int len = strlen(allele);
+int minAbbrevLen = max(n*2, n+4);
+if (len >= minAbbrevLen && bufLen >= minAbbrevLen)
+    {
+    int reps = 1;
+    int i;
+    for (i = n;  i < len;  i++)
+        {
+        if (allele[i] != allele[i-n])
+            break;
+        if (i % n == n-1)
+            reps++;
+        }
+    if (i >= minAbbrevLen)
+        {
+        // End of repeating section; are there enough repeats to make the notation shorter?
+        char repeatUnit[n+1];
+        safencpy(repeatUnit, sizeof repeatUnit, allele, n);
+        int abbrevLen = snprintf(buf, bufLen, "(%s)%d", repeatUnit, reps);
+        // Does the rest of the sequence start with a different repeat?
+        char *bufRest = buf+abbrevLen;
+        size_t bufRestLen = bufLen - abbrevLen;
+        char *alRest = allele + (reps * n);
+        if (bufRestLen > 5 && abbrevNRepeat(alRest, n, bufRest, bufRestLen))
+            abbrevLen = strlen(buf);
+        else
+            abbrevLen += snprintf(bufRest, bufRestLen, "%s", alRest);
+        if (abbrevLen < bufLen)
+            canAbbrev = TRUE;
+        else
+            buf[0] = '\0';
+        }
+    }
+return canAbbrev;
+}
+
+char *bigDbSnpAbbrevAllele(char *allele, char *buf, size_t bufLen)
+/* If allele can be abbreviated to something shorter than itself that fits in buf,
+ * and doesn't end up with a tiny bit of abbreviation followed by a bunch of unabbreviated
+ * sequence, then put the abbreviation in buf and return buf; otherwise return allele.
+ * If allele is the empty string, returns "-" (in buf). */
+{
+if (isEmpty(allele))
+    {
+    safecpy(buf, bufLen, "-");
+    return buf;
+    }
+char *abbrev = allele;
+int maxN = (bufLen - 3) / 2;
+int n;
+for (n = 1; n <= maxN; n++)
+    {
+    if (abbrevNRepeat(allele, n, buf, bufLen))
+        {
+        abbrev = buf;
+        break;
+        }
+    }
+if (abbrev == buf)
+    {
+    int alLen = strlen(buf);
+    char *abbrevEnd = strrchr(buf, ')');
+    if (abbrevEnd == NULL)
+        errAbort("bigDbSnpAbbrevAllele: expect abbreviated allele '%s' to contain at least one ')'",
+                 buf);
+    int abbrevLen = abbrevEnd + 1 - buf;
+    if (abbrevLen < alLen>>2)
+        {
+        // Never mind, the abbreviated portion is much smaller than the unabbreviated portion.
+        abbrev = allele;
+        }
+    }
+return abbrev;
+}