src/hg/lib/bigDbSnp.c d3752edc12da1bf08427946150f564dbdd5d2254

d3752edc12da1bf08427946150f564dbdd5d2254
angie
  Thu Oct 24 13:55:51 2019 -0700
bigDbSnp track handler code - initial commit.  refs #23283
* dnautil: Added trimRefAltLeft to get left-justified trimming (a la VCF not HGVS).
* bigBedClick: do hReplaceGbdb up front in parseDetailsTablUrls instead of waiting until endpoint.
* trackDbCustom.c: consolidating type-handling for wig/bigWig vs. bigBed-based big*.

diff --git src/hg/lib/bigDbSnp.c src/hg/lib/bigDbSnp.c
index 71c30b7..863d8af 100644
--- src/hg/lib/bigDbSnp.c
+++ src/hg/lib/bigDbSnp.c
@@ -1,299 +1,477 @@
 /* bigDbSnp.c was originally generated by the autoSql program, which also 
  * generated bigDbSnp.h and bigDbSnp.sql.  This module links the database and
  * the RAM representation of objects. */
 
 #include "common.h"
 #include "linefile.h"
 #include "dystring.h"
 #include "jksql.h"
 #include "bigDbSnp.h"
 
 
 
 char *bigDbSnpCommaSepFieldNames = "chrom,chromStart,chromEnd,name,ref,altCount,alts,shiftBases,freqSourceCount,minorAlleleFreq,majorAllele,minorAllele,maxFuncImpact,class,ucscNotes,_dataOffset,_dataLen";
 
 /* definitions for class column */
 static char *values_class[] = {"snv", "mnv", "ins", "del", "delins", "identity", NULL};
 static struct hash *valhash_class = NULL;
 
 struct bigDbSnp *bigDbSnpLoad(char **row)
 /* Load a bigDbSnp from row fetched with select * from bigDbSnp
  * from database.  Dispose of this with bigDbSnpFree(). */
 {
 struct bigDbSnp *ret;
 
 AllocVar(ret);
 ret->altCount = sqlSigned(row[5]);
 ret->freqSourceCount = sqlSigned(row[8]);
 ret->chrom = cloneString(row[0]);
 ret->chromStart = sqlUnsigned(row[1]);
 ret->chromEnd = sqlUnsigned(row[2]);
 ret->name = cloneString(row[3]);
 ret->ref = cloneString(row[4]);
 {
 int sizeOne;
 sqlStringDynamicArray(row[6], &ret->alts, &sizeOne);
 assert(sizeOne == ret->altCount);
 }
 ret->shiftBases = sqlUnsigned(row[7]);
 {
 int sizeOne;
 sqlDoubleDynamicArray(row[9], &ret->minorAlleleFreq, &sizeOne);
 assert(sizeOne == ret->freqSourceCount);
 }
 {
 int sizeOne;
 sqlStringDynamicArray(row[10], &ret->majorAllele, &sizeOne);
 assert(sizeOne == ret->freqSourceCount);
 }
 {
 int sizeOne;
 sqlStringDynamicArray(row[11], &ret->minorAllele, &sizeOne);
 assert(sizeOne == ret->freqSourceCount);
 }
 ret->maxFuncImpact = sqlUnsigned(row[12]);
 ret->class = sqlEnumParse(row[13], values_class, &valhash_class);
 ret->ucscNotes = cloneString(row[14]);
 ret->_dataOffset = sqlLongLong(row[15]);
 ret->_dataLen = sqlSigned(row[16]);
 return ret;
 }
 
 struct bigDbSnp *bigDbSnpLoadAll(char *fileName) 
 /* Load all bigDbSnp from a whitespace-separated file.
  * Dispose of this with bigDbSnpFreeList(). */
 {
 struct bigDbSnp *list = NULL, *el;
 struct lineFile *lf = lineFileOpen(fileName, TRUE);
 char *row[17];
 
 while (lineFileRow(lf, row))
     {
     el = bigDbSnpLoad(row);
     slAddHead(&list, el);
     }
 lineFileClose(&lf);
 slReverse(&list);
 return list;
 }
 
 struct bigDbSnp *bigDbSnpLoadAllByChar(char *fileName, char chopper) 
 /* Load all bigDbSnp from a chopper separated file.
  * Dispose of this with bigDbSnpFreeList(). */
 {
 struct bigDbSnp *list = NULL, *el;
 struct lineFile *lf = lineFileOpen(fileName, TRUE);
 char *row[17];
 
 while (lineFileNextCharRow(lf, chopper, row, ArraySize(row)))
     {
     el = bigDbSnpLoad(row);
     slAddHead(&list, el);
     }
 lineFileClose(&lf);
 slReverse(&list);
 return list;
 }
 
 struct bigDbSnp *bigDbSnpCommaIn(char **pS, struct bigDbSnp *ret)
 /* Create a bigDbSnp out of a comma separated string. 
  * This will fill in ret if non-null, otherwise will
  * return a new bigDbSnp */
 {
 char *s = *pS;
 
 if (ret == NULL)
     AllocVar(ret);
 ret->chrom = sqlStringComma(&s);
 ret->chromStart = sqlUnsignedComma(&s);
 ret->chromEnd = sqlUnsignedComma(&s);
 ret->name = sqlStringComma(&s);
 ret->ref = sqlStringComma(&s);
 ret->altCount = sqlSignedComma(&s);
 {
 int i;
 s = sqlEatChar(s, '{');
 AllocArray(ret->alts, ret->altCount);
 for (i=0; i<ret->altCount; ++i)
     {
     ret->alts[i] = sqlStringComma(&s);
     }
 s = sqlEatChar(s, '}');
 s = sqlEatChar(s, ',');
 }
 ret->shiftBases = sqlUnsignedComma(&s);
 ret->freqSourceCount = sqlSignedComma(&s);
 {
 int i;
 s = sqlEatChar(s, '{');
 AllocArray(ret->minorAlleleFreq, ret->freqSourceCount);
 for (i=0; i<ret->freqSourceCount; ++i)
     {
     ret->minorAlleleFreq[i] = sqlDoubleComma(&s);
     }
 s = sqlEatChar(s, '}');
 s = sqlEatChar(s, ',');
 }
 {
 int i;
 s = sqlEatChar(s, '{');
 AllocArray(ret->majorAllele, ret->freqSourceCount);
 for (i=0; i<ret->freqSourceCount; ++i)
     {
     ret->majorAllele[i] = sqlStringComma(&s);
     }
 s = sqlEatChar(s, '}');
 s = sqlEatChar(s, ',');
 }
 {
 int i;
 s = sqlEatChar(s, '{');
 AllocArray(ret->minorAllele, ret->freqSourceCount);
 for (i=0; i<ret->freqSourceCount; ++i)
     {
     ret->minorAllele[i] = sqlStringComma(&s);
     }
 s = sqlEatChar(s, '}');
 s = sqlEatChar(s, ',');
 }
 ret->maxFuncImpact = sqlUnsignedComma(&s);
 ret->class = sqlEnumComma(&s, values_class, &valhash_class);
 ret->ucscNotes = sqlStringComma(&s);
 ret->_dataOffset = sqlLongLongComma(&s);
 ret->_dataLen = sqlSignedComma(&s);
 *pS = s;
 return ret;
 }
 
 void bigDbSnpFree(struct bigDbSnp **pEl)
 /* Free a single dynamically allocated bigDbSnp such as created
  * with bigDbSnpLoad(). */
 {
 struct bigDbSnp *el;
 
 if ((el = *pEl) == NULL) return;
 freeMem(el->chrom);
 freeMem(el->name);
 freeMem(el->ref);
 /* All strings in alts are allocated at once, so only need to free first. */
 if (el->alts != NULL)
     freeMem(el->alts[0]);
 freeMem(el->alts);
 freeMem(el->minorAlleleFreq);
 /* All strings in majorAllele are allocated at once, so only need to free first. */
 if (el->majorAllele != NULL)
     freeMem(el->majorAllele[0]);
 freeMem(el->majorAllele);
 /* All strings in minorAllele are allocated at once, so only need to free first. */
 if (el->minorAllele != NULL)
     freeMem(el->minorAllele[0]);
 freeMem(el->minorAllele);
 freeMem(el->ucscNotes);
 freez(pEl);
 }
 
 void bigDbSnpFreeList(struct bigDbSnp **pList)
 /* Free a list of dynamically allocated bigDbSnp's */
 {
 struct bigDbSnp *el, *next;
 
 for (el = *pList; el != NULL; el = next)
     {
     next = el->next;
     bigDbSnpFree(&el);
     }
 *pList = NULL;
 }
 
 void bigDbSnpOutput(struct bigDbSnp *el, FILE *f, char sep, char lastSep) 
 /* Print out bigDbSnp.  Separate fields with sep. Follow last field with lastSep. */
 {
 if (sep == ',') fputc('"',f);
 fprintf(f, "%s", el->chrom);
 if (sep == ',') fputc('"',f);
 fputc(sep,f);
 fprintf(f, "%u", el->chromStart);
 fputc(sep,f);
 fprintf(f, "%u", el->chromEnd);
 fputc(sep,f);
 if (sep == ',') fputc('"',f);
 fprintf(f, "%s", el->name);
 if (sep == ',') fputc('"',f);
 fputc(sep,f);
 if (sep == ',') fputc('"',f);
 fprintf(f, "%s", el->ref);
 if (sep == ',') fputc('"',f);
 fputc(sep,f);
 fprintf(f, "%d", el->altCount);
 fputc(sep,f);
 {
 int i;
 if (sep == ',') fputc('{',f);
 for (i=0; i<el->altCount; ++i)
     {
     if (sep == ',') fputc('"',f);
     fprintf(f, "%s", el->alts[i]);
     if (sep == ',') fputc('"',f);
     fputc(',', f);
     }
 if (sep == ',') fputc('}',f);
 }
 fputc(sep,f);
 fprintf(f, "%u", el->shiftBases);
 fputc(sep,f);
 fprintf(f, "%d", el->freqSourceCount);
 fputc(sep,f);
 {
 int i;
 if (sep == ',') fputc('{',f);
 for (i=0; i<el->freqSourceCount; ++i)
     {
     fprintf(f, "%g", el->minorAlleleFreq[i]);
     fputc(',', f);
     }
 if (sep == ',') fputc('}',f);
 }
 fputc(sep,f);
 {
 int i;
 if (sep == ',') fputc('{',f);
 for (i=0; i<el->freqSourceCount; ++i)
     {
     if (sep == ',') fputc('"',f);
     fprintf(f, "%s", el->majorAllele[i]);
     if (sep == ',') fputc('"',f);
     fputc(',', f);
     }
 if (sep == ',') fputc('}',f);
 }
 fputc(sep,f);
 {
 int i;
 if (sep == ',') fputc('{',f);
 for (i=0; i<el->freqSourceCount; ++i)
     {
     if (sep == ',') fputc('"',f);
     fprintf(f, "%s", el->minorAllele[i]);
     if (sep == ',') fputc('"',f);
     fputc(',', f);
     }
 if (sep == ',') fputc('}',f);
 }
 fputc(sep,f);
 fprintf(f, "%u", el->maxFuncImpact);
 fputc(sep,f);
 if (sep == ',') fputc('"',f);
 sqlEnumPrint(f, el->class, values_class);
 if (sep == ',') fputc('"',f);
 fputc(sep,f);
 if (sep == ',') fputc('"',f);
 fprintf(f, "%s", el->ucscNotes);
 if (sep == ',') fputc('"',f);
 fputc(sep,f);
 fprintf(f, "%lld", el->_dataOffset);
 fputc(sep,f);
 fprintf(f, "%d", el->_dataLen);
 fputc(lastSep,f);
 }
 
 /* -------------------------------- End autoSql Generated Code -------------------------------- */
+
+struct symbolDesc
+{
+    char *symbol;
+    char *description;
+};
+
+struct symbolDesc ucscNotesDesc[] =
+    {
+    { bdsAltIsAmbiguous,
+      "At least one alternate allele "
+      "contains an IUPAC ambiguous base (e.g. 'R' for 'A or G')." },
+    { bdsClassMismatch,
+      "Variation class/type is inconsistent with alleles mapped to this genome assembly." },
+    { bdsClinvar,
+      "Variant is in ClinVar." },
+    { bdsClusterError,
+      "This variant has the same start, end and class as another variant; "
+      "they probably should have been merged into one variant." },
+    { bdsCommonAll,
+      "Variant is \"common\", i.e. has a Minor Allele Frequency of at least 1% "
+      "in all projects reporting frequencies." },
+    { bdsCommonSome,
+      "Variant is \"common\", i.e. has a Minor Allele Frequency of at least 1% "
+      "in some, but not all, projects reporting frequencies." },
+    { bdsDiffMajor,
+      "Different frequency sources have different major alleles "
+      "(see table of allele frequencies above)." },
+    { bdsFreqIsAmbiguous,
+      "At least one allele reported by at least one project "
+      "contains an IUPAC ambiguous base (e.g. 'R' for 'A or G')." },
+    { bdsFreqNotRefAlt,
+      "The reference genome allele is not the major allele in at least one project." },
+    { bdsMultiMap,
+      "This variant has been mapped to more than one distinct genomic location." },
+    { bdsOverlapDiffClass,
+      "This variant overlaps another variant with a different type/class." },
+    { bdsOverlapSameClass,
+      "This variant overlaps another with the same type/class but different start/end." },
+    { bdsRefIsAmbiguous,
+      "The reference genome allele "
+      "contains an IUPAC ambiguous base (e.g. 'R' for 'A or G')." },
+    { bdsRefIsMinor,
+      "The reference genome allele is not the major allele in at least one project." },
+    { bdsRefIsRare,
+      "The reference genome allele is rare (i.e. allele frequency < 1%)." },
+    { bdsRefIsSingleton,
+      "The reference genome allele has never been observed "
+      "in a population sequencing project reporting frequencies." },
+    { bdsRefMismatch,
+      "The reference genome allele reported by dbSNP differs from the GenBank assembly sequence." },
+    { bdsRevStrand,
+      "The orientation of the currently viewed reference genome sequence is different from "
+      "the orientation of dbSNP's preferred assembly; alleles are "
+      "presented on the forward strand of the currently viewed reference sequence." },
+    };
+
+char *bigDbSnpDescribeUcscNote(char *ucscNote)
+/* Return a string describing ucscNote, unless it is unrecognized in which case return NULL.
+ * Do not free returned value. */
+{
+int i;
+for (i = 0;  i < ArraySize(ucscNotesDesc);  i++)
+    {
+    if (sameString(ucscNote, ucscNotesDesc[i].symbol))
+        return ucscNotesDesc[i].description;
+    }
+return NULL;
+}
+
+char *bigDbSnpClassToString(enum bigDbSnpClass class)
+/* Return the string version of enum bigDbSnpClass.  Do not free result. */
+{
+char *string = NULL;
+switch (class)
+    {
+    case bigDbSnpSnv:
+        string = "snv";
+        break;
+    case bigDbSnpMnv:
+        string = "mnv";
+        break;
+    case bigDbSnpIns:
+        string = "ins";
+        break;
+    case bigDbSnpDel:
+        string = "del";
+        break;
+    case bigDbSnpDelins:
+        string = "delins";
+        break;
+    case bigDbSnpIdentity:
+        string = "identity";
+        break;
+    default:
+        errAbort("bigDbSnpClassToString: unrecognized value %d", (int)class);
+    }
+return string;
+}
+
+static boolean abbrevNRepeat(char *allele, int n, char *buf, size_t bufLen)
+/* If allele is an N-base repeat, and a shorter representation fits in buf, return TRUE. */
+{
+boolean canAbbrev = FALSE;
+int len = strlen(allele);
+int minAbbrevLen = max(n*2, n+4);
+if (len >= minAbbrevLen && bufLen >= minAbbrevLen)
+    {
+    int reps = 1;
+    int i;
+    for (i = n;  i < len;  i++)
+        {
+        if (allele[i] != allele[i-n])
+            break;
+        if (i % n == n-1)
+            reps++;
+        }
+    if (i >= minAbbrevLen)
+        {
+        // End of repeating section; are there enough repeats to make the notation shorter?
+        char repeatUnit[n+1];
+        safencpy(repeatUnit, sizeof repeatUnit, allele, n);
+        int abbrevLen = snprintf(buf, bufLen, "(%s)%d", repeatUnit, reps);
+        // Does the rest of the sequence start with a different repeat?
+        char *bufRest = buf+abbrevLen;
+        size_t bufRestLen = bufLen - abbrevLen;
+        char *alRest = allele + (reps * n);
+        if (bufRestLen > 5 && abbrevNRepeat(alRest, n, bufRest, bufRestLen))
+            abbrevLen = strlen(buf);
+        else
+            abbrevLen += snprintf(bufRest, bufRestLen, "%s", alRest);
+        if (abbrevLen < bufLen)
+            canAbbrev = TRUE;
+        else
+            buf[0] = '\0';
+        }
+    }
+return canAbbrev;
+}
+
+char *bigDbSnpAbbrevAllele(char *allele, char *buf, size_t bufLen)
+/* If allele can be abbreviated to something shorter than itself that fits in buf,
+ * and doesn't end up with a tiny bit of abbreviation followed by a bunch of unabbreviated
+ * sequence, then put the abbreviation in buf and return buf; otherwise return allele.
+ * If allele is the empty string, returns "-" (in buf). */
+{
+if (isEmpty(allele))
+    {
+    safecpy(buf, bufLen, "-");
+    return buf;
+    }
+char *abbrev = allele;
+int maxN = (bufLen - 3) / 2;
+int n;
+for (n = 1; n <= maxN; n++)
+    {
+    if (abbrevNRepeat(allele, n, buf, bufLen))
+        {
+        abbrev = buf;
+        break;
+        }
+    }
+if (abbrev == buf)
+    {
+    int alLen = strlen(buf);
+    char *abbrevEnd = strrchr(buf, ')');
+    if (abbrevEnd == NULL)
+        errAbort("bigDbSnpAbbrevAllele: expect abbreviated allele '%s' to contain at least one ')'",
+                 buf);
+    int abbrevLen = abbrevEnd + 1 - buf;
+    if (abbrevLen < alLen>>2)
+        {
+        // Never mind, the abbreviated portion is much smaller than the unabbreviated portion.
+        abbrev = allele;
+        }
+    }
+return abbrev;
+}