src/hg/lib/bigDbSnp.c acca3deffc05c4d8d11590a1cf3d893763254712

acca3deffc05c4d8d11590a1cf3d893763254712
angie
  Thu Oct 31 13:43:05 2019 -0700
dbSnp153: Adding new ucscNotes suggested by Ana Benet: clinvar{Benign,Conflicting,Pathogenic}, rareAll, rareSome.  refs #23283

diff --git src/hg/lib/bigDbSnp.c src/hg/lib/bigDbSnp.c
index 863d8af..8ebf546 100644
--- src/hg/lib/bigDbSnp.c
+++ src/hg/lib/bigDbSnp.c
@@ -1,477 +1,489 @@
 /* bigDbSnp.c was originally generated by the autoSql program, which also 
  * generated bigDbSnp.h and bigDbSnp.sql.  This module links the database and
  * the RAM representation of objects. */
 
 #include "common.h"
 #include "linefile.h"
 #include "dystring.h"
 #include "jksql.h"
 #include "bigDbSnp.h"
 
 
 
 char *bigDbSnpCommaSepFieldNames = "chrom,chromStart,chromEnd,name,ref,altCount,alts,shiftBases,freqSourceCount,minorAlleleFreq,majorAllele,minorAllele,maxFuncImpact,class,ucscNotes,_dataOffset,_dataLen";
 
 /* definitions for class column */
 static char *values_class[] = {"snv", "mnv", "ins", "del", "delins", "identity", NULL};
 static struct hash *valhash_class = NULL;
 
 struct bigDbSnp *bigDbSnpLoad(char **row)
 /* Load a bigDbSnp from row fetched with select * from bigDbSnp
  * from database.  Dispose of this with bigDbSnpFree(). */
 {
 struct bigDbSnp *ret;
 
 AllocVar(ret);
 ret->altCount = sqlSigned(row[5]);
 ret->freqSourceCount = sqlSigned(row[8]);
 ret->chrom = cloneString(row[0]);
 ret->chromStart = sqlUnsigned(row[1]);
 ret->chromEnd = sqlUnsigned(row[2]);
 ret->name = cloneString(row[3]);
 ret->ref = cloneString(row[4]);
 {
 int sizeOne;
 sqlStringDynamicArray(row[6], &ret->alts, &sizeOne);
 assert(sizeOne == ret->altCount);
 }
 ret->shiftBases = sqlUnsigned(row[7]);
 {
 int sizeOne;
 sqlDoubleDynamicArray(row[9], &ret->minorAlleleFreq, &sizeOne);
 assert(sizeOne == ret->freqSourceCount);
 }
 {
 int sizeOne;
 sqlStringDynamicArray(row[10], &ret->majorAllele, &sizeOne);
 assert(sizeOne == ret->freqSourceCount);
 }
 {
 int sizeOne;
 sqlStringDynamicArray(row[11], &ret->minorAllele, &sizeOne);
 assert(sizeOne == ret->freqSourceCount);
 }
 ret->maxFuncImpact = sqlUnsigned(row[12]);
 ret->class = sqlEnumParse(row[13], values_class, &valhash_class);
 ret->ucscNotes = cloneString(row[14]);
 ret->_dataOffset = sqlLongLong(row[15]);
 ret->_dataLen = sqlSigned(row[16]);
 return ret;
 }
 
 struct bigDbSnp *bigDbSnpLoadAll(char *fileName) 
 /* Load all bigDbSnp from a whitespace-separated file.
  * Dispose of this with bigDbSnpFreeList(). */
 {
 struct bigDbSnp *list = NULL, *el;
 struct lineFile *lf = lineFileOpen(fileName, TRUE);
 char *row[17];
 
 while (lineFileRow(lf, row))
     {
     el = bigDbSnpLoad(row);
     slAddHead(&list, el);
     }
 lineFileClose(&lf);
 slReverse(&list);
 return list;
 }
 
 struct bigDbSnp *bigDbSnpLoadAllByChar(char *fileName, char chopper) 
 /* Load all bigDbSnp from a chopper separated file.
  * Dispose of this with bigDbSnpFreeList(). */
 {
 struct bigDbSnp *list = NULL, *el;
 struct lineFile *lf = lineFileOpen(fileName, TRUE);
 char *row[17];
 
 while (lineFileNextCharRow(lf, chopper, row, ArraySize(row)))
     {
     el = bigDbSnpLoad(row);
     slAddHead(&list, el);
     }
 lineFileClose(&lf);
 slReverse(&list);
 return list;
 }
 
 struct bigDbSnp *bigDbSnpCommaIn(char **pS, struct bigDbSnp *ret)
 /* Create a bigDbSnp out of a comma separated string. 
  * This will fill in ret if non-null, otherwise will
  * return a new bigDbSnp */
 {
 char *s = *pS;
 
 if (ret == NULL)
     AllocVar(ret);
 ret->chrom = sqlStringComma(&s);
 ret->chromStart = sqlUnsignedComma(&s);
 ret->chromEnd = sqlUnsignedComma(&s);
 ret->name = sqlStringComma(&s);
 ret->ref = sqlStringComma(&s);
 ret->altCount = sqlSignedComma(&s);
 {
 int i;
 s = sqlEatChar(s, '{');
 AllocArray(ret->alts, ret->altCount);
 for (i=0; i<ret->altCount; ++i)
     {
     ret->alts[i] = sqlStringComma(&s);
     }
 s = sqlEatChar(s, '}');
 s = sqlEatChar(s, ',');
 }
 ret->shiftBases = sqlUnsignedComma(&s);
 ret->freqSourceCount = sqlSignedComma(&s);
 {
 int i;
 s = sqlEatChar(s, '{');
 AllocArray(ret->minorAlleleFreq, ret->freqSourceCount);
 for (i=0; i<ret->freqSourceCount; ++i)
     {
     ret->minorAlleleFreq[i] = sqlDoubleComma(&s);
     }
 s = sqlEatChar(s, '}');
 s = sqlEatChar(s, ',');
 }
 {
 int i;
 s = sqlEatChar(s, '{');
 AllocArray(ret->majorAllele, ret->freqSourceCount);
 for (i=0; i<ret->freqSourceCount; ++i)
     {
     ret->majorAllele[i] = sqlStringComma(&s);
     }
 s = sqlEatChar(s, '}');
 s = sqlEatChar(s, ',');
 }
 {
 int i;
 s = sqlEatChar(s, '{');
 AllocArray(ret->minorAllele, ret->freqSourceCount);
 for (i=0; i<ret->freqSourceCount; ++i)
     {
     ret->minorAllele[i] = sqlStringComma(&s);
     }
 s = sqlEatChar(s, '}');
 s = sqlEatChar(s, ',');
 }
 ret->maxFuncImpact = sqlUnsignedComma(&s);
 ret->class = sqlEnumComma(&s, values_class, &valhash_class);
 ret->ucscNotes = sqlStringComma(&s);
 ret->_dataOffset = sqlLongLongComma(&s);
 ret->_dataLen = sqlSignedComma(&s);
 *pS = s;
 return ret;
 }
 
 void bigDbSnpFree(struct bigDbSnp **pEl)
 /* Free a single dynamically allocated bigDbSnp such as created
  * with bigDbSnpLoad(). */
 {
 struct bigDbSnp *el;
 
 if ((el = *pEl) == NULL) return;
 freeMem(el->chrom);
 freeMem(el->name);
 freeMem(el->ref);
 /* All strings in alts are allocated at once, so only need to free first. */
 if (el->alts != NULL)
     freeMem(el->alts[0]);
 freeMem(el->alts);
 freeMem(el->minorAlleleFreq);
 /* All strings in majorAllele are allocated at once, so only need to free first. */
 if (el->majorAllele != NULL)
     freeMem(el->majorAllele[0]);
 freeMem(el->majorAllele);
 /* All strings in minorAllele are allocated at once, so only need to free first. */
 if (el->minorAllele != NULL)
     freeMem(el->minorAllele[0]);
 freeMem(el->minorAllele);
 freeMem(el->ucscNotes);
 freez(pEl);
 }
 
 void bigDbSnpFreeList(struct bigDbSnp **pList)
 /* Free a list of dynamically allocated bigDbSnp's */
 {
 struct bigDbSnp *el, *next;
 
 for (el = *pList; el != NULL; el = next)
     {
     next = el->next;
     bigDbSnpFree(&el);
     }
 *pList = NULL;
 }
 
 void bigDbSnpOutput(struct bigDbSnp *el, FILE *f, char sep, char lastSep) 
 /* Print out bigDbSnp.  Separate fields with sep. Follow last field with lastSep. */
 {
 if (sep == ',') fputc('"',f);
 fprintf(f, "%s", el->chrom);
 if (sep == ',') fputc('"',f);
 fputc(sep,f);
 fprintf(f, "%u", el->chromStart);
 fputc(sep,f);
 fprintf(f, "%u", el->chromEnd);
 fputc(sep,f);
 if (sep == ',') fputc('"',f);
 fprintf(f, "%s", el->name);
 if (sep == ',') fputc('"',f);
 fputc(sep,f);
 if (sep == ',') fputc('"',f);
 fprintf(f, "%s", el->ref);
 if (sep == ',') fputc('"',f);
 fputc(sep,f);
 fprintf(f, "%d", el->altCount);
 fputc(sep,f);
 {
 int i;
 if (sep == ',') fputc('{',f);
 for (i=0; i<el->altCount; ++i)
     {
     if (sep == ',') fputc('"',f);
     fprintf(f, "%s", el->alts[i]);
     if (sep == ',') fputc('"',f);
     fputc(',', f);
     }
 if (sep == ',') fputc('}',f);
 }
 fputc(sep,f);
 fprintf(f, "%u", el->shiftBases);
 fputc(sep,f);
 fprintf(f, "%d", el->freqSourceCount);
 fputc(sep,f);
 {
 int i;
 if (sep == ',') fputc('{',f);
 for (i=0; i<el->freqSourceCount; ++i)
     {
     fprintf(f, "%g", el->minorAlleleFreq[i]);
     fputc(',', f);
     }
 if (sep == ',') fputc('}',f);
 }
 fputc(sep,f);
 {
 int i;
 if (sep == ',') fputc('{',f);
 for (i=0; i<el->freqSourceCount; ++i)
     {
     if (sep == ',') fputc('"',f);
     fprintf(f, "%s", el->majorAllele[i]);
     if (sep == ',') fputc('"',f);
     fputc(',', f);
     }
 if (sep == ',') fputc('}',f);
 }
 fputc(sep,f);
 {
 int i;
 if (sep == ',') fputc('{',f);
 for (i=0; i<el->freqSourceCount; ++i)
     {
     if (sep == ',') fputc('"',f);
     fprintf(f, "%s", el->minorAllele[i]);
     if (sep == ',') fputc('"',f);
     fputc(',', f);
     }
 if (sep == ',') fputc('}',f);
 }
 fputc(sep,f);
 fprintf(f, "%u", el->maxFuncImpact);
 fputc(sep,f);
 if (sep == ',') fputc('"',f);
 sqlEnumPrint(f, el->class, values_class);
 if (sep == ',') fputc('"',f);
 fputc(sep,f);
 if (sep == ',') fputc('"',f);
 fprintf(f, "%s", el->ucscNotes);
 if (sep == ',') fputc('"',f);
 fputc(sep,f);
 fprintf(f, "%lld", el->_dataOffset);
 fputc(sep,f);
 fprintf(f, "%d", el->_dataLen);
 fputc(lastSep,f);
 }
 
 /* -------------------------------- End autoSql Generated Code -------------------------------- */
 
 struct symbolDesc
 {
     char *symbol;
     char *description;
 };
 
 struct symbolDesc ucscNotesDesc[] =
     {
     { bdsAltIsAmbiguous,
       "At least one alternate allele "
       "contains an IUPAC ambiguous base (e.g. 'R' for 'A or G')." },
     { bdsClassMismatch,
       "Variation class/type is inconsistent with alleles mapped to this genome assembly." },
     { bdsClinvar,
       "Variant is in ClinVar." },
+    { bdsClinvarBenign,
+      "Variant is in ClinVar with clinical significance of benign and/or likely benign." },
+    { bdsClinvarConflicting,
+      "Variant is in ClinVar with reports of both benign and pathogenic significance." },
+    { bdsClinvarPathogenic,
+      "Variant is in ClinVar with clinical significance of pathogenic and/or likely pathogenic." },
     { bdsClusterError,
       "This variant has the same start, end and class as another variant; "
       "they probably should have been merged into one variant." },
     { bdsCommonAll,
       "Variant is \"common\", i.e. has a Minor Allele Frequency of at least 1% "
       "in all projects reporting frequencies." },
     { bdsCommonSome,
       "Variant is \"common\", i.e. has a Minor Allele Frequency of at least 1% "
       "in some, but not all, projects reporting frequencies." },
     { bdsDiffMajor,
       "Different frequency sources have different major alleles "
       "(see table of allele frequencies above)." },
     { bdsFreqIsAmbiguous,
       "At least one allele reported by at least one project "
       "contains an IUPAC ambiguous base (e.g. 'R' for 'A or G')." },
     { bdsFreqNotRefAlt,
       "The reference genome allele is not the major allele in at least one project." },
     { bdsMultiMap,
       "This variant has been mapped to more than one distinct genomic location." },
     { bdsOverlapDiffClass,
       "This variant overlaps another variant with a different type/class." },
     { bdsOverlapSameClass,
       "This variant overlaps another with the same type/class but different start/end." },
+    { bdsRareAll,
+      "Variant is \"rare\", i.e. has a Minor Allele Frequency of less than 1% "
+      "in all projects reporting frequencies, or has been reported without frequency data." },
+    { bdsRareSome,
+      "Variant is \"rare\", i.e. has a Minor Allele Frequency of less than 1% "
+      "in some, but not all, projects reporting frequencies." },
     { bdsRefIsAmbiguous,
       "The reference genome allele "
       "contains an IUPAC ambiguous base (e.g. 'R' for 'A or G')." },
     { bdsRefIsMinor,
       "The reference genome allele is not the major allele in at least one project." },
     { bdsRefIsRare,
       "The reference genome allele is rare (i.e. allele frequency < 1%)." },
     { bdsRefIsSingleton,
       "The reference genome allele has never been observed "
       "in a population sequencing project reporting frequencies." },
     { bdsRefMismatch,
       "The reference genome allele reported by dbSNP differs from the GenBank assembly sequence." },
     { bdsRevStrand,
       "The orientation of the currently viewed reference genome sequence is different from "
       "the orientation of dbSNP's preferred assembly; alleles are "
       "presented on the forward strand of the currently viewed reference sequence." },
     };
 
 char *bigDbSnpDescribeUcscNote(char *ucscNote)
 /* Return a string describing ucscNote, unless it is unrecognized in which case return NULL.
  * Do not free returned value. */
 {
 int i;
 for (i = 0;  i < ArraySize(ucscNotesDesc);  i++)
     {
     if (sameString(ucscNote, ucscNotesDesc[i].symbol))
         return ucscNotesDesc[i].description;
     }
 return NULL;
 }
 
 char *bigDbSnpClassToString(enum bigDbSnpClass class)
 /* Return the string version of enum bigDbSnpClass.  Do not free result. */
 {
 char *string = NULL;
 switch (class)
     {
     case bigDbSnpSnv:
         string = "snv";
         break;
     case bigDbSnpMnv:
         string = "mnv";
         break;
     case bigDbSnpIns:
         string = "ins";
         break;
     case bigDbSnpDel:
         string = "del";
         break;
     case bigDbSnpDelins:
         string = "delins";
         break;
     case bigDbSnpIdentity:
         string = "identity";
         break;
     default:
         errAbort("bigDbSnpClassToString: unrecognized value %d", (int)class);
     }
 return string;
 }
 
 static boolean abbrevNRepeat(char *allele, int n, char *buf, size_t bufLen)
 /* If allele is an N-base repeat, and a shorter representation fits in buf, return TRUE. */
 {
 boolean canAbbrev = FALSE;
 int len = strlen(allele);
 int minAbbrevLen = max(n*2, n+4);
 if (len >= minAbbrevLen && bufLen >= minAbbrevLen)
     {
     int reps = 1;
     int i;
     for (i = n;  i < len;  i++)
         {
         if (allele[i] != allele[i-n])
             break;
         if (i % n == n-1)
             reps++;
         }
     if (i >= minAbbrevLen)
         {
         // End of repeating section; are there enough repeats to make the notation shorter?
         char repeatUnit[n+1];
         safencpy(repeatUnit, sizeof repeatUnit, allele, n);
         int abbrevLen = snprintf(buf, bufLen, "(%s)%d", repeatUnit, reps);
         // Does the rest of the sequence start with a different repeat?
         char *bufRest = buf+abbrevLen;
         size_t bufRestLen = bufLen - abbrevLen;
         char *alRest = allele + (reps * n);
         if (bufRestLen > 5 && abbrevNRepeat(alRest, n, bufRest, bufRestLen))
             abbrevLen = strlen(buf);
         else
             abbrevLen += snprintf(bufRest, bufRestLen, "%s", alRest);
         if (abbrevLen < bufLen)
             canAbbrev = TRUE;
         else
             buf[0] = '\0';
         }
     }
 return canAbbrev;
 }
 
 char *bigDbSnpAbbrevAllele(char *allele, char *buf, size_t bufLen)
 /* If allele can be abbreviated to something shorter than itself that fits in buf,
  * and doesn't end up with a tiny bit of abbreviation followed by a bunch of unabbreviated
  * sequence, then put the abbreviation in buf and return buf; otherwise return allele.
  * If allele is the empty string, returns "-" (in buf). */
 {
 if (isEmpty(allele))
     {
     safecpy(buf, bufLen, "-");
     return buf;
     }
 char *abbrev = allele;
 int maxN = (bufLen - 3) / 2;
 int n;
 for (n = 1; n <= maxN; n++)
     {
     if (abbrevNRepeat(allele, n, buf, bufLen))
         {
         abbrev = buf;
         break;
         }
     }
 if (abbrev == buf)
     {
     int alLen = strlen(buf);
     char *abbrevEnd = strrchr(buf, ')');
     if (abbrevEnd == NULL)
         errAbort("bigDbSnpAbbrevAllele: expect abbreviated allele '%s' to contain at least one ')'",
                  buf);
     int abbrevLen = abbrevEnd + 1 - buf;
     if (abbrevLen < alLen>>2)
         {
         // Never mind, the abbreviated portion is much smaller than the unabbreviated portion.
         abbrev = allele;
         }
     }
 return abbrev;
 }